Пример #1
0
    def create_pytorch_job(self, namespace, pytorchjob):
        """Create the provided PyTorchJob in the specified namespace.
        The PyTorchJob version is defined in PYTORCH_JOB_VERSION in kubeflow.pytorch.constants.
        The version PyTorchJob need to be installed before creating the PyTorchJob.

        :param namespace: The custom resource
        :param pytorchjob: The JSON schema of the Resource to create
        :returns: object: Created TFJob.

        """
        pytorchjob_client = PyTorchJobClient()
        try:
            return pytorchjob_client.create(pytorchjob, namespace=namespace)
        except client.rest.ApiException:
            raise RuntimeError(
                "Failed to create PyTorchJob. Perhaps the CRD PyTorchJob version "
                "{} in not installed(If you use different version you can pass it "
                "as ENV variable called `PYTORCH_JOB_VERSION`)? ".format(
                    constants.PYTORCH_JOB_VERSION))
Пример #2
0
def launch_job(client: PyTorchJobClient, job: V1PyTorchJob):
    """
    Launch PyTorchJob on kubeflow pipeline

    """

    ret = client.create(job)  # type: V1PyTorchJob
    LOGGER.info('Launch PyTorchJob %s', ret)
    job_name = ret['metadata']['name']
    namespace = ret['metadata']['namespace']

    LOGGER.debug('setup sigterm handler')
    delete_job_func = _get_delete_pytorch_job_func(client, job_name, namespace)
    signal.signal(signal.SIGTERM, delete_job_func)

    job = client.wait_for_condition(job_name, ['Created', 'Failed'], namespace=namespace, status_callback=lambda x: LOGGER.debug(
        'PyTorchJob Conditions\n %s', x.get("status", {}).get("conditions", ['None Condition'])[-1]))

    if job.get("status", {}).get("conditions", [])[0]['type'] == 'Failed':
        LOGGER.error('Cancel PytorchJob: %s', job_name)
        LOGGER.error('Unexpected condition. Could you confirm below ?')
        LOGGER.error(job)

        sys.exit(1)

    LOGGER.info('PyTorchJob created: %s', job_name)
    for _pname in client.get_pod_names(job_name, namespace=namespace):
        LOGGER.info('Pod name: %s', _pname)

    master_pod_name = list(client.get_pod_names(job_name, namespace=namespace, master=True))[0]
    master_pod = client.core_api.read_namespaced_pod(master_pod_name, namespace, pretty='true')

    LOGGER.debug('master pod spec')
    LOGGER.debug(master_pod)

    labels = utils.get_labels(job_name, master=True)
    LOGGER.info('wait till pod running. target selector: %s', labels)
    w = watch.Watch()
    last_pod_info = None
    for event in w.stream(client.core_api.list_namespaced_pod,
                          namespace,
                          label_selector=utils.to_selector(labels)):
        last_pod_info = event['object']  # type: V1Pod
        LOGGER.debug("Event: %s %s %s %s",
                     event['type'],
                     last_pod_info.metadata.name,
                     last_pod_info.status.phase,
                     last_pod_info.status.conditions[-1] if len(last_pod_info.status.conditions) > 0 else 'none')

        if last_pod_info.status.phase in ['Succeeded', 'Failed', 'Unknown'] or (last_pod_info.status.phase == 'Running' and
                                                                                len(last_pod_info.status.conditions) > 0 and
                                                                                last_pod_info.status.conditions[-1].type == 'PodScheduled'):
            w.stop()

    if last_pod_info.status.phase in ['Failed', 'Unknown']:
        LOGGER.error('Cancel PytorchJob: %s', job_name)
        LOGGER.error('master pod status: %s', last_pod_info.status.phase)
        LOGGER.error('Could you confirm below ?')
        LOGGER.error(last_pod_info)

        sys.exit(1)

    LOGGER.info('start watch PyTorchJob Pods log')
    for line in client.core_api.read_namespaced_pod_log(master_pod_name,
                                                        namespace,
                                                        container='pytorch',
                                                        follow=True,
                                                        _preload_content=False).stream():
        LOGGER.info(line.decode('utf-8')[:-1])

    client.wait_for_job(job_name, namespace=namespace)

    LOGGER.info('Delete PyTorchJob')
    delete_job_func()

    LOGGER.info('Launched job finished')