def handle_pytorch_experiment(experiment, spawner, response): # Get the number of jobs this experiment started master = response[TaskType.MASTER] job_uuid = master['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(master), resources=spawner.spec.master_resources) cluster, is_distributed, = spawner.spec.cluster_def worker_resources = PytorchSpecification.get_worker_resources( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed ) for i, worker in enumerate(response[TaskType.WORKER]): job_uuid = worker['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(worker), role=TaskType.WORKER, resources=worker_resources.get(i))
def handle_base_experiment(experiment, spawner, response): # Default case only master was created by the experiment spawner master = response[TaskType.MASTER] job_uuid = master['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(master), resources=spawner.spec.master_resources)
def start_notebook(notebook): # Update job status to show that its started notebook.set_status(JobLifeCycle.SCHEDULED) try: image_name, image_tag = get_image_info(build_job=notebook.build_job) except ValueError as e: _logger.warning('Could not start the notebook, %s', e) notebook.set_status(JobLifeCycle.FAILED, message='External git repo was note found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start notebook with built image `%s`', job_docker_image) spawner = NotebookSpawner( project_name=notebook.project.unique_name, project_uuid=notebook.project.uuid.hex, job_name=notebook.unique_name, job_uuid=notebook.uuid.hex, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True) try: results = spawner.start_notebook(image=job_docker_image, resources=notebook.resources, node_selectors=notebook.node_selectors) except ApiException as e: _logger.warning('Could not start notebook, please check your polyaxon spec %s', e) notebook.set_status( JobLifeCycle.FAILED, message='Could not start notebook, encountered a Kubernetes ApiException.') return except Exception as e: _logger.warning('Could not start notebook, please check your polyaxon spec %s', e) notebook.set_status( JobLifeCycle.FAILED, message='Could not start notebook encountered an {} exception.'.format( e.__class__.__name__ )) return notebook.definition = get_job_definition(results) notebook.save()
def start_dockerizer(build_job): # Update job status to show that its started build_job.set_status(JobLifeCycle.SCHEDULED) spawner = DockerizerSpawner(project_name=build_job.project.unique_name, project_uuid=build_job.project.uuid.hex, job_name=build_job.unique_name, job_uuid=build_job.uuid.hex, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True) try: results = spawner.start_dockerizer( resources=build_job.resources, node_selectors=build_job.node_selectors) auditor.record(event_type=BUILD_JOB_STARTED, instance=build_job, target='project') except ApiException as e: _logger.warning( 'Could not start build job, please check your polyaxon spec %s', e) build_job.set_status( JobLifeCycle.FAILED, message= 'Could not start build job, encountered a Kubernetes ApiException.' ) return False except Exception as e: _logger.warning( 'Could not start build job, please check your polyaxon spec %s', e) build_job.set_status( JobLifeCycle.FAILED, message='Could not start build job encountered an {} exception.'. format(e.__class__.__name__)) return False build_job.definition = get_job_definition(results) build_job.save() return True
def start_tensorboard(tensorboard): # Update job status to show that its started tensorboard.set_status(JobLifeCycle.SCHEDULED) spawner = TensorboardSpawner(project_name=tensorboard.project.unique_name, project_uuid=tensorboard.project.uuid.hex, job_name=tensorboard.unique_name, job_uuid=tensorboard.uuid.hex, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True) try: results = spawner.start_tensorboard( image=tensorboard.image, resources=tensorboard.resources, node_selectors=tensorboard.node_selectors) except ApiException as e: _logger.warning( 'Could not start tensorboard, please check your polyaxon spec %s', e) tensorboard.set_status( JobLifeCycle.FAILED, message= 'Could not start tensorboard, encountered a Kubernetes ApiException.' ) return except Exception as e: _logger.warning( 'Could not start tensorboard, please check your polyaxon spec %s', e) tensorboard.set_status( JobLifeCycle.FAILED, message='Could not start tensorboard encountered an {} exception.'. format(e.__class__.__name__)) return tensorboard.definition = get_job_definition(results) tensorboard.save()
def start_notebook(notebook): # Update job status to show that its started notebook.set_status(JobLifeCycle.SCHEDULED) try: image_name, image_tag = get_image_info(build_job=notebook.build_job) except (ValueError, AttributeError): _logger.error('Could not start the notebook.', exc_info=True) notebook.set_status(JobLifeCycle.FAILED, message='Image info was not found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start notebook with built image `%s`', job_docker_image) spawner = NotebookSpawner(project_name=notebook.project.unique_name, project_uuid=notebook.project.uuid.hex, job_name=notebook.unique_name, job_uuid=notebook.uuid.hex, k8s_config=conf.get('K8S_CONFIG'), namespace=conf.get('K8S_NAMESPACE'), job_docker_image=job_docker_image, in_cluster=True) error = {} try: mount_code_in_notebooks = conf.get('MOUNT_CODE_IN_NOTEBOOKS') results = spawner.start_notebook( persistence_outputs=notebook.persistence_outputs, persistence_data=notebook.persistence_data, outputs_refs_jobs=notebook.outputs_refs_jobs, outputs_refs_experiments=notebook.outputs_refs_experiments, resources=notebook.resources, secret_refs=notebook.secret_refs, configmap_refs=notebook.configmap_refs, node_selector=notebook.node_selector, affinity=notebook.affinity, tolerations=notebook.tolerations, backend=notebook.backend, mount_code_in_notebooks=mount_code_in_notebooks) notebook.definition = get_job_definition(results) notebook.save(update_fields=['definition']) return except ApiException: _logger.error( 'Could not start notebook, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except VolumeNotFoundError as e: _logger.error( 'Could not start the notebook, please check your volume definitions', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except Exception as e: _logger.error( 'Could not start notebook, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start notebook encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): notebook.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def start_dockerizer(build_job): # Update job status to show that its started build_job.set_status(JobLifeCycle.SCHEDULED) spawner = DockerizerSpawner(project_name=build_job.project.unique_name, project_uuid=build_job.project.uuid.hex, job_name=build_job.unique_name, job_uuid=build_job.uuid.hex, k8s_config=conf.get('K8S_CONFIG'), namespace=conf.get('K8S_NAMESPACE'), in_cluster=True) error = {} try: results = spawner.start_dockerizer( resources=build_job.resources, node_selector=build_job.node_selector, affinity=build_job.affinity, tolerations=build_job.tolerations) auditor.record(event_type=BUILD_JOB_STARTED, instance=build_job) build_job.definition = get_job_definition(results) build_job.save(update_fields=['definition']) return True except ApiException: _logger.error( 'Could not start build job, please check your polyaxon spec', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job, encountered a Kubernetes ApiException.' } except VolumeNotFoundError as e: _logger.error( 'Could not start build job, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job, encountered a volume definition problem. %s' % e } except Exception as e: _logger.error( 'Could not start build job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): build_job.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def start_job(job): # Update job status to show that its started job.set_status(JobLifeCycle.SCHEDULED) try: image_name, image_tag = get_image_info(build_job=job.build_job) except (ValueError, AttributeError): _logger.error('Could not start the job.', exc_info=True) job.set_status(JobLifeCycle.FAILED, message='Image info was not found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start job with built image `%s`', job_docker_image) spawner = JobSpawner( project_name=job.project.unique_name, project_uuid=job.project.uuid.hex, job_name=job.unique_name, job_uuid=job.uuid.hex, spec=job.specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, job_docker_image=job_docker_image, in_cluster=True, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) error = {} try: results = spawner.start_job( persistence_data=job.persistence_data, persistence_outputs=job.persistence_outputs, outputs_refs_jobs=job.outputs_refs_jobs, outputs_refs_experiments=job.outputs_refs_experiments, resources=job.resources, node_selector=job.node_selector, affinity=job.affinity, tolerations=job.tolerations) job.definition = get_job_definition(results) job.save() return except ApiException: _logger.error('Could not start job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except VolumeNotFoundError as e: _logger.error( 'Could not start the job, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except Exception as e: _logger.error('Could not start job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start job encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): job.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def start_job(job): # Update job status to show that its started job.set_status(JobLifeCycle.SCHEDULED) try: image_name, image_tag = get_image_info(build_job=job.build_job) except ValueError as e: _logger.warning('Could not start the job, %s', e) job.set_status(JobLifeCycle.FAILED, message='External git repo was note found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start job with built image `%s`', job_docker_image) spawner = JobSpawner( project_name=job.project.unique_name, project_uuid=job.project.uuid.hex, job_name=job.unique_name, job_uuid=job.uuid.hex, spec=job.specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, job_docker_image=job_docker_image, in_cluster=True, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) try: results = spawner.start_job( persistence_data=job.persistence_data, persistence_outputs=job.persistence_outputs, outputs_refs_jobs=job.outputs_refs_jobs, outputs_refs_experiments=job.outputs_refs_experiments, resources=job.resources, node_selectors=job.node_selectors) except ApiException as e: _logger.warning( 'Could not start job, please check your polyaxon spec %s', e) job.set_status( JobLifeCycle.FAILED, message= 'Could not start job, encountered a Kubernetes ApiException.') return except VolumeNotFoundError as e: _logger.warning( 'Could not start the job, please check your volume definitions %s', e) job.set_status(JobLifeCycle.FAILED, message='Could not start the job, ' 'encountered a volume definition problem. %s' % e) return False except Exception as e: _logger.warning( 'Could not start job, please check your polyaxon spec %s', e) job.set_status( JobLifeCycle.FAILED, message='Could not start job encountered an {} exception.'.format( e.__class__.__name__)) return job.definition = get_job_definition(results) job.save()
def start_tensorboard(tensorboard): # Update job status to show that its started tensorboard.set_status(JobLifeCycle.SCHEDULED) spawner = TensorboardSpawner(project_name=tensorboard.project.unique_name, project_uuid=tensorboard.project.uuid.hex, job_name=tensorboard.unique_name, job_uuid=tensorboard.uuid.hex, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True) error = {} try: results = spawner.start_tensorboard( image=tensorboard.image, outputs_path=tensorboard.outputs_path, persistence_outputs=tensorboard.persistence_outputs, outputs_refs_jobs=tensorboard.outputs_refs_jobs, outputs_refs_experiments=tensorboard.outputs_refs_experiments, resources=tensorboard.resources, node_selector=tensorboard.node_selector, affinity=tensorboard.affinity, tolerations=tensorboard.tolerations) tensorboard.definition = get_job_definition(results) tensorboard.save(update_fields=['definition']) return except ApiException: _logger.error( 'Could not start tensorboard, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except VolumeNotFoundError as e: _logger.error( 'Could not start the tensorboard, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except Exception as e: _logger.error( 'Could not start tensorboard, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start tensorboard encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): tensorboard.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def handle_base_experiment(response): master = response[TaskType.MASTER] job_uuid = master['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) set_job_definition(job_uuid=job_uuid, definition=get_job_definition(master))
def start_tensorboard(tensorboard): # Update job status to show that its started tensorboard.set_status(JobLifeCycle.SCHEDULED) spawner = TensorboardSpawner(project_name=tensorboard.project.unique_name, project_uuid=tensorboard.project.uuid.hex, job_name=tensorboard.unique_name, job_uuid=tensorboard.uuid.hex, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), job_docker_image=tensorboard.build_image, in_cluster=True) error = {} outputs_specs, tensorboard_paths = tensorboard.outputs_path try: results = spawner.start_tensorboard( outputs_path=tensorboard_paths, persistence_outputs=tensorboard.persistence_outputs, outputs_specs=outputs_specs, outputs_refs_jobs=tensorboard.outputs_refs_jobs, outputs_refs_experiments=tensorboard.outputs_refs_experiments, resources=tensorboard.resources, labels=tensorboard.labels, annotations=tensorboard.annotations, node_selector=tensorboard.node_selector, affinity=tensorboard.affinity, tolerations=tensorboard.tolerations, max_restarts=get_max_restart(tensorboard.max_restarts, conf.get(MAX_RESTARTS_TENSORBOARDS)), reconcile_url=get_tensorboard_reconcile_url( tensorboard.unique_name)) tensorboard.definition = get_job_definition(results) tensorboard.save(update_fields=['definition']) return except ApiException: _logger.error( 'Could not start tensorboard, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except StoreNotFoundError as e: _logger.error( 'Could not start the tensorboard, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except TensorboardValidation as e: _logger.error( 'Could not start the tensorboard, ' 'some experiments require authenticating to stores with different access.', exc_info=True) error = { 'raised': True, 'traceback': None, 'message': 'Could not start the tensorboard, ' 'some experiments require authenticating ' 'to stores with different access. %s' % e, } except Exception as e: _logger.error( 'Could not start tensorboard, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start tensorboard encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): tensorboard.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def start_notebook(notebook): # Update job status to show that its started notebook.set_status(JobLifeCycle.SCHEDULED) try: image_name, image_tag = get_image_info(build_job=notebook.build_job) except (ValueError, AttributeError): _logger.error('Could not start the notebook.', exc_info=True) notebook.set_status(JobLifeCycle.FAILED, message='Image info was not found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start notebook with built image `%s`', job_docker_image) spawner = NotebookSpawner(project_name=notebook.project.unique_name, project_uuid=notebook.project.uuid.hex, job_name=notebook.unique_name, job_uuid=notebook.uuid.hex, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True) try: allow_commits = False if settings.REPOS_CLAIM_NAME or notebook.node_selector: allow_commits = True results = spawner.start_notebook( image=job_docker_image, persistence_outputs=notebook.persistence_outputs, persistence_data=notebook.persistence_data, outputs_refs_jobs=notebook.outputs_refs_jobs, outputs_refs_experiments=notebook.outputs_refs_experiments, resources=notebook.resources, node_selector=notebook.node_selector, affinity=notebook.affinity, tolerations=notebook.tolerations, allow_commits=allow_commits) except ApiException: _logger.error( 'Could not start notebook, please check your polyaxon spec.', exc_info=True) notebook.set_status( JobLifeCycle.FAILED, message= 'Could not start notebook, encountered a Kubernetes ApiException.') return except VolumeNotFoundError as e: _logger.error( 'Could not start the notebook, please check your volume definitions', exc_info=True) notebook.set_status(JobLifeCycle.FAILED, message='Could not start the notebook, ' 'encountered a volume definition problem. %s' % e) return False except Exception as e: _logger.error( 'Could not start notebook, please check your polyaxon spec.', exc_info=True) notebook.set_status( JobLifeCycle.FAILED, message='Could not start notebook encountered an {} exception.'. format(e.__class__.__name__)) return notebook.definition = get_job_definition(results) notebook.save()
def start_job(job): # Update job status to show that its started job.set_status(JobLifeCycle.SCHEDULED) try: registry_spec = get_registry_context(build_backend=None) except ContainerRegistryError: job.set_status( JobLifeCycle.FAILED, message= 'Could not start the job, please check your registry configuration.' ) return try: image_name, image_tag = get_image_info( build_job=job.build_job, registry_host=registry_spec.host) except (ValueError, AttributeError): _logger.error('Could not start the job.', exc_info=True) job.set_status(JobLifeCycle.FAILED, message='Image info was not found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start job with built image `%s`', job_docker_image) spawner = JobSpawner(project_name=job.project.unique_name, project_uuid=job.project.uuid.hex, job_name=job.unique_name, job_uuid=job.uuid.hex, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), job_docker_image=job_docker_image, in_cluster=True, use_sidecar=True, log_level=job.specification.log_level) error = {} try: results = spawner.start_job( container_cmd_callback=job.specification.run.get_container_cmd, persistence_data=job.persistence_data, persistence_outputs=job.persistence_outputs, outputs_refs_jobs=job.outputs_refs_jobs, outputs_refs_experiments=job.outputs_refs_experiments, secret_refs=job.secret_refs, config_map_refs=job.config_map_refs, resources=job.resources, labels=job.labels, annotations=job.annotations, node_selector=job.node_selector, affinity=job.affinity, tolerations=job.tolerations, max_restarts=get_max_restart(job.max_restarts, conf.get(MAX_RESTARTS_JOBS)), reconcile_url=get_job_reconcile_url(job.unique_name)) job.definition = get_job_definition(results) job.save(update_fields=['definition']) return except ApiException: _logger.error('Could not start job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except StoreNotFoundError as e: _logger.error( 'Could not start the job, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except Exception as e: _logger.error('Could not start job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start job encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): job.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def start_dockerizer(build_job): # Update job status to show that its started build_job.set_status(JobLifeCycle.SCHEDULED) spawner_class = get_spawner_class(build_job.backend) local_build = build_job.backend in {BuildBackend.NATIVE, None} spawner = spawner_class( project_name=build_job.project.unique_name, project_uuid=build_job.project.uuid.hex, job_name=build_job.unique_name, job_uuid=build_job.uuid.hex, commit=build_job.commit, from_image=build_job.build_image, dockerfile_path=build_job.build_dockerfile, context_path=build_job.build_context, image_tag=build_job.uuid.hex, image_name=get_image_name(build_job, local=local_build), build_steps=build_job.build_steps, env_vars=build_job.build_env_vars, nocache=build_job.build_nocache, in_cluster_registry=conf.get('REGISTRY_IN_CLUSTER'), spec=build_job.specification, k8s_config=conf.get('K8S_CONFIG'), namespace=conf.get('K8S_NAMESPACE'), in_cluster=True, use_sidecar=True) error = {} try: results = spawner.start_dockerizer( resources=build_job.resources, node_selector=build_job.node_selector, affinity=build_job.affinity, tolerations=build_job.tolerations) auditor.record(event_type=BUILD_JOB_STARTED, instance=build_job) build_job.definition = get_job_definition(results) build_job.save(update_fields=['definition']) return True except ApiException: _logger.error( 'Could not start build job, please check your polyaxon spec', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job, encountered a Kubernetes ApiException.' } except VolumeNotFoundError as e: _logger.error( 'Could not start build job, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job, encountered a volume definition problem. %s' % e } except Exception as e: _logger.error( 'Could not start build job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): build_job.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def handle_tensorflow_experiment(experiment, spawner, response): # Get the number of jobs this experiment started master = response[TaskType.MASTER] job_uuid = master['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(master), resources=spawner.spec.master_resources, node_selector=spawner.spec.master_node_selector, affinity=spawner.spec.master_affinity, tolerations=spawner.spec.master_tolerations) cluster, is_distributed = spawner.spec.cluster_def worker_resources = TensorflowSpecification.get_worker_resources( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) worker_node_selectors = TensorflowSpecification.get_worker_node_selectors( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) worker_affinities = TensorflowSpecification.get_worker_affinities( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) worker_tolerations = TensorflowSpecification.get_worker_tolerations( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) for i, worker in enumerate(response[TaskType.WORKER]): job_uuid = worker['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(worker), role=TaskType.WORKER, sequence=i, resources=worker_resources.get(i), node_selector=worker_node_selectors.get(i), affinity=worker_affinities.get(i), tolerations=worker_tolerations.get(i)) ps_resources = TensorflowSpecification.get_ps_resources( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) ps_node_selectors = TensorflowSpecification.get_ps_node_selectors( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) ps_affinities = TensorflowSpecification.get_ps_affinities( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) ps_tolerations = TensorflowSpecification.get_ps_tolerations( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) for i, ps in enumerate(response[TaskType.PS]): job_uuid = ps['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(ps), role=TaskType.PS, sequence=i, resources=ps_resources.get(i), node_selector=ps_node_selectors.get(i), affinity=ps_affinities.get(i), tolerations=ps_tolerations.get(i))
def start_dockerizer(build_job): # Update job status to show that its started build_job.set_status(JobLifeCycle.SCHEDULED) spawner_class = get_spawner_class(build_job.backend) try: registry_spec = get_registry_context(build_backend=build_job.backend) except ContainerRegistryError: build_job.set_status( JobLifeCycle.FAILED, message= 'Could not start the dockerizer job, please check your registry configuration.' ) return spawner = spawner_class(project_name=build_job.project.unique_name, project_uuid=build_job.project.uuid.hex, job_name=build_job.unique_name, job_uuid=build_job.uuid.hex, commit=build_job.commit, from_image=build_job.build_image, dockerfile_path=build_job.build_dockerfile, context_path=build_job.build_context, image_tag=build_job.uuid.hex, image_name=get_image_name( build_job=build_job, registry_host=registry_spec.host), build_steps=build_job.build_steps, env_vars=build_job.build_env_vars, lang_env=build_job.build_lang_env, nocache=build_job.build_nocache, insecure=registry_spec.insecure, creds_secret_ref=registry_spec.secret, creds_secret_items=registry_spec.secret_items, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), in_cluster=True, use_sidecar=True, log_level=build_job.specification.log_level) error = {} try: results = spawner.start_dockerizer( secret_refs=build_job.secret_refs, config_map_refs=build_job.config_map_refs, resources=build_job.resources, labels=build_job.labels, annotations=build_job.annotations, node_selector=build_job.node_selector, affinity=build_job.affinity, tolerations=build_job.tolerations, max_restarts=get_max_restart(build_job.max_restarts, conf.get(MAX_RESTARTS_BUILD_JOBS)), reconcile_url=get_build_reconcile_url(build_job.unique_name)) auditor.record(event_type=BUILD_JOB_STARTED, instance=build_job) build_job.definition = get_job_definition(results) build_job.save(update_fields=['definition']) return True except ApiException: _logger.error( 'Could not start build job, please check your polyaxon spec', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job, encountered a Kubernetes ApiException.' } except StoreNotFoundError as e: _logger.error( 'Could not start build job, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job, encountered a volume definition problem. %s' % e } except Exception as e: _logger.error( 'Could not start build job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): build_job.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))