def create_master(self): command, args = self.get_pod_command_args(task_type=TaskType.MASTER, task_idx=0) env_vars = self.get_env_vars(task_type=TaskType.MASTER, task_idx=0) resources = self.get_resources(task_type=TaskType.MASTER, task_idx=0) annotations = self.get_annotations(task_type=TaskType.MASTER, task_idx=0) node_selector = self.get_node_selector(task_type=TaskType.MASTER, task_idx=0) affinity = self.get_affinity(task_type=TaskType.MASTER, task_idx=0) tolerations = self.get_tolerations(task_type=TaskType.MASTER, task_idx=0) max_restarts = get_max_restart(self.spec.max_restarts, conf.get(MAX_RESTARTS_EXPERIMENTS)) return self._create_job(task_type=TaskType.MASTER, task_idx=0, command=command, args=args, env_vars=env_vars, resources=resources, annotations=annotations, node_selector=node_selector, affinity=affinity, tolerations=tolerations, add_service=self.MASTER_SERVICE, max_restarts=max_restarts)
def create_multi_jobs(self, task_type, add_service): resp = [] n_pods = self.get_n_pods(task_type=task_type) max_restarts = get_max_restart(self.spec.max_restarts, conf.get(MAX_RESTARTS_EXPERIMENTS)) for i in range(n_pods): command, args = self.get_pod_command_args(task_type=task_type, task_idx=i) env_vars = self.get_env_vars(task_type=task_type, task_idx=i) resources = self.get_resources(task_type=task_type, task_idx=i) annotations = self.get_annotations(task_type=task_type, task_idx=i) node_selector = self.get_node_selector(task_type=task_type, task_idx=i) affinity = self.get_affinity(task_type=task_type, task_idx=i) tolerations = self.get_tolerations(task_type=task_type, task_idx=i) resp.append( self._create_job(task_type=task_type, task_idx=i, command=command, args=args, env_vars=env_vars, resources=resources, annotations=annotations, node_selector=node_selector, affinity=affinity, tolerations=tolerations, add_service=add_service, max_restarts=max_restarts)) return resp
def start_notebook(notebook): # Update job status to show that its started notebook.set_status(JobLifeCycle.SCHEDULED) try: registry_spec = get_registry_context(build_backend=None) except ContainerRegistryError: notebook.set_status( JobLifeCycle.FAILED, message= 'Could not start the notebook, please check your registry configuration.' ) return try: image_name, image_tag = get_image_info( build_job=notebook.build_job, registry_host=registry_spec.host) except (ValueError, AttributeError): _logger.error('Could not start the notebook.', exc_info=True) notebook.set_status(JobLifeCycle.FAILED, message='Image info was not found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start notebook with built image `%s`', job_docker_image) spawner = NotebookSpawner(project_name=notebook.project.unique_name, project_uuid=notebook.project.uuid.hex, job_name=notebook.unique_name, job_uuid=notebook.uuid.hex, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), job_docker_image=job_docker_image, in_cluster=True) error = {} try: mount_code_in_notebooks = conf.get(NOTEBOOKS_MOUNT_CODE) results = spawner.start_notebook( persistence_outputs=notebook.persistence_outputs, persistence_data=notebook.persistence_data, outputs_refs_jobs=notebook.outputs_refs_jobs, outputs_refs_experiments=notebook.outputs_refs_experiments, resources=notebook.resources, labels=notebook.labels, annotations=notebook.annotations, secret_refs=notebook.secret_refs, config_map_refs=notebook.config_map_refs, node_selector=notebook.node_selector, affinity=notebook.affinity, tolerations=notebook.tolerations, backend=notebook.backend, max_restarts=get_max_restart(notebook.max_restarts, conf.get(MAX_RESTARTS_NOTEBOOKS)), reconcile_url=get_notebook_reconcile_url(notebook.unique_name), mount_code_in_notebooks=mount_code_in_notebooks) notebook.definition = get_job_definition(results) notebook.save(update_fields=['definition']) return except ApiException: _logger.error( 'Could not start notebook, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except StoreNotFoundError as e: _logger.error( 'Could not start the notebook, please check your volume definitions', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except Exception as e: _logger.error( 'Could not start notebook, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start notebook encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): notebook.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def start_dockerizer(build_job): # Update job status to show that its started build_job.set_status(JobLifeCycle.SCHEDULED) spawner_class = get_spawner_class(build_job.backend) try: registry_spec = get_registry_context(build_backend=build_job.backend) except ContainerRegistryError: build_job.set_status( JobLifeCycle.FAILED, message= 'Could not start the dockerizer job, please check your registry configuration.' ) return spawner = spawner_class(project_name=build_job.project.unique_name, project_uuid=build_job.project.uuid.hex, job_name=build_job.unique_name, job_uuid=build_job.uuid.hex, commit=build_job.commit, from_image=build_job.build_image, dockerfile_path=build_job.build_dockerfile, context_path=build_job.build_context, image_tag=build_job.uuid.hex, image_name=get_image_name( build_job=build_job, registry_host=registry_spec.host), build_steps=build_job.build_steps, env_vars=build_job.build_env_vars, lang_env=build_job.build_lang_env, nocache=build_job.build_nocache, insecure=registry_spec.insecure, creds_secret_ref=registry_spec.secret, creds_secret_items=registry_spec.secret_items, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), in_cluster=True, use_sidecar=True, log_level=build_job.specification.log_level) error = {} try: results = spawner.start_dockerizer( secret_refs=build_job.secret_refs, config_map_refs=build_job.config_map_refs, resources=build_job.resources, labels=build_job.labels, annotations=build_job.annotations, node_selector=build_job.node_selector, affinity=build_job.affinity, tolerations=build_job.tolerations, max_restarts=get_max_restart(build_job.max_restarts, conf.get(MAX_RESTARTS_BUILD_JOBS)), reconcile_url=get_build_reconcile_url(build_job.unique_name)) auditor.record(event_type=BUILD_JOB_STARTED, instance=build_job) build_job.definition = get_job_definition(results) build_job.save(update_fields=['definition']) return True except ApiException: _logger.error( 'Could not start build job, please check your polyaxon spec', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job, encountered a Kubernetes ApiException.' } except StoreNotFoundError as e: _logger.error( 'Could not start build job, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job, encountered a volume definition problem. %s' % e } except Exception as e: _logger.error( 'Could not start build job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start build job encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): build_job.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def start_job(job): # Update job status to show that its started job.set_status(JobLifeCycle.SCHEDULED) try: registry_spec = get_registry_context(build_backend=None) except ContainerRegistryError: job.set_status( JobLifeCycle.FAILED, message= 'Could not start the job, please check your registry configuration.' ) return try: image_name, image_tag = get_image_info( build_job=job.build_job, registry_host=registry_spec.host) except (ValueError, AttributeError): _logger.error('Could not start the job.', exc_info=True) job.set_status(JobLifeCycle.FAILED, message='Image info was not found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start job with built image `%s`', job_docker_image) spawner = JobSpawner(project_name=job.project.unique_name, project_uuid=job.project.uuid.hex, job_name=job.unique_name, job_uuid=job.uuid.hex, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), job_docker_image=job_docker_image, in_cluster=True, use_sidecar=True, log_level=job.specification.log_level) error = {} try: results = spawner.start_job( container_cmd_callback=job.specification.run.get_container_cmd, persistence_data=job.persistence_data, persistence_outputs=job.persistence_outputs, outputs_refs_jobs=job.outputs_refs_jobs, outputs_refs_experiments=job.outputs_refs_experiments, secret_refs=job.secret_refs, config_map_refs=job.config_map_refs, resources=job.resources, labels=job.labels, annotations=job.annotations, node_selector=job.node_selector, affinity=job.affinity, tolerations=job.tolerations, max_restarts=get_max_restart(job.max_restarts, conf.get(MAX_RESTARTS_JOBS)), reconcile_url=get_job_reconcile_url(job.unique_name)) job.definition = get_job_definition(results) job.save(update_fields=['definition']) return except ApiException: _logger.error('Could not start job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except StoreNotFoundError as e: _logger.error( 'Could not start the job, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except Exception as e: _logger.error('Could not start job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start job encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): job.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def start_tensorboard(tensorboard): # Update job status to show that its started tensorboard.set_status(JobLifeCycle.SCHEDULED) spawner = TensorboardSpawner(project_name=tensorboard.project.unique_name, project_uuid=tensorboard.project.uuid.hex, job_name=tensorboard.unique_name, job_uuid=tensorboard.uuid.hex, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), job_docker_image=tensorboard.build_image, in_cluster=True) error = {} outputs_specs, tensorboard_paths = tensorboard.outputs_path try: results = spawner.start_tensorboard( outputs_path=tensorboard_paths, persistence_outputs=tensorboard.persistence_outputs, outputs_specs=outputs_specs, outputs_refs_jobs=tensorboard.outputs_refs_jobs, outputs_refs_experiments=tensorboard.outputs_refs_experiments, resources=tensorboard.resources, labels=tensorboard.labels, annotations=tensorboard.annotations, node_selector=tensorboard.node_selector, affinity=tensorboard.affinity, tolerations=tensorboard.tolerations, max_restarts=get_max_restart(tensorboard.max_restarts, conf.get(MAX_RESTARTS_TENSORBOARDS)), reconcile_url=get_tensorboard_reconcile_url( tensorboard.unique_name)) tensorboard.definition = get_job_definition(results) tensorboard.save(update_fields=['definition']) return except ApiException: _logger.error( 'Could not start tensorboard, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except StoreNotFoundError as e: _logger.error( 'Could not start the tensorboard, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except TensorboardValidation as e: _logger.error( 'Could not start the tensorboard, ' 'some experiments require authenticating to stores with different access.', exc_info=True) error = { 'raised': True, 'traceback': None, 'message': 'Could not start the tensorboard, ' 'some experiments require authenticating ' 'to stores with different access. %s' % e, } except Exception as e: _logger.error( 'Could not start tensorboard, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start tensorboard encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): tensorboard.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))