def stop_job(project_name, project_uuid, job_name, job_uuid): spawner = JobSpawner(project_name=project_name, project_uuid=project_uuid, job_name=job_name, job_uuid=job_uuid, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), in_cluster=True) return spawner.stop_job()
def stop_job(project_name, project_uuid, job_name, job_uuid, specification): spawner = JobSpawner(project_name=project_name, project_uuid=project_uuid, job_name=job_name, job_uuid=job_uuid, spec=specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True) return spawner.stop_job()
def stop_job(job, update_status=False): spawner = JobSpawner(project_name=job.project.unique_name, project_uuid=job.project.uuid.hex, job_name=job.unique_name, job_uuid=job.uuid.hex, spec=job.specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True) spawner.stop_job() if update_status: # Update experiment status to show that its stopped job.set_status(status=JobLifeCycle.STOPPED, message='Job was stopped')
def start_job(job): # Update job status to show that its started job.set_status(JobLifeCycle.SCHEDULED) try: image_name, image_tag = get_image_info(build_job=job.build_job) except ValueError as e: _logger.warning('Could not start the notebook, %s', e) job.set_status(JobLifeCycle.FAILED, message='External git repo was note found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start notebook with built image `%s`', job_docker_image) spawner = JobSpawner( project_name=job.project.unique_name, project_uuid=job.project.uuid.hex, job_name=job.unique_name, job_uuid=job.uuid.hex, spec=job.specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, job_docker_image=job_docker_image, in_cluster=True, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) try: results = spawner.start_job(resources=job.resources, node_selectors=job.node_selectors) except ApiException as e: _logger.warning( 'Could not start job, please check your polyaxon spec %s', e) job.set_status( JobLifeCycle.FAILED, message= 'Could not start job, encountered a Kubernetes ApiException.') return except Exception as e: _logger.warning( 'Could not start job, please check your polyaxon spec %s', e) job.set_status( JobLifeCycle.FAILED, message='Could not start job encountered an {} exception.'.format( e.__class__.__name__)) return job.definition = get_job_definition(results) job.save()
def start_job(job): # Update job status to show that its started job.set_status(JobLifeCycle.SCHEDULED) try: image_name, image_tag = get_image_info(build_job=job.build_job) except (ValueError, AttributeError): _logger.error('Could not start the job.', exc_info=True) job.set_status(JobLifeCycle.FAILED, message='Image info was not found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start job with built image `%s`', job_docker_image) spawner = JobSpawner( project_name=job.project.unique_name, project_uuid=job.project.uuid.hex, job_name=job.unique_name, job_uuid=job.uuid.hex, spec=job.specification, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, job_docker_image=job_docker_image, in_cluster=True, use_sidecar=True, sidecar_config=config.get_requested_params(to_str=True)) error = {} try: results = spawner.start_job( persistence_data=job.persistence_data, persistence_outputs=job.persistence_outputs, outputs_refs_jobs=job.outputs_refs_jobs, outputs_refs_experiments=job.outputs_refs_experiments, resources=job.resources, node_selector=job.node_selector, affinity=job.affinity, tolerations=job.tolerations) job.definition = get_job_definition(results) job.save() return except ApiException: _logger.error('Could not start job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except VolumeNotFoundError as e: _logger.error( 'Could not start the job, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except Exception as e: _logger.error('Could not start job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start job encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): job.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def start_job(job): # Update job status to show that its started job.set_status(JobLifeCycle.SCHEDULED) try: registry_spec = get_registry_context(build_backend=None) except ContainerRegistryError: job.set_status( JobLifeCycle.FAILED, message= 'Could not start the job, please check your registry configuration.' ) return try: image_name, image_tag = get_image_info( build_job=job.build_job, registry_host=registry_spec.host) except (ValueError, AttributeError): _logger.error('Could not start the job.', exc_info=True) job.set_status(JobLifeCycle.FAILED, message='Image info was not found.') return job_docker_image = '{}:{}'.format(image_name, image_tag) _logger.info('Start job with built image `%s`', job_docker_image) spawner = JobSpawner(project_name=job.project.unique_name, project_uuid=job.project.uuid.hex, job_name=job.unique_name, job_uuid=job.uuid.hex, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), job_docker_image=job_docker_image, in_cluster=True, use_sidecar=True, log_level=job.specification.log_level) error = {} try: results = spawner.start_job( container_cmd_callback=job.specification.run.get_container_cmd, persistence_data=job.persistence_data, persistence_outputs=job.persistence_outputs, outputs_refs_jobs=job.outputs_refs_jobs, outputs_refs_experiments=job.outputs_refs_experiments, secret_refs=job.secret_refs, config_map_refs=job.config_map_refs, resources=job.resources, labels=job.labels, annotations=job.annotations, node_selector=job.node_selector, affinity=job.affinity, tolerations=job.tolerations, max_restarts=get_max_restart(job.max_restarts, conf.get(MAX_RESTARTS_JOBS)), reconcile_url=get_job_reconcile_url(job.unique_name)) job.definition = get_job_definition(results) job.save(update_fields=['definition']) return except ApiException: _logger.error('Could not start job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except StoreNotFoundError as e: _logger.error( 'Could not start the job, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except Exception as e: _logger.error('Could not start job, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start job encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): job.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))