def start_tensorboard(tensorboard): spawner = TensorboardSpawner(project_name=tensorboard.project.unique_name, project_uuid=tensorboard.project.uuid.hex, job_name=tensorboard.unique_name, job_uuid=tensorboard.uuid.hex, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True) try: results = spawner.start_tensorboard( image=tensorboard.image, resources=tensorboard.resources, node_selectors=tensorboard.node_selectors) except ApiException as e: logger.warning( 'Could not start tensorboard, please check your polyaxon spec %s', e) tensorboard.set_status( JobLifeCycle.FAILED, message= 'Could not start tensorboard, encountered a Kubernetes ApiException.' ) return except Exception as e: logger.warning( 'Could not start tensorboard, please check your polyaxon spec %s', e) tensorboard.set_status( JobLifeCycle.FAILED, message='Could not start tensorboard encountered an {} exception.'. format(e.__class__.__name__)) return tensorboard.definition = get_job_definition(results) tensorboard.save()
def start_tensorboard(tensorboard): # Update job status to show that its started tensorboard.set_status(JobLifeCycle.SCHEDULED) spawner = TensorboardSpawner( project_name=tensorboard.project.unique_name, project_uuid=tensorboard.project.uuid.hex, job_name=tensorboard.unique_name, job_uuid=tensorboard.uuid.hex, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True) error = {} try: results = spawner.start_tensorboard( image=tensorboard.image, outputs_path=tensorboard.outputs_path, persistence_outputs=tensorboard.persistence_outputs, outputs_refs_jobs=tensorboard.outputs_refs_jobs, outputs_refs_experiments=tensorboard.outputs_refs_experiments, resources=tensorboard.resources, node_selector=tensorboard.node_selector, affinity=tensorboard.affinity, tolerations=tensorboard.tolerations) tensorboard.definition = get_job_definition(results) tensorboard.save() return except ApiException: _logger.error('Could not start tensorboard, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except VolumeNotFoundError as e: _logger.error('Could not start the tensorboard, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except Exception as e: _logger.error('Could not start tensorboard, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start tensorboard encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): tensorboard.set_status( JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))
def start_tensorboard(tensorboard): # Update job status to show that its started tensorboard.set_status(JobLifeCycle.SCHEDULED) spawner = TensorboardSpawner(project_name=tensorboard.project.unique_name, project_uuid=tensorboard.project.uuid.hex, job_name=tensorboard.unique_name, job_uuid=tensorboard.uuid.hex, k8s_config=settings.K8S_CONFIG, namespace=settings.K8S_NAMESPACE, in_cluster=True) try: node_selectors = get_node_selector( node_selector=tensorboard.node_selectors, default_node_selector=settings.NODE_SELECTORS_EXPERIMENTS) results = spawner.start_tensorboard( image=tensorboard.image, outputs_path=tensorboard.outputs_path, persistence_outputs=tensorboard.persistence_outputs, outputs_refs_jobs=tensorboard.outputs_refs_jobs, outputs_refs_experiments=tensorboard.outputs_refs_experiments, resources=tensorboard.resources, node_selectors=node_selectors) except ApiException as e: _logger.warning( 'Could not start tensorboard, please check your polyaxon spec %s', e) tensorboard.set_status( JobLifeCycle.FAILED, message= 'Could not start tensorboard, encountered a Kubernetes ApiException.' ) return except VolumeNotFoundError as e: _logger.warning( 'Could not start the tensorboard, ' 'please check your volume definitions %s', e) tensorboard.set_status(JobLifeCycle.FAILED, message='Could not start the tensorboard, ' 'encountered a volume definition problem. %s' % e) return False except Exception as e: _logger.warning( 'Could not start tensorboard, please check your polyaxon spec %s', e) tensorboard.set_status( JobLifeCycle.FAILED, message='Could not start tensorboard encountered an {} exception.'. format(e.__class__.__name__)) return tensorboard.definition = get_job_definition(results) tensorboard.save()
def start_tensorboard(tensorboard): # Update job status to show that its started tensorboard.set_status(JobLifeCycle.SCHEDULED) spawner = TensorboardSpawner(project_name=tensorboard.project.unique_name, project_uuid=tensorboard.project.uuid.hex, job_name=tensorboard.unique_name, job_uuid=tensorboard.uuid.hex, k8s_config=conf.get(K8S_CONFIG), namespace=conf.get(K8S_NAMESPACE), job_docker_image=tensorboard.build_image, in_cluster=True) error = {} outputs_specs, tensorboard_paths = tensorboard.outputs_path try: results = spawner.start_tensorboard( outputs_path=tensorboard_paths, persistence_outputs=tensorboard.persistence_outputs, outputs_specs=outputs_specs, outputs_refs_jobs=tensorboard.outputs_refs_jobs, outputs_refs_experiments=tensorboard.outputs_refs_experiments, resources=tensorboard.resources, # TODO: resources node_selector=tensorboard.node_selector, affinity=tensorboard.affinity, tolerations=tensorboard.tolerations) tensorboard.definition = get_job_definition(results) tensorboard.save(update_fields=['definition']) return except ApiException: _logger.error( 'Could not start tensorboard, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a Kubernetes ApiException.', } except VolumeNotFoundError as e: _logger.error( 'Could not start the tensorboard, please check your volume definitions.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start the job, encountered a volume definition problem. %s' % e, } except TensorboardValidation as e: _logger.error( 'Could not start the tensorboard, ' 'some experiments require authenticating to stores with different access.', exc_info=True) error = { 'raised': True, 'traceback': None, 'message': 'Could not start the tensorboard, ' 'some experiments require authenticating ' 'to stores with different access. %s' % e, } except Exception as e: _logger.error( 'Could not start tensorboard, please check your polyaxon spec.', exc_info=True) error = { 'raised': True, 'traceback': traceback.format_exc(), 'message': 'Could not start tensorboard encountered an {} exception.'.format( e.__class__.__name__) } finally: if error.get('raised'): tensorboard.set_status(JobLifeCycle.FAILED, message=error.get('message'), traceback=error.get('traceback'))