def get_init_container(self): """Pod init container for setting outputs path.""" if self.original_name is not None and self.cloning_strategy == CloningStrategy.RESUME: return [] if self.original_name is not None and self.cloning_strategy == CloningStrategy.COPY: command = InitCommands.COPY original_outputs_path = get_experiment_outputs_path( experiment_name=self.original_name) else: command = InitCommands.CREATE original_outputs_path = None outputs_path = get_experiment_outputs_path( experiment_name=self.experiment_name) outputs_volume_mount = get_volume_mount( volume=constants.DATA_VOLUME, volume_mount=settings.DATA_ROOT) return [ client.V1Container( name=self.init_container_name, image=self.init_docker_image, command=["/bin/sh", "-c"], args=to_list( get_output_args( command=command, outputs_path=outputs_path, original_outputs_path=original_outputs_path)), volume_mounts=[outputs_volume_mount]) ]
def get_init_container(self): """Pod init container for setting outputs path.""" if self.original_name is not None and self.cloning_strategy == CloningStrategy.RESUME: return [] if self.original_name is not None and self.cloning_strategy == CloningStrategy.COPY: command = InitCommands.COPY original_outputs_path = get_experiment_outputs_path(experiment_name=self.original_name) else: command = InitCommands.CREATE original_outputs_path = None outputs_path = get_experiment_outputs_path(experiment_name=self.experiment_name) outputs_volume_mount = get_volume_mount( volume=constants.DATA_VOLUME, volume_mount=settings.DATA_ROOT) return [ client.V1Container( name=self.init_container_name, image=self.init_docker_image, command=["/bin/sh", "-c"], args=to_list(get_output_args(command=command, outputs_path=outputs_path, original_outputs_path=original_outputs_path)), volume_mounts=[outputs_volume_mount]) ]
def get_init_container(self, persistence_outputs): """Pod init container for setting outputs path.""" if self.original_name is not None and self.cloning_strategy == CloningStrategy.RESUME: return [] if self.original_name is not None and self.cloning_strategy == CloningStrategy.COPY: command = InitCommands.COPY original_outputs_path = get_experiment_outputs_path( persistence_outputs=persistence_outputs, experiment_name=self.original_name) else: command = InitCommands.CREATE original_outputs_path = None outputs_path = get_experiment_outputs_path(persistence_outputs=persistence_outputs, experiment_name=self.experiment_name) _, outputs_volume_mount = get_pod_outputs_volume(persistence_outputs=persistence_outputs) return [ client.V1Container( name=self.init_container_name, image=self.init_docker_image, command=["/bin/sh", "-c"], args=to_list(get_output_args(command=command, outputs_path=outputs_path, original_outputs_path=original_outputs_path)), volume_mounts=outputs_volume_mount) ]
def get(self, request, *args, **kwargs): filepath = request.query_params.get('path') if not filepath: raise ValidationError('Files view expect a path to the file.') experiment_outputs_path = get_experiment_outputs_path( persistence_outputs=self.experiment.persistence_outputs, experiment_name=self.experiment.unique_name, original_name=self.experiment.original_unique_name, cloning_strategy=self.experiment.cloning_strategy) download_filepath = archive_outputs_file( persistence_outputs=self.experiment.persistence_outputs, outputs_path=experiment_outputs_path, namepath=self.experiment.unique_name, filepath=filepath) filename = os.path.basename(download_filepath) chunk_size = 8192 try: wrapped_file = FileWrapper(open(download_filepath, 'rb'), chunk_size) response = StreamingHttpResponse( wrapped_file, content_type=mimetypes.guess_type(download_filepath)[0]) response['Content-Length'] = os.path.getsize(download_filepath) response['Content-Disposition'] = "attachment; filename={}".format( filename) return response except FileNotFoundError: _logger.warning('Log file not found: log_path=%s', download_filepath) return Response(status=status.HTTP_404_NOT_FOUND, data='Log file not found: log_path={}'.format( download_filepath))
def get_config_map(namespace, project_name, experiment_group_name, experiment_name, project_uuid, experiment_group_uuid, experiment_uuid, original_name, cloning_strategy, cluster_def, declarations, log_level): name = constants.CONFIG_MAP_NAME.format(experiment_uuid=experiment_uuid) labels = get_map_labels(project_name, experiment_group_name, experiment_name, project_uuid, experiment_group_uuid, experiment_uuid) metadata = client.V1ObjectMeta(name=name, labels=labels, namespace=namespace) experiment_outputs_path = get_experiment_outputs_path( experiment_name=experiment_name, original_name=original_name, cloning_strategy=cloning_strategy) experiment_logs_path = get_experiment_logs_path(experiment_name) experiment_data_path = get_project_data_path(project_name) data = { constants.CONFIG_MAP_CLUSTER_KEY_NAME: json.dumps(cluster_def), constants.CONFIG_MAP_DECLARATIONS_KEY_NAME: json.dumps(declarations) or '{}', constants.CONFIG_MAP_EXPERIMENT_INFO_KEY_NAME: json.dumps(labels), constants.CONFIG_MAP_LOG_LEVEL_KEY_NAME: log_level, API_KEY_NAME: get_settings_api_url(), constants.CONFIG_MAP_EXPERIMENT_OUTPUTS_PATH_KEY_NAME: experiment_outputs_path, constants.CONFIG_MAP_EXPERIMENT_LOGS_PATH_KEY_NAME: experiment_logs_path, constants.CONFIG_MAP_EXPERIMENT_DATA_PATH_KEY_NAME: experiment_data_path, } return client.V1ConfigMap(api_version=k8s_constants.K8S_API_VERSION_V1, kind=k8s_constants.K8S_CONFIG_MAP_KIND, metadata=metadata, data=data)
def get_experiments_outputs_spec(self): from libs.paths.experiments import get_experiment_outputs_path if not self.experiments.count(): return None annotation = { 'persistence_outputs': KeyTransform('outputs', 'persistence') } query = self.experiments.annotate(**annotation) experiment_data = query.values_list('id', 'experiment_group__id', 'project__user__username', 'project__name', 'persistence_outputs') outputs_spec_data = {} for data in experiment_data: project_name = PROJECT_UNIQUE_NAME_FORMAT.format(user=data[2], project=data[3]) if data[1]: group_name = GROUP_UNIQUE_NAME_FORMAT.format( project_name=project_name, id=data[1]) experiment_name = EXPERIMENT_UNIQUE_NAME_FORMAT.format( parent_name=group_name, id=data[0]) else: experiment_name = EXPERIMENT_UNIQUE_NAME_FORMAT.format( parent_name=project_name, id=data[0]) outputs_path = get_experiment_outputs_path( persistence_outputs=data[4], experiment_name=experiment_name) outputs_spec_data[data[0]] = OutputsRefsSpec(path=outputs_path, persistence=data[4]) return outputs_spec_data
def get_pod_container(self, volume_mounts, env_vars=None, command=None, args=None, persistence_outputs=None, persistence_data=None, outputs_refs_jobs=None, outputs_refs_experiments=None, secret_refs=None, configmap_refs=None, resources=None, ephemeral_token=None): """Pod job container for task.""" assert self.cluster_def is not None # Env vars preparations env_vars = to_list(env_vars, check_none=True) outputs_path = get_experiment_outputs_path( persistence_outputs=persistence_outputs, experiment_name=self.experiment_name, original_name=self.original_name, cloning_strategy=self.cloning_strategy) env_vars += get_job_env_vars( persistence_outputs=persistence_outputs, outputs_path=outputs_path, persistence_data=persistence_data, log_level=self.log_level, logs_path=get_experiment_logs_path(self.experiment_name, temp=False), outputs_refs_jobs=outputs_refs_jobs, outputs_refs_experiments=outputs_refs_experiments, ephemeral_token=ephemeral_token, ) env_vars += [ get_env_var(name=constants.CONFIG_MAP_CLUSTER_KEY_NAME, value=json.dumps(self.cluster_def)), get_env_var(name=constants.CONFIG_MAP_DECLARATIONS_KEY_NAME, value=self.declarations), get_env_var(name=constants.CONFIG_MAP_EXPERIMENT_INFO_KEY_NAME, value=json.dumps(self.experiment_labels)), ] env_vars += get_resources_env_vars(resources=resources) # Env from configmap and secret refs env_from = get_pod_env_from(secret_refs=secret_refs, configmap_refs=configmap_refs) ports = [ client.V1ContainerPort(container_port=port) for port in self.ports ] return client.V1Container(name=self.job_container_name, image=self.job_docker_image, command=command, args=args, ports=ports, env=env_vars, env_from=env_from, resources=get_resources(resources), volume_mounts=volume_mounts)
def test_experiment_outputs_path_creation_deletion(self): experiment_outputs_path = get_experiment_outputs_path(self.experiment.unique_name) assert os.path.exists(experiment_outputs_path) is False create_experiment_outputs_path(self.experiment.unique_name) assert os.path.exists(experiment_outputs_path) is True delete_experiment_outputs(self.experiment.unique_name) assert os.path.exists(experiment_outputs_path) is False
def test_copying_an_experiment(self): with patch('scheduler.tasks.experiments.experiments_build.apply_async' ) as _: # noqa experiment1 = ExperimentFactory() # We create some outputs files for the experiment path = create_experiment_outputs_path( persistence_outputs=experiment1.persistence_outputs, experiment_name=experiment1.unique_name) open(os.path.join(path, 'file'), 'w+') # Create a new experiment that is a clone of the previous with patch('scheduler.tasks.experiments.experiments_build.apply_async' ) as _: # noqa experiment2 = ExperimentFactory(original_experiment=experiment1) # Check that outputs path for experiment2 does not exist yet experiment2_outputs_path = get_experiment_outputs_path( persistence_outputs=experiment2.persistence_outputs, experiment_name=experiment2.unique_name) assert os.path.exists(experiment2_outputs_path) is False # Handle restart should create the outputs and copy the content of experiment 1 copy_experiment(experiment2) assert os.path.exists(experiment2_outputs_path) is True assert os.path.exists(os.path.join(experiment2_outputs_path, 'file')) is True
def test_experiment_outputs_path_creation_deletion(self): experiment_outputs_path = get_experiment_outputs_path( self.experiment.unique_name) assert os.path.exists(experiment_outputs_path) is False create_experiment_outputs_path(self.experiment.unique_name) assert os.path.exists(experiment_outputs_path) is True delete_experiment_outputs(self.experiment.unique_name) assert os.path.exists(experiment_outputs_path) is False
def archive_experiment_outputs(persistence_outputs, experiment_name): check_archive_path(settings.OUTPUTS_ARCHIVE_ROOT) experiment_outputs_path = get_experiment_outputs_path(persistence_outputs=persistence_outputs, experiment_name=experiment_name) outputs_files = get_files_in_path(experiment_outputs_path) tar_name = "{}.tar.gz".format(experiment_name.replace('.', '_')) create_tarfile(files=outputs_files, tar_path=os.path.join(settings.OUTPUTS_ARCHIVE_ROOT, tar_name)) return settings.OUTPUTS_ARCHIVE_ROOT, tar_name
def get_env_vars(self, task_type, task_idx): tf_config = { 'cluster': self.get_cluster(), 'task': {'type': task_type, 'index': task_idx}, 'model_dir': get_experiment_outputs_path( persistence_outputs=self.persistence_config.outputs, experiment_name=self.experiment_name, cloning_strategy=self.cloning_strategy), 'environment': 'cloud' } return get_env_var(name='TF_CONFIG', value=tf_config)
def get_named_experiment_outputs_path(experiment): persistence = experiment.persistence_outputs outputs_path = get_experiment_outputs_path( persistence_outputs=experiment.persistence_outputs, experiment_name=experiment.unique_name, original_name=experiment.original_unique_name, cloning_strategy=experiment.cloning_strategy) tensorboard_path = '{}:{}'.format( experiment.unique_name, outputs_path) return [OutputsRefsSpec(path=outputs_path, persistence=persistence)], tensorboard_path
def test_project_outputs_path_creation_deletion(self): with patch('scheduler.tasks.experiments.experiments_build.apply_async') as _: # noqa experiment = ExperimentFactory(user=self.project.user, project=self.project) create_experiment_outputs_path(experiment.unique_name) create_experiment_logs_path(experiment.unique_name) experiment_outputs_path = get_experiment_outputs_path(experiment.unique_name) project_outputs_path = get_project_outputs_path(self.project.unique_name) assert os.path.exists(experiment_outputs_path) is True assert os.path.exists(project_outputs_path) is True delete_project_outputs(self.project.unique_name) assert os.path.exists(experiment_outputs_path) is False assert os.path.exists(project_outputs_path) is False
def test_experiment_group_outputs_path_creation_deletion(self): experiment = ExperimentFactory(user=self.project.user, project=self.project, experiment_group=self.experiment_group) create_experiment_outputs_path(experiment.unique_name) experiment_outputs_path = get_experiment_outputs_path(experiment.unique_name) experiment_group_outputs_path = get_experiment_group_outputs_path( self.experiment_group.unique_name) assert os.path.exists(experiment_outputs_path) is True assert os.path.exists(experiment_group_outputs_path) is True delete_experiment_group_outputs(self.experiment_group.unique_name) assert os.path.exists(experiment_outputs_path) is False assert os.path.exists(experiment_group_outputs_path) is False
def get(self, request, *args, **kwargs): auditor.record(event_type=EXPERIMENT_OUTPUTS_DOWNLOADED, instance=self.experiment, actor_id=self.request.user.id, actor_name=self.request.user.username) experiment_outputs_path = get_experiment_outputs_path( persistence_outputs=self.experiment.persistence_outputs, experiment_name=self.experiment.unique_name, original_name=self.experiment.original_unique_name, cloning_strategy=self.experiment.cloning_strategy) archived_path, archive_name = archive_outputs( outputs_path=experiment_outputs_path, name=self.experiment.unique_name) return self.redirect(path='{}/{}'.format(archived_path, archive_name))
def outputs_path(self): if self.experiment: from libs.paths.experiments import get_experiment_outputs_path return get_experiment_outputs_path( experiment_name=self.experiment.unique_name, original_name=self.experiment.original_unique_name, cloning_strategy=self.experiment.cloning_strategy) if self.experiment_group: from libs.paths.experiment_groups import get_experiment_group_outputs_path return get_experiment_group_outputs_path( experiment_group_name=self.experiment_group.unique_name) from libs.paths.projects import get_project_outputs_path return get_project_outputs_path(project_name=self.project.unique_name)
def test_experiment_group_outputs_path_creation_deletion(self): experiment = ExperimentFactory(user=self.project.user, project=self.project, experiment_group=self.experiment_group) create_experiment_outputs_path(experiment.unique_name) experiment_outputs_path = get_experiment_outputs_path( experiment.unique_name) experiment_group_outputs_path = get_experiment_group_outputs_path( self.experiment_group.unique_name) assert os.path.exists(experiment_outputs_path) is True assert os.path.exists(experiment_group_outputs_path) is True delete_experiment_group_outputs(self.experiment_group.unique_name) assert os.path.exists(experiment_outputs_path) is False assert os.path.exists(experiment_group_outputs_path) is False
def get_pod_container(self, volume_mounts, env_vars=None, command=None, args=None, persistence_outputs=None, persistence_data=None, outputs_refs_jobs=None, outputs_refs_experiments=None, resources=None): """Pod job container for task.""" assert self.cluster_def is not None env_vars = get_list(env_vars) outputs_path = get_experiment_outputs_path( persistence_outputs=persistence_outputs, experiment_name=self.experiment_name, original_name=self.original_name, cloning_strategy=self.cloning_strategy) env_vars += get_job_env_vars( log_level=self.log_level, outputs_path=outputs_path, data_paths=get_data_paths(persistence_data), logs_path=get_experiment_logs_path(self.experiment_name), outputs_refs_jobs=outputs_refs_jobs, outputs_refs_experiments=outputs_refs_experiments) env_vars += [ get_env_var(name=constants.CONFIG_MAP_CLUSTER_KEY_NAME, value=json.dumps(self.cluster_def)), get_env_var(name=constants.CONFIG_MAP_DECLARATIONS_KEY_NAME, value=self.declarations), get_env_var(name=constants.CONFIG_MAP_EXPERIMENT_INFO_KEY_NAME, value=json.dumps(self.experiment_labels)), ] env_vars += get_resources_env_vars(resources=resources) ports = [ client.V1ContainerPort(container_port=port) for port in self.ports ] return client.V1Container(name=self.job_container_name, image=self.job_docker_image, command=command, args=args, ports=ports, env=env_vars, resources=get_resources(resources), volume_mounts=volume_mounts)
def get_pod_container(self, volume_mounts, env_vars=None, command=None, args=None, resources=None): """Pod job container for task.""" assert self.cluster_def is not None env_vars = get_list(env_vars) outputs_path = get_experiment_outputs_path( experiment_name=self.experiment_name, original_name=self.original_name, cloning_strategy=self.cloning_strategy) env_vars += get_job_env_vars( log_level=self.log_level, outputs_path=outputs_path, logs_path=get_experiment_logs_path(self.experiment_name), data_path=get_experiment_data_path(self.experiment_name), project_data_path=get_project_data_path(project_name=self.project_name) ) env_vars += [ get_env_var(name=constants.CONFIG_MAP_CLUSTER_KEY_NAME, value=json.dumps(self.cluster_def)), get_env_var(name=constants.CONFIG_MAP_DECLARATIONS_KEY_NAME, value=self.declarations), get_env_var(name=constants.CONFIG_MAP_EXPERIMENT_INFO_KEY_NAME, value=json.dumps(self.experiment_labels)), ] if resources: env_vars += get_resources_env_vars(resources=resources) ports = [client.V1ContainerPort(container_port=port) for port in self.ports] return client.V1Container(name=self.job_container_name, image=self.job_docker_image, command=command, args=args, ports=ports, env=env_vars, resources=get_resources(resources), volume_mounts=volume_mounts)
def get_config_map(namespace, project_name, experiment_group_name, experiment_name, project_uuid, experiment_group_uuid, experiment_uuid, original_name, cloning_strategy, cluster_def, declarations, log_level): name = constants.CONFIG_MAP_NAME.format(uuid=experiment_uuid) labels = get_map_labels(project_name, experiment_group_name, experiment_name, project_uuid, experiment_group_uuid, experiment_uuid) metadata = client.V1ObjectMeta(name=name, labels=labels, namespace=namespace) experiment_outputs_path = get_experiment_outputs_path(experiment_name=experiment_name, original_name=original_name, cloning_strategy=cloning_strategy) experiment_logs_path = get_experiment_logs_path(experiment_name) experiment_data_path = get_project_data_path(project_name) data = { constants.CONFIG_MAP_CLUSTER_KEY_NAME: json.dumps(cluster_def), constants.CONFIG_MAP_DECLARATIONS_KEY_NAME: json.dumps(declarations) or '{}', constants.CONFIG_MAP_EXPERIMENT_INFO_KEY_NAME: json.dumps(labels), constants.CONFIG_MAP_LOG_LEVEL_KEY_NAME: log_level, API_KEY_NAME: get_settings_api_url(), constants.CONFIG_MAP_RUN_OUTPUTS_PATH_KEY_NAME: experiment_outputs_path, constants.CONFIG_MAP_RUN_LOGS_PATH_KEY_NAME: experiment_logs_path, constants.CONFIG_MAP_RUN_DATA_PATH_KEY_NAME: experiment_data_path, } return client.V1ConfigMap(api_version=k8s_constants.K8S_API_VERSION_V1, kind=k8s_constants.K8S_CONFIG_MAP_KIND, metadata=metadata, data=data)
def get(self, request, *args, **kwargs): store_manager = get_outputs_store( persistence_outputs=self.experiment.persistence_outputs) experiment_outputs_path = get_experiment_outputs_path( persistence_outputs=self.experiment.persistence_outputs, experiment_name=self.experiment.unique_name, original_name=self.experiment.original_unique_name, cloning_strategy=self.experiment.cloning_strategy) if request.query_params.get('path'): experiment_outputs_path = os.path.join( experiment_outputs_path, request.query_params.get('path')) try: data = store_manager.ls(experiment_outputs_path) except VolumeNotFoundError: raise ValidationError( 'Store manager could not load the volume requested,' ' to get the outputs data.') except Exception: raise ValidationError( 'Experiment outputs path does not exists or bad configuration.' ) return Response(data=data, status=200)