def test_import_library_require_permissions(): """Verify library creation (import) is off by default.""" app = _mock_app() sa_session = app.model.context u = model.User(email="*****@*****.**", password="******") library = model.Library(name="my library 1", description="my library description", synopsis="my synopsis") root_folder = model.LibraryFolder(name="my library 1", description='folder description') library.root_folder = root_folder sa_session.add_all((library, root_folder)) sa_session.flush() temp_directory = mkdtemp() with store.DirectoryModelExportStore(temp_directory, app=app) as export_store: export_store.export_library(library) error_caught = False try: import_model_store = store.get_import_model_store_for_directory( temp_directory, app=app, user=u) import_model_store.perform_import() except AssertionError: # TODO: throw and catch a better exception... error_caught = True assert error_caught
def test_edit_metadata_files(): app = _mock_app(store_by="uuid") sa_session = app.model.context u = model.User(email="*****@*****.**", password="******") h = model.History(name="Test History", user=u) d1 = _create_datasets(sa_session, h, 1, extension="bam")[0] sa_session.add_all((h, d1)) sa_session.flush() index = NamedTemporaryFile("w") index.write("cool bam index") metadata_dict = { "bam_index": MetadataTempFile.from_JSON({ "kwds": {}, "filename": index.name }) } d1.metadata.from_JSON_dict(json_dict=metadata_dict) assert d1.metadata.bam_index assert isinstance(d1.metadata.bam_index, model.MetadataFile) temp_directory = mkdtemp() with store.DirectoryModelExportStore( temp_directory, app=app, for_edit=True, strip_metadata_files=False) as export_store: export_store.add_dataset(d1) import_history = model.History(name="Test History for Import", user=u) sa_session.add(import_history) sa_session.flush() _perform_import_from_directory(temp_directory, app, u, import_history, store.ImportOptions(allow_edit=True))
def test_model_create_context_persist_error_hda(): work_directory = mkdtemp() with open(os.path.join(work_directory, "file1.txt"), "w") as f: f.write("hello world\nhello world line 2") target = { "destination": { "type": "hdas", }, "elements": [{ "error_message": "Failed to download some URL I guess", }], } app = _mock_app(store_by="uuid") temp_directory = mkdtemp() with store.DirectoryModelExportStore( temp_directory, serialize_dataset_objects=True) as export_store: persist_target_to_export_store(target, export_store, app.object_store, work_directory) import_history = _import_directory_to_history(app, temp_directory, work_directory) assert len(import_history.datasets) == 1 imported_hda = import_history.datasets[0] assert imported_hda.state == "error" assert imported_hda.info == "Failed to download some URL I guess"
def setup_job(self, history, store_directory, include_hidden=False, include_deleted=False, compressed=True): """Perform setup for job to export a history into an archive. Method generates attribute files for export, sets the corresponding attributes in the jeha object, and returns a command line for running the job. The command line includes the command, inputs, and options; it does not include the output file because it must be set at runtime. """ app = self.app # symlink files on export, on worker files will tarred up in a dereferenced manner. with store.DirectoryModelExportStore( store_directory, app=app, export_files="symlink") as export_store: export_store.export_history(history, include_hidden=include_hidden, include_deleted=include_deleted) # # Create and return command line for running tool. # options = f"--galaxy-version '{VERSION_MAJOR}'" if compressed: options += " -G" return f"{options} {store_directory}"
def setup_job(self, jeha, include_hidden=False, include_deleted=False): """ Perform setup for job to export a history into an archive. Method generates attribute files for export, sets the corresponding attributes in the jeha object, and returns a command line for running the job. The command line includes the command, inputs, and options; it does not include the output file because it must be set at runtime. """ app = self.app # # Create attributes/metadata files for export. # jeha.dataset.create_extra_files_path() temp_output_dir = jeha.dataset.extra_files_path history = jeha.history history_attrs_filename = os.path.join(temp_output_dir, ATTRS_FILENAME_HISTORY) jeha.history_attrs_filename = history_attrs_filename # symlink files on export, on worker files will tarred up in a dereferenced manner. with store.DirectoryModelExportStore( temp_output_dir, app=app, export_files="symlink") as export_store: export_store.export_history(history, include_hidden=include_hidden, include_deleted=include_deleted) # # Create and return command line for running tool. # options = "--galaxy-version '%s'" % VERSION_MAJOR if jeha.compressed: options += " -G" return "{} {}".format(options, temp_output_dir)
def test_import_export_composite_datasets(): app = _mock_app() sa_session = app.model.context u = model.User(email="*****@*****.**", password="******") h = model.History(name="Test History", user=u) d1 = _create_datasets(sa_session, h, 1, extension="html")[0] d1.dataset.create_extra_files_path() sa_session.add_all((h, d1)) sa_session.flush() primary = NamedTemporaryFile("w") primary.write("cool primary file") primary.flush() app.object_store.update_from_file(d1.dataset, file_name=primary.name, create=True, preserve_symlinks=True) composite1 = NamedTemporaryFile("w") composite1.write("cool composite file") composite1.flush() app.object_store.update_from_file(d1.dataset, extra_dir=os.path.normpath( os.path.join(d1.extra_files_path, "parent_dir")), alt_name="child_file", file_name=composite1.name, create=True, preserve_symlinks=True) temp_directory = mkdtemp() with store.DirectoryModelExportStore(temp_directory, app=app, export_files="copy") as export_store: export_store.add_dataset(d1) import_history = model.History(name="Test History for Import", user=u) sa_session.add(import_history) sa_session.flush() _perform_import_from_directory(temp_directory, app, u, import_history) assert len(import_history.datasets) == 1 import_dataset = import_history.datasets[0] root_extra_files_path = import_dataset.extra_files_path assert len(os.listdir(root_extra_files_path)) == 1 assert os.listdir(root_extra_files_path)[0] == "parent_dir" composite_sub_dir = os.path.join(root_extra_files_path, "parent_dir") child_files = os.listdir(composite_sub_dir) assert len(child_files) == 1 with open(os.path.join(composite_sub_dir, child_files[0]), "r") as f: contents = f.read() assert contents == "cool composite file"
def test_persist_target_hdca(): work_directory = mkdtemp() with open(os.path.join(work_directory, "file1.txt"), "w") as f: f.write("hello world\nhello world line 2") with open(os.path.join(work_directory, "file2.txt"), "w") as f: f.write("file 2 contents") target = { "destination": { "type": "hdca", }, "name": "My HDCA", "collection_type": "list", "elements": [{ "filename": "file1.txt", "ext": "txt", "dbkey": "hg19", "info": "dataset info", "name": "my file", }, { "filename": "file2.txt", "ext": "txt", "dbkey": "hg18", "info": "dataset info 2", "name": "my file 2", }] } app = _mock_app(store_by="uuid") temp_directory = mkdtemp() with store.DirectoryModelExportStore( temp_directory, serialize_dataset_objects=True) as export_store: persist_target_to_export_store(target, export_store, app.object_store, work_directory) import_history = _import_directory_to_history(app, temp_directory, work_directory) assert len(import_history.dataset_collections) == 1 assert len(import_history.datasets) == 2 import_hdca = import_history.dataset_collections[0] datasets = import_hdca.dataset_instances assert len(datasets) == 2 dataset0 = datasets[0] dataset1 = datasets[1] with open(dataset0.file_name, "r") as f: assert f.read().startswith("hello world\n") with open(dataset1.file_name, "r") as f: assert f.read().startswith("file 2 contents")
def _setup_simple_export(export_kwds): app = _mock_app() u, h, d1, d2, j = _setup_simple_cat_job(app) sa_session = app.model.context import_history = model.History(name="Test History for Import", user=u) sa_session.add(import_history) temp_directory = mkdtemp() with store.DirectoryModelExportStore(temp_directory, app=app, **export_kwds) as export_store: export_store.add_dataset(d1) export_store.add_dataset(d2) return app, h, temp_directory, import_history
def _import_library_target(target, work_directory): app = _mock_app(store_by="uuid") temp_directory = mkdtemp() with store.DirectoryModelExportStore( temp_directory, app=app, serialize_dataset_objects=True) as export_store: persist_target_to_export_store(target, export_store, app.object_store, work_directory) u = model.User(email="*****@*****.**", password="******") import_options = store.ImportOptions(allow_dataset_object_edit=True, allow_library_creation=True) import_model_store = store.get_import_model_store_for_directory( temp_directory, app=app, user=u, import_options=import_options) import_model_store.perform_import() sa_session = app.model.context return sa_session
def test_model_create_context_persist_hdas(): work_directory = mkdtemp() with open(os.path.join(work_directory, "file1.txt"), "w") as f: f.write("hello world\nhello world line 2") target = { "destination": { "type": "hdas", }, "elements": [{ "filename": "file1.txt", "ext": "txt", "dbkey": "hg19", "name": "my file", "md5": "e5d21b1ea57fc9a31f8ea0110531bf3d", "tags": ["name:value"] }], } app = _mock_app() temp_directory = mkdtemp() with store.DirectoryModelExportStore( temp_directory, serialize_dataset_objects=True) as export_store: persist_target_to_export_store(target, export_store, app.object_store, work_directory) import_history = _import_directory_to_history(app, temp_directory, work_directory) assert len(import_history.datasets) == 1 imported_hda = import_history.datasets[0] assert imported_hda.ext == "txt" assert imported_hda.name == "my file" assert imported_hda.metadata.data_lines == 2 assert len(imported_hda.dataset.hashes) == 1 assert imported_hda.dataset.hashes[ 0].hash_value == "e5d21b1ea57fc9a31f8ea0110531bf3d" tags = imported_hda.tags assert len(tags) == 1 assert tags[0].value == "value" with open(imported_hda.file_name) as f: assert f.read().startswith("hello world\n")
def test_import_export_library(): """Test basics of library, library folder, and library dataset import/export.""" app = _mock_app() sa_session = app.model.context u = model.User(email="*****@*****.**", password="******") library = model.Library(name="my library 1", description="my library description", synopsis="my synopsis") root_folder = model.LibraryFolder(name="my library 1", description='folder description') library.root_folder = root_folder sa_session.add_all((library, root_folder)) sa_session.flush() subfolder = model.LibraryFolder(name="sub folder 1", description="sub folder") root_folder.add_folder(subfolder) sa_session.add(subfolder) ld = model.LibraryDataset(folder=root_folder, name="my name", info="my library dataset") ldda = model.LibraryDatasetDatasetAssociation(create_dataset=True, flush=False) ld.library_dataset_dataset_association = ldda root_folder.add_library_dataset(ld) sa_session.add(ld) sa_session.add(ldda) sa_session.flush() assert len(root_folder.datasets) == 1 assert len(root_folder.folders) == 1 temp_directory = mkdtemp() with store.DirectoryModelExportStore(temp_directory, app=app) as export_store: export_store.export_library(library) import_model_store = store.get_import_model_store_for_directory( temp_directory, app=app, user=u, import_options=store.ImportOptions(allow_library_creation=True)) import_model_store.perform_import() all_libraries = sa_session.query(model.Library).all() assert len(all_libraries) == 2, len(all_libraries) all_lddas = sa_session.query(model.LibraryDatasetDatasetAssociation).all() assert len(all_lddas) == 2, len(all_lddas) new_library = [l for l in all_libraries if l.id != library.id][0] assert new_library.name == "my library 1" assert new_library.description == "my library description" assert new_library.synopsis == "my synopsis" new_root = new_library.root_folder assert new_root assert new_root.name == "my library 1" assert len(new_root.folders) == 1 assert len(new_root.datasets) == 1
def test_import_export_edit_collection(): """Test modifying existing collections with imports.""" app = _mock_app() sa_session = app.model.context u = model.User(email="*****@*****.**", password="******") h = model.History(name="Test History", user=u) c1 = model.DatasetCollection(collection_type="list", populated=False) hc1 = model.HistoryDatasetCollectionAssociation( history=h, hid=1, collection=c1, name="HistoryCollectionTest1") sa_session.add(hc1) sa_session.add(h) sa_session.flush() import_history = model.History(name="Test History for Import", user=u) sa_session.add(import_history) temp_directory = mkdtemp() with store.DirectoryModelExportStore(temp_directory, app=app, for_edit=True) as export_store: export_store.add_dataset_collection(hc1) # Fabric editing metadata for collection... collections_metadata_path = os.path.join(temp_directory, store.ATTRS_FILENAME_COLLECTIONS) datasets_metadata_path = os.path.join(temp_directory, store.ATTRS_FILENAME_DATASETS) with open(collections_metadata_path, "r") as f: hdcas_metadata = json.load(f) assert len(hdcas_metadata) == 1 hdca_metadata = hdcas_metadata[0] assert hdca_metadata assert "id" in hdca_metadata assert "collection" in hdca_metadata collection_metadata = hdca_metadata["collection"] assert "populated_state" in collection_metadata assert collection_metadata[ "populated_state"] == model.DatasetCollection.populated_states.NEW collection_metadata[ "populated_state"] = model.DatasetCollection.populated_states.OK d1 = model.HistoryDatasetAssociation(extension="txt", create_dataset=True, flush=False) d1.hid = 1 d2 = model.HistoryDatasetAssociation(extension="txt", create_dataset=True, flush=False) d2.hid = 2 serialization_options = model.SerializationOptions(for_edit=True) dataset_list = [ d1.serialize(app.security, serialization_options), d2.serialize(app.security, serialization_options) ] dc = model.DatasetCollection( id=collection_metadata["id"], collection_type="list", element_count=2, ) dc.populated_state = model.DatasetCollection.populated_states.OK dce1 = model.DatasetCollectionElement( element=d1, element_index=0, element_identifier="first", ) dce2 = model.DatasetCollectionElement( element=d2, element_index=1, element_identifier="second", ) dc.elements = [dce1, dce2] with open(datasets_metadata_path, "w") as datasets_f: json.dump(dataset_list, datasets_f) hdca_metadata["collection"] = dc.serialize(app.security, serialization_options) with open(collections_metadata_path, "w") as collections_f: json.dump(hdcas_metadata, collections_f) _perform_import_from_directory(temp_directory, app, u, import_history, store.ImportOptions(allow_edit=True)) sa_session.refresh(c1) assert c1.populated_state == model.DatasetCollection.populated_states.OK, c1.populated_state assert len(c1.elements) == 2
def set_metadata_portable(): import galaxy.model tool_job_working_directory = os.path.abspath(os.getcwd()) metadata_tmp_files_dir = os.path.join(tool_job_working_directory, "metadata") galaxy.model.metadata.MetadataTempFile.tmp_dir = metadata_tmp_files_dir metadata_params_path = os.path.join("metadata", "params.json") try: with open(metadata_params_path, "r") as f: metadata_params = json.load(f) except IOError: raise Exception("Failed to find metadata/params.json from cwd [%s]" % tool_job_working_directory) datatypes_config = metadata_params["datatypes_config"] job_metadata = metadata_params["job_metadata"] provided_metadata_style = metadata_params.get("provided_metadata_style") max_metadata_value_size = metadata_params.get( "max_metadata_value_size") or 0 outputs = metadata_params["outputs"] datatypes_registry = validate_and_load_datatypes_config(datatypes_config) tool_provided_metadata = load_job_metadata(job_metadata, provided_metadata_style) def set_meta(new_dataset_instance, file_dict): set_meta_with_tool_provided(new_dataset_instance, file_dict, set_meta_kwds, datatypes_registry, max_metadata_value_size) object_store_conf_path = os.path.join("metadata", "object_store_conf.json") extended_metadata_collection = os.path.exists(object_store_conf_path) object_store = None job_context = None version_string = "" export_store = None if extended_metadata_collection: from galaxy.tool_util.parser.stdio import ToolStdioRegex, ToolStdioExitCode tool_dict = metadata_params["tool"] stdio_exit_code_dicts, stdio_regex_dicts = tool_dict[ "stdio_exit_codes"], tool_dict["stdio_regexes"] stdio_exit_codes = list(map(ToolStdioExitCode, stdio_exit_code_dicts)) stdio_regexes = list(map(ToolStdioRegex, stdio_regex_dicts)) with open(object_store_conf_path, "r") as f: config_dict = json.load(f) from galaxy.objectstore import build_object_store_from_config assert config_dict is not None object_store = build_object_store_from_config(None, config_dict=config_dict) galaxy.model.Dataset.object_store = object_store outputs_directory = os.path.join(tool_job_working_directory, "outputs") if not os.path.exists(outputs_directory): outputs_directory = tool_job_working_directory # TODO: constants... if os.path.exists(os.path.join(outputs_directory, "tool_stdout")): with open(os.path.join(outputs_directory, "tool_stdout"), "rb") as f: tool_stdout = f.read() with open(os.path.join(outputs_directory, "tool_stderr"), "rb") as f: tool_stderr = f.read() elif os.path.exists(os.path.join(outputs_directory, "stdout")): # Puslar style working directory. with open(os.path.join(outputs_directory, "stdout"), "rb") as f: tool_stdout = f.read() with open(os.path.join(outputs_directory, "stderr"), "rb") as f: tool_stderr = f.read() job_id_tag = metadata_params["job_id_tag"] # TODO: this clearly needs to be refactored, nothing in runners should be imported here.. from galaxy.job_execution.output_collect import default_exit_code_file, read_exit_code_from exit_code_file = default_exit_code_file(".", job_id_tag) tool_exit_code = read_exit_code_from(exit_code_file, job_id_tag) from galaxy.tool_util.output_checker import check_output, DETECTED_JOB_STATE check_output_detected_state, tool_stdout, tool_stderr, job_messages = check_output( stdio_regexes, stdio_exit_codes, tool_stdout, tool_stderr, tool_exit_code, job_id_tag) if check_output_detected_state == DETECTED_JOB_STATE.OK and not tool_provided_metadata.has_failed_outputs( ): final_job_state = galaxy.model.Job.states.OK else: final_job_state = galaxy.model.Job.states.ERROR from pulsar.client.staging import COMMAND_VERSION_FILENAME version_string = "" if os.path.exists(COMMAND_VERSION_FILENAME): version_string = open(COMMAND_VERSION_FILENAME).read() # TODO: handle outputs_to_working_directory? from galaxy.util.expressions import ExpressionContext job_context = ExpressionContext( dict(stdout=tool_stdout, stderr=tool_stderr)) # Load outputs. import_model_store = store.imported_store_for_metadata( 'metadata/outputs_new', object_store=object_store) export_store = store.DirectoryModelExportStore( 'metadata/outputs_populated', serialize_dataset_objects=True, for_edit=True) for output_name, output_dict in outputs.items(): if extended_metadata_collection: dataset_instance_id = output_dict["id"] dataset = import_model_store.sa_session.query( galaxy.model.HistoryDatasetAssociation).find( dataset_instance_id) assert dataset is not None else: filename_in = os.path.join("metadata/metadata_in_%s" % output_name) dataset = cPickle.load(open(filename_in, 'rb')) # load DatasetInstance filename_kwds = os.path.join("metadata/metadata_kwds_%s" % output_name) filename_out = os.path.join("metadata/metadata_out_%s" % output_name) filename_results_code = os.path.join("metadata/metadata_results_%s" % output_name) override_metadata = os.path.join("metadata/metadata_override_%s" % output_name) dataset_filename_override = output_dict["filename_override"] # Same block as below... set_meta_kwds = stringify_dictionary_keys( json.load(open(filename_kwds)) ) # load kwds; need to ensure our keywords are not unicode try: dataset.dataset.external_filename = dataset_filename_override store_by = metadata_params.get("object_store_store_by", "id") extra_files_dir_name = "dataset_%s_files" % getattr( dataset.dataset, store_by) files_path = os.path.abspath( os.path.join(tool_job_working_directory, "working", extra_files_dir_name)) dataset.dataset.external_extra_files_path = files_path file_dict = tool_provided_metadata.get_dataset_meta( output_name, dataset.dataset.id, dataset.dataset.uuid) if 'ext' in file_dict: dataset.extension = file_dict['ext'] # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles override_metadata = json.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value( metadata_file_override): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override) setattr(dataset.metadata, metadata_name, metadata_file_override) if output_dict.get("validate", False): set_validated_state(dataset) set_meta(dataset, file_dict) if extended_metadata_collection: meta = tool_provided_metadata.get_dataset_meta( output_name, dataset.dataset.id, dataset.dataset.uuid) if meta: context = ExpressionContext(meta, job_context) else: context = job_context # Lazy and unattached # if getattr(dataset, "hidden_beneath_collection_instance", None): # dataset.visible = False dataset.blurb = 'done' dataset.peek = 'no peek' dataset.info = (dataset.info or '') if context['stdout'].strip(): # Ensure white space between entries dataset.info = dataset.info.rstrip( ) + "\n" + context['stdout'].strip() if context['stderr'].strip(): # Ensure white space between entries dataset.info = dataset.info.rstrip( ) + "\n" + context['stderr'].strip() dataset.tool_version = version_string dataset.set_size() if 'uuid' in context: dataset.dataset.uuid = context['uuid'] object_store.update_from_file(dataset.dataset, create=True) from galaxy.job_execution.output_collect import collect_extra_files collect_extra_files(object_store, dataset, ".") if galaxy.model.Job.states.ERROR == final_job_state: dataset.blurb = "error" dataset.mark_unhidden() else: # If the tool was expected to set the extension, attempt to retrieve it if dataset.ext == 'auto': dataset.extension = context.get('ext', 'data') dataset.init_meta(copy_from=dataset) # This has already been done: # else: # self.external_output_metadata.load_metadata(dataset, output_name, self.sa_session, working_directory=self.working_directory, remote_metadata_directory=remote_metadata_directory) line_count = context.get('line_count', None) try: # Certain datatype's set_peek methods contain a line_count argument dataset.set_peek(line_count=line_count) except TypeError: # ... and others don't dataset.set_peek() from galaxy.jobs import TOOL_PROVIDED_JOB_METADATA_KEYS for context_key in TOOL_PROVIDED_JOB_METADATA_KEYS: if context_key in context: context_value = context[context_key] setattr(dataset, context_key, context_value) if extended_metadata_collection: export_store.add_dataset(dataset) else: cPickle.dump(dataset, open(filename_out, 'wb+')) else: dataset.metadata.to_JSON_dict( filename_out) # write out results of set_meta json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wt+')) # setting metadata has succeeded except Exception: json.dump((False, traceback.format_exc()), open(filename_results_code, 'wt+')) # setting metadata has failed somehow if extended_metadata_collection: # discover extra outputs... from galaxy.job_execution.output_collect import collect_dynamic_outputs, collect_primary_datasets, SessionlessJobContext job_context = SessionlessJobContext( metadata_params, tool_provided_metadata, object_store, export_store, import_model_store, os.path.join(tool_job_working_directory, "working")) output_collections = {} for name, output_collection in metadata_params[ "output_collections"].items(): output_collections[name] = import_model_store.sa_session.query( galaxy.model.HistoryDatasetCollectionAssociation).find( output_collection["id"]) outputs = {} for name, output in metadata_params["outputs"].items(): outputs[name] = import_model_store.sa_session.query( galaxy.model.HistoryDatasetAssociation).find(output["id"]) input_ext = json.loads(metadata_params["job_params"].get( "__input_ext", '"data"')) collect_primary_datasets( job_context, outputs, input_ext=input_ext, ) collect_dynamic_outputs(job_context, output_collections) if export_store: export_store._finalize() write_job_metadata(tool_job_working_directory, job_metadata, set_meta, tool_provided_metadata)
def set_metadata_portable(): tool_job_working_directory = os.path.abspath(os.getcwd()) metadata_tmp_files_dir = os.path.join(tool_job_working_directory, "metadata") MetadataTempFile.tmp_dir = metadata_tmp_files_dir metadata_params_path = os.path.join("metadata", "params.json") try: with open(metadata_params_path) as f: metadata_params = json.load(f) except OSError: raise Exception( f"Failed to find metadata/params.json from cwd [{tool_job_working_directory}]" ) datatypes_config = metadata_params["datatypes_config"] job_metadata = metadata_params["job_metadata"] provided_metadata_style = metadata_params.get("provided_metadata_style") max_metadata_value_size = metadata_params.get( "max_metadata_value_size") or 0 outputs = metadata_params["outputs"] datatypes_registry = validate_and_load_datatypes_config(datatypes_config) tool_provided_metadata = load_job_metadata(job_metadata, provided_metadata_style) def set_meta(new_dataset_instance, file_dict): set_meta_with_tool_provided(new_dataset_instance, file_dict, set_meta_kwds, datatypes_registry, max_metadata_value_size) object_store_conf_path = os.path.join("metadata", "object_store_conf.json") extended_metadata_collection = os.path.exists(object_store_conf_path) object_store = None job_context = None version_string = "" export_store = None final_job_state = Job.states.OK if extended_metadata_collection: tool_dict = metadata_params["tool"] stdio_exit_code_dicts, stdio_regex_dicts = tool_dict[ "stdio_exit_codes"], tool_dict["stdio_regexes"] stdio_exit_codes = list(map(ToolStdioExitCode, stdio_exit_code_dicts)) stdio_regexes = list(map(ToolStdioRegex, stdio_regex_dicts)) with open(object_store_conf_path) as f: config_dict = json.load(f) assert config_dict is not None object_store = build_object_store_from_config(None, config_dict=config_dict) Dataset.object_store = object_store outputs_directory = os.path.join(tool_job_working_directory, "outputs") if not os.path.exists(outputs_directory): outputs_directory = tool_job_working_directory # TODO: constants... if os.path.exists(os.path.join(outputs_directory, "tool_stdout")): with open(os.path.join(outputs_directory, "tool_stdout"), "rb") as f: tool_stdout = f.read() with open(os.path.join(outputs_directory, "tool_stderr"), "rb") as f: tool_stderr = f.read() elif os.path.exists(os.path.join(tool_job_working_directory, "stdout")): with open(os.path.join(tool_job_working_directory, "stdout"), "rb") as f: tool_stdout = f.read() with open(os.path.join(tool_job_working_directory, "stderr"), "rb") as f: tool_stderr = f.read() elif os.path.exists(os.path.join(outputs_directory, "stdout")): # Puslar style output directory? Was this ever used - did this ever work? with open(os.path.join(outputs_directory, "stdout"), "rb") as f: tool_stdout = f.read() with open(os.path.join(outputs_directory, "stderr"), "rb") as f: tool_stderr = f.read() else: wdc = os.listdir(tool_job_working_directory) odc = os.listdir(outputs_directory) error_desc = "Failed to find tool_stdout or tool_stderr for this job, cannot collect metadata" error_extra = f"Working dir contents [{wdc}], output directory contents [{odc}]" log.warn(f"{error_desc}. {error_extra}") raise Exception(error_desc) job_id_tag = metadata_params["job_id_tag"] exit_code_file = default_exit_code_file(".", job_id_tag) tool_exit_code = read_exit_code_from(exit_code_file, job_id_tag) check_output_detected_state, tool_stdout, tool_stderr, job_messages = check_output( stdio_regexes, stdio_exit_codes, tool_stdout, tool_stderr, tool_exit_code, job_id_tag) if check_output_detected_state == DETECTED_JOB_STATE.OK and not tool_provided_metadata.has_failed_outputs( ): final_job_state = Job.states.OK else: final_job_state = Job.states.ERROR version_string = "" if os.path.exists(COMMAND_VERSION_FILENAME): version_string = open(COMMAND_VERSION_FILENAME).read() expression_context = ExpressionContext( dict(stdout=tool_stdout, stderr=tool_stderr)) # Load outputs. export_store = store.DirectoryModelExportStore( 'metadata/outputs_populated', serialize_dataset_objects=True, for_edit=True, strip_metadata_files=False, serialize_jobs=False) try: import_model_store = store.imported_store_for_metadata( 'metadata/outputs_new', object_store=object_store) except AssertionError: # Remove in 21.09, this should only happen for jobs that started on <= 20.09 and finish now import_model_store = None job_context = SessionlessJobContext( metadata_params, tool_provided_metadata, object_store, export_store, import_model_store, os.path.join(tool_job_working_directory, "working"), final_job_state=final_job_state, ) unnamed_id_to_path = {} for unnamed_output_dict in job_context.tool_provided_metadata.get_unnamed_outputs( ): destination = unnamed_output_dict["destination"] elements = unnamed_output_dict["elements"] destination_type = destination["type"] if destination_type == 'hdas': for element in elements: filename = element.get('filename') if filename: unnamed_id_to_path[element['object_id']] = os.path.join( job_context.job_working_directory, filename) for output_name, output_dict in outputs.items(): dataset_instance_id = output_dict["id"] klass = getattr( galaxy.model, output_dict.get('model_class', 'HistoryDatasetAssociation')) dataset = None if import_model_store: dataset = import_model_store.sa_session.query(klass).find( dataset_instance_id) if dataset is None: # legacy check for jobs that started before 21.01, remove on 21.05 filename_in = os.path.join(f"metadata/metadata_in_{output_name}") import pickle dataset = pickle.load(open(filename_in, 'rb')) # load DatasetInstance assert dataset is not None filename_kwds = os.path.join(f"metadata/metadata_kwds_{output_name}") filename_out = os.path.join(f"metadata/metadata_out_{output_name}") filename_results_code = os.path.join( f"metadata/metadata_results_{output_name}") override_metadata = os.path.join( f"metadata/metadata_override_{output_name}") dataset_filename_override = output_dict["filename_override"] # pre-20.05 this was a per job parameter and not a per dataset parameter, drop in 21.XX legacy_object_store_store_by = metadata_params.get( "object_store_store_by", "id") # Same block as below... set_meta_kwds = stringify_dictionary_keys( json.load(open(filename_kwds)) ) # load kwds; need to ensure our keywords are not unicode try: dataset.dataset.external_filename = unnamed_id_to_path.get( dataset_instance_id, dataset_filename_override) store_by = output_dict.get("object_store_store_by", legacy_object_store_store_by) extra_files_dir_name = f"dataset_{getattr(dataset.dataset, store_by)}_files" files_path = os.path.abspath( os.path.join(tool_job_working_directory, "working", extra_files_dir_name)) dataset.dataset.external_extra_files_path = files_path file_dict = tool_provided_metadata.get_dataset_meta( output_name, dataset.dataset.id, dataset.dataset.uuid) if 'ext' in file_dict: dataset.extension = file_dict['ext'] # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles override_metadata = json.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if MetadataTempFile.is_JSONified_value(metadata_file_override): metadata_file_override = MetadataTempFile.from_JSON( metadata_file_override) setattr(dataset.metadata, metadata_name, metadata_file_override) if output_dict.get("validate", False): set_validated_state(dataset) if dataset_instance_id not in unnamed_id_to_path: # We're going to run through set_metadata in collect_dynamic_outputs with more contextual metadata, # so skip set_meta here. set_meta(dataset, file_dict) if extended_metadata_collection: meta = tool_provided_metadata.get_dataset_meta( output_name, dataset.dataset.id, dataset.dataset.uuid) if meta: context = ExpressionContext(meta, expression_context) else: context = expression_context # Lazy and unattached # if getattr(dataset, "hidden_beneath_collection_instance", None): # dataset.visible = False dataset.blurb = 'done' dataset.peek = 'no peek' dataset.info = (dataset.info or '') if context['stdout'].strip(): # Ensure white space between entries dataset.info = f"{dataset.info.rstrip()}\n{context['stdout'].strip()}" if context['stderr'].strip(): # Ensure white space between entries dataset.info = f"{dataset.info.rstrip()}\n{context['stderr'].strip()}" dataset.tool_version = version_string dataset.set_size() if 'uuid' in context: dataset.dataset.uuid = context['uuid'] if dataset_filename_override and dataset_filename_override != dataset.file_name: # This has to be a job with outputs_to_working_directory set. # We update the object store with the created output file. object_store.update_from_file( dataset.dataset, file_name=dataset_filename_override, create=True) collect_extra_files(object_store, dataset, ".") if Job.states.ERROR == final_job_state: dataset.blurb = "error" dataset.mark_unhidden() else: # If the tool was expected to set the extension, attempt to retrieve it if dataset.ext == 'auto': dataset.extension = context.get('ext', 'data') dataset.init_meta(copy_from=dataset) # This has already been done: # else: # self.external_output_metadata.load_metadata(dataset, output_name, self.sa_session, working_directory=self.working_directory, remote_metadata_directory=remote_metadata_directory) line_count = context.get('line_count', None) try: # Certain datatype's set_peek methods contain a line_count argument dataset.set_peek(line_count=line_count) except TypeError: # ... and others don't dataset.set_peek() for context_key in TOOL_PROVIDED_JOB_METADATA_KEYS: if context_key in context: context_value = context[context_key] setattr(dataset, context_key, context_value) # We never want to persist the external_filename. dataset.dataset.external_filename = None export_store.add_dataset(dataset) else: dataset.metadata.to_JSON_dict( filename_out) # write out results of set_meta json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wt+')) # setting metadata has succeeded except Exception: json.dump((False, traceback.format_exc()), open(filename_results_code, 'wt+')) # setting metadata has failed somehow if extended_metadata_collection: # discover extra outputs... output_collections = {} for name, output_collection in metadata_params[ "output_collections"].items(): output_collections[name] = import_model_store.sa_session.query( HistoryDatasetCollectionAssociation).find( output_collection["id"]) outputs = {} for name, output in metadata_params["outputs"].items(): klass = getattr( galaxy.model, output.get('model_class', 'HistoryDatasetAssociation')) outputs[name] = import_model_store.sa_session.query(klass).find( output["id"]) input_ext = json.loads(metadata_params["job_params"].get( "__input_ext", '"data"')) collect_primary_datasets( job_context, outputs, input_ext=input_ext, ) collect_dynamic_outputs(job_context, output_collections) if export_store: export_store._finalize() write_job_metadata(tool_job_working_directory, job_metadata, set_meta, tool_provided_metadata)
def set_metadata_portable(): tool_job_working_directory = os.path.abspath(os.getcwd()) metadata_tmp_files_dir = os.path.join(tool_job_working_directory, "metadata") MetadataTempFile.tmp_dir = metadata_tmp_files_dir metadata_params = get_metadata_params(tool_job_working_directory) datatypes_config = metadata_params["datatypes_config"] job_metadata = metadata_params["job_metadata"] provided_metadata_style = metadata_params.get("provided_metadata_style") max_metadata_value_size = metadata_params.get("max_metadata_value_size") or 0 max_discovered_files = metadata_params.get("max_discovered_files") outputs = metadata_params["outputs"] datatypes_registry = validate_and_load_datatypes_config(datatypes_config) tool_provided_metadata = load_job_metadata(job_metadata, provided_metadata_style) def set_meta(new_dataset_instance, file_dict): set_meta_with_tool_provided(new_dataset_instance, file_dict, set_meta_kwds, datatypes_registry, max_metadata_value_size) try: object_store = get_object_store(tool_job_working_directory=tool_job_working_directory) except (FileNotFoundError, AssertionError): object_store = None extended_metadata_collection = bool(object_store) job_context = None version_string = None export_store = None final_job_state = Job.states.OK job_messages = [] if extended_metadata_collection: tool_dict = metadata_params["tool"] stdio_exit_code_dicts, stdio_regex_dicts = tool_dict["stdio_exit_codes"], tool_dict["stdio_regexes"] stdio_exit_codes = list(map(ToolStdioExitCode, stdio_exit_code_dicts)) stdio_regexes = list(map(ToolStdioRegex, stdio_regex_dicts)) outputs_directory = os.path.join(tool_job_working_directory, "outputs") if not os.path.exists(outputs_directory): outputs_directory = tool_job_working_directory # TODO: constants... locations = [ (outputs_directory, 'tool_'), (tool_job_working_directory, ''), (outputs_directory, ''), # # Pulsar style output directory? Was this ever used - did this ever work? ] for directory, prefix in locations: if os.path.exists(os.path.join(directory, f"{prefix}stdout")): with open(os.path.join(directory, f"{prefix}stdout"), 'rb') as f: tool_stdout = f.read(MAX_STDIO_READ_BYTES) with open(os.path.join(directory, f"{prefix}stderr"), 'rb') as f: tool_stderr = f.read(MAX_STDIO_READ_BYTES) break else: if os.path.exists(os.path.join(tool_job_working_directory, 'task_0')): # We have a task splitting job tool_stdout = b'' tool_stderr = b'' paths = Path(tool_job_working_directory).glob('task_*') for path in paths: with open(path / 'outputs' / 'tool_stdout', 'rb') as f: task_stdout = f.read(MAX_STDIO_READ_BYTES) if task_stdout: tool_stdout = b"%s[%s stdout]\n%s\n" % (tool_stdout, path.name.encode(), task_stdout) with open(path / 'outputs' / 'tool_stderr', 'rb') as f: task_stderr = f.read(MAX_STDIO_READ_BYTES) if task_stderr: tool_stderr = b"%s[%s stdout]\n%s\n" % (tool_stderr, path.name.encode(), task_stderr) else: wdc = os.listdir(tool_job_working_directory) odc = os.listdir(outputs_directory) error_desc = "Failed to find tool_stdout or tool_stderr for this job, cannot collect metadata" error_extra = f"Working dir contents [{wdc}], output directory contents [{odc}]" log.warn(f"{error_desc}. {error_extra}") raise Exception(error_desc) job_id_tag = metadata_params["job_id_tag"] exit_code_file = default_exit_code_file(".", job_id_tag) tool_exit_code = read_exit_code_from(exit_code_file, job_id_tag) check_output_detected_state, tool_stdout, tool_stderr, job_messages = check_output(stdio_regexes, stdio_exit_codes, tool_stdout, tool_stderr, tool_exit_code, job_id_tag) if check_output_detected_state == DETECTED_JOB_STATE.OK and not tool_provided_metadata.has_failed_outputs(): final_job_state = Job.states.OK else: final_job_state = Job.states.ERROR version_string_path = os.path.join('outputs', COMMAND_VERSION_FILENAME) version_string = collect_shrinked_content_from_path(version_string_path) expression_context = ExpressionContext(dict(stdout=tool_stdout[:255], stderr=tool_stderr[:255])) # Load outputs. export_store = store.DirectoryModelExportStore('metadata/outputs_populated', serialize_dataset_objects=True, for_edit=True, strip_metadata_files=False, serialize_jobs=True) try: import_model_store = store.imported_store_for_metadata('metadata/outputs_new', object_store=object_store) except AssertionError: # Remove in 21.09, this should only happen for jobs that started on <= 20.09 and finish now import_model_store = None tool_script_file = os.path.join(tool_job_working_directory, 'tool_script.sh') job = None if import_model_store and export_store: job = next(iter(import_model_store.sa_session.objects[Job].values())) job_context = SessionlessJobContext( metadata_params, tool_provided_metadata, object_store, export_store, import_model_store, os.path.join(tool_job_working_directory, "working"), final_job_state=final_job_state, max_discovered_files=max_discovered_files, ) if extended_metadata_collection: # discover extra outputs... output_collections = {} for name, output_collection in metadata_params["output_collections"].items(): # TODO: remove HistoryDatasetCollectionAssociation fallback on 22.01, model_class used to not be serialized prior to 21.09 model_class = output_collection.get('model_class', 'HistoryDatasetCollectionAssociation') collection = import_model_store.sa_session.query(getattr(galaxy.model, model_class)).find(output_collection["id"]) output_collections[name] = collection output_instances = {} for name, output in metadata_params["outputs"].items(): klass = getattr(galaxy.model, output.get('model_class', 'HistoryDatasetAssociation')) output_instances[name] = import_model_store.sa_session.query(klass).find(output["id"]) input_ext = json.loads(metadata_params["job_params"].get("__input_ext") or '"data"') try: collect_primary_datasets( job_context, output_instances, input_ext=input_ext, ) collect_dynamic_outputs(job_context, output_collections) except MaxDiscoveredFilesExceededError as e: final_job_state = Job.states.ERROR job_messages.append(str(e)) if job: job.job_messages = job_messages job.state = final_job_state if os.path.exists(tool_script_file): with open(tool_script_file) as command_fh: command_line_lines = [] for i, line in enumerate(command_fh): if i == 0 and line.endswith('COMMAND_VERSION 2>&1;'): # Don't record version command as part of command line continue command_line_lines.append(line) job.command_line = "".join(command_line_lines).strip() export_store.export_job(job, include_job_data=False) unnamed_id_to_path = {} for unnamed_output_dict in job_context.tool_provided_metadata.get_unnamed_outputs(): destination = unnamed_output_dict["destination"] elements = unnamed_output_dict["elements"] destination_type = destination["type"] if destination_type == 'hdas': for element in elements: filename = element.get('filename') object_id = element.get('object_id') if filename and object_id: unnamed_id_to_path[object_id] = os.path.join(job_context.job_working_directory, filename) for output_name, output_dict in outputs.items(): dataset_instance_id = output_dict["id"] klass = getattr(galaxy.model, output_dict.get('model_class', 'HistoryDatasetAssociation')) dataset = None if import_model_store: dataset = import_model_store.sa_session.query(klass).find(dataset_instance_id) if dataset is None: # legacy check for jobs that started before 21.01, remove on 21.05 filename_in = os.path.join(f"metadata/metadata_in_{output_name}") import pickle dataset = pickle.load(open(filename_in, 'rb')) # load DatasetInstance assert dataset is not None filename_kwds = os.path.join(f"metadata/metadata_kwds_{output_name}") filename_out = os.path.join(f"metadata/metadata_out_{output_name}") filename_results_code = os.path.join(f"metadata/metadata_results_{output_name}") override_metadata = os.path.join(f"metadata/metadata_override_{output_name}") dataset_filename_override = output_dict["filename_override"] # pre-20.05 this was a per job parameter and not a per dataset parameter, drop in 21.XX legacy_object_store_store_by = metadata_params.get("object_store_store_by", "id") # Same block as below... set_meta_kwds = stringify_dictionary_keys(json.load(open(filename_kwds))) # load kwds; need to ensure our keywords are not unicode try: external_filename = unnamed_id_to_path.get(dataset_instance_id, dataset_filename_override) if not os.path.exists(external_filename): matches = glob.glob(external_filename) assert len(matches) == 1, f"More than one file matched by output glob '{external_filename}'" external_filename = matches[0] assert safe_contains(tool_job_working_directory, external_filename), f"Cannot collect output '{external_filename}' from outside of working directory" created_from_basename = os.path.relpath(external_filename, os.path.join(tool_job_working_directory, 'working')) dataset.dataset.created_from_basename = created_from_basename # override filename if we're dealing with outputs to working directory and dataset is not linked to link_data_only = metadata_params.get("link_data_only") if not link_data_only: # Only set external filename if we're dealing with files in job working directory. # Fixes link_data_only uploads dataset.dataset.external_filename = external_filename store_by = output_dict.get("object_store_store_by", legacy_object_store_store_by) extra_files_dir_name = f"dataset_{getattr(dataset.dataset, store_by)}_files" files_path = os.path.abspath(os.path.join(tool_job_working_directory, "working", extra_files_dir_name)) dataset.dataset.external_extra_files_path = files_path file_dict = tool_provided_metadata.get_dataset_meta(output_name, dataset.dataset.id, dataset.dataset.uuid) if 'ext' in file_dict: dataset.extension = file_dict['ext'] # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles override_metadata = json.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if MetadataTempFile.is_JSONified_value(metadata_file_override): metadata_file_override = MetadataTempFile.from_JSON(metadata_file_override) setattr(dataset.metadata, metadata_name, metadata_file_override) if output_dict.get("validate", False): set_validated_state(dataset) if dataset_instance_id not in unnamed_id_to_path: # We're going to run through set_metadata in collect_dynamic_outputs with more contextual metadata, # so skip set_meta here. set_meta(dataset, file_dict) if extended_metadata_collection: collect_extra_files(object_store, dataset, ".") dataset.state = dataset.dataset.state = final_job_state if extended_metadata_collection: if not link_data_only and os.path.getsize(external_filename): # Here we might be updating a disk based objectstore when outputs_to_working_directory is used, # or a remote object store from its cache path. object_store.update_from_file(dataset.dataset, file_name=external_filename, create=True) # TODO: merge expression_context into tool_provided_metadata so we don't have to special case this (here and in _finish_dataset) meta = tool_provided_metadata.get_dataset_meta(output_name, dataset.dataset.id, dataset.dataset.uuid) if meta: context = ExpressionContext(meta, expression_context) else: context = expression_context dataset.blurb = 'done' dataset.peek = 'no peek' dataset.info = (dataset.info or '') if context['stdout'].strip(): # Ensure white space between entries dataset.info = f"{dataset.info.rstrip()}\n{context['stdout'].strip()}" if context['stderr'].strip(): # Ensure white space between entries dataset.info = f"{dataset.info.rstrip()}\n{context['stderr'].strip()}" dataset.tool_version = version_string if 'uuid' in context: dataset.dataset.uuid = context['uuid'] if not final_job_state == Job.states.ERROR: line_count = context.get('line_count', None) try: # Certain datatype's set_peek methods contain a line_count argument dataset.set_peek(line_count=line_count) except TypeError: # ... and others don't dataset.set_peek() for context_key in TOOL_PROVIDED_JOB_METADATA_KEYS: if context_key in context: context_value = context[context_key] setattr(dataset, context_key, context_value) # We only want to persist the external_filename if the dataset has been linked in. if not link_data_only: dataset.dataset.external_filename = None dataset.dataset.extra_files_path = None export_store.add_dataset(dataset) else: dataset.metadata.to_JSON_dict(filename_out) # write out results of set_meta json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wt+')) # setting metadata has succeeded except Exception: json.dump((False, traceback.format_exc()), open(filename_results_code, 'wt+')) # setting metadata has failed somehow if export_store: export_store._finalize() write_job_metadata(tool_job_working_directory, job_metadata, set_meta, tool_provided_metadata)