def prepare_specs(self): with self.yaml_path.open("r") as file: specs = yaml.safe_load(file) self.instruction_name = Util.check_instruction_type( specs, DataSimulationTool.__name__, self.expected_instruction) self.export_format = Util.check_export_format( specs, DataSimulationTool.__name__, self.instruction_name) ParameterValidator.assert_keys_present(specs["definitions"], ["datasets"], DataSimulationTool.__name__, "definitions/datasets") ParameterValidator.assert_type_and_value( specs['definitions']['datasets'], dict, DataSimulationTool.__name__, "definitions/datasets") dataset_names = list(specs['definitions']['datasets'].keys()) assert len(dataset_names) == 1, f"{DataSimulationTool.__name__}: one dataset has to be defined under definitions/datasets, got " \ f"{dataset_names} instead." self.dataset_name = dataset_names[0] Util.check_paths(specs, DataSimulationTool.__name__) Util.update_result_paths(specs, self.result_path, self.yaml_path)
def build(cls, **kwargs): ParameterValidator.assert_keys_present( list(kwargs.keys()), ['metadata_file', 'name', 'repertoire_ids', 'metadata_fields'], RepertoireDataset.__name__, "repertoire dataset") repertoires = [] metadata_df = pd.read_csv(kwargs['metadata_file'], comment=Constants.COMMENT_SIGN) for index, row in metadata_df.iterrows(): filename = Path(kwargs['metadata_file']).parent / row['filename'] if not filename.is_file() and 'repertoires' in str(filename): filename = filename.parent.parent / Path(row['filename']).name repertoire = Repertoire(data_filename=filename, metadata_filename=filename.parent / f'{filename.stem}_metadata.yaml', identifier=row['identifier']) repertoires.append(repertoire) if "repertoire_ids" in kwargs.keys( ) and "repertoires" not in kwargs.keys( ) and kwargs['repertoire_ids'] is not None: assert all(rep.identifier == kwargs['repertoire_ids'][i] for i, rep in enumerate(repertoires)), \ f"{RepertoireDataset.__name__}: repertoire ids from the iml_dataset file and metadata file don't match for the dataset " \ f"{kwargs['name']} with identifier {kwargs['identifier']}." return RepertoireDataset(**{**kwargs, **{"repertoires": repertoires}})
def _prepare_specs(self): with self.yaml_path.open("r") as file: specs = yaml.safe_load(file) ParameterValidator.assert_keys_present(specs.keys(), ["definitions", "instructions"], GalaxyTrainMLModel.__name__, "YAML specification") ParameterValidator.assert_all_in_valid_list(specs.keys(), ["definitions", "instructions", "output"], GalaxyTrainMLModel.__name__, "YAML specification") ParameterValidator.assert_type_and_value(specs["instructions"], dict, GalaxyTrainMLModel.__name__, "instructions") assert len(list(specs["instructions"].keys())) == 1, f"{GalaxyTrainMLModel.__name__}: one instruction has to be specified under " \ f"`instructions`, got the following instead: {list(specs['instructions'].keys())}." self.instruction_name = list(specs["instructions"].keys())[0] ParameterValidator.assert_type_and_value(specs['instructions'][self.instruction_name], dict, GalaxyTrainMLModel.__name__, self.instruction_name) ParameterValidator.assert_keys_present(specs['instructions'][self.instruction_name].keys(), ['type'], GalaxyTrainMLModel.__name__, self.instruction_name) assert specs['instructions'][self.instruction_name]['type'] == TrainMLModelInstruction.__name__[:-11], \ f"{GalaxyTrainMLModel.__name__}: instruction `type` under {self.instruction_name} has to be {TrainMLModelInstruction.__name__[:-11]} " \ f"for this tool." assert len( specs['instructions'][self.instruction_name]['labels']) == 1, f"{GalaxyTrainMLModel.__name__}: one label has to be specified under " \ f"`labels`, got the following instead: {specs['instructions'][self.instruction_name]['labels']}." Util.check_paths(specs, GalaxyTrainMLModel.__name__) Util.update_result_paths(specs, self.result_path, self.yaml_path)
def build_object(cls, **kwargs): ParameterValidator.assert_keys_present(list(kwargs.keys()), ['file_format', 'name'], DesignMatrixExporter.__name__, DesignMatrixExporter.__name__) ParameterValidator.assert_in_valid_list( kwargs['file_format'], ['npy', 'csv', 'npy.zip', 'csv.zip', 'hdf5.zip'], DesignMatrixExporter.__name__, 'file_format') return DesignMatrixExporter(**kwargs)
def _check_dataset(self, specs): ParameterValidator.assert_keys_present(specs["definitions"].keys(), ['datasets'], DatasetGenerationTool.__name__, 'definitions') assert len(specs['definitions']['datasets'].keys()) == 1, \ f"{DatasetGenerationTool.__name__}: only one dataset can be defined with this Galaxy tool, got these " \ f"instead: {list(specs['definitions']['datasets'].keys())}." assert len(specs['instructions'].keys()) == 1, \ f"{DatasetGenerationTool.__name__}: only one instruction of type DatasetExport can be defined with this Galaxy tool, got these " \ f"instructions instead: {list(specs['instructions'].keys())}."
def parse_instruction(key: str, instruction: dict, symbol_table: SymbolTable, path) -> tuple: ParameterValidator.assert_keys_present(list(instruction.keys()), ["type"], InstructionParser.__name__, key) valid_instructions = [cls[:-6] for cls in ReflectionHandler.discover_classes_by_partial_name("Parser", "dsl/instruction_parsers/")] ParameterValidator.assert_in_valid_list(instruction["type"], valid_instructions, "InstructionParser", "type") default_params = DefaultParamsLoader.load("instructions/", instruction["type"]) instruction = {**default_params, **instruction} parser = ReflectionHandler.get_class_by_name("{}Parser".format(instruction["type"]), "instruction_parsers/")() instruction_object = parser.parse(key, instruction, symbol_table, path) symbol_table.add(key, SymbolType.INSTRUCTION, instruction_object) return instruction, symbol_table
def check_instruction_type(specs: dict, tool_name, expected_instruction) -> str: ParameterValidator.assert_keys_present(list(specs.keys()), ['definitions', 'instructions'], tool_name, "YAML specification") assert len(list(specs['instructions'].keys())) == 1, f"{tool_name}: multiple instructions were given " \ f"({str(list(specs['instructions'].keys()))[1:-1]}), but only one instruction of type " \ f"{expected_instruction} should be specified." instruction_name = list(specs['instructions'].keys())[0] instruction_type = specs['instructions'][instruction_name]['type'] assert instruction_type == expected_instruction, \ f"{tool_name}: instruction type has to be '{expected_instruction}', got {instruction_type} instead." return instruction_name
def build_object(cls, **kwargs): ParameterValidator.assert_keys_present(list(kwargs.keys()), ["implanted_motifs_per_label"], "MotifSeedRecovery", "MotifSeedRecovery report") implanted_motifs_per_label = kwargs["implanted_motifs_per_label"] ParameterValidator.assert_type_and_value( implanted_motifs_per_label, dict, "MotifSeedRecovery", f"implanted_motifs_per_label") for label_name in implanted_motifs_per_label.keys(): ParameterValidator.assert_type_and_value( implanted_motifs_per_label[label_name], dict, "MotifSeedRecovery", f"implanted_motifs_per_label/{label_name}") ParameterValidator.assert_keys_present( implanted_motifs_per_label[label_name].keys(), ["hamming_distance", "seeds", "gap_sizes"], "MotifSeedRecovery", f"implanted_motifs_per_label/{label_name}") ParameterValidator.assert_type_and_value( implanted_motifs_per_label[label_name]["hamming_distance"], bool, "MotifSeedRecovery", f"implanted_motifs_per_label/{label_name}/hamming_distance") ParameterValidator.assert_type_and_value( implanted_motifs_per_label[label_name]["gap_sizes"], list, "MotifSeedRecovery", f"implanted_motifs_per_label/{label_name}/gap_sizes") ParameterValidator.assert_type_and_value( implanted_motifs_per_label[label_name]["seeds"], list, "MotifSeedRecovery", f"implanted_motifs_per_label/{label_name}/seeds") for gap_size in implanted_motifs_per_label[label_name][ "gap_sizes"]: ParameterValidator.assert_type_and_value( gap_size, int, "MotifSeedRecovery", f"implanted_motifs_per_label/{label_name}/gap_sizes", min_inclusive=0) for seed in implanted_motifs_per_label[label_name]["seeds"]: ParameterValidator.assert_type_and_value( seed, str, "MotifSeedRecovery", f"implanted_motifs_per_label/{label_name}/seeds") return MotifSeedRecovery(implanted_motifs_per_label)
def check_export_format(specs: dict, tool_name: str, instruction_name: str): ParameterValidator.assert_keys_present( list(specs['instructions'][instruction_name].keys()), ["export_formats"], tool_name, f"{instruction_name}/export_formats") ParameterValidator.assert_type_and_value( specs['instructions'][instruction_name]["export_formats"], list, tool_name, f"{instruction_name}/export_formats") assert len(specs['instructions'][instruction_name]["export_formats"]) == 1, \ f"{tool_name}: only one format can be specified under export_formats parameter under " \ f"{instruction_name}/export_formats, got {specs['instructions'][instruction_name]['export_formats']} instead." return specs['instructions'][instruction_name]["export_formats"][0]
def _check_instruction(self, specs): instruction_name = Util.check_instruction_type( specs, DatasetGenerationTool.__name__, DatasetExportInstruction.__name__[:-11]) for key in ['datasets', 'export_formats']: ParameterValidator.assert_keys_present( list(specs['instructions'][instruction_name].keys()), [key], DatasetGenerationTool.__name__, instruction_name) ParameterValidator.assert_type_and_value( specs["instructions"][instruction_name][key], list, DatasetGenerationTool.__name__, f"{instruction_name}/{key}") assert len(specs['instructions'][instruction_name][key]) == 1, \ f"{DatasetGenerationTool.__name__}: this tool accepts only one item under {key}, got {specs['instructions'][instruction_name][key]} " \ f"instead."
def _check_specs(self): with open(self.yaml_path, "r") as file: specs = yaml.load(file) instruction_name = Util.check_instruction_type( specs, GalaxyMLApplicationTool.__name__, MLApplicationInstruction.__name__[:-11]) ParameterValidator.assert_keys_present( list(specs['instructions'][instruction_name].keys()), ["dataset", "config_path"], GalaxyMLApplicationTool.__name__, instruction_name) assert os.path.isfile(specs['instructions'][instruction_name]['config_path']), \ f"{GalaxyMLApplicationTool.__name__}: file specified under 'config_path' parameter " \ f"({specs['instructions'][instruction_name]['config_path']}) is not available. Please check if it was correctly uploaded or if the file" \ f" name is correct."
def _check_dataset_specs(self, workflow_specification, location): ParameterValidator.assert_type_and_value( workflow_specification['definitions'], dict, location, 'definitions') ParameterValidator.assert_keys_present( workflow_specification['definitions'].keys(), ['datasets'], location, 'definitions') ParameterValidator.assert_type_and_value( workflow_specification['definitions']['datasets'], dict, location, 'datasets') dataset_names = list( workflow_specification['definitions']['datasets'].keys()) assert len(dataset_names) > 1, \ f"MultiDatasetBenchmarkTool: there is only one dataset specified ({dataset_names[0]}), while this tool operates on multiple datasets. " \ f"If only one dataset is needed, consider using the training instruction directly."
def _update_specs(self): with self.yaml_path.open('r') as file: specs = yaml.safe_load(file) ParameterValidator.assert_keys_present(specs.keys(), ["definitions", "instructions"], DatasetGenerationTool.__name__, "YAML specification") ParameterValidator.assert_all_in_valid_list( specs.keys(), ["definitions", "instructions", "output"], DatasetGenerationTool.__name__, "YAML specification") self._check_dataset(specs) self._check_instruction(specs) Util.check_paths(specs, DatasetGenerationTool.__name__) Util.update_result_paths(specs, self.result_path, self.yaml_path)
def import_repertoire_dataset(import_class, params: DatasetImportParams, dataset_name: str) -> RepertoireDataset: """ Function to create a dataset from the metadata and a list of repertoire files and exports dataset pickle file Arguments: import_class: class to use for import params: instance of DatasetImportParams class which includes information on path, columns, result path etc. dataset_name: user-defined name of the dataset Returns: RepertoireDataset object that was created """ metadata = pd.read_csv(params.metadata_file, ",") ParameterValidator.assert_keys_present( metadata.columns.tolist(), ["filename"], ImportHelper.__name__, f'{dataset_name}: params: metadata_file') PathBuilder.build(params.result_path / "repertoires/") arguments = [(import_class, row, params) for index, row in metadata.iterrows()] with Pool(params.number_of_processes) as pool: repertoires = pool.starmap(ImportHelper.load_repertoire_as_object, arguments) new_metadata_file = ImportHelper.make_new_metadata_file( repertoires, metadata, params.result_path, dataset_name) potential_labels = list(set(metadata.columns.tolist()) - {"filename"}) dataset = RepertoireDataset(labels={ key: list(set(metadata[key].values.tolist())) for key in potential_labels }, repertoires=repertoires, metadata_file=new_metadata_file, name=dataset_name) PickleExporter.export(dataset, params.result_path) return dataset
def prepare_specs(self): with self.yaml_path.open("r") as file: specs = yaml.safe_load(file) self.instruction_name = Util.check_instruction_type( specs, DataSimulationTool.__name__, self.expected_instruction) self.export_format = Util.check_export_format( specs, DataSimulationTool.__name__, self.instruction_name) ParameterValidator.assert_keys_present(specs["definitions"], ["datasets"], DataSimulationTool.__name__, "definitions/datasets") ParameterValidator.assert_type_and_value( specs['definitions']['datasets'], dict, DataSimulationTool.__name__, "definitions/datasets") self.dataset_name = "dataset" Util.update_dataset_key(specs, DataSimulationTool.__name__, self.dataset_name) Util.check_paths(specs, DataSimulationTool.__name__) Util.update_result_paths(specs, self.result_path, self.yaml_path)
def build_object(cls, **kwargs): ParameterValidator.assert_keys( kwargs.keys(), ['reference_path', 'comparison_attributes', 'name', 'label'], ReferenceSequenceOverlap.__name__, f"reports: {kwargs['name'] if 'name' in kwargs else ''}") kwargs['reference_path'] = Path(kwargs['reference_path']) assert kwargs['reference_path'].is_file(), f"{ReferenceSequenceOverlap.__name__}: 'reference_path' for report {kwargs['name']} is not " \ f"a valid file path." reference_sequences_df = pd.read_csv(kwargs['reference_path']) attributes = reference_sequences_df.columns.tolist() ParameterValidator.assert_keys_present( expected_values=kwargs['comparison_attributes'], values=attributes, location=ReferenceSequenceOverlap.__name__, parameter_name='columns in file under reference_path') return ReferenceSequenceOverlap(**kwargs)
def parse_signals(signals: dict, symbol_table: SymbolTable): for key, signal_spec in signals.items(): ParameterValidator.assert_keys_present(signal_spec.keys(), SignalParser.VALID_KEYS, "SignalParser", key) implanting_strategy = SignalParser._get_implanting_strategy( key, signal_spec) ParameterValidator.assert_keys( signal_spec["motifs"], symbol_table.get_keys_by_type(SymbolType.MOTIF), "SignalParser", f"motifs in signal {key}", False) signal_motifs = [ symbol_table.get(motif_id) for motif_id in signal_spec["motifs"] ] signal = Signal(key, signal_motifs, implanting_strategy) symbol_table.add(key, SymbolType.SIGNAL, signal) return symbol_table, signals
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path = None) -> DatasetExportInstruction: location = "DatasetExportParser" ParameterValidator.assert_keys( list(instruction.keys()), DatasetExportParser.REQUIRED_KEYS + DatasetExportParser.OPTIONAL_KEYS, location, key, False) ParameterValidator.assert_keys_present( list(instruction.keys()), DatasetExportParser.REQUIRED_KEYS, location, key) valid_formats = ReflectionHandler.all_nonabstract_subclass_basic_names( DataExporter, "Exporter", 'dataset_export/') ParameterValidator.assert_all_in_valid_list( instruction["export_formats"], valid_formats, location, "export_formats") ParameterValidator.assert_all_in_valid_list( instruction["datasets"], symbol_table.get_keys_by_type(SymbolType.DATASET), location, "datasets") return DatasetExportInstruction( datasets=[ symbol_table.get(dataset_key) for dataset_key in instruction["datasets"] ], exporters=[ ReflectionHandler.get_class_by_name(f"{key}Exporter", "dataset_export/") for key in instruction["export_formats"] ], preprocessing_sequence=symbol_table.get( instruction["preprocessing_sequence"]) if "preprocessing_sequence" in instruction else None, name=key)
def _check_instruction_specs(self, workflow_specification, location): ParameterValidator.assert_type_and_value( workflow_specification['instructions'], dict, location, 'instructions') instruction_names = list(workflow_specification['instructions'].keys()) assert len(instruction_names) == 1, f"MultiDatasetBenchmarkTool: there can be only one instruction specified for this tool. " \ f"Currently the following instructions are specified: {instruction_names}." ParameterValidator.assert_keys_present( workflow_specification['instructions'][ instruction_names[0]].keys(), ['type', 'datasets'], location, instruction_names[0]) instruction_type = workflow_specification['instructions'][ instruction_names[0]]['type'] assert instruction_type == 'TrainMLModel', \ f"MultiDatasetBenchmarkTool: this tool works only with instruction of type 'TrainMLModel', got {instruction_type} instead." datasets_in_instruction = workflow_specification['instructions'][ instruction_names[0]]['datasets'] assert len(datasets_in_instruction) > 1, \ f'{location}: this tool takes a multiple dataset names as input, but only {len(datasets_in_instruction)} were provided: ' \ f'{datasets_in_instruction}.'
def _get_implanting_strategy(key: str, signal: dict) -> SignalImplantingStrategy: valid_strategies = [ cls[:-10] for cls in ReflectionHandler.discover_classes_by_partial_name( "Implanting", "simulation/signal_implanting_strategy/") ] ParameterValidator.assert_in_valid_list(signal["implanting"], valid_strategies, "SignalParser", key) defaults = DefaultParamsLoader.load( "signal_implanting_strategy/", f"{signal['implanting']}Implanting") signal = {**defaults, **signal} ParameterValidator.assert_keys_present( list(signal.keys()), ["motifs", "implanting", "sequence_position_weights"], SignalParser.__name__, key) implanting_comp = None if 'implanting_computation' in signal: implanting_comp = signal['implanting_computation'].lower() ParameterValidator.assert_in_valid_list( implanting_comp, [el.name.lower() for el in ImplantingComputation], SignalParser.__name__, 'implanting_computation') implanting_comp = ImplantingComputation[implanting_comp.upper()] implanting_strategy = ReflectionHandler.get_class_by_name( f"{signal['implanting']}Implanting")( GappedMotifImplanting(), signal["sequence_position_weights"], implanting_comp) return implanting_strategy