Exemplo n.º 1
0
    def _prepare_specs(self):
        with self.yaml_path.open("r") as file:
            specs = yaml.safe_load(file)

        ParameterValidator.assert_keys_present(specs.keys(), ["definitions", "instructions"], GalaxyTrainMLModel.__name__, "YAML specification")
        ParameterValidator.assert_all_in_valid_list(specs.keys(), ["definitions", "instructions", "output"], GalaxyTrainMLModel.__name__,
                                                    "YAML specification")

        ParameterValidator.assert_type_and_value(specs["instructions"], dict, GalaxyTrainMLModel.__name__, "instructions")

        assert len(list(specs["instructions"].keys())) == 1, f"{GalaxyTrainMLModel.__name__}: one instruction has to be specified under " \
                                                             f"`instructions`, got the following instead: {list(specs['instructions'].keys())}."

        self.instruction_name = list(specs["instructions"].keys())[0]

        ParameterValidator.assert_type_and_value(specs['instructions'][self.instruction_name], dict, GalaxyTrainMLModel.__name__,
                                                 self.instruction_name)
        ParameterValidator.assert_keys_present(specs['instructions'][self.instruction_name].keys(), ['type'], GalaxyTrainMLModel.__name__,
                                               self.instruction_name)

        assert specs['instructions'][self.instruction_name]['type'] == TrainMLModelInstruction.__name__[:-11], \
            f"{GalaxyTrainMLModel.__name__}: instruction `type` under {self.instruction_name} has to be {TrainMLModelInstruction.__name__[:-11]} " \
            f"for this tool."

        assert len(
            specs['instructions'][self.instruction_name]['labels']) == 1, f"{GalaxyTrainMLModel.__name__}: one label has to be specified under " \
                                                                          f"`labels`, got the following instead: {specs['instructions'][self.instruction_name]['labels']}."
        Util.check_paths(specs, GalaxyTrainMLModel.__name__)
        Util.update_result_paths(specs, self.result_path, self.yaml_path)
Exemplo n.º 2
0
    def _prepare_parameters(motif_filepath: str, match_v_genes: bool, sum_counts: bool, name: str = None):

        ParameterValidator.assert_type_and_value(match_v_genes, bool, "MatchedRegexEncoder", "match_v_genes")
        ParameterValidator.assert_type_and_value(sum_counts, bool, "MatchedRegexEncoder", "sum_counts")

        motif_filepath = Path(motif_filepath)
        assert motif_filepath.is_file(), f"MatchedRegexEncoder: the file {motif_filepath} does not exist. " \
                                               f"Specify the correct path under motif_filepath."

        file_columns = list(pd.read_csv(motif_filepath, sep="\t", iterator=False, dtype=str, nrows=0).columns)

        ParameterValidator.assert_all_in_valid_list(file_columns, ["id"] + [f"{c.value}V" for c in Chain] + [f"{c.value}_regex" for c in Chain], "MatchedRegexEncoder", "motif_filepath (column names)")

        chains = [colname.split("_")[0] for colname in file_columns if colname.endswith("_regex")]
        if match_v_genes:
            for chain in chains:
                assert f"{chain}V" in file_columns, f"MatchedRegexEncoder: expected column {chain}V to be present in the columns of motif_filepath. " \
                                                    f"Remove {chain}_regex from columns, or set match_v_genes to False."

        return {
            "motif_filepath": motif_filepath,
            "match_v_genes": match_v_genes,
            "sum_counts": sum_counts,
            "chains": chains,
            "name": name
        }
Exemplo n.º 3
0
    def _prepare_report_config(self, instruction_key, instruction, split_key,
                               symbol_table):
        if "reports" in instruction[split_key] and len(
                instruction[split_key]["reports"]) > 0:
            location = f"{instruction_key}/{split_key}/reports"
            report_types = list(signature(ReportConfig).parameters.keys())
            ParameterValidator.assert_all_in_valid_list(
                instruction[split_key]["reports"].keys(), report_types,
                location, "reports")

            for report_type in instruction[split_key]["reports"]:
                ParameterValidator.assert_type_and_value(
                    instruction[split_key]["reports"][report_type], list,
                    f"{location}/{report_type}", report_type)

            report_config_input = {
                report_type: {
                    report_id: symbol_table.get(report_id)
                    for report_id in instruction[split_key]["reports"]
                    [report_type]
                }
                for report_type in instruction[split_key]["reports"]
            }
        else:
            report_config_input = {}

        return report_config_input
Exemplo n.º 4
0
    def import_dataset(params, name: str) -> SequenceDataset:
        """
        Returns randomly generated receptor dataset according to the parameters;

        YAML specification:

            result_path: path/where/to/store/results/
            sequence_count: 100 # number of random sequences to generate
            chain_1_length_probabilities:
                14: 0.8 # 80% of all generated sequences for all sequences will have length 14
                15: 0.2 # 20% of all generated sequences across all sequences will have length 15
            labels:
                epitope1: # label name
                    True: 0.5 # 50% of the sequences will have class True
                    False: 0.5 # 50% of the sequences will have class False
                epitope2: # next label with classes that will be assigned to sequences independently of the previous label or other parameters
                    1: 0.3 # 30% of the generated sequences will have class 1
                    0: 0.7 # 70% of the generated sequences will have class 0

        """
        valid_keys = [
            "sequence_count", "length_probabilities", "labels", "result_path"
        ]
        ParameterValidator.assert_all_in_valid_list(
            list(params.keys()), valid_keys, "RandomSequenceDatasetImport",
            "params")

        return RandomDatasetGenerator.generate_sequence_dataset(
            sequence_count=params["sequence_count"],
            length_probabilities=params["length_probabilities"],
            labels=params["labels"],
            path=params["result_path"])
Exemplo n.º 5
0
    def build_object(cls, **kwargs):
        location = "TrainingPerformance"        
        valid_metrics = [m.name for m in Metric]

        name = kwargs["name"] if "name" in kwargs else None
        metrics = kwargs["metrics"] if "metrics" in kwargs else valid_metrics
        metrics = [m.upper() for m in metrics]
        
        ParameterValidator.assert_all_in_valid_list(metrics, valid_metrics, location, 'metrics')

        return TrainingPerformance(set(metrics), name=name)
Exemplo n.º 6
0
    def import_dataset(params: dict, dataset_name: str) -> RepertoireDataset:
        valid_keys = [
            "result_path", "repertoire_count", "sequence_count_probabilities",
            "sequence_length_probabilities", "labels"
        ]
        ParameterValidator.assert_all_in_valid_list(
            list(params.keys()), valid_keys, "RandomRepertoireDatasetImport",
            "params")

        return RandomDatasetGenerator.generate_repertoire_dataset(
            repertoire_count=params["repertoire_count"],
            sequence_count_probabilities=params[
                "sequence_count_probabilities"],
            sequence_length_probabilities=params[
                "sequence_length_probabilities"],
            labels=params["labels"],
            path=params["result_path"])
Exemplo n.º 7
0
    def _update_specs(self):
        with self.yaml_path.open('r') as file:
            specs = yaml.safe_load(file)

        ParameterValidator.assert_keys_present(specs.keys(),
                                               ["definitions", "instructions"],
                                               DatasetGenerationTool.__name__,
                                               "YAML specification")
        ParameterValidator.assert_all_in_valid_list(
            specs.keys(), ["definitions", "instructions", "output"],
            DatasetGenerationTool.__name__, "YAML specification")

        self._check_dataset(specs)
        self._check_instruction(specs)

        Util.check_paths(specs, DatasetGenerationTool.__name__)
        Util.update_result_paths(specs, self.result_path, self.yaml_path)
Exemplo n.º 8
0
    def _parse_ml_method(ml_method_id: str, ml_specification) -> tuple:

        valid_class_values = ReflectionHandler.all_nonabstract_subclass_basic_names(
            MLMethod, "", "ml_methods/")

        if type(ml_specification) is str:
            ml_specification = {ml_specification: {}}

        ml_specification = {
            **DefaultParamsLoader.load("ml_methods/", "MLMethod"),
            **ml_specification
        }
        ml_specification_keys = list(ml_specification.keys())

        ParameterValidator.assert_all_in_valid_list(
            list(ml_specification_keys),
            ["model_selection_cv", "model_selection_n_folds"] +
            valid_class_values, "MLParser", ml_method_id)

        non_default_keys = [
            key for key in ml_specification.keys()
            if key not in ["model_selection_cv", "model_selection_n_folds"]
        ]

        assert len(ml_specification_keys) == 3, f"MLParser: ML method {ml_method_id} was not correctly specified. Expected at least 1 key " \
                                                f"(ML method name), got {len(ml_specification_keys) - 2} instead: " \
                                                f"{str([key for key in non_default_keys])[1:-1]}."

        ml_method_class_name = non_default_keys[0]
        ml_method_class = ReflectionHandler.get_class_by_name(
            ml_method_class_name, "ml_methods/")

        ml_specification[ml_method_class_name] = {
            **DefaultParamsLoader.load("ml_methods/",
                                       ml_method_class_name,
                                       log_if_missing=False),
            **ml_specification[ml_method_class_name]
        }

        method, params = MLParser.create_method_instance(
            ml_specification, ml_method_class, ml_method_id)
        ml_specification[ml_method_class_name] = params
        method.name = ml_method_id

        return method, ml_specification
Exemplo n.º 9
0
    def parse(self,
              key: str,
              instruction: dict,
              symbol_table: SymbolTable,
              path: Path = None) -> SubsamplingInstruction:
        valid_keys = [
            "type", "dataset", "subsampled_dataset_sizes",
            "dataset_export_formats"
        ]
        ParameterValidator.assert_keys(instruction.keys(), valid_keys,
                                       SubsamplingParser.__name__, key)

        dataset_keys = symbol_table.get_keys_by_type(SymbolType.DATASET)
        ParameterValidator.assert_in_valid_list(instruction['dataset'],
                                                dataset_keys,
                                                SubsamplingParser.__name__,
                                                f'{key}/dataset')

        dataset = symbol_table.get(instruction['dataset'])
        ParameterValidator.assert_type_and_value(
            instruction['subsampled_dataset_sizes'], list,
            SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes')
        ParameterValidator.assert_all_type_and_value(
            instruction['subsampled_dataset_sizes'], int,
            SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes', 1,
            dataset.get_example_count())

        valid_export_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(
            DataExporter, 'Exporter', "dataset_export/")
        ParameterValidator.assert_type_and_value(
            instruction['dataset_export_formats'], list,
            SubsamplingParser.__name__, f"{key}/dataset_export_formats")
        ParameterValidator.assert_all_in_valid_list(
            instruction['dataset_export_formats'], valid_export_formats,
            SubsamplingParser.__name__, f"{key}/dataset_export_formats")

        return SubsamplingInstruction(
            dataset=dataset,
            subsampled_dataset_sizes=instruction['subsampled_dataset_sizes'],
            dataset_export_formats=[
                ReflectionHandler.get_class_by_name(export_format + "Exporter",
                                                    "dataset_export/")
                for export_format in instruction['dataset_export_formats']
            ],
            name=key)
Exemplo n.º 10
0
    def parse_exporters(self, instruction):
        if instruction["export_formats"] is not None:
            class_path = "dataset_export/"
            ParameterValidator.assert_all_in_valid_list(
                instruction["export_formats"],
                ReflectionHandler.all_nonabstract_subclass_basic_names(
                    DataExporter, 'Exporter', class_path),
                location="SimulationParser",
                parameter_name="export_formats")
            exporters = [
                ReflectionHandler.get_class_by_name(f"{item}Exporter",
                                                    class_path)
                for item in instruction["export_formats"]
            ]
        else:
            exporters = None

        return exporters
Exemplo n.º 11
0
    def parse_encoder(key: str, specs: dict):
        class_path = "encodings"
        valid_encoders = ReflectionHandler.all_nonabstract_subclass_basic_names(
            DatasetEncoder, "Encoder", class_path)
        encoder = ObjectParser.get_class(specs, valid_encoders, "Encoder",
                                         class_path, "EncodingParser", key)
        params = ObjectParser.get_all_params(specs, class_path,
                                             encoder.__name__[:-7], key)

        required_params = [
            p for p in list(
                inspect.signature(encoder.__init__).parameters.keys())
            if p != "self"
        ]
        ParameterValidator.assert_all_in_valid_list(
            params.keys(), required_params, "EncoderParser",
            f"{key}/{encoder.__name__.replace('Encoder', '')}")

        return encoder, params
Exemplo n.º 12
0
    def build_object(cls, **kwargs):
        location = "Coefficients"
        coefs_to_plot = [coef.upper() for coef in kwargs["coefs_to_plot"]]

        name = kwargs["name"] if "name" in kwargs else None

        ParameterValidator.assert_all_in_valid_list(
            coefs_to_plot,
            [item.name.upper() for item in CoefficientPlottingSetting],
            location, "coefs_to_plot")

        if CoefficientPlottingSetting.CUTOFF.name in coefs_to_plot:
            cutoff = kwargs["cutoff"]
            ParameterValidator.assert_type_and_value(cutoff, list, location,
                                                     "cutoff")
            ParameterValidator.assert_all_type_and_value(cutoff,
                                                         Number,
                                                         location,
                                                         "cutoff",
                                                         min_inclusive=1e-15)
        else:
            cutoff = []

        if CoefficientPlottingSetting.N_LARGEST.name in coefs_to_plot:
            n_largest = kwargs["n_largest"]
            ParameterValidator.assert_type_and_value(n_largest, list, location,
                                                     "n_largest")
            ParameterValidator.assert_all_type_and_value(n_largest,
                                                         int,
                                                         location,
                                                         "n_largest",
                                                         min_inclusive=1)
        else:
            n_largest = []

        coefs = CoefficientPlottingSettingList()
        for keyword in coefs_to_plot:
            coefs.append(CoefficientPlottingSetting[keyword.upper()])

        return Coefficients(coefs_to_plot=coefs,
                            cutoff=cutoff,
                            n_largest=n_largest,
                            name=name)
Exemplo n.º 13
0
    def _extract_reports(self):
        with self.specification_path.open("r") as file:
            workflow_specification = yaml.safe_load(file)

        report_keys = list(workflow_specification['instructions'].values()
                           )[0]['benchmark_reports']

        ParameterValidator.assert_all_in_valid_list(
            report_keys,
            list(workflow_specification['definitions']['reports'].keys()),
            MultiDatasetBenchmarkTool.__name__, "benchmark_reports")

        reports = {
            key: value
            for key, value in workflow_specification['definitions']
            ['reports'].items() if key in report_keys
        }
        symbol_table, _ = ReportParser.parse_reports(reports, SymbolTable())
        self.reports = [
            entry.item for entry in symbol_table.get_by_type(SymbolType.REPORT)
        ]
Exemplo n.º 14
0
    def parse(self,
              key: str,
              instruction: dict,
              symbol_table: SymbolTable,
              path: Path = None) -> DatasetExportInstruction:
        location = "DatasetExportParser"
        ParameterValidator.assert_keys(
            list(instruction.keys()), DatasetExportParser.REQUIRED_KEYS +
            DatasetExportParser.OPTIONAL_KEYS, location, key, False)
        ParameterValidator.assert_keys_present(
            list(instruction.keys()), DatasetExportParser.REQUIRED_KEYS,
            location, key)

        valid_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(
            DataExporter, "Exporter", 'dataset_export/')
        ParameterValidator.assert_all_in_valid_list(
            instruction["export_formats"], valid_formats, location,
            "export_formats")
        ParameterValidator.assert_all_in_valid_list(
            instruction["datasets"],
            symbol_table.get_keys_by_type(SymbolType.DATASET), location,
            "datasets")

        return DatasetExportInstruction(
            datasets=[
                symbol_table.get(dataset_key)
                for dataset_key in instruction["datasets"]
            ],
            exporters=[
                ReflectionHandler.get_class_by_name(f"{key}Exporter",
                                                    "dataset_export/")
                for key in instruction["export_formats"]
            ],
            preprocessing_sequence=symbol_table.get(
                instruction["preprocessing_sequence"])
            if "preprocessing_sequence" in instruction else None,
            name=key)