Exemplo n.º 1
0
    def test(self):

        path = EnvironmentSettings.tmp_test_path + "integration_sequence_classification/"
        dataset = self.create_dataset(path)

        os.environ["cache_type"] = "test"
        encoder_params = {
            "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
            "reads": ReadsType.UNIQUE.name,
            "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
            "k": 3
        }

        hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object(
            dataset, **encoder_params),
                               encoder_params=encoder_params,
                               ml_method=LogisticRegression(),
                               ml_params={
                                   "model_selection_cv": False,
                                   "model_selection_n_folds": -1
                               },
                               preproc_sequence=[])

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        instruction = TrainMLModelInstruction(
            dataset, GridSearch([hp_setting]), [hp_setting],
            SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
            SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path)

        result = instruction.run(result_path=path)

        shutil.rmtree(path)
Exemplo n.º 2
0
    def test__split_repertoire_dataset(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "manual_splitter/")
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(10, {4: 1}, {3: 1}, {}, path)

        train_metadata = pd.DataFrame({"subject_id": ["rep_1", "rep_2", "rep_4", "rep_5", "rep_9", "rep_7"]})
        train_metadata.to_csv(path + "train.csv")

        test_metadata = pd.DataFrame({"subject_id": ["rep_0", "rep_3", "rep_6", "rep_8"]})
        test_metadata.to_csv(path + "test.csv")

        train_datasets, test_datasets = ManualSplitter._split_repertoire_dataset(
            DataSplitterParams(dataset, SplitType.MANUAL, split_count=1, paths=[path + 'result/'],
                               split_config=SplitConfig(manual_config=ManualSplitConfig(path + "train.csv",
                                                                                        path + "test.csv"),
                                                        split_count=1, split_strategy=SplitType.MANUAL)))

        self.assertEqual(1, len(train_datasets))
        self.assertEqual(1, len(test_datasets))
        self.assertEqual(6, train_datasets[0].get_example_count())
        self.assertEqual(4, test_datasets[0].get_example_count())
        self.assertTrue(all(subject_id in ["rep_1", "rep_2", "rep_4", "rep_5", "rep_9", "rep_7"]
                            for subject_id in train_datasets[0].get_metadata(["subject_id"])["subject_id"]))
        self.assertTrue(all(subject_id in ["rep_0", "rep_3", "rep_6", "rep_8"]
                            for subject_id in test_datasets[0].get_metadata(["subject_id"])["subject_id"]))
        self.assertTrue(os.path.isfile(train_datasets[0].metadata_file))
        self.assertTrue(os.path.isfile(test_datasets[0].metadata_file))

        shutil.rmtree(path)
Exemplo n.º 3
0
    def _update_split_configs(
            self, assessment: SplitConfig, selection: SplitConfig,
            dataset: Dataset) -> Tuple[SplitConfig, SplitConfig]:

        if assessment.split_strategy == SplitType.LOOCV:
            assessment.split_count = dataset.get_example_count()
            train_val_example_count = assessment.split_count - 1
        elif assessment.split_strategy == SplitType.K_FOLD:
            train_val_example_count = int(dataset.get_example_count() *
                                          (assessment.split_count - 1) /
                                          assessment.split_count)
        else:
            train_val_example_count = int(dataset.get_example_count() *
                                          assessment.training_percentage)

        if selection.split_strategy == SplitType.LOOCV:
            selection.split_count = train_val_example_count

        return assessment, selection
    def _create_state_object(self, path):
        repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
                                                        path=path,
                                                        labels={
                                                            "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                                                                   1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
                                                            "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
                                                                   0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]})

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata,
                                    params={"l1": [1, 2], "l2": [0, 1]})
        enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4}
        hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params,
                                 LogisticRegression(),
                                 {"model_selection_cv": False, "model_selection_n_folds": -1},
                                 [])]

        label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])])

        process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings,
                                          SplitConfig(SplitType.RANDOM, 1, 0.5),
                                          SplitConfig(SplitType.RANDOM, 1, 0.5),
                                          {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path)

        state = process.run(result_path=path)

        return state
Exemplo n.º 5
0
    def test_run(self):

        path = EnvironmentSettings.root_path + "test/tmp/smmodel/"
        PathBuilder.build(path)
        repertoires, metadata = RepertoireBuilder.build([["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path,
                                                      {"default": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                                                                   1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2]})
        dataset = RepertoireDataset(repertoires=repertoires,
                                    params={"default": [1, 2]},
                                    metadata_file=metadata)

        label_config = LabelConfiguration()
        label_config.add_label("default", [1, 2])

        hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **{"vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3}),
                                 {"vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3},
                                 LogisticRegression(),
                                 {"model_selection_cv": False, "model_selection_n_folds": -1}, [])]

        split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig())
        split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig())

        instruction = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings,
                                              split_config_assessment,
                                              split_config_selection,
                                              {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY,
                                              label_config, path)
        semantic_model = SemanticModel([instruction], path)

        semantic_model.run()

        shutil.rmtree(path)
Exemplo n.º 6
0
    def _parse_split_config(self, instruction_key, instruction: dict,
                            split_key: str, symbol_table: SymbolTable,
                            settings_count: int) -> SplitConfig:

        try:

            default_params = DefaultParamsLoader.load("instructions/",
                                                      SplitConfig.__name__)
            report_config_input = self._prepare_report_config(
                instruction_key, instruction, split_key, symbol_table)
            instruction[split_key] = {
                **default_params,
                **instruction[split_key]
            }

            split_strategy = SplitType[instruction[split_key]
                                       ["split_strategy"].upper()]
            training_percentage = float(
                instruction[split_key]["training_percentage"]
            ) if split_strategy == SplitType.RANDOM else -1

            if split_strategy == SplitType.RANDOM and training_percentage == 1 and settings_count > 1:
                raise ValueError(
                    f"{TrainMLModelParser.__name__}: all data under {instruction_key}/{split_key} was specified to be used for "
                    f"training, but {settings_count} settings were specified for evaluation. Please define a test/validation set by "
                    f"reducing the training percentage (e.g., to 0.7) or use only one hyperparameter setting to run the analysis."
                )

            return SplitConfig(
                split_strategy=split_strategy,
                split_count=int(instruction[split_key]["split_count"]),
                training_percentage=training_percentage,
                reports=ReportConfig(**report_config_input),
                manual_config=ManualSplitConfig(
                    **instruction[split_key]["manual_config"])
                if "manual_config" in instruction[split_key] else None,
                leave_one_out_config=LeaveOneOutConfig(
                    **instruction[split_key]["leave_one_out_config"])
                if "leave_one_out_config" in instruction[split_key] else None)

        except KeyError as key_error:
            raise KeyError(
                f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under {split_key}."
            )
Exemplo n.º 7
0
    def test_generate(self):
        path = EnvironmentSettings.tmp_test_path + "cv_feature_performance/"

        state = TrainMLModelState(
            assessment=SplitConfig(split_count=5,
                                   split_strategy=SplitType.K_FOLD),
            selection=SplitConfig(split_count=10,
                                  split_strategy=SplitType.K_FOLD),
            optimization_metric=Metric.ACCURACY,
            label_configuration=LabelConfiguration(
                labels=[Label(name="CMV", values=[True, False])]),
            hp_settings=[
                HPSetting(encoder_params={"p_value_threshold": 0.001},
                          encoder_name="e1",
                          encoder=SequenceAbundanceEncoder([], 0, 0, 0),
                          preproc_sequence=[],
                          ml_method_name="ml1",
                          ml_method=ProbabilisticBinaryClassifier(10, 0.1),
                          ml_params={}),
                HPSetting(encoder_params={"p_value_threshold": 0.01},
                          encoder_name="e2",
                          encoder=SequenceAbundanceEncoder([], 0, 0, 0),
                          preproc_sequence=[],
                          ml_method_name="ml1",
                          ml_method=ProbabilisticBinaryClassifier(10, 0.1),
                          ml_params={}),
                HPSetting(encoder_params={"p_value_threshold": 0.01},
                          encoder=SequenceAbundanceEncoder([], 0, 0, 0),
                          preproc_sequence=[],
                          ml_method=ProbabilisticBinaryClassifier(10, 0.01),
                          ml_params={})
            ],
            dataset=None,
            hp_strategy=None,
            metrics=None)

        report = CVFeaturePerformance("p_value_threshold",
                                      state,
                                      path,
                                      is_feature_axis_categorical=True,
                                      name="report1")
        with self.assertWarns(RuntimeWarning):
            report.generate_report()

        state.hp_settings = state.hp_settings[:2]

        state.assessment_states = [
            HPAssessmentState(i, None, None, None, state.label_configuration)
            for i in range(state.assessment.split_count)
        ]
        for assessment_state in state.assessment_states:
            assessment_state.label_states["CMV"] = HPLabelState("CMV", [])
            assessment_state.label_states["CMV"].assessment_items = {
                setting.get_key():
                HPItem(performance={'accuracy': random.uniform(0.5, 1)},
                       hp_setting=setting)
                for setting in state.hp_settings
            }
            assessment_state.label_states[
                "CMV"].selection_state = HPSelectionState(
                    [], [], "", GridSearch(state.hp_settings))
            assessment_state.label_states["CMV"].selection_state.hp_items = {
                str(setting): [
                    HPItem(performance={'accuracy': random.uniform(0.5, 1)},
                           hp_setting=setting)
                    for _ in range(state.selection.split_count)
                ]
                for setting in state.hp_settings
            }

        report.state = state

        report_result = report.generate_report()

        self.assertTrue(isinstance(report_result, ReportResult))
        self.assertEqual(2, len(report_result.output_tables))
        self.assertEqual(1, len(report_result.output_figures))
        self.assertTrue(os.path.isfile(report_result.output_figures[0].path))
        self.assertTrue(os.path.isfile(report_result.output_tables[0].path))
        self.assertTrue(os.path.isfile(report_result.output_tables[1].path))

        shutil.rmtree(path)
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path + "hpoptimproc/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
            path=path,
            labels={
                "l1": [
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
                ],
                "l2": [
                    0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
                    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
                ]
            })

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    params={
                                        "l1": [1, 2],
                                        "l2": [0, 1]
                                    })
        enc1 = {
            "k": 3,
            "model_type": ModelType.SEQUENCE.name,
            "vector_size": 4
        }
        enc2 = {
            "k": 3,
            "model_type": ModelType.SEQUENCE.name,
            "vector_size": 6
        }
        hp_settings = [
            HPSetting(Word2VecEncoder.build_object(dataset, **enc1), enc1,
                      LogisticRegression(), {
                          "model_selection_cv": False,
                          "model_selection_n_folds": -1
                      }, []),
            HPSetting(
                Word2VecEncoder.build_object(dataset, **enc2), enc2, SVM(), {
                    "model_selection_cv": False,
                    "model_selection_n_folds": -1
                },
                [ClonesPerRepertoireFilter(lower_limit=-1, upper_limit=1000)])
        ]

        report = SequenceLengthDistribution()
        label_config = LabelConfiguration(
            [Label("l1", [1, 2]), Label("l2", [0, 1])])

        process = TrainMLModelInstruction(
            dataset, GridSearch(hp_settings), hp_settings,
            SplitConfig(SplitType.RANDOM,
                        1,
                        0.5,
                        reports=ReportConfig(data_splits={"seqlen": report})),
            SplitConfig(SplitType.RANDOM,
                        1,
                        0.5,
                        reports=ReportConfig(data_splits={"seqlen": report})),
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config,
            path)

        state = process.run(result_path=path)

        self.assertTrue(isinstance(state, TrainMLModelState))
        self.assertEqual(1, len(state.assessment_states))
        self.assertTrue("l1" in state.assessment_states[0].label_states)
        self.assertTrue("l2" in state.assessment_states[0].label_states)

        shutil.rmtree(path)