def test(self): path = EnvironmentSettings.tmp_test_path + "integration_sequence_classification/" dataset = self.create_dataset(path) os.environ["cache_type"] = "test" encoder_params = { "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3 } hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object( dataset, **encoder_params), encoder_params=encoder_params, ml_method=LogisticRegression(), ml_params={ "model_selection_cv": False, "model_selection_n_folds": -1 }, preproc_sequence=[]) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) instruction = TrainMLModelInstruction( dataset, GridSearch([hp_setting]), [hp_setting], SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path) result = instruction.run(result_path=path) shutil.rmtree(path)
def test__split_repertoire_dataset(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "manual_splitter/") dataset = RandomDatasetGenerator.generate_repertoire_dataset(10, {4: 1}, {3: 1}, {}, path) train_metadata = pd.DataFrame({"subject_id": ["rep_1", "rep_2", "rep_4", "rep_5", "rep_9", "rep_7"]}) train_metadata.to_csv(path + "train.csv") test_metadata = pd.DataFrame({"subject_id": ["rep_0", "rep_3", "rep_6", "rep_8"]}) test_metadata.to_csv(path + "test.csv") train_datasets, test_datasets = ManualSplitter._split_repertoire_dataset( DataSplitterParams(dataset, SplitType.MANUAL, split_count=1, paths=[path + 'result/'], split_config=SplitConfig(manual_config=ManualSplitConfig(path + "train.csv", path + "test.csv"), split_count=1, split_strategy=SplitType.MANUAL))) self.assertEqual(1, len(train_datasets)) self.assertEqual(1, len(test_datasets)) self.assertEqual(6, train_datasets[0].get_example_count()) self.assertEqual(4, test_datasets[0].get_example_count()) self.assertTrue(all(subject_id in ["rep_1", "rep_2", "rep_4", "rep_5", "rep_9", "rep_7"] for subject_id in train_datasets[0].get_metadata(["subject_id"])["subject_id"])) self.assertTrue(all(subject_id in ["rep_0", "rep_3", "rep_6", "rep_8"] for subject_id in test_datasets[0].get_metadata(["subject_id"])["subject_id"])) self.assertTrue(os.path.isfile(train_datasets[0].metadata_file)) self.assertTrue(os.path.isfile(test_datasets[0].metadata_file)) shutil.rmtree(path)
def _update_split_configs( self, assessment: SplitConfig, selection: SplitConfig, dataset: Dataset) -> Tuple[SplitConfig, SplitConfig]: if assessment.split_strategy == SplitType.LOOCV: assessment.split_count = dataset.get_example_count() train_val_example_count = assessment.split_count - 1 elif assessment.split_strategy == SplitType.K_FOLD: train_val_example_count = int(dataset.get_example_count() * (assessment.split_count - 1) / assessment.split_count) else: train_val_example_count = int(dataset.get_example_count() * assessment.training_percentage) if selection.split_strategy == SplitType.LOOCV: selection.split_count = train_val_example_count return assessment, selection
def _create_state_object(self, path): repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]}) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, params={"l1": [1, 2], "l2": [0, 1]}) enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4} hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params, LogisticRegression(), {"model_selection_cv": False, "model_selection_n_folds": -1}, [])] label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])]) process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings, SplitConfig(SplitType.RANDOM, 1, 0.5), SplitConfig(SplitType.RANDOM, 1, 0.5), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) state = process.run(result_path=path) return state
def test_run(self): path = EnvironmentSettings.root_path + "test/tmp/smmodel/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path, {"default": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2]}) dataset = RepertoireDataset(repertoires=repertoires, params={"default": [1, 2]}, metadata_file=metadata) label_config = LabelConfiguration() label_config.add_label("default", [1, 2]) hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **{"vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3}), {"vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3}, LogisticRegression(), {"model_selection_cv": False, "model_selection_n_folds": -1}, [])] split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig()) split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig()) instruction = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings, split_config_assessment, split_config_selection, {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) semantic_model = SemanticModel([instruction], path) semantic_model.run() shutil.rmtree(path)
def _parse_split_config(self, instruction_key, instruction: dict, split_key: str, symbol_table: SymbolTable, settings_count: int) -> SplitConfig: try: default_params = DefaultParamsLoader.load("instructions/", SplitConfig.__name__) report_config_input = self._prepare_report_config( instruction_key, instruction, split_key, symbol_table) instruction[split_key] = { **default_params, **instruction[split_key] } split_strategy = SplitType[instruction[split_key] ["split_strategy"].upper()] training_percentage = float( instruction[split_key]["training_percentage"] ) if split_strategy == SplitType.RANDOM else -1 if split_strategy == SplitType.RANDOM and training_percentage == 1 and settings_count > 1: raise ValueError( f"{TrainMLModelParser.__name__}: all data under {instruction_key}/{split_key} was specified to be used for " f"training, but {settings_count} settings were specified for evaluation. Please define a test/validation set by " f"reducing the training percentage (e.g., to 0.7) or use only one hyperparameter setting to run the analysis." ) return SplitConfig( split_strategy=split_strategy, split_count=int(instruction[split_key]["split_count"]), training_percentage=training_percentage, reports=ReportConfig(**report_config_input), manual_config=ManualSplitConfig( **instruction[split_key]["manual_config"]) if "manual_config" in instruction[split_key] else None, leave_one_out_config=LeaveOneOutConfig( **instruction[split_key]["leave_one_out_config"]) if "leave_one_out_config" in instruction[split_key] else None) except KeyError as key_error: raise KeyError( f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under {split_key}." )
def test_generate(self): path = EnvironmentSettings.tmp_test_path + "cv_feature_performance/" state = TrainMLModelState( assessment=SplitConfig(split_count=5, split_strategy=SplitType.K_FOLD), selection=SplitConfig(split_count=10, split_strategy=SplitType.K_FOLD), optimization_metric=Metric.ACCURACY, label_configuration=LabelConfiguration( labels=[Label(name="CMV", values=[True, False])]), hp_settings=[ HPSetting(encoder_params={"p_value_threshold": 0.001}, encoder_name="e1", encoder=SequenceAbundanceEncoder([], 0, 0, 0), preproc_sequence=[], ml_method_name="ml1", ml_method=ProbabilisticBinaryClassifier(10, 0.1), ml_params={}), HPSetting(encoder_params={"p_value_threshold": 0.01}, encoder_name="e2", encoder=SequenceAbundanceEncoder([], 0, 0, 0), preproc_sequence=[], ml_method_name="ml1", ml_method=ProbabilisticBinaryClassifier(10, 0.1), ml_params={}), HPSetting(encoder_params={"p_value_threshold": 0.01}, encoder=SequenceAbundanceEncoder([], 0, 0, 0), preproc_sequence=[], ml_method=ProbabilisticBinaryClassifier(10, 0.01), ml_params={}) ], dataset=None, hp_strategy=None, metrics=None) report = CVFeaturePerformance("p_value_threshold", state, path, is_feature_axis_categorical=True, name="report1") with self.assertWarns(RuntimeWarning): report.generate_report() state.hp_settings = state.hp_settings[:2] state.assessment_states = [ HPAssessmentState(i, None, None, None, state.label_configuration) for i in range(state.assessment.split_count) ] for assessment_state in state.assessment_states: assessment_state.label_states["CMV"] = HPLabelState("CMV", []) assessment_state.label_states["CMV"].assessment_items = { setting.get_key(): HPItem(performance={'accuracy': random.uniform(0.5, 1)}, hp_setting=setting) for setting in state.hp_settings } assessment_state.label_states[ "CMV"].selection_state = HPSelectionState( [], [], "", GridSearch(state.hp_settings)) assessment_state.label_states["CMV"].selection_state.hp_items = { str(setting): [ HPItem(performance={'accuracy': random.uniform(0.5, 1)}, hp_setting=setting) for _ in range(state.selection.split_count) ] for setting in state.hp_settings } report.state = state report_result = report.generate_report() self.assertTrue(isinstance(report_result, ReportResult)) self.assertEqual(2, len(report_result.output_tables)) self.assertEqual(1, len(report_result.output_figures)) self.assertTrue(os.path.isfile(report_result.output_figures[0].path)) self.assertTrue(os.path.isfile(report_result.output_tables[0].path)) self.assertTrue(os.path.isfile(report_result.output_tables[1].path)) shutil.rmtree(path)
def test_run(self): path = EnvironmentSettings.tmp_test_path + "hpoptimproc/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 ], "l2": [ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 ] }) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, params={ "l1": [1, 2], "l2": [0, 1] }) enc1 = { "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4 } enc2 = { "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 } hp_settings = [ HPSetting(Word2VecEncoder.build_object(dataset, **enc1), enc1, LogisticRegression(), { "model_selection_cv": False, "model_selection_n_folds": -1 }, []), HPSetting( Word2VecEncoder.build_object(dataset, **enc2), enc2, SVM(), { "model_selection_cv": False, "model_selection_n_folds": -1 }, [ClonesPerRepertoireFilter(lower_limit=-1, upper_limit=1000)]) ] report = SequenceLengthDistribution() label_config = LabelConfiguration( [Label("l1", [1, 2]), Label("l2", [0, 1])]) process = TrainMLModelInstruction( dataset, GridSearch(hp_settings), hp_settings, SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig(data_splits={"seqlen": report})), SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig(data_splits={"seqlen": report})), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) state = process.run(result_path=path) self.assertTrue(isinstance(state, TrainMLModelState)) self.assertEqual(1, len(state.assessment_states)) self.assertTrue("l1" in state.assessment_states[0].label_states) self.assertTrue("l2" in state.assessment_states[0].label_states) shutil.rmtree(path)