def test_generate_receptor_dataset(self): path = EnvironmentSettings.tmp_test_path / "random_receptor_dataset_generation/" dataset = RandomDatasetGenerator.generate_receptor_dataset( receptor_count=100, chain_1_length_probabilities={ 4: 0.5, 5: 0.5 }, chain_2_length_probabilities={ 4: 0.5, 5: 0.5 }, labels={"HLA": { "A": 0.5, "B": 0.5 }}, path=path) self.assertEqual(ReceptorDataset, type(dataset)) self.assertEqual(100, dataset.get_example_count()) for receptor in dataset.get_data(): self.assertTrue( len(sequence_aa) in [4, 5] for sequence_aa in [ receptor.alpha.amino_acid_sequence, receptor.beta.amino_acid_sequence ]) self.assertTrue(receptor.metadata["HLA"] in ["A", "B"]) shutil.rmtree(path)
def test__split_repertoire_dataset(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "manual_splitter/") dataset = RandomDatasetGenerator.generate_repertoire_dataset(10, {4: 1}, {3: 1}, {}, path) train_metadata = pd.DataFrame({"subject_id": ["rep_1", "rep_2", "rep_4", "rep_5", "rep_9", "rep_7"]}) train_metadata.to_csv(path / "train.csv") test_metadata = pd.DataFrame({"subject_id": ["rep_0", "rep_3", "rep_6", "rep_8"]}) test_metadata.to_csv(path / "test.csv") train_datasets, test_datasets = ManualSplitter._split_repertoire_dataset( DataSplitterParams(dataset, SplitType.MANUAL, split_count=1, paths=[path / 'result/'], split_config=SplitConfig(manual_config=ManualSplitConfig(path / "train.csv", path / "test.csv"), split_count=1, split_strategy=SplitType.MANUAL))) self.assertEqual(1, len(train_datasets)) self.assertEqual(1, len(test_datasets)) self.assertEqual(6, train_datasets[0].get_example_count()) self.assertEqual(4, test_datasets[0].get_example_count()) self.assertTrue(all(subject_id in ["rep_1", "rep_2", "rep_4", "rep_5", "rep_9", "rep_7"] for subject_id in train_datasets[0].get_metadata(["subject_id"])["subject_id"])) self.assertTrue(all(subject_id in ["rep_0", "rep_3", "rep_6", "rep_8"] for subject_id in test_datasets[0].get_metadata(["subject_id"])["subject_id"])) self.assertTrue(os.path.isfile(train_datasets[0].metadata_file)) self.assertTrue(os.path.isfile(test_datasets[0].metadata_file)) shutil.rmtree(path)
def test_generate_repertoire_dataset(self): path = EnvironmentSettings.tmp_test_path / "random_repertoire_dataset_generation/" dataset = RandomDatasetGenerator.generate_repertoire_dataset( repertoire_count=100, sequence_count_probabilities={ 5: 0.5, 6: 0.5 }, sequence_length_probabilities={ 4: 0.5, 5: 0.5 }, labels={"HLA": { "A": 0.5, "B": 0.5 }}, path=path) self.assertEqual(RepertoireDataset, type(dataset)) self.assertEqual(100, dataset.get_example_count()) for repertoire in dataset.repertoires: self.assertTrue(repertoire.get_element_count() == 5 or repertoire.get_element_count() == 6) self.assertTrue( all( len(sequence_aa) in [4, 5] for sequence_aa in repertoire.get_sequence_aas().tolist())) self.assertTrue(repertoire.metadata["HLA"] in ["A", "B"]) shutil.rmtree(path)
def test(self): path = EnvironmentSettings.tmp_test_path / "integration_sequence_classification/" dataset = RandomDatasetGenerator.generate_sequence_dataset(50, {4: 1}, {'l1': {1: 0.5, 2: 0.5}}, path / 'data') os.environ["cache_type"] = "test" encoder_params = { "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "sequence_type": SequenceType.AMINO_ACID.name, "k": 3 } hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object(dataset, **encoder_params), encoder_params=encoder_params, ml_method=LogisticRegression(), ml_params={"model_selection_cv": False, "model_selection_n_folds": -1}, preproc_sequence=[]) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) instruction = TrainMLModelInstruction(dataset, GridSearch([hp_setting]), [hp_setting], SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path) result = instruction.run(result_path=path) shutil.rmtree(path)
def test_encode(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "atchley_kmer_encoding/") dataset = RandomDatasetGenerator.generate_repertoire_dataset( 3, {1: 1}, {4: 1}, {"l1": { True: 0.4, False: 0.6 }}, path / "dataset") encoder = AtchleyKmerEncoder.build_object( dataset, **{ "k": 2, "skip_first_n_aa": 1, "skip_last_n_aa": 1, "abundance": "RELATIVE_ABUNDANCE", "normalize_all_features": False }) encoded_dataset = encoder.encode( dataset, EncoderParams(path / "result", LabelConfiguration(labels=[Label("l1")]))) self.assertEqual((3, 11, 3), encoded_dataset.encoded_data.examples.shape) self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0]) shutil.rmtree(path)
def test_get_metadata(self): path = EnvironmentSettings.tmp_test_path / "sequence_dataset/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_receptor_dataset( 2, {2: 1.}, {2: 1.}, { "l1": { "True": 1. }, "l2": { "2": 1. } }, path) self.assertTrue("l1" in dataset.get_label_names()) self.assertTrue("l2" in dataset.get_label_names()) self.assertTrue( np.array_equal(['True', 'True'], dataset.get_metadata(['l1'])['l1'])) self.assertTrue( np.array_equal(['2', '2'], dataset.get_metadata(['l1', 'l2'])['l2'])) shutil.rmtree(path)
def _make_dataset(self, path, size) -> RepertoireDataset: random_dataset = RandomDatasetGenerator.generate_repertoire_dataset(repertoire_count=size, sequence_count_probabilities={100: 1.}, sequence_length_probabilities={5: 1.}, labels={}, path=path) signals = [Signal(identifier="disease", motifs=[Motif(identifier="m1", instantiation=GappedKmerInstantiation(), seed="AAA")], implanting_strategy=HealthySequenceImplanting(implanting_computation=ImplantingComputation.ROUND, implanting=GappedMotifImplanting())), Signal(identifier="HLA", motifs=[Motif(identifier="m2", instantiation=GappedKmerInstantiation(), seed="CCC")], implanting_strategy=HealthySequenceImplanting(implanting_computation=ImplantingComputation.ROUND, implanting=GappedMotifImplanting())), Signal(identifier="age", motifs=[Motif(identifier="m3", instantiation=GappedKmerInstantiation(), seed="GGG")], implanting_strategy=HealthySequenceImplanting(implanting_computation=ImplantingComputation.ROUND, implanting=GappedMotifImplanting()))] simulation = Simulation([Implanting(dataset_implanting_rate=0.2, signals=signals, name='i1', repertoire_implanting_rate=0.25), Implanting(dataset_implanting_rate=0.2, signals=[signals[0], signals[1]], name='i2', repertoire_implanting_rate=0.25), Implanting(dataset_implanting_rate=0.1, signals=[signals[0]], name='i3', repertoire_implanting_rate=0.25), Implanting(dataset_implanting_rate=0.2, signals=[signals[2]], name='i4', repertoire_implanting_rate=0.25), Implanting(dataset_implanting_rate=0.1, signals=[signals[1]], name='i5', repertoire_implanting_rate=0.25) ]) dataset = SignalImplanter.run(SimulationState(signals=signals, dataset=random_dataset, formats=['Pickle'], result_path=path, name='my_synthetic_dataset', simulation=simulation)) return dataset
def test_generate(self): path = EnvironmentSettings.tmp_test_path / "relevant_sequence_exporter/" PathBuilder.build(path) df = pd.DataFrame({ "v_genes": ["TRBV1-1", "TRBV1-1"], 'j_genes': ["TRBJ1-1", "TRBJ1-2"], "sequence_aas": ['ACCF', "EEFG"] }) df.to_csv(path / 'sequences.csv', index=False) dataset = RandomDatasetGenerator.generate_repertoire_dataset( 2, {2: 1}, {4: 1}, {}, path / "data") dataset.encoded_data = EncodedData( examples=None, info={'relevant_sequence_path': path / 'sequences.csv'}, encoding="SequenceAbundanceEncoder") report_result = RelevantSequenceExporter(dataset, path / "result", 'somename').generate_report() self.assertEqual(1, len(report_result.output_tables)) self.assertTrue(os.path.isfile(report_result.output_tables[0].path)) self.assertTrue( all(col in ["v_call", "j_call", "cdr3_aa"] for col in pd.read_csv( report_result.output_tables[0].path).columns)) shutil.rmtree(path)
def test_generate(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "receptor_dataset_overview/") dataset = RandomDatasetGenerator.generate_receptor_dataset( 100, { 9: 0.3, 10: 0.4, 11: 0.1, 12: 0.2 }, { 9: 0.1, 10: 0.2, 11: 0.4, 12: 0.3 }, {}, path / "dataset") report = ReceptorDatasetOverview(200, dataset, path / "result", "receptor_overview") result = report.generate_report() self.assertTrue( os.path.isfile(path / "result/sequence_length_distribution.html")) self.assertTrue( os.path.isfile( path / "result/sequence_length_distribution_chain_alpha.csv")) self.assertTrue( os.path.isfile( path / "result/sequence_length_distribution_chain_beta.csv")) self.assertTrue(isinstance(result, ReportResult)) shutil.rmtree(path)
def test_run(self): path = EnvironmentSettings.tmp_test_path / "mlapplicationtest/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_repertoire_dataset(50, {5: 1}, {5: 1}, {"l1": {1: 0.5, 2: 0.5}}, path / 'dataset/') ml_method = LogisticRegression() encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3, scale_to_zero_mean=True, scale_to_unit_variance=True) label_config = LabelConfiguration([Label("l1", [1, 2])]) enc_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4)) ml_method.fit(enc_dataset.encoded_data, 'l1') hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer", "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1') PathBuilder.build(path / 'result/instr1/') shutil.copy(path / 'dict_vectorizer.pickle', path / 'result/instr1/dict_vectorizer.pickle') shutil.copy(path / 'scaler.pickle', path / 'result/instr1/scaler.pickle') ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False) ml_app.run(path / 'result/') predictions_path = path / "result/instr1/predictions.csv" self.assertTrue(os.path.isfile(predictions_path)) df = pd.read_csv(predictions_path) self.assertEqual(50, df.shape[0]) shutil.rmtree(path)
def import_dataset(params, name: str) -> SequenceDataset: """ Returns randomly generated receptor dataset according to the parameters; YAML specification: result_path: path/where/to/store/results/ sequence_count: 100 # number of random sequences to generate chain_1_length_probabilities: 14: 0.8 # 80% of all generated sequences for all sequences will have length 14 15: 0.2 # 20% of all generated sequences across all sequences will have length 15 labels: epitope1: # label name True: 0.5 # 50% of the sequences will have class True False: 0.5 # 50% of the sequences will have class False epitope2: # next label with classes that will be assigned to sequences independently of the previous label or other parameters 1: 0.3 # 30% of the generated sequences will have class 1 0: 0.7 # 70% of the generated sequences will have class 0 """ valid_keys = [ "sequence_count", "length_probabilities", "labels", "result_path" ] ParameterValidator.assert_all_in_valid_list( list(params.keys()), valid_keys, "RandomSequenceDatasetImport", "params") return RandomDatasetGenerator.generate_sequence_dataset( sequence_count=params["sequence_count"], length_probabilities=params["length_probabilities"], labels=params["labels"], path=params["result_path"])
def test_run(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "subsampling/") dataset = RandomDatasetGenerator.generate_receptor_dataset( 200, labels={"epitope": { "A": 0.5, "B": 0.5 }}, path=path, chain_1_length_probabilities={3: 1}, chain_2_length_probabilities={4: 1}) dataset.name = "d1" inst = SubsamplingInstruction(dataset=dataset, subsampled_dataset_sizes=[100, 50], dataset_export_formats=[PickleExporter], name="subsampling_inst") state = inst.run(path / "result/") self.assertEqual(2, len(state.subsampled_datasets)) self.assertEqual(100, state.subsampled_datasets[0].get_example_count()) self.assertEqual(50, state.subsampled_datasets[1].get_example_count()) self.assertTrue( all( os.path.isfile(state.subsampled_dataset_paths[name]['pickle']) for name in [dataset.name for dataset in state.subsampled_datasets])) shutil.rmtree(path)
def test_import_receptors(self): path = EnvironmentSettings.tmp_test_path / "iml_import_receptors/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_receptor_dataset(10, {2: 1}, {3: 1}, {}, path) dataset.name = "d1" ImmuneMLExporter.export(dataset, path) receptor_dataset = ImmuneMLImport.import_dataset({"path": path / "d1.iml_dataset"}, "dataset_name") self.assertEqual(10, len(list(receptor_dataset.get_data()))) shutil.rmtree(path)
def test_generate(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "kernel_sequence_logo/") dataset = RandomDatasetGenerator.generate_receptor_dataset( receptor_count=500, chain_1_length_probabilities={4: 1}, chain_2_length_probabilities={4: 1}, labels={"CMV": { True: 0.5, False: 0.5 }}, path=path / "dataset") enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode( dataset, EncoderParams(path / "result", LabelConfiguration([Label("CMV", [True, False])]))) cnn = ReceptorCNN(kernel_count=2, kernel_size=[3], positional_channels=3, sequence_type="amino_acid", device="cpu", number_of_threads=4, random_seed=1, learning_rate=0.01, iteration_count=10, l1_weight_decay=0.1, evaluate_at=5, batch_size=100, training_percentage=0.8, l2_weight_decay=0.0) cnn.fit(enc_dataset.encoded_data, "CMV") report = KernelSequenceLogo(method=cnn, result_path=path / "logos/") report.generate_report() self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.png")) self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.png")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.png")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.png")) self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.csv")) self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.csv")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.csv")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.csv")) self.assertTrue( os.path.isfile(path / "logos/fully_connected_layer_weights.csv")) self.assertTrue( os.path.isfile(path / "logos/fully_connected_layer_weights.html")) shutil.rmtree(path)
def test_export_receptor_dataset(self): path = EnvironmentSettings.tmp_test_path / "pickleexporter_receptor/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_receptor_dataset( 10, {2: 1}, {3: 1}, {}, path) dataset.name = "d1" PickleExporter.export(dataset, path) with open(path / f"{dataset.name}.iml_dataset", "rb") as file: dataset2 = pickle.load(file) self.assertTrue(isinstance(dataset2, ReceptorDataset)) self.assertEqual(10, dataset2.get_example_count()) shutil.rmtree(path)
def test_export_receptor_dataset(self): path = EnvironmentSettings.tmp_test_path / "imlexporter_receptor/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_receptor_dataset(10, {2: 1}, {3: 1}, {}, path) dataset.name = "d1" element_ids = dataset.get_example_ids() ImmuneMLExporter.export(dataset, path) with open(path / f"{dataset.name}.iml_dataset", "r") as file: dataset2 = yaml.safe_load(file) self.assertEqual('ReceptorDataset', dataset2['dataset_class']) self.assertEqual(element_ids, dataset2['element_ids']) shutil.rmtree(path)
def test_parse(self): path = PathBuilder.build( f'{EnvironmentSettings.tmp_test_path}subsampling_parser/') dataset = RandomDatasetGenerator.generate_receptor_dataset( 30, {3: 1}, {2: 1}, {}, path) symbol_table = SymbolTable() symbol_table.add("d1", SymbolType.DATASET, dataset) SubsamplingParser().parse( 'inst1', { 'dataset': 'd1', 'type': 'Subsampling', 'subsampled_dataset_sizes': [10, 20], 'dataset_export_formats': ['Pickle'] }, symbol_table) with self.assertRaises(AssertionError): SubsamplingParser().parse( 'inst1', { 'dataset': 'd1', 'type': 'Subsampling', 'subsampled_dataset_sizes': [10, 50], 'dataset_export_formats': ['Pickle'] }, symbol_table) with self.assertRaises(AssertionError): SubsamplingParser().parse( 'inst1', { 'dataset': 'd2', 'type': 'Subsampling', 'subsampled_dataset_sizes': [10, 20], 'dataset_export_formats': ['Pickle'] }, symbol_table) with self.assertRaises(AssertionError): SubsamplingParser().parse( 'inst1', { 'dataset': 'd2', 'type': 'Subsampling', 'subsampled_dataset_sizes': [10, 20], 'dataset_export_formats': ['Random'] }, symbol_table) shutil.rmtree(path)
def import_dataset(params: dict, dataset_name: str) -> RepertoireDataset: valid_keys = [ "result_path", "repertoire_count", "sequence_count_probabilities", "sequence_length_probabilities", "labels" ] ParameterValidator.assert_all_in_valid_list( list(params.keys()), valid_keys, "RandomRepertoireDatasetImport", "params") return RandomDatasetGenerator.generate_repertoire_dataset( repertoire_count=params["repertoire_count"], sequence_count_probabilities=params[ "sequence_count_probabilities"], sequence_length_probabilities=params[ "sequence_length_probabilities"], labels=params["labels"], path=params["result_path"])
def test_run(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "dataset_export_instruction/") dataset = RandomDatasetGenerator.generate_repertoire_dataset( 10, {10: 1}, {12: 1}, {}, path) dataset.name = "d1" filter = CountPerSequenceFilter(low_count_limit=1, remove_without_count=True, remove_empty_repertoires=True, batch_size=100) instruction = DatasetExportInstruction(datasets=[dataset], preprocessing_sequence=[filter], exporters=[AIRRExporter], name="export_instr") result_path = path / "generated/" state = instruction.run(result_path=result_path) self.assertTrue(isinstance(state, DatasetExportState)) self.assertEqual(1, len(state.datasets)) self.assertEqual(1, len(state.formats)) self.assertEqual("AIRR", state.formats[0]) self.assertTrue(os.path.isdir(result_path)) self.assertEqual(1, len(list(glob(str(state.result_path / "*/"))))) self.assertEqual( 1, len(list(glob(str(state.result_path / f"{dataset.name}/*/"))))) self.assertTrue( os.path.isdir(str(state.result_path / f"{dataset.name}/AIRR/"))) self.assertTrue( os.path.isfile( str(state.result_path / f"{dataset.name}/AIRR/metadata.csv"))) self.assertEqual( 10, len( list( glob( str(state.result_path / f"{dataset.name}/AIRR/repertoires/*"))))) shutil.rmtree(path)
def test_generate(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "gliph2_export") dataset = RandomDatasetGenerator.generate_receptor_dataset( 10, {3: 1}, {2: 1}, {"epitope": { "ep1": 0.4, "ep2": 0.6 }}, path) report_result = GLIPH2Exporter(dataset, path / "result", "somename", "epitope").generate_report() self.assertEqual(1, len(report_result.output_tables)) self.assertTrue(os.path.isfile(report_result.output_tables[0].path)) df = pd.read_csv(report_result.output_tables[0].path, sep="\t") self.assertTrue( all(col in [ "CDR3b", "TRBV", "TRBJ", "CDR3a", "subject:condition", "count" ] for col in df.columns)) self.assertEqual(10, df.shape[0]) shutil.rmtree(path)
def test_run1(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "api_galaxy_yaml_tool1/") result_path = path / "result/" dataset = RandomDatasetGenerator.generate_repertoire_dataset(10, {10: 1}, {12: 1}, {}, result_path) dataset.name = "d1" ImmuneMLExporter.export(dataset, result_path) specs = { "definitions": { "datasets": { "new_d1": { "format": "ImmuneML", "params": { "metadata_file": str(result_path / "d1_metadata.csv") } } }, }, "instructions": { "inst1": { "type": "DatasetExport", "datasets": ["new_d1"], "export_formats": ["AIRR"] } } } specs_path = path / "specs.yaml" with open(specs_path, "w") as file: yaml.dump(specs, file) run_immuneML(Namespace(**{"specification_path": specs_path, "result_path": result_path / 'result/', 'tool': "GalaxyYamlTool"})) self.assertTrue(os.path.exists(result_path / "result/inst1/dataset/AIRR")) shutil.rmtree(path)
def test_run_with_receptors(self): path = PathBuilder.build(EnvironmentSettings.root_path / "test/tmp/signalImplanter_receptor/") dataset = RandomDatasetGenerator.generate_receptor_dataset( 10, {10: 1}, {12: 1}, {}, path / "dataset/") motif1 = Motif(identifier="motif1", instantiation=GappedKmerInstantiation(), seed_chain1="AAA", name_chain1=Chain.ALPHA, seed_chain2="CCC", name_chain2=Chain.BETA) signal1 = Signal(identifier="signal1", motifs=[motif1], implanting_strategy=ReceptorImplanting( GappedMotifImplanting())) simulation = Simulation( [Implanting(dataset_implanting_rate=0.5, signals=[signal1])]) sim_state = SimulationState(dataset=dataset, result_path=path, simulation=simulation, signals=[signal1], formats=["ImmuneML"]) new_dataset = SignalImplanter.run(sim_state) self.assertEqual(10, new_dataset.get_example_count()) self.assertEqual( 5, len([ receptor for receptor in new_dataset.get_data(40) if receptor.metadata["signal1"] is True ])) shutil.rmtree(path)
def test_run(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "dataset_export_instruction/") dataset = RandomDatasetGenerator.generate_repertoire_dataset( 10, {10: 1}, {12: 1}, {}, path) dataset.name = "d1" instruction = DatasetExportInstruction(datasets=[dataset], exporters=[AIRRExporter], name="export_instr") result_path = path / "generated/" state = instruction.run(result_path=result_path) self.assertTrue(isinstance(state, DatasetExportState)) self.assertEqual(1, len(state.datasets)) self.assertEqual(1, len(state.formats)) self.assertEqual("AIRR", state.formats[0]) self.assertTrue(os.path.isdir(result_path)) self.assertEqual(1, len(list(glob(str(state.result_path / "*/"))))) self.assertEqual( 1, len(list(glob(str(state.result_path / f"{dataset.name}/*/"))))) self.assertTrue( os.path.isdir(str(state.result_path / f"{dataset.name}/AIRR/"))) self.assertTrue( os.path.isfile( str(state.result_path / f"{dataset.name}/AIRR/metadata.csv"))) self.assertEqual( 10, len( list( glob( str(state.result_path / f"{dataset.name}/AIRR/repertoires/*"))))) shutil.rmtree(path)
def test_generate_sequence_dataset(self): path = EnvironmentSettings.tmp_test_path / "random_sequence_dataset_generation/" dataset = RandomDatasetGenerator.generate_sequence_dataset( sequence_count=100, length_probabilities={ 4: 0.5, 5: 0.5 }, labels={"HLA": { "A": 0.5, "B": 0.5 }}, path=path) self.assertEqual(SequenceDataset, type(dataset)) self.assertEqual(100, dataset.get_example_count()) for sequence in dataset.get_data(): self.assertTrue(len(sequence.amino_acid_sequence) in [4, 5]) self.assertTrue(sequence.get_attribute("HLA") in ["A", "B"]) shutil.rmtree(path)
def test_fit(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "kmermil") repertoire_count = 10 dataset = RandomDatasetGenerator.generate_repertoire_dataset( repertoire_count=repertoire_count, sequence_count_probabilities={2: 1}, sequence_length_probabilities={4: 1}, labels={"l1": { True: 0.5, False: 0.5 }}, path=path / "dataset") enc_dataset = AtchleyKmerEncoder( 2, 1, 1, 'relative_abundance', False).encode( dataset, EncoderParams(path / "result", LabelConfiguration([Label("l1", [True, False])]))) cls = AtchleyKmerMILClassifier(iteration_count=10, threshold=-0.0001, evaluate_at=2, use_early_stopping=False, random_seed=1, learning_rate=0.01, zero_abundance_weight_init=True, number_of_threads=8) cls.fit(enc_dataset.encoded_data, "l1") predictions = cls.predict(enc_dataset.encoded_data, "l1") self.assertEqual(repertoire_count, len(predictions["l1"])) self.assertEqual( repertoire_count, len([pred for pred in predictions["l1"] if isinstance(pred, bool)])) predictions_proba = cls.predict_proba(enc_dataset.encoded_data, "l1") self.assertEqual(repertoire_count, np.rint(np.sum(predictions_proba["l1"]))) self.assertEqual(repertoire_count, predictions_proba["l1"].shape[0]) cls.store(path / "model_storage", feature_names=enc_dataset.encoded_data.feature_names) cls2 = AtchleyKmerMILClassifier(iteration_count=10, threshold=-0.0001, evaluate_at=2, use_early_stopping=False, random_seed=1, learning_rate=0.01, zero_abundance_weight_init=True, number_of_threads=8) cls2.load(path / "model_storage") cls2_vars = vars(cls2) del cls2_vars["logistic_regression"] cls_vars = vars(cls) del cls_vars["logistic_regression"] for item, value in cls_vars.items(): if not isinstance(value, np.ndarray): loaded_value = cls2_vars[item] self.assertEqual(value, loaded_value) model = cls.get_model("l1") self.assertEqual(vars(cls), model) shutil.rmtree(path)
def test_run(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "api_galaxy_yaml_tool/") result_path = path / "result/" dataset = RandomDatasetGenerator.generate_repertoire_dataset( 10, {10: 1}, {12: 1}, {}, result_path) dataset.name = "d1" PickleExporter.export(dataset, result_path) specs = { "definitions": { "datasets": { "new_d1": { "format": "Pickle", "params": { "metadata_file": str(result_path / "d1_metadata.csv") } }, "d2": { "format": "RandomRepertoireDataset", "params": { "repertoire_count": 50, "sequence_length_probabilities": { 10: 1 }, 'sequence_count_probabilities': { 10: 1 }, 'labels': { "CD": { True: 0.5, False: 0.5 } } } } }, "encodings": { "e1": { "Word2Vec": { "k": 3, "model_type": "sequence", "vector_size": 8, } }, "e2": { "Word2Vec": { "k": 3, "model_type": "sequence", "vector_size": 10, } }, }, "ml_methods": { "simpleLR": { "LogisticRegression": { "penalty": "l1" }, "model_selection_cv": False, "model_selection_n_folds": -1, } }, }, "instructions": { "inst1": { "type": "DatasetExport", "datasets": ["new_d1", 'd2'], "export_formats": ["AIRR"] }, "inst2": { "type": "TrainMLModel", "settings": [{ "encoding": "e1", "ml_method": "simpleLR" }, { "encoding": "e2", "ml_method": "simpleLR" }], "assessment": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.7 }, "selection": { "split_strategy": "random", "split_count": 2, "training_percentage": 0.7 }, "labels": ["CD"], "dataset": "d2", "strategy": "GridSearch", "metrics": ["accuracy", "auc"], "reports": [], "number_of_processes": 10, "optimization_metric": "accuracy", 'refit_optimal_model': False, "store_encoded_data": False } } } specs_path = path / "specs.yaml" with open(specs_path, "w") as file: yaml.dump(specs, file) run_immuneML( Namespace( **{ "specification_path": specs_path, "result_path": result_path / 'result/', 'tool': "GalaxyYamlTool" })) self.assertTrue( os.path.exists(result_path / "result/inst1/new_d1/AIRR")) self.assertTrue(os.path.exists(result_path / "result/inst1/d2/AIRR")) self.assertTrue(os.path.exists(result_path / "result/d2")) shutil.rmtree(path)