def test_generate_receptor_dataset(self):

        path = EnvironmentSettings.tmp_test_path / "random_receptor_dataset_generation/"

        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            receptor_count=100,
            chain_1_length_probabilities={
                4: 0.5,
                5: 0.5
            },
            chain_2_length_probabilities={
                4: 0.5,
                5: 0.5
            },
            labels={"HLA": {
                "A": 0.5,
                "B": 0.5
            }},
            path=path)

        self.assertEqual(ReceptorDataset, type(dataset))

        self.assertEqual(100, dataset.get_example_count())
        for receptor in dataset.get_data():
            self.assertTrue(
                len(sequence_aa) in [4, 5] for sequence_aa in [
                    receptor.alpha.amino_acid_sequence,
                    receptor.beta.amino_acid_sequence
                ])
            self.assertTrue(receptor.metadata["HLA"] in ["A", "B"])

        shutil.rmtree(path)
예제 #2
0
    def test__split_repertoire_dataset(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "manual_splitter/")
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(10, {4: 1}, {3: 1}, {}, path)

        train_metadata = pd.DataFrame({"subject_id": ["rep_1", "rep_2", "rep_4", "rep_5", "rep_9", "rep_7"]})
        train_metadata.to_csv(path / "train.csv")

        test_metadata = pd.DataFrame({"subject_id": ["rep_0", "rep_3", "rep_6", "rep_8"]})
        test_metadata.to_csv(path / "test.csv")

        train_datasets, test_datasets = ManualSplitter._split_repertoire_dataset(
            DataSplitterParams(dataset, SplitType.MANUAL, split_count=1, paths=[path / 'result/'],
                               split_config=SplitConfig(manual_config=ManualSplitConfig(path / "train.csv",
                                                                                        path / "test.csv"),
                                                        split_count=1, split_strategy=SplitType.MANUAL)))

        self.assertEqual(1, len(train_datasets))
        self.assertEqual(1, len(test_datasets))
        self.assertEqual(6, train_datasets[0].get_example_count())
        self.assertEqual(4, test_datasets[0].get_example_count())
        self.assertTrue(all(subject_id in ["rep_1", "rep_2", "rep_4", "rep_5", "rep_9", "rep_7"]
                            for subject_id in train_datasets[0].get_metadata(["subject_id"])["subject_id"]))
        self.assertTrue(all(subject_id in ["rep_0", "rep_3", "rep_6", "rep_8"]
                            for subject_id in test_datasets[0].get_metadata(["subject_id"])["subject_id"]))
        self.assertTrue(os.path.isfile(train_datasets[0].metadata_file))
        self.assertTrue(os.path.isfile(test_datasets[0].metadata_file))

        shutil.rmtree(path)
    def test_generate_repertoire_dataset(self):

        path = EnvironmentSettings.tmp_test_path / "random_repertoire_dataset_generation/"

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            repertoire_count=100,
            sequence_count_probabilities={
                5: 0.5,
                6: 0.5
            },
            sequence_length_probabilities={
                4: 0.5,
                5: 0.5
            },
            labels={"HLA": {
                "A": 0.5,
                "B": 0.5
            }},
            path=path)

        self.assertEqual(RepertoireDataset, type(dataset))

        self.assertEqual(100, dataset.get_example_count())
        for repertoire in dataset.repertoires:
            self.assertTrue(repertoire.get_element_count() == 5
                            or repertoire.get_element_count() == 6)
            self.assertTrue(
                all(
                    len(sequence_aa) in [4, 5]
                    for sequence_aa in repertoire.get_sequence_aas().tolist()))
            self.assertTrue(repertoire.metadata["HLA"] in ["A", "B"])

        shutil.rmtree(path)
예제 #4
0
    def test(self):

        path = EnvironmentSettings.tmp_test_path / "integration_sequence_classification/"
        dataset = RandomDatasetGenerator.generate_sequence_dataset(50, {4: 1}, {'l1': {1: 0.5, 2: 0.5}}, path / 'data')

        os.environ["cache_type"] = "test"
        encoder_params = {
            "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
            "reads": ReadsType.UNIQUE.name,
            "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
            "sequence_type": SequenceType.AMINO_ACID.name,
            "k": 3
        }

        hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object(dataset, **encoder_params), encoder_params=encoder_params,
                               ml_method=LogisticRegression(), ml_params={"model_selection_cv": False, "model_selection_n_folds": -1},
                               preproc_sequence=[])

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        instruction = TrainMLModelInstruction(dataset, GridSearch([hp_setting]), [hp_setting],
                                              SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
                                              SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
                                              {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path)

        result = instruction.run(result_path=path)

        shutil.rmtree(path)
예제 #5
0
    def test_encode(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "atchley_kmer_encoding/")
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            3, {1: 1}, {4: 1}, {"l1": {
                True: 0.4,
                False: 0.6
            }}, path / "dataset")

        encoder = AtchleyKmerEncoder.build_object(
            dataset, **{
                "k": 2,
                "skip_first_n_aa": 1,
                "skip_last_n_aa": 1,
                "abundance": "RELATIVE_ABUNDANCE",
                "normalize_all_features": False
            })
        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(path / "result",
                          LabelConfiguration(labels=[Label("l1")])))

        self.assertEqual((3, 11, 3),
                         encoded_dataset.encoded_data.examples.shape)
        self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0])

        shutil.rmtree(path)
예제 #6
0
    def test_get_metadata(self):

        path = EnvironmentSettings.tmp_test_path / "sequence_dataset/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            2, {2: 1.}, {2: 1.}, {
                "l1": {
                    "True": 1.
                },
                "l2": {
                    "2": 1.
                }
            }, path)

        self.assertTrue("l1" in dataset.get_label_names())
        self.assertTrue("l2" in dataset.get_label_names())

        self.assertTrue(
            np.array_equal(['True', 'True'],
                           dataset.get_metadata(['l1'])['l1']))
        self.assertTrue(
            np.array_equal(['2', '2'],
                           dataset.get_metadata(['l1', 'l2'])['l2']))

        shutil.rmtree(path)
    def _make_dataset(self, path, size) -> RepertoireDataset:

        random_dataset = RandomDatasetGenerator.generate_repertoire_dataset(repertoire_count=size, sequence_count_probabilities={100: 1.},
                                                                            sequence_length_probabilities={5: 1.}, labels={}, path=path)

        signals = [Signal(identifier="disease", motifs=[Motif(identifier="m1", instantiation=GappedKmerInstantiation(), seed="AAA")],
                          implanting_strategy=HealthySequenceImplanting(implanting_computation=ImplantingComputation.ROUND,
                                                                        implanting=GappedMotifImplanting())),
                   Signal(identifier="HLA", motifs=[Motif(identifier="m2", instantiation=GappedKmerInstantiation(), seed="CCC")],
                          implanting_strategy=HealthySequenceImplanting(implanting_computation=ImplantingComputation.ROUND,
                                                                        implanting=GappedMotifImplanting())),
                   Signal(identifier="age", motifs=[Motif(identifier="m3", instantiation=GappedKmerInstantiation(), seed="GGG")],
                          implanting_strategy=HealthySequenceImplanting(implanting_computation=ImplantingComputation.ROUND,
                                                                        implanting=GappedMotifImplanting()))]

        simulation = Simulation([Implanting(dataset_implanting_rate=0.2, signals=signals, name='i1', repertoire_implanting_rate=0.25),
                                 Implanting(dataset_implanting_rate=0.2, signals=[signals[0], signals[1]], name='i2', repertoire_implanting_rate=0.25),
                                 Implanting(dataset_implanting_rate=0.1, signals=[signals[0]], name='i3', repertoire_implanting_rate=0.25),
                                 Implanting(dataset_implanting_rate=0.2, signals=[signals[2]], name='i4', repertoire_implanting_rate=0.25),
                                 Implanting(dataset_implanting_rate=0.1, signals=[signals[1]], name='i5', repertoire_implanting_rate=0.25)
                                 ])

        dataset = SignalImplanter.run(SimulationState(signals=signals, dataset=random_dataset, formats=['Pickle'], result_path=path,
                                                      name='my_synthetic_dataset', simulation=simulation))

        return dataset
    def test_generate(self):
        path = EnvironmentSettings.tmp_test_path / "relevant_sequence_exporter/"
        PathBuilder.build(path)

        df = pd.DataFrame({
            "v_genes": ["TRBV1-1", "TRBV1-1"],
            'j_genes': ["TRBJ1-1", "TRBJ1-2"],
            "sequence_aas": ['ACCF', "EEFG"]
        })
        df.to_csv(path / 'sequences.csv', index=False)

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            2, {2: 1}, {4: 1}, {}, path / "data")
        dataset.encoded_data = EncodedData(
            examples=None,
            info={'relevant_sequence_path': path / 'sequences.csv'},
            encoding="SequenceAbundanceEncoder")

        report_result = RelevantSequenceExporter(dataset, path / "result",
                                                 'somename').generate_report()

        self.assertEqual(1, len(report_result.output_tables))
        self.assertTrue(os.path.isfile(report_result.output_tables[0].path))

        self.assertTrue(
            all(col in ["v_call", "j_call", "cdr3_aa"] for col in pd.read_csv(
                report_result.output_tables[0].path).columns))

        shutil.rmtree(path)
예제 #9
0
    def test_generate(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "receptor_dataset_overview/")

        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            100, {
                9: 0.3,
                10: 0.4,
                11: 0.1,
                12: 0.2
            }, {
                9: 0.1,
                10: 0.2,
                11: 0.4,
                12: 0.3
            }, {}, path / "dataset")

        report = ReceptorDatasetOverview(200, dataset, path / "result",
                                         "receptor_overview")
        result = report.generate_report()

        self.assertTrue(
            os.path.isfile(path / "result/sequence_length_distribution.html"))
        self.assertTrue(
            os.path.isfile(
                path / "result/sequence_length_distribution_chain_alpha.csv"))
        self.assertTrue(
            os.path.isfile(
                path / "result/sequence_length_distribution_chain_beta.csv"))
        self.assertTrue(isinstance(result, ReportResult))

        shutil.rmtree(path)
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path / "mlapplicationtest/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(50, {5: 1}, {5: 1}, {"l1": {1: 0.5, 2: 0.5}}, path / 'dataset/')
        ml_method = LogisticRegression()
        encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3,
                                            scale_to_zero_mean=True, scale_to_unit_variance=True)
        label_config = LabelConfiguration([Label("l1", [1, 2])])

        enc_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4))
        ml_method.fit(enc_dataset.encoded_data, 'l1')

        hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer",
                                         "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1')

        PathBuilder.build(path / 'result/instr1/')
        shutil.copy(path / 'dict_vectorizer.pickle', path / 'result/instr1/dict_vectorizer.pickle')
        shutil.copy(path / 'scaler.pickle', path / 'result/instr1/scaler.pickle')

        ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False)
        ml_app.run(path / 'result/')

        predictions_path = path / "result/instr1/predictions.csv"
        self.assertTrue(os.path.isfile(predictions_path))

        df = pd.read_csv(predictions_path)
        self.assertEqual(50, df.shape[0])

        shutil.rmtree(path)
예제 #11
0
    def import_dataset(params, name: str) -> SequenceDataset:
        """
        Returns randomly generated receptor dataset according to the parameters;

        YAML specification:

            result_path: path/where/to/store/results/
            sequence_count: 100 # number of random sequences to generate
            chain_1_length_probabilities:
                14: 0.8 # 80% of all generated sequences for all sequences will have length 14
                15: 0.2 # 20% of all generated sequences across all sequences will have length 15
            labels:
                epitope1: # label name
                    True: 0.5 # 50% of the sequences will have class True
                    False: 0.5 # 50% of the sequences will have class False
                epitope2: # next label with classes that will be assigned to sequences independently of the previous label or other parameters
                    1: 0.3 # 30% of the generated sequences will have class 1
                    0: 0.7 # 70% of the generated sequences will have class 0

        """
        valid_keys = [
            "sequence_count", "length_probabilities", "labels", "result_path"
        ]
        ParameterValidator.assert_all_in_valid_list(
            list(params.keys()), valid_keys, "RandomSequenceDatasetImport",
            "params")

        return RandomDatasetGenerator.generate_sequence_dataset(
            sequence_count=params["sequence_count"],
            length_probabilities=params["length_probabilities"],
            labels=params["labels"],
            path=params["result_path"])
예제 #12
0
    def test_run(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "subsampling/")
        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            200,
            labels={"epitope": {
                "A": 0.5,
                "B": 0.5
            }},
            path=path,
            chain_1_length_probabilities={3: 1},
            chain_2_length_probabilities={4: 1})
        dataset.name = "d1"

        inst = SubsamplingInstruction(dataset=dataset,
                                      subsampled_dataset_sizes=[100, 50],
                                      dataset_export_formats=[PickleExporter],
                                      name="subsampling_inst")

        state = inst.run(path / "result/")

        self.assertEqual(2, len(state.subsampled_datasets))
        self.assertEqual(100, state.subsampled_datasets[0].get_example_count())
        self.assertEqual(50, state.subsampled_datasets[1].get_example_count())

        self.assertTrue(
            all(
                os.path.isfile(state.subsampled_dataset_paths[name]['pickle'])
                for name in
                [dataset.name for dataset in state.subsampled_datasets]))

        shutil.rmtree(path)
예제 #13
0
    def test_import_receptors(self):
        path = EnvironmentSettings.tmp_test_path / "iml_import_receptors/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_receptor_dataset(10, {2: 1}, {3: 1}, {}, path)
        dataset.name = "d1"
        ImmuneMLExporter.export(dataset, path)

        receptor_dataset = ImmuneMLImport.import_dataset({"path": path / "d1.iml_dataset"}, "dataset_name")

        self.assertEqual(10, len(list(receptor_dataset.get_data())))

        shutil.rmtree(path)
예제 #14
0
    def test_generate(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "kernel_sequence_logo/")
        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            receptor_count=500,
            chain_1_length_probabilities={4: 1},
            chain_2_length_probabilities={4: 1},
            labels={"CMV": {
                True: 0.5,
                False: 0.5
            }},
            path=path / "dataset")
        enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode(
            dataset,
            EncoderParams(path / "result",
                          LabelConfiguration([Label("CMV", [True, False])])))
        cnn = ReceptorCNN(kernel_count=2,
                          kernel_size=[3],
                          positional_channels=3,
                          sequence_type="amino_acid",
                          device="cpu",
                          number_of_threads=4,
                          random_seed=1,
                          learning_rate=0.01,
                          iteration_count=10,
                          l1_weight_decay=0.1,
                          evaluate_at=5,
                          batch_size=100,
                          training_percentage=0.8,
                          l2_weight_decay=0.0)
        cnn.fit(enc_dataset.encoded_data, "CMV")

        report = KernelSequenceLogo(method=cnn, result_path=path / "logos/")
        report.generate_report()

        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.png"))
        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.png"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.png"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.png"))
        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.csv"))
        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.csv"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.csv"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.csv"))
        self.assertTrue(
            os.path.isfile(path / "logos/fully_connected_layer_weights.csv"))
        self.assertTrue(
            os.path.isfile(path / "logos/fully_connected_layer_weights.html"))

        shutil.rmtree(path)
예제 #15
0
    def test_export_receptor_dataset(self):
        path = EnvironmentSettings.tmp_test_path / "pickleexporter_receptor/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            10, {2: 1}, {3: 1}, {}, path)
        dataset.name = "d1"
        PickleExporter.export(dataset, path)

        with open(path / f"{dataset.name}.iml_dataset", "rb") as file:
            dataset2 = pickle.load(file)

        self.assertTrue(isinstance(dataset2, ReceptorDataset))
        self.assertEqual(10, dataset2.get_example_count())

        shutil.rmtree(path)
예제 #16
0
    def test_export_receptor_dataset(self):
        path = EnvironmentSettings.tmp_test_path / "imlexporter_receptor/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_receptor_dataset(10, {2: 1}, {3: 1}, {}, path)
        dataset.name = "d1"
        element_ids = dataset.get_example_ids()
        ImmuneMLExporter.export(dataset, path)

        with open(path / f"{dataset.name}.iml_dataset", "r") as file:
            dataset2 = yaml.safe_load(file)

        self.assertEqual('ReceptorDataset', dataset2['dataset_class'])
        self.assertEqual(element_ids, dataset2['element_ids'])

        shutil.rmtree(path)
예제 #17
0
    def test_parse(self):

        path = PathBuilder.build(
            f'{EnvironmentSettings.tmp_test_path}subsampling_parser/')
        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            30, {3: 1}, {2: 1}, {}, path)

        symbol_table = SymbolTable()
        symbol_table.add("d1", SymbolType.DATASET, dataset)

        SubsamplingParser().parse(
            'inst1', {
                'dataset': 'd1',
                'type': 'Subsampling',
                'subsampled_dataset_sizes': [10, 20],
                'dataset_export_formats': ['Pickle']
            }, symbol_table)

        with self.assertRaises(AssertionError):
            SubsamplingParser().parse(
                'inst1', {
                    'dataset': 'd1',
                    'type': 'Subsampling',
                    'subsampled_dataset_sizes': [10, 50],
                    'dataset_export_formats': ['Pickle']
                }, symbol_table)

        with self.assertRaises(AssertionError):
            SubsamplingParser().parse(
                'inst1', {
                    'dataset': 'd2',
                    'type': 'Subsampling',
                    'subsampled_dataset_sizes': [10, 20],
                    'dataset_export_formats': ['Pickle']
                }, symbol_table)

        with self.assertRaises(AssertionError):
            SubsamplingParser().parse(
                'inst1', {
                    'dataset': 'd2',
                    'type': 'Subsampling',
                    'subsampled_dataset_sizes': [10, 20],
                    'dataset_export_formats': ['Random']
                }, symbol_table)

        shutil.rmtree(path)
예제 #18
0
    def import_dataset(params: dict, dataset_name: str) -> RepertoireDataset:
        valid_keys = [
            "result_path", "repertoire_count", "sequence_count_probabilities",
            "sequence_length_probabilities", "labels"
        ]
        ParameterValidator.assert_all_in_valid_list(
            list(params.keys()), valid_keys, "RandomRepertoireDatasetImport",
            "params")

        return RandomDatasetGenerator.generate_repertoire_dataset(
            repertoire_count=params["repertoire_count"],
            sequence_count_probabilities=params[
                "sequence_count_probabilities"],
            sequence_length_probabilities=params[
                "sequence_length_probabilities"],
            labels=params["labels"],
            path=params["result_path"])
예제 #19
0
    def test_run(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "dataset_export_instruction/")
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            10, {10: 1}, {12: 1}, {}, path)
        dataset.name = "d1"

        filter = CountPerSequenceFilter(low_count_limit=1,
                                        remove_without_count=True,
                                        remove_empty_repertoires=True,
                                        batch_size=100)
        instruction = DatasetExportInstruction(datasets=[dataset],
                                               preprocessing_sequence=[filter],
                                               exporters=[AIRRExporter],
                                               name="export_instr")

        result_path = path / "generated/"
        state = instruction.run(result_path=result_path)

        self.assertTrue(isinstance(state, DatasetExportState))
        self.assertEqual(1, len(state.datasets))
        self.assertEqual(1, len(state.formats))
        self.assertEqual("AIRR", state.formats[0])

        self.assertTrue(os.path.isdir(result_path))
        self.assertEqual(1, len(list(glob(str(state.result_path / "*/")))))
        self.assertEqual(
            1, len(list(glob(str(state.result_path / f"{dataset.name}/*/")))))
        self.assertTrue(
            os.path.isdir(str(state.result_path / f"{dataset.name}/AIRR/")))
        self.assertTrue(
            os.path.isfile(
                str(state.result_path / f"{dataset.name}/AIRR/metadata.csv")))
        self.assertEqual(
            10,
            len(
                list(
                    glob(
                        str(state.result_path /
                            f"{dataset.name}/AIRR/repertoires/*")))))

        shutil.rmtree(path)
예제 #20
0
    def test_generate(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "gliph2_export")
        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            10, {3: 1}, {2: 1}, {"epitope": {
                "ep1": 0.4,
                "ep2": 0.6
            }}, path)
        report_result = GLIPH2Exporter(dataset, path / "result", "somename",
                                       "epitope").generate_report()

        self.assertEqual(1, len(report_result.output_tables))
        self.assertTrue(os.path.isfile(report_result.output_tables[0].path))

        df = pd.read_csv(report_result.output_tables[0].path, sep="\t")
        self.assertTrue(
            all(col in [
                "CDR3b", "TRBV", "TRBJ", "CDR3a", "subject:condition", "count"
            ] for col in df.columns))
        self.assertEqual(10, df.shape[0])

        shutil.rmtree(path)
예제 #21
0
    def test_run1(self):

        path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "api_galaxy_yaml_tool1/")
        result_path = path / "result/"

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(10, {10: 1}, {12: 1}, {}, result_path)
        dataset.name = "d1"
        ImmuneMLExporter.export(dataset, result_path)

        specs = {
            "definitions": {
                "datasets": {
                    "new_d1": {
                        "format": "ImmuneML",
                        "params": {
                            "metadata_file": str(result_path / "d1_metadata.csv")
                        }
                    }
                },
            },
            "instructions": {
                "inst1": {
                    "type": "DatasetExport",
                    "datasets": ["new_d1"],
                    "export_formats": ["AIRR"]
                }
            }
        }

        specs_path = path / "specs.yaml"
        with open(specs_path, "w") as file:
            yaml.dump(specs, file)

        run_immuneML(Namespace(**{"specification_path": specs_path, "result_path": result_path / 'result/', 'tool': "GalaxyYamlTool"}))

        self.assertTrue(os.path.exists(result_path / "result/inst1/dataset/AIRR"))

        shutil.rmtree(path)
예제 #22
0
    def test_run_with_receptors(self):

        path = PathBuilder.build(EnvironmentSettings.root_path /
                                 "test/tmp/signalImplanter_receptor/")

        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            10, {10: 1}, {12: 1}, {}, path / "dataset/")
        motif1 = Motif(identifier="motif1",
                       instantiation=GappedKmerInstantiation(),
                       seed_chain1="AAA",
                       name_chain1=Chain.ALPHA,
                       seed_chain2="CCC",
                       name_chain2=Chain.BETA)
        signal1 = Signal(identifier="signal1",
                         motifs=[motif1],
                         implanting_strategy=ReceptorImplanting(
                             GappedMotifImplanting()))

        simulation = Simulation(
            [Implanting(dataset_implanting_rate=0.5, signals=[signal1])])

        sim_state = SimulationState(dataset=dataset,
                                    result_path=path,
                                    simulation=simulation,
                                    signals=[signal1],
                                    formats=["ImmuneML"])

        new_dataset = SignalImplanter.run(sim_state)

        self.assertEqual(10, new_dataset.get_example_count())
        self.assertEqual(
            5,
            len([
                receptor for receptor in new_dataset.get_data(40)
                if receptor.metadata["signal1"] is True
            ]))

        shutil.rmtree(path)
예제 #23
0
    def test_run(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "dataset_export_instruction/")
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            10, {10: 1}, {12: 1}, {}, path)
        dataset.name = "d1"
        instruction = DatasetExportInstruction(datasets=[dataset],
                                               exporters=[AIRRExporter],
                                               name="export_instr")

        result_path = path / "generated/"
        state = instruction.run(result_path=result_path)

        self.assertTrue(isinstance(state, DatasetExportState))
        self.assertEqual(1, len(state.datasets))
        self.assertEqual(1, len(state.formats))
        self.assertEqual("AIRR", state.formats[0])

        self.assertTrue(os.path.isdir(result_path))
        self.assertEqual(1, len(list(glob(str(state.result_path / "*/")))))
        self.assertEqual(
            1, len(list(glob(str(state.result_path / f"{dataset.name}/*/")))))
        self.assertTrue(
            os.path.isdir(str(state.result_path / f"{dataset.name}/AIRR/")))
        self.assertTrue(
            os.path.isfile(
                str(state.result_path / f"{dataset.name}/AIRR/metadata.csv")))
        self.assertEqual(
            10,
            len(
                list(
                    glob(
                        str(state.result_path /
                            f"{dataset.name}/AIRR/repertoires/*")))))

        shutil.rmtree(path)
    def test_generate_sequence_dataset(self):

        path = EnvironmentSettings.tmp_test_path / "random_sequence_dataset_generation/"

        dataset = RandomDatasetGenerator.generate_sequence_dataset(
            sequence_count=100,
            length_probabilities={
                4: 0.5,
                5: 0.5
            },
            labels={"HLA": {
                "A": 0.5,
                "B": 0.5
            }},
            path=path)

        self.assertEqual(SequenceDataset, type(dataset))
        self.assertEqual(100, dataset.get_example_count())

        for sequence in dataset.get_data():
            self.assertTrue(len(sequence.amino_acid_sequence) in [4, 5])
            self.assertTrue(sequence.get_attribute("HLA") in ["A", "B"])

        shutil.rmtree(path)
    def test_fit(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "kmermil")

        repertoire_count = 10
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            repertoire_count=repertoire_count,
            sequence_count_probabilities={2: 1},
            sequence_length_probabilities={4: 1},
            labels={"l1": {
                True: 0.5,
                False: 0.5
            }},
            path=path / "dataset")
        enc_dataset = AtchleyKmerEncoder(
            2, 1, 1, 'relative_abundance', False).encode(
                dataset,
                EncoderParams(path / "result",
                              LabelConfiguration([Label("l1",
                                                        [True, False])])))
        cls = AtchleyKmerMILClassifier(iteration_count=10,
                                       threshold=-0.0001,
                                       evaluate_at=2,
                                       use_early_stopping=False,
                                       random_seed=1,
                                       learning_rate=0.01,
                                       zero_abundance_weight_init=True,
                                       number_of_threads=8)
        cls.fit(enc_dataset.encoded_data, "l1")

        predictions = cls.predict(enc_dataset.encoded_data, "l1")
        self.assertEqual(repertoire_count, len(predictions["l1"]))
        self.assertEqual(
            repertoire_count,
            len([pred for pred in predictions["l1"]
                 if isinstance(pred, bool)]))

        predictions_proba = cls.predict_proba(enc_dataset.encoded_data, "l1")
        self.assertEqual(repertoire_count,
                         np.rint(np.sum(predictions_proba["l1"])))
        self.assertEqual(repertoire_count, predictions_proba["l1"].shape[0])

        cls.store(path / "model_storage",
                  feature_names=enc_dataset.encoded_data.feature_names)

        cls2 = AtchleyKmerMILClassifier(iteration_count=10,
                                        threshold=-0.0001,
                                        evaluate_at=2,
                                        use_early_stopping=False,
                                        random_seed=1,
                                        learning_rate=0.01,
                                        zero_abundance_weight_init=True,
                                        number_of_threads=8)
        cls2.load(path / "model_storage")

        cls2_vars = vars(cls2)
        del cls2_vars["logistic_regression"]
        cls_vars = vars(cls)
        del cls_vars["logistic_regression"]

        for item, value in cls_vars.items():
            if not isinstance(value, np.ndarray):
                loaded_value = cls2_vars[item]
                self.assertEqual(value, loaded_value)

        model = cls.get_model("l1")
        self.assertEqual(vars(cls), model)

        shutil.rmtree(path)
예제 #26
0
    def test_run(self):

        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "api_galaxy_yaml_tool/")
        result_path = path / "result/"

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            10, {10: 1}, {12: 1}, {}, result_path)
        dataset.name = "d1"
        PickleExporter.export(dataset, result_path)

        specs = {
            "definitions": {
                "datasets": {
                    "new_d1": {
                        "format": "Pickle",
                        "params": {
                            "metadata_file":
                            str(result_path / "d1_metadata.csv")
                        }
                    },
                    "d2": {
                        "format": "RandomRepertoireDataset",
                        "params": {
                            "repertoire_count": 50,
                            "sequence_length_probabilities": {
                                10: 1
                            },
                            'sequence_count_probabilities': {
                                10: 1
                            },
                            'labels': {
                                "CD": {
                                    True: 0.5,
                                    False: 0.5
                                }
                            }
                        }
                    }
                },
                "encodings": {
                    "e1": {
                        "Word2Vec": {
                            "k": 3,
                            "model_type": "sequence",
                            "vector_size": 8,
                        }
                    },
                    "e2": {
                        "Word2Vec": {
                            "k": 3,
                            "model_type": "sequence",
                            "vector_size": 10,
                        }
                    },
                },
                "ml_methods": {
                    "simpleLR": {
                        "LogisticRegression": {
                            "penalty": "l1"
                        },
                        "model_selection_cv": False,
                        "model_selection_n_folds": -1,
                    }
                },
            },
            "instructions": {
                "inst1": {
                    "type": "DatasetExport",
                    "datasets": ["new_d1", 'd2'],
                    "export_formats": ["AIRR"]
                },
                "inst2": {
                    "type":
                    "TrainMLModel",
                    "settings": [{
                        "encoding": "e1",
                        "ml_method": "simpleLR"
                    }, {
                        "encoding": "e2",
                        "ml_method": "simpleLR"
                    }],
                    "assessment": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.7
                    },
                    "selection": {
                        "split_strategy": "random",
                        "split_count": 2,
                        "training_percentage": 0.7
                    },
                    "labels": ["CD"],
                    "dataset":
                    "d2",
                    "strategy":
                    "GridSearch",
                    "metrics": ["accuracy", "auc"],
                    "reports": [],
                    "number_of_processes":
                    10,
                    "optimization_metric":
                    "accuracy",
                    'refit_optimal_model':
                    False,
                    "store_encoded_data":
                    False
                }
            }
        }

        specs_path = path / "specs.yaml"
        with open(specs_path, "w") as file:
            yaml.dump(specs, file)

        run_immuneML(
            Namespace(
                **{
                    "specification_path": specs_path,
                    "result_path": result_path / 'result/',
                    'tool': "GalaxyYamlTool"
                }))

        self.assertTrue(
            os.path.exists(result_path / "result/inst1/new_d1/AIRR"))
        self.assertTrue(os.path.exists(result_path / "result/inst1/d2/AIRR"))
        self.assertTrue(os.path.exists(result_path / "result/d2"))

        shutil.rmtree(path)