Exemplo n.º 1
0
    def create_dataset(self):
        path = Path(
            os.path.relpath(EnvironmentSettings.root_path /
                            "test/tmp/immunemlapp/initial_dataset"))
        PathBuilder.build(path)

        repertoire_count = 30
        repertoires, metadata = RepertoireBuilder.build(
            [["AA", "AAAA", "AAAA", "AAA"] for i in range(repertoire_count)],
            path, {
                "CD": [
                    'yes' if i % 2 == 0 else 'no'
                    for i in range(repertoire_count)
                ],
                "CMV": [
                    True if i % 2 == 1 else False
                    for i in range(repertoire_count)
                ]
            }, [[{
                "chain": "A" if i % 2 == 0 else "B",
                "count": random.randint(2, 5)
            } for i in range(4)] for j in range(repertoire_count)])

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    labels={
                                        "CD": [True, False],
                                        "CMV": [True, False]
                                    },
                                    name="d1")
        PickleExporter.export(dataset, path)

        return path / "d1.iml_dataset"
Exemplo n.º 2
0
    def import_sequence_dataset(import_class, params, dataset_name: str):
        PathBuilder.build(params.result_path)

        filenames = ImportHelper.get_sequence_filenames(params.path, dataset_name)

        file_index = 0
        dataset_filenames = []
        dataset_params = {}
        items = None

        for index, filename in enumerate(filenames):
            new_items = ImportHelper.import_items(import_class, filename, params)
            items = np.append(items, new_items) if items is not None else new_items
            dataset_params = ImportHelper.extract_sequence_dataset_params(items, params)

            while len(items) > params.sequence_file_size or (index == len(filenames) - 1 and len(items) > 0):
                dataset_filenames.append(params.result_path / "batch_{}.pickle".format(file_index))
                ImportHelper.store_sequence_items(dataset_filenames, items, params.sequence_file_size)
                items = items[params.sequence_file_size:]
                file_index += 1

        init_kwargs = {"filenames": dataset_filenames, "file_size": params.sequence_file_size, "name": dataset_name, "labels": dataset_params}

        dataset = ReceptorDataset(**init_kwargs) if params.paired else SequenceDataset(**init_kwargs)

        PickleExporter.export(dataset, params.result_path)

        return dataset
Exemplo n.º 3
0
    def make_random_dataset(self, path):
        alphabet = EnvironmentSettings.get_sequence_alphabet()
        sequences = [["".join([rn.choice(alphabet) for i in range(20)]) for i in range(100)] for i in range(40)]

        repertoires, metadata = RepertoireBuilder.build(sequences, path, subject_ids=[i % 2 for i in range(len(sequences))])
        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)
        PickleExporter.export(dataset, path)
Exemplo n.º 4
0
    def create_dummy_dataset(self, path):
        repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path, labels={"label1": ["val1", "val2"], "label2": ["val1", "val2"]})

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)
        dataset.name = "my_dataset"
        PickleExporter.export(dataset, path)

        return f"{dataset.name}.iml_dataset"
    def import_dataset(params, dataset_name: str) -> ReceptorDataset:
        generic_params = DatasetImportParams.build_object(**params)

        filenames = ImportHelper.get_sequence_filenames(generic_params.path, dataset_name)

        PathBuilder.build(generic_params.result_path, warn_if_exists=True)

        dataset = SingleLineReceptorImport._import_from_files(filenames, generic_params)
        dataset.name = dataset_name
        dataset.labels = ImportHelper.extract_sequence_dataset_params(params=generic_params)

        PickleExporter.export(dataset, generic_params.result_path)

        return dataset
Exemplo n.º 6
0
    def test_load_receptors(self):
        path = EnvironmentSettings.tmp_test_path / "pickle_import_receptors/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            10, {2: 1}, {3: 1}, {}, path)
        dataset.name = "d1"
        PickleExporter.export(dataset, path)

        receptor_dataset = PickleImport.import_dataset(
            {"path": path / "d1.iml_dataset"}, "dataset_name")

        self.assertEqual(10, len(list(receptor_dataset.get_data())))

        shutil.rmtree(path)
Exemplo n.º 7
0
    def test_export_receptor_dataset(self):
        path = EnvironmentSettings.tmp_test_path / "pickleexporter_receptor/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            10, {2: 1}, {3: 1}, {}, path)
        dataset.name = "d1"
        PickleExporter.export(dataset, path)

        with open(path / f"{dataset.name}.iml_dataset", "rb") as file:
            dataset2 = pickle.load(file)

        self.assertTrue(isinstance(dataset2, ReceptorDataset))
        self.assertEqual(10, dataset2.get_example_count())

        shutil.rmtree(path)
Exemplo n.º 8
0
    def import_repertoire_dataset(import_class, params: DatasetImportParams,
                                  dataset_name: str) -> RepertoireDataset:
        """
        Function to create a dataset from the metadata and a list of repertoire files and exports dataset pickle file

        Arguments:
            import_class: class to use for import
            params: instance of DatasetImportParams class which includes information on path, columns, result path etc.
            dataset_name: user-defined name of the dataset

        Returns:
            RepertoireDataset object that was created
        """
        metadata = pd.read_csv(params.metadata_file, ",")

        ParameterValidator.assert_keys_present(
            metadata.columns.tolist(), ["filename"], ImportHelper.__name__,
            f'{dataset_name}: params: metadata_file')

        PathBuilder.build(params.result_path / "repertoires/")

        arguments = [(import_class, row, params)
                     for index, row in metadata.iterrows()]
        with Pool(params.number_of_processes) as pool:
            repertoires = pool.starmap(ImportHelper.load_repertoire_as_object,
                                       arguments)

        new_metadata_file = ImportHelper.make_new_metadata_file(
            repertoires, metadata, params.result_path, dataset_name)

        potential_labels = list(set(metadata.columns.tolist()) - {"filename"})
        dataset = RepertoireDataset(labels={
            key: list(set(metadata[key].values.tolist()))
            for key in potential_labels
        },
                                    repertoires=repertoires,
                                    metadata_file=new_metadata_file,
                                    name=dataset_name)

        PickleExporter.export(dataset, params.result_path)

        return dataset
Exemplo n.º 9
0
    def test_export(self):
        path = EnvironmentSettings.tmp_test_path / "pickleexporter/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path)
        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata)
        PickleExporter.export(
            dataset, EnvironmentSettings.tmp_test_path / "pickleexporter/")

        with open(
                EnvironmentSettings.tmp_test_path /
                f"pickleexporter/{dataset.name}.iml_dataset", "rb") as file:
            dataset2 = pickle.load(file)

        shutil.rmtree(EnvironmentSettings.tmp_test_path / "pickleexporter/")

        self.assertTrue(isinstance(dataset2, RepertoireDataset))
        self.assertEqual(2, len(dataset2.get_data()))
        self.assertEqual("rep_0",
                         dataset2.get_data()[0].metadata["subject_id"])
Exemplo n.º 10
0
    def prepare_dataset(self, path):
        PathBuilder.build(path)
        repertoires, metadata = RepertoireBuilder.build(
            sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
            path=path,
            labels={
                "l1": [
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
                ],
                "l2": [
                    0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
                    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
                ]
            })

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    labels={
                                        "l1": [1, 2],
                                        "l2": [0, 1]
                                    },
                                    name="dataset1")
        PickleExporter.export(dataset, path)
Exemplo n.º 11
0
    def test_run(self):

        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "api_galaxy_yaml_tool/")
        result_path = path / "result/"

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            10, {10: 1}, {12: 1}, {}, result_path)
        dataset.name = "d1"
        PickleExporter.export(dataset, result_path)

        specs = {
            "definitions": {
                "datasets": {
                    "new_d1": {
                        "format": "Pickle",
                        "params": {
                            "metadata_file":
                            str(result_path / "d1_metadata.csv")
                        }
                    },
                    "d2": {
                        "format": "RandomRepertoireDataset",
                        "params": {
                            "repertoire_count": 50,
                            "sequence_length_probabilities": {
                                10: 1
                            },
                            'sequence_count_probabilities': {
                                10: 1
                            },
                            'labels': {
                                "CD": {
                                    True: 0.5,
                                    False: 0.5
                                }
                            }
                        }
                    }
                },
                "encodings": {
                    "e1": {
                        "Word2Vec": {
                            "k": 3,
                            "model_type": "sequence",
                            "vector_size": 8,
                        }
                    },
                    "e2": {
                        "Word2Vec": {
                            "k": 3,
                            "model_type": "sequence",
                            "vector_size": 10,
                        }
                    },
                },
                "ml_methods": {
                    "simpleLR": {
                        "LogisticRegression": {
                            "penalty": "l1"
                        },
                        "model_selection_cv": False,
                        "model_selection_n_folds": -1,
                    }
                },
            },
            "instructions": {
                "inst1": {
                    "type": "DatasetExport",
                    "datasets": ["new_d1", 'd2'],
                    "export_formats": ["AIRR"]
                },
                "inst2": {
                    "type":
                    "TrainMLModel",
                    "settings": [{
                        "encoding": "e1",
                        "ml_method": "simpleLR"
                    }, {
                        "encoding": "e2",
                        "ml_method": "simpleLR"
                    }],
                    "assessment": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.7
                    },
                    "selection": {
                        "split_strategy": "random",
                        "split_count": 2,
                        "training_percentage": 0.7
                    },
                    "labels": ["CD"],
                    "dataset":
                    "d2",
                    "strategy":
                    "GridSearch",
                    "metrics": ["accuracy", "auc"],
                    "reports": [],
                    "number_of_processes":
                    10,
                    "optimization_metric":
                    "accuracy",
                    'refit_optimal_model':
                    False,
                    "store_encoded_data":
                    False
                }
            }
        }

        specs_path = path / "specs.yaml"
        with open(specs_path, "w") as file:
            yaml.dump(specs, file)

        run_immuneML(
            Namespace(
                **{
                    "specification_path": specs_path,
                    "result_path": result_path / 'result/',
                    'tool': "GalaxyYamlTool"
                }))

        self.assertTrue(
            os.path.exists(result_path / "result/inst1/new_d1/AIRR"))
        self.assertTrue(os.path.exists(result_path / "result/inst1/d2/AIRR"))
        self.assertTrue(os.path.exists(result_path / "result/d2"))

        shutil.rmtree(path)
Exemplo n.º 12
0
 def store(encoded_dataset, params: EncoderParams):
     PickleExporter.export(encoded_dataset, params.result_path)
Exemplo n.º 13
0
    def test_parse_yaml_file(self):
        path = EnvironmentSettings.root_path / "test/tmp/parser/"
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["AAA", "CCC"], ["TTTT"]], path, {"default": [1, 2]})[0],
                                    labels={"default": [1, 2]})
        PickleExporter.export(dataset, path)

        spec = {
            "definitions": {
                "datasets": {
                    "d1": {
                        "format": "Pickle",
                        "params": {
                            "path": str(path / f"{dataset.name}.iml_dataset"),
                        }
                    }
                },
                "encodings": {
                    "a1": {
                        "Word2Vec": {
                            "k": 3,
                            "model_type": "sequence",
                            "vector_size": 8,
                        }
                    },
                    "a2": "Word2Vec"
                },
                "ml_methods": {
                    "simpleLR": {
                        "LogisticRegression": {
                            "penalty": "l1"
                        },
                        "model_selection_cv": False,
                        "model_selection_n_folds": -1,
                    },
                    "simpleLR2": "LogisticRegression"
                },
                "reports": {
                    "rep1": "SequenceLengthDistribution"
                }
            },
            "instructions": {}
        }

        PathBuilder.build(path)

        specs_filename = path / "tmp_yaml_spec.yaml"

        with specs_filename.open("w") as file:
            yaml.dump(spec, file, default_flow_style=False)

        symbol_table, _ = ImmuneMLParser.parse_yaml_file(specs_filename,
                                                         result_path=path)

        self.assertTrue(
            all([
                symbol_table.contains(key)
                for key in ["simpleLR", "rep1", "a1", "d1"]
            ]))
        self.assertTrue(isinstance(symbol_table.get("d1"), RepertoireDataset))

        with self.assertRaises(YAMLError):
            with specs_filename.open("r") as file:
                specs_text = file.readlines()
            specs_text[0] = "        definitions:"
            with specs_filename.open("w") as file:
                file.writelines(specs_text)

            ImmuneMLParser.parse_yaml_file(specs_filename, result_path=path)

        shutil.rmtree(path)
Exemplo n.º 14
0
    def test_encoding(self):

        path = EnvironmentSettings.tmp_test_path / "integration_test_emerson_encoding/"
        PathBuilder.build(path)

        ref_path = path / "reference.csv"
        pd.DataFrame({
            "sequence_aas": ["GGG", "III", "TTT", "EFEF"],
            "v_alleles":
            ["TRBV6-1*01", "TRBV6-1*01", "TRBV6-1*01", "TRBV6-1*01"],
            'j_alleles': ["TRBJ2-7", "TRBJ2-7", "TRBJ2-7", "TRBJ2-7"]
        }).to_csv(ref_path, index=False)

        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"],
             ["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"],
             ["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"],
             ["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
            labels={
                "l1": [
                    True, True, False, False, True, True, False, False, True,
                    True, False, False, True, True, False, False
                ]
            },
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    labels={"l1": [True, False]})
        PickleExporter.export(dataset, path)

        specs = {
            "definitions": {
                "datasets": {
                    "d1": {
                        "format": "Pickle",
                        "params": {
                            "path": str(path / f"{dataset.name}.iml_dataset"),
                        }
                    }
                },
                "encodings": {
                    "e1": {
                        "SequenceAbundance": {
                            'comparison_attributes':
                            ["sequence_aas", "v_alleles", "j_alleles"]
                        }
                    }
                },
                "ml_methods": {
                    "knn": {
                        "KNN": {
                            "n_neighbors": 1
                        },
                    }
                },
                "reports": {
                    "r1": {
                        "ReferenceSequenceOverlap": {
                            "reference_path":
                            str(ref_path),
                            'comparison_attributes':
                            ["sequence_aas", "v_alleles", "j_alleles"]
                        }
                    }
                }
            },
            "instructions": {
                "inst1": {
                    "type": "TrainMLModel",
                    "settings": [{
                        "encoding": "e1",
                        "ml_method": "knn"
                    }],
                    "assessment": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.7,
                        "reports": {}
                    },
                    "selection": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.7,
                    },
                    "labels": [{
                        "l1": {
                            "positive_class": True
                        }
                    }],
                    "dataset": "d1",
                    "strategy": "GridSearch",
                    "metrics": ["accuracy"],
                    "number_of_processes": 2,
                    "reports": ["r1"],
                    "optimization_metric": "balanced_accuracy",
                    "refit_optimal_model": True,
                    "store_encoded_data": False
                }
            }
        }

        specs_file = path / "specs.yaml"
        with open(specs_file, "w") as file:
            yaml.dump(specs, file)

        app = ImmuneMLApp(specs_file, path / "result")
        app.run()

        shutil.rmtree(path)
Exemplo n.º 15
0
    def test_generate(self):

        path = EnvironmentSettings.tmp_test_path / "disease_assoc_seq_cv/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"]],
            labels={
                "l1": [
                    True, False, True, False, True, False, True, False, True,
                    False, True, False, True, False
                ]
            },
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    labels={"l1": [True, False]})
        PickleExporter.export(dataset, path)

        specs = {
            "definitions": {
                "datasets": {
                    "d1": {
                        "format": "Pickle",
                        "params": {
                            "path": str(path / f"{dataset.name}.iml_dataset"),
                        }
                    }
                },
                "encodings": {
                    "e1": {
                        "SequenceAbundance": {
                            'p_value_threshold': 0.5
                        }
                    }
                },
                "ml_methods": {
                    "knn": {
                        "KNN": {
                            "n_neighbors": 1
                        },
                    }
                },
                "reports": {
                    "r1": {
                        "DiseaseAssociatedSequenceCVOverlap": {
                            "compare_in_selection": True,
                            "compare_in_assessment": True
                        }
                    }
                }
            },
            "instructions": {
                "inst1": {
                    "type": "TrainMLModel",
                    "settings": [{
                        "encoding": "e1",
                        "ml_method": "knn"
                    }],
                    "assessment": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.5,
                        "reports": {}
                    },
                    "selection": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.5,
                    },
                    "labels": [{
                        "l1": {
                            "positive_class": True
                        }
                    }],
                    "dataset": "d1",
                    "strategy": "GridSearch",
                    "metrics": ["accuracy"],
                    "number_of_processes": 2,
                    "reports": ["r1"],
                    "optimization_metric": "balanced_accuracy",
                    "refit_optimal_model": True,
                    "store_encoded_data": False
                }
            }
        }

        specs_file = path / "specs.yaml"
        with open(specs_file, "w") as file:
            yaml.dump(specs, file)

        app = ImmuneMLApp(specs_file, path / "result/")
        state = app.run()[0]

        self.assertEqual(1, len(state.report_results))
        self.assertTrue(len(state.report_results[0].output_figures) > 0)
        self.assertTrue(len(state.report_results[0].output_tables) > 0)

        for fig in state.report_results[0].output_figures:
            self.assertTrue(os.path.isfile(fig.path))
        for table in state.report_results[0].output_tables:
            self.assertTrue(os.path.isfile(table.path))

        shutil.rmtree(path)