示例#1
0
    def test_load_repertoire_with_stop_codon(self):
        path = EnvironmentSettings.root_path + "test/tmp/io_igor_load/"

        PathBuilder.build(path)
        self.write_dummy_files(path, True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/", "igor")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["path"] = path
        params["import_with_stop_codon"] = True
        params["metadata_file"] = path + "metadata.csv"

        dataset_stop_codons = IGoRImport.import_dataset(
            params, "igor_dataset_stop")

        self.assertEqual(2, dataset_stop_codons.get_example_count())
        self.assertEqual(len(dataset_stop_codons.repertoires[0].sequences), 2)
        self.assertEqual(len(dataset_stop_codons.repertoires[1].sequences), 2)

        self.assertEqual(
            dataset_stop_codons.repertoires[0].sequences[0].
            amino_acid_sequence, "ARVNRHIVVVTAIMTG*NWFDP")

        shutil.rmtree(path)
示例#2
0
    def standard_scale(scaler_file: str, design_matrix, with_mean: bool = True):
        """
        scale to zero mean and unit variance on feature level
        :param scaler_file: path to scaler file fitted on train set or where the resulting scaler file will be stored
        :param design_matrix: rows -> examples, columns -> features
        :param with_mean: whether to scale to zero mean or not (could lose sparsity if scaled)
        :return: scaled design matrix
        """

        if with_mean and hasattr(design_matrix, "todense"):
            scaled_design_matrix = design_matrix.todense()
        else:
            scaled_design_matrix = design_matrix

        if os.path.isfile(scaler_file):
            with open(scaler_file, 'rb') as file:
                scaler = pickle.load(file)
                scaled_design_matrix = scaler.transform(scaled_design_matrix)
        else:
            scaler = StandardScaler(with_mean=with_mean)
            scaled_design_matrix = scaler.fit_transform(scaled_design_matrix)

            PathBuilder.build(os.path.dirname(scaler_file))

            with open(scaler_file, 'wb') as file:
                pickle.dump(scaler, file)

        return scaled_design_matrix
示例#3
0
    def __init__(self, specification_path: str, result_path: str):
        self._specification_path = specification_path
        self._result_path = os.path.relpath(result_path) + "/"

        PathBuilder.build(self._result_path)

        self._cache_path = f"{self._result_path}cache/"
示例#4
0
 def build_path(self, path: str = None):
     if path is None:
         path = EnvironmentSettings.root_path + "quickstart/"
         if os.path.isdir(path):
             shutil.rmtree(path)
         PathBuilder.build(path)
     return path
示例#5
0
 def _generate(self) -> ReportResult:
     PathBuilder.build(self.result_path)
     data_long_format = DataReshaper.reshape(self.dataset)
     table_result = self._write_results_table(data_long_format)
     report_output_fig = self._safe_plot(data_long_format=data_long_format)
     output_figures = None if report_output_fig is None else [report_output_fig]
     return ReportResult(self.name, output_figures, [table_result])
示例#6
0
    def test_make_subset(self):
        sequences = []
        for i in range(100):
            sequences.append(
                ReceptorSequence(amino_acid_sequence="AAA", identifier=str(i)))

        path = EnvironmentSettings.tmp_test_path + "element_generator_subset/"
        PathBuilder.build(path)

        for i in range(10):
            with open("{}batch{}.pkl".format(path, i), "wb") as file:
                sequences_to_pickle = sequences[i * 10:(i + 1) * 10]
                pickle.dump(sequences_to_pickle, file)

        d = SequenceDataset(
            filenames=["{}batch{}.pkl".format(path, i) for i in range(10)],
            file_size=10)

        indices = [1, 20, 21, 22, 23, 24, 25, 50, 52, 60, 70, 77, 78, 90, 92]

        d2 = d.make_subset(indices, path, SequenceDataset.TRAIN)

        for batch in d2.get_batch(1000):
            for sequence in batch:
                self.assertTrue(int(sequence.identifier) in indices)

        self.assertEqual(15, d2.get_example_count())

        shutil.rmtree(path)
示例#7
0
    def import_repertoire_dataset(import_class, params: DatasetImportParams, dataset_name: str) -> RepertoireDataset:
        """
        Function to create a dataset from the metadata and a list of repertoire files and exports dataset pickle file

        Arguments:
            import_class: class to use for import
            params: instance of DatasetImportParams class which includes information on path, columns, result path etc.
            dataset_name: user-defined name of the dataset

        Returns:
            RepertoireDataset object that was created
        """
        metadata = pd.read_csv(params.metadata_file, ",")

        ParameterValidator.assert_keys_present(metadata.columns.tolist(), ["filename"], ImportHelper.__name__,
                                               f'{dataset_name}: params: metadata_file')

        PathBuilder.build(params.result_path + "repertoires/")

        arguments = [(import_class, row, params) for index, row in metadata.iterrows()]
        with Pool(params.number_of_processes) as pool:
            repertoires = pool.starmap(ImportHelper.load_repertoire_as_object, arguments)

        new_metadata_file = ImportHelper.make_new_metadata_file(repertoires, metadata, params.result_path, dataset_name)

        potential_labels = list(set(metadata.columns.tolist()) - {"filename"})
        dataset = RepertoireDataset(params={key: list(set(metadata[key].values.tolist())) for key in potential_labels},
                                    repertoires=repertoires, metadata_file=new_metadata_file, name=dataset_name)

        PickleExporter.export(dataset, params.result_path)

        return dataset
示例#8
0
    def create_dataset(self, path, dataset_size: int = 50):

        sequences = []

        for i in range(dataset_size):
            if i % 2 == 0:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="AAACCC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 1})))
            else:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="ACACAC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 2})))

        PathBuilder.build(path)
        filename = "{}sequences.pkl".format(path)
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = SequenceDataset(params={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset
示例#9
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path + "abundance_encoder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["GGG", "III", "LLL", "MMM"],
                                                         ["DDD", "EEE", "FFF", "III", "LLL", "MMM"],
                                                         ["CCC", "FFF", "MMM"],
                                                         ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
                                                        labels={"l1": [True, True, False, False]}, path=path)

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1")

        encoder = SequenceAbundanceEncoder.build_object(dataset, **{
            "comparison_attributes": ["sequence_aas"],
            "p_value_threshold": 0.4, "sequence_batch_size": 4, "repertoire_batch_size": 8
        })

        label_config = LabelConfiguration([Label("l1", [True, False], positive_class=True)])

        encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config))

        self.assertTrue(np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples))

        encoder.p_value_threshold = 0.05

        encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config))

        self.assertTrue(np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples))

        shutil.rmtree(path)
示例#10
0
    def test_import_sequences(self):
        path = EnvironmentSettings.tmp_test_path + "importseqsiris/sequences.csv"
        PathBuilder.build(os.path.dirname(path))
        with open(path, "w") as file:
            file.write(
                "Cell type	Clonotype ID	Chain: TRA (1)	TRA - V gene (1)	TRA - D gene (1)	TRA - J gene (1)	Chain: TRA (2)	TRA - V gene (2)	\
                TRA - D gene (2)	TRA - J gene (2)	Chain: TRB (1)	TRB - V gene (1)	TRB - D gene (1)	TRB - J gene (1)	Chain: TRB (2)	TRB - V \
                gene (2)	TRB - D gene (2)	TRB - J gene (2)\n\
                TCR_AB	181	LVGG	TRAV4*01	null	TRAJ4*01	null	null	null	null	null	null	null	null	null	null	null	null\n\
                TCR_AB	591	AL	TRAV9-2*01	null	TRAJ21*01	null	null	null	null	null	null	null	null	null	null	null	null\n\
                TCR_AB	1051	VVNII	TRAV12-1*01	null	TRAJ3*01	null	null	null	null	null	null	null	null	null	null	null	null\n\
                TCR_AB	1341	LNKLT	TRAV2*01	null	TRAJ10*01	null	null	null	null	null	null	null	null	null	null	null	null\n\
                TCR_AB	1411	AVLY	TRAV8-3*01	null	TRAJ18*01	null	null	null	null	null	null	null	null	null	null	null	null\n\
                TCR_AB	1421	AT	TRAV12-3*01	null	TRAJ17*01	null	null	null	null	null	null	null	null	null	null	null	null\n\
                TCR_AB	1671	AVLI	TRAV12*01	null	TRAJ33*01	null	null	null	null	null	null	null	null	null	null	null	null\n\
                TCR_AB	1901	LVGKLI	TRAV4*01	null	TRAJ4*01	null	null	null	null	null	null	null	null	null	null	null	null\n\
                TCR_AB	2021	YSSASKII	TRAV2-1*01	null	TRAJ3*01	null	null	null	null	null	null	null	null	null	null	null	null\n\
                TCR_AB	2251	ARLY	TRAV4/DV5*01	null	TRAJ18*01	null	null	null	null	null	null	null	null	null	null	null	null\n\
                TCR_AB	2791	IEFN	TRAV26-1*01	null	TRAJ20*01	null	null	null	null	null	null	null	null	null	null	null	null\n\
                TCR_AB	3031	TLGRLY	TRAV8-3*01	null	TRAJ18*01	null	null	null	null	null	null	null	null	null	null	null	null\n\
                TCR_AB	3241	AVGLY	TRAV8-3*01	null	TRAJ18*01	null	null	null	null	null	null	null	null	null	null	null	null\n\
                TCR_AB	3511	KII	TRAV12-1*01	null	TRAJ3*01	null	null	null	null	null	null	null	null	null	null	null	null\n\
                TCR_AB	3821	LVGD	TRAV8*01	null	TRAJ4*01	null	null	null	null	null	null	null	null	null	null	null	null\n"
            )

        sequences = IRISSequenceImport.import_items(path)

        self.assertEqual(15, len(sequences))
        self.assertTrue(
            all(
                isinstance(sequence, ReceptorSequence)
                for sequence in sequences))
        self.assertEqual("LVGG", sequences[0].get_sequence())

        shutil.rmtree(os.path.dirname(path))
示例#11
0
    def test_import_paired_sequences(self):
        path = EnvironmentSettings.tmp_test_path + "importseqsiris/sequences.csv"
        PathBuilder.build(os.path.dirname(path))

        with open(path, "w") as file:
            file.write(
                "Cell type	Clonotype ID	Chain: TRA (1)	TRA - V gene (1)	TRA - D gene (1)	TRA - J gene (1)	Chain: TRA (2)	TRA - V gene (2)	\
                TRA - D gene (2)	TRA - J gene (2)	Chain: TRB (1)	TRB - V gene (1)	TRB - D gene (1)	TRB - J gene (1)	Chain: TRB (2)	TRB - V \
                gene (2)	TRB - D gene (2)	TRB - J gene (2)\n\
                TCR_AB	540891	ATDIWSNFGNEKLT	TRAV17*01		TRAJ48*01	null	null	null	null	SARVRNYQETQY	TRBV20-1*01	TRBD1*01	TRBJ2-5*01	null	null	null	null\n\
                TCR_AB	540892	AASAGDDKII	TRAV29/DV5*01		TRAJ30*01	null	null	null	null	ASRPTGTVDYEQY	TRBV5-1*01	TRBD1*01	TRBJ2-7*01	null	null	null	null\n\
                TCR_AB	540893	AAYTSGTYKYI	TRAV8-1*01		TRAJ40*01	null	null	null	null	ASSLTGMNTEAF	TRBV11-1*01	TRBD2*01	TRBJ1-1*01	null	null	null	null\n\
                TCR_AB	54084	ALLSRSGGYQKVT	TRAV12-2*01		TRAJ13*02	null	null	null	null	SARDNQETQY	TRBV20-1*01	TRBD1*01	TRBJ2-5*01	null	null	null	null\n\
                TCR_AB	540895	AYRSRIQGAQKLV	TRAV38-2/DV8*01		TRAJ54*01	null	null	null	null	ASSHGTSGSGEQY	TRBV7-9*01	TRBD2*02	TRBJ2-7*01	null	null	null	null\n"
            )

        paired_sequences = IRISSequenceImport.import_items(path, paired=True)

        self.assertEqual(5, len(paired_sequences))
        self.assertTrue(
            all(
                isinstance(sequence, TCABReceptor)
                for sequence in paired_sequences))

        self.assertEqual("ATDIWSNFGNEKLT",
                         paired_sequences[0].alpha.get_sequence())
        self.assertEqual("SARVRNYQETQY",
                         paired_sequences[0].beta.get_sequence())

        shutil.rmtree(os.path.dirname(path))
示例#12
0
    def test_sequence_flattened(self):
        path = EnvironmentSettings.root_path + "test/tmp/onehot_seq_flat/"

        PathBuilder.build(path)

        dataset = self.construct_test_flatten_dataset(path)

        encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": None, "flatten": True})

        encoded_data = encoder.encode(dataset, EncoderParams(
            result_path=path,
            label_config=LabelConfiguration([Label(name="l1", values=[1, 0], positive_class="1")]),
            pool_size=1,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        self.assertTrue(isinstance(encoded_data, SequenceDataset))

        onehot_a = [1.0] + [0.0] * 19
        onehot_t = [0.0] * 16 + [1.0] + [0] * 3

        self.assertListEqual(list(encoded_data.encoded_data.examples[0]), onehot_a+onehot_a+onehot_a+onehot_t+onehot_t+onehot_t)
        self.assertListEqual(list(encoded_data.encoded_data.examples[1]), onehot_a+onehot_t+onehot_a+onehot_t+onehot_a+onehot_t)

        self.assertListEqual(list(encoded_data.encoded_data.feature_names), [f"{pos}_{char}" for pos in range(6) for char in EnvironmentSettings.get_sequence_alphabet()])
        shutil.rmtree(path)
示例#13
0
    def test(self):
        path = EnvironmentSettings.tmp_test_path + "onehot_sequence/"
        PathBuilder.build(path)

        dataset, lc = self._construct_test_dataset(path)

        encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False,
                                                         "distance_to_seq_middle": None,
                                                         "flatten": False})

        encoded_data = encoder.encode(dataset, EncoderParams(
            result_path=f"{path}encoded/",
            label_config=lc,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        self.assertTrue(isinstance(encoded_data, SequenceDataset))

        onehot_a = [1] + [0] * 19
        onehot_t = [0] * 16 + [1] + [0] * 3
        onehot_empty = [0] * 20

        self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[0]], [onehot_a for i in range(4)])
        self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[1]], [onehot_a, onehot_t, onehot_a, onehot_empty])
        self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[2]], [onehot_a, onehot_t, onehot_t, onehot_empty])

        self.assertListEqual(encoded_data.encoded_data.example_ids, [receptor.identifier for receptor in dataset.get_data()])
        self.assertDictEqual(encoded_data.encoded_data.labels,
                             {"l1": [receptor_seq.get_attribute("l1") for receptor_seq in dataset.get_data()],
                              "l2": [receptor_seq.get_attribute("l2") for receptor_seq in dataset.get_data()]})

        shutil.rmtree(path)
示例#14
0
    def test_load_sequence_dataset(self):
        """Test dataset content with and without a header included in the input file"""
        path = EnvironmentSettings.root_path + "test/tmp/io_igor_load/"

        PathBuilder.build(path)
        self.write_dummy_files(path, False)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/", "igor")
        params["is_repertoire"] = False
        params["paired"] = False
        params["result_path"] = path
        params["path"] = path
        params["import_with_stop_codon"] = True

        dataset = IGoRImport.import_dataset(params, "igor_seq_dataset")

        seqs = [sequence for sequence in dataset.get_data()]

        self.assertEqual(4, dataset.get_example_count())

        self.assertEqual(
            "GCGAGACGTGTCTAGGGAGGATATTGTAGTAGTACCAGCTGCTATGACGGGCGGTCCGGTAGTACTACTTTGACTAC",
            seqs[0].nucleotide_sequence)
        self.assertEqual(
            "GCGAGAGGCTTCCATGGAACTACAGTAACTACGTTTGTAGGCTGTAGTACTACATGGACGTC",
            seqs[1].nucleotide_sequence)
        self.assertEqual(
            "GCGAGAGTTAATCGGCATATTGTGGTGGTGACTGCTATTATGACCGGGTAAAACTGGTTCGACCCC",
            seqs[2].nucleotide_sequence)
        self.assertEqual(
            "GCGAGAGATAGGTGGTCAACCCCAGTATTACGATATTTTGACTGGTGGACCCCGCCCTACTACTACTACATGGACGTC",
            seqs[3].nucleotide_sequence)

        shutil.rmtree(path)
示例#15
0
    def test_import_repertoire_dataset(self):
        path = EnvironmentSettings.root_path + "test/tmp/io_10xGenomics/"
        PathBuilder.build(path)
        self.create_dumy_dataset(path, add_metadata=True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/",
            "tenx_genomics")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["path"] = path
        params["metadata_file"] = path + "metadata.csv"

        dataset = TenxGenomicsImport.import_dataset(params,
                                                    "tenx_dataset_repertoire")

        self.assertEqual(2, dataset.get_example_count())

        self.assertEqual(len(dataset.repertoires[0].sequences), 2)
        self.assertEqual(len(dataset.repertoires[1].sequences), 4)

        self.assertEqual(
            dataset.repertoires[0].sequences[0].amino_acid_sequence,
            "ALSGTGGYKVV")
        self.assertListEqual([Chain.ALPHA, Chain.BETA],
                             list(dataset.repertoires[0].get_chains()))
        self.assertListEqual([2, 4], list(dataset.repertoires[0].get_counts()))

        shutil.rmtree(path)
示例#16
0
    def test_repertoire_export(self):
        path = EnvironmentSettings.tmp_test_path + "airr_exporter_repertoire/"
        PathBuilder.build(path)

        repertoire, metadata_path = self.create_dummy_repertoire(path)
        dataset = RepertoireDataset(repertoires=[repertoire],
                                    metadata_file=metadata_path)

        path_exported = f"{path}exported/"
        AIRRExporter.export(dataset, path_exported)

        resulting_data = pd.read_csv(
            path_exported + f"repertoires/{repertoire.identifier}.tsv",
            sep="\t")

        self.assertListEqual(list(resulting_data["sequence_id"]),
                             ["receptor_1", "receptor_2"])
        self.assertListEqual(list(resulting_data["cdr3"]),
                             ["GCTGCTGCT", "GGTGGTGGT"])
        self.assertListEqual(list(resulting_data["cdr3_aa"]), ["AAA", "GGG"])
        self.assertListEqual(list(resulting_data["v_call"]),
                             ["TRBV1", "TRAV2*01"])
        self.assertListEqual(list(resulting_data["j_call"]),
                             ["TRBJ1", "TRAJ2"])
        self.assertListEqual(list(resulting_data["d_call"]),
                             ["TRBD1", "TRAD2"])
        self.assertListEqual(list(resulting_data["locus"]), ["TRB", "TRA"])
        self.assertListEqual(list(resulting_data["duplicate_count"]), [5, 15])
        self.assertListEqual(list(resulting_data["custom_test"]),
                             ["cust1", "cust2"])
        self.assertListEqual(list(resulting_data["productive"]), ['T', nan])
        self.assertListEqual(list(resulting_data["stop_codon"]), ['F', nan])

        shutil.rmtree(path)
示例#17
0
    def test_import_receptor_dataset(self):
        path = EnvironmentSettings.root_path + "test/tmp/io_10xGenomics/"
        PathBuilder.build(path)
        self.create_dumy_dataset(path, add_metadata=False)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/",
            "tenx_genomics")
        params["is_repertoire"] = False
        params["paired"] = True
        params["result_path"] = path
        params["path"] = path
        params["sequence_file_size"] = 1
        params["receptor_chains"] = "TRA_TRB"

        dataset = TenxGenomicsImport.import_dataset(params,
                                                    "tenx_dataset_receptor")

        self.assertEqual(2, dataset.get_example_count())
        self.assertEqual(2, len(dataset.get_filenames()))

        data = dataset.get_data(1)
        for receptor in data:
            self.assertTrue(receptor.alpha.amino_acid_sequence in
                            ["ALSGTGGYKVV", "AIVGNTGKLI"])
            self.assertTrue(receptor.beta.amino_acid_sequence in
                            ["ASSLYGGPEVF", "ASSFATNSDYT"])

        shutil.rmtree(path)
示例#18
0
    def test_load_repertoire_dataset_minimal(self):
        # loading with minimal data (no dual genes, no duplicate V/J segments)

        number_of_repertoires = 5

        path = EnvironmentSettings.tmp_test_path + "importseqsiris_mini/"
        PathBuilder.build(path)
        self._create_dummy_data(path, number_of_repertoires=number_of_repertoires, add_metadata=True)

        # case: minimal dataset (all dual chains and all genes = False)
        dataset = IRISImport.import_dataset({"is_repertoire": True, "result_path": path, "metadata_file": path + "metadata.csv", "path": path,
                                             "import_dual_chains": False, "import_all_gene_combinations": False, "separator": "\t",
                                             "extra_columns_to_load": ["extra_col"], "receptor_chains": "TRA_TRB"}, "iris_dataset")

        self.assertEqual(number_of_repertoires, dataset.get_example_count())
        self.assertEqual(number_of_repertoires, len(dataset.get_data()))

        for repertoire in dataset.get_data(2):
            self.assertTrue(repertoire.metadata["label1"] in {0, 1})
            self.assertEqual(7, len(repertoire.sequences))  # 6 alpha + 1 beta
            self.assertEqual(1, len(repertoire.receptors))  # 1 alpha/beta pair (dual chain (1))
            self.assertListEqual([Chain.ALPHA for i in range(6)] + [Chain.BETA], list(repertoire.get_chains()))
            self.assertEqual(None, repertoire.get_counts())

        shutil.rmtree(path)
示例#19
0
    def import_sequence_dataset(import_class, params, dataset_name: str):
        PathBuilder.build(params.result_path)

        filenames = ImportHelper.get_sequence_filenames(params.path, dataset_name)

        file_index = 0
        dataset_filenames = []
        dataset_params = {}
        items = None

        for index, filename in enumerate(filenames):
            new_items = ImportHelper.import_items(import_class, filename, params)
            items = np.append(items, new_items) if items is not None else new_items
            dataset_params = ImportHelper.extract_sequence_dataset_params(items, params)

            while len(items) > params.sequence_file_size or (index == len(filenames) - 1 and len(items) > 0):
                dataset_filenames.append(params.result_path + "batch_{}.pickle".format(file_index))
                ImportHelper.store_sequence_items(dataset_filenames, items, params.sequence_file_size)
                items = items[params.sequence_file_size:]
                file_index += 1

        init_kwargs = {"filenames": dataset_filenames, "file_size": params.sequence_file_size, "name": dataset_name, "params": dataset_params}

        dataset = ReceptorDataset(**init_kwargs) if params.paired else SequenceDataset(**init_kwargs)

        PickleExporter.export(dataset, params.result_path)

        return dataset
示例#20
0
    def _write_paired_matches(self, paired_matches_path) -> List[ReportOutput]:
        PathBuilder.build(paired_matches_path)

        report_outputs = []
        for i in range(0, len(self.dataset.encoded_data.example_ids)
                       ):  # todo don't mention subject in the name twice
            filename = "example_{}_".format(
                self.dataset.encoded_data.example_ids[i])
            filename += "_".join([
                "{label}_{value}".format(label=label, value=values[i])
                for label, values in self.dataset.encoded_data.labels.items()
            ])
            filename += ".csv"
            filename = os.path.join(paired_matches_path, filename)

            if self.dataset.encoded_data.encoding == "MatchedReceptorsEncoder":
                self._write_paired_receptor_matches_for_repertoire(
                    self.dataset.encoded_data.examples[i], filename)
            elif self.dataset.encoded_data.encoding == "MatchedRegexEncoder":
                self._write_paired_regex_matches_for_repertoire(
                    self.dataset.encoded_data.examples[i], filename)

            report_outputs.append(
                ReportOutput(
                    filename,
                    f"example {self.dataset.encoded_data.example_ids[i]} paired matches"
                ))

        return report_outputs
示例#21
0
    def make_dummy_dataset(self, path, add_metadata):
        rep1text = """Clone ID	Senior Author	TRAJ Gene	TRAV Gene	CDR3A AA Sequence	TRBV Gene	TRBD Gene	TRBJ Gene	CDR3B AA Sequence	Antigen Protein	Antigen Gene	Antigen Species	Antigen Peptide AA #	Epitope Peptide	MHC Class	HLA Restriction	Counts
1E6	Sewell	TRAJ12	TRAV12-3	CAMRGDSSYKLIF	TRBV12-4	TRBD2	TRBJ2-4	CASSLWEKLAKNIQYF	PPI	INS	Human	12-24	ALWGPDPAAA	MHC I	A*02:01	1
4.13	Nepom	TRAJ44	TRAV19	CALSENRGGTASKLTF	TRBV5-1	TRBD1	TRBJ1-1	CASSLVGGPSSEAFF	GAD		Human	555-567		MHC II	DRB1*04:01	3
5	Roep	TRAJ6	TRAV21	CAVKRTGGSYIPTF	TRBV11-2	TRBD1	TRBJ2-2	CASSSFWGSDTGELFF	Insulin B		Human	9-23		MHC II	DQ8	7
D222D 2	Mallone	TRAJ36*01	TRAV17*01	CAVTGANNLFF	TRBV19*01	TRBD1*01	TRBJ2-2*01	CASSIEGPTGELFF	Zinc Transporter 8	ZnT8	Human	185-194	AVAANIVLTV	MHC I	A*02:01	2
GSE.20D11	Nakayama	TRAJ4	TRAV12-3	CAILSGGYNKLIF	TRBV2	TRBD2	TRBJ2-5	CASSAETQYF	Insulin B		Human	9-23		MHC II	DQ8	10
GSE.6H9	Nakayama	TRAJ40	TRAV26-1	CIVRVDSGTYKYIF	TRBV7-2	TRBD2	TRBJ2-1	CASSLTAGLASTYNEQFF	Insulin B		Human	9-23		MHC II	DQ8/DQ8	nan
iGRP 32	DiLorenzo	TRAJ48	TRAV12-1	CVVNILSNFGNEKLTF	TRBV20/OR9-2	TRBD1	TRBJ2-1	CSASRQGWVNEQFF	IGRP		Human	265-273		MHC I	A*02:01	1
MART-1	TBD	TRAJ23	TRAV12-2	CAVNFGGGKLIF	TRBV6-4	TRBD2	TRBJ1-1	CASSLSFGTEAFF	Melan A		Human	27-35	ELAGIGILTV	MHC I	A2	3
MHB10.3	TBD	TRAJ27	TRAV4	CLVGDSLNTNAGKSTF	TRBV29-1	TRBD2	TRBJ2-2	CSVEDRNTGELFF	Insulin B		Human	11-30		MHC II	DRB1*03:01	NA
PM1#11	TBD	TRAJ54	TRAV35	CAGHSIIQGAQKLVF	TRBV5-1	TRBD2	TRBJ2-1	CASGRSSYNEQFF	GAD		Human	339-352		MHC II	DRB1*03:01	2
R164	Nepom	TRAJ56	TRAV19	CALSEEGGGANSKLTF	TRBV5-1	TRBD2	TRBJ1-6	CASSLAGGANSPLHF	GAD		Human	555-567		MHC II	DRB1*04:01	1
SD32.5	Boehm	TRAJ23	TRAV26-1	CIVRVSSAYYNQGGKLIF	TRBV27	TRBD2	TRBJ2-3	CASSPRANTDTQYF	Insulin A		Human	5-21		MHC II	DRB1*04:01	1
SD52.c1	Boehm	TRAJ27	TRAV4	CLVGDSLNTNAGKSTF	TRBV27	TRBD1	TRBJ1-5	CASSWSSIGNQPQHF	PPI	INS	Human	C18-A1		MHC II	DRB1*04:01	1
T1D#10 C8	TBD	TRAJ26	TRAV12-3	CATAYGQNFVF	TRBV4-1	TRBD2	TRBJ2-2	CASSRGGGNTGELFF	Insulin B		Human	9-23		MHC II	DQ8	1
T1D#3 C8	TBD	TRAJ23	TRAV17	CATDAGYNQGGKLIF	TRBV5-1	TRBD2	TRBJ1-3	CASSAGNTIYF	Insulin B		Human	9-23		MHC II	DQ8	1"""
        PathBuilder.build(path)

        with open(path + "rep1.tsv", "w") as file:
            file.writelines(rep1text)

        if add_metadata:
            with open(path + "metadata.csv", "w") as file:
                file.writelines(
                    """filename,chain,subject_id,coeliac status (yes/no)
rep1.tsv,TRA,1234e,no"""
                )
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        rep_map = {}
        repertoires = []
        indices_to_keep = []

        processed_dataset = dataset.clone()
        PathBuilder.build(params["result_path"])

        for index, repertoire in enumerate(processed_dataset.get_data()):
            if repertoire.metadata["subject_id"] in rep_map.keys():
                sequences = np.append(
                    repertoire.sequences,
                    rep_map[repertoire.metadata["subject_id"]].sequences)
                del rep_map[repertoire.metadata["subject_id"]]
                repertoires.append(
                    SubjectRepertoireCollector.store_repertoire(
                        params["result_path"], repertoire, sequences))
            else:
                rep_map[repertoire.metadata["subject_id"]] = repertoire
                indices_to_keep.append(index)

        for key in rep_map.keys():
            repertoires.append(
                SubjectRepertoireCollector.store_repertoire(
                    params["result_path"], rep_map[key],
                    rep_map[key].sequences))

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = SubjectRepertoireCollector.build_new_metadata(
            dataset, indices_to_keep, params["result_path"])

        return processed_dataset
示例#23
0
    def create_dummy_dataset(self, path, add_metadata):
        rep1text = """nucleotide	aminoAcid	count (templates/reads)	frequencyCount (%)	cdr3Length	vMaxResolved	vFamilyName	vGeneName	vGeneAllele	vFamilyTies	vGeneNameTies	vGeneAlleleTies	dMaxResolved	dFamilyName	dGeneName	dGeneAllele	dFamilyTies	dGeneNameTies	dGeneAlleleTies	jMaxResolved	jFamilyName	jGeneName	jGeneAllele	jFamilyTies	jGeneNameTies	jGeneAlleleTies	vDeletion	n1Insertion	d5Deletion	d3Deletion	n2Insertion	jDeletion	vIndex	n1Index	dIndex	n2Index	jIndex	estimatedNumberGenomes	sequenceStatus	cloneResolved	vOrphon	dOrphon	jOrphon	vFunction	dFunction	jFunction	fractionNucleated	vAlignLength	vAlignSubstitutionCount	vAlignSubstitutionIndexes	vAlignSubstitutionGeneThreePrimeIndexes	vSeqWithMutations
GCCATCCCCAACCAGACAGCTCTTTACTTCTGTGCCACCAGTGATCAACTTAACCGTTGGGGGACCGGGGAGCTGTTTTTTGGAGAA	CATSDQLNRWGTGELFF	38	0.0017525250196006087	51	TCRBV24	TCRBV24				TCRBV24-01,TCRBV24-or09_02						TCRBD01,TCRBD02	TCRBD01-01,TCRBD02-01		TCRBJ02-02*01	TCRBJ02	TCRBJ02-02	01				3	0	6	1	13	5	30	45	58	-1	63	38	In	VDJ												
GGGTTGGAGTCGGCTGCTCCCTCCCAAACATCTGTGTACTTCTGTGCCAGCAAGGACGGCGACACCGGGGAGCTGTTTTTTGGAGAA	CASKDGDTGELFF	48	0.002213715814232348	39	TCRBV06	TCRBV06				TCRBV06-02,TCRBV06-03						TCRBD01,TCRBD02	TCRBD01-01,TCRBD02-01		TCRBJ02-02*01	TCRBJ02	TCRBJ02-02	01				7	4	1	7	1	3	42	52	53	57	61	48	In	VDJ												
AGGCCCTCACATACCTCTCAGTACCTCTGTGCCAGCAGTGGGGAGGGACAGGGGGTATTTGGTGGCACTGAAGCTTTCTTTGGACAA	CASSGEGQGVFGGTEAFF	37	0.001706405940137435	54	TCRBV25-01*01	TCRBV25	TCRBV25-01	01				TCRBD01-01*01	TCRBD01	TCRBD01-01	01				TCRBJ01-01*01	TCRBJ01	TCRBJ01-01	01				4	10	0	1	4	4	27	40	44	55	65	37	In	VDJ												
GAGTCGGCTGCTCCCTCCCAGACATCTGTGTACTTCTGTGCCAGCAGTGAGGAGGTAGGGGGCAATCAGCCCCAGCATTTTGGTGAT	CASSEEVGGNQPQHF	53	0.0024443112115482175	45	TCRBV06-01*01	TCRBV06	TCRBV06-01	01								TCRBD01,TCRBD02	TCRBD01-01,TCRBD02-01		TCRBJ01-05*01	TCRBJ01	TCRBJ01-05	01				3	0	5	2	6	2	36	50	56	-1	61	53	In	VDJ												
GAGTCGGCTGCTCCCTCCCAGACATCTGTGTACTTCTGTGCCAGCAGTGAATTACAGGAAGGTTATGAGACCCAGTACTTCGGGCCA	CASSELQEGYETQYF	28	0.0012913342249688696	45	TCRBV06-01*01	TCRBV06	TCRBV06-01	01				TCRBD01-01*01	TCRBD01	TCRBD01-01	01				TCRBJ02-05*01	TCRBJ02	TCRBJ02-05	01				2	8	3	4	2	5	36	51	53	58	66	28	In	VDJ												
TTGGAGTCGGCTGCTCCCTCCCAAACATCTGTGTACTTCTGTGCCAGCAGTTTCCTAGCGGACCCCGGAGAGCAGTTCTTCGGGCCA	CASSFLADPGEQFF	16	7.379052714107826E-4	42	TCRBV06	TCRBV06				TCRBV06-02,TCRBV06-03		TCRBD02-01	TCRBD02	TCRBD02-01				01,02	TCRBJ02-01*01	TCRBJ02	TCRBJ02-01	01				4	8	4	5	2	10	39	52	54	61	69	16	In	VDJ												
CAGCGCACAGAGCAGGGGGACTCGGCCATGTATCTCTGTGCCAGCAGCTCACTTTGGGGTCGGAGGTATGGCTACACCTTCGGTTCG	CASSSLWGRRYGYTF	72	0.003320573721348522	45	TCRBV07-09	TCRBV07	TCRBV07-09				01,03	TCRBD02-01*02	TCRBD02	TCRBD02-01	02				TCRBJ01-02*01	TCRBJ01	TCRBJ01-02	01				4	0	10	1	12	5	36	49	61	-1	66	72	In	VDJ												
AGCAACATGAGCCCTGAAGACAGCAGCATATATCTCTGCAGCGTTTTGGACCTCCCGACCCAAACAGATACGCAGTATTTTGGCCCA	CSVLDLPTQTDTQYF	14	6.456671124844348E-4	45	TCRBV29-01*01	TCRBV29	TCRBV29-01	01								TCRBD01,TCRBD02	TCRBD01-01,TCRBD02-01		TCRBJ02-03*01	TCRBJ02	TCRBJ02-03	01				5	12	1	7	2	3	36	45	47	51	63	14	In	VDJ												
CAGCGCACACAGCAGGAGGACTCGGCCGTGTATCTCTGTGCCAGCAGCTTAAGGCTAGCGGGAGTGGAGACCCAGTACTTCGGGCCA	CASSLRLAGVETQYF	26	0.0011990960660425217	45	TCRBV07-02*01	TCRBV07	TCRBV07-02	01				TCRBD02-01*02	TCRBD02	TCRBD02-01	02				TCRBJ02-05*01	TCRBJ02	TCRBJ02-05	01				2	2	4	2	3	5	36	51	54	64	66	26	In	VDJ												
CTGGAGTCGGCTGCTCCCTCCCAGACATCTGTGTACTTCTGTGCCAGCAGCAGCGGTCCAGGGATGGAGACCCAGTACTTCGGGCCA	CASSSGPGMETQYF	13	5.995480330212608E-4	42	TCRBV06-01*01	TCRBV06	TCRBV06-01	01								TCRBD01,TCRBD02	TCRBD01-01,TCRBD02-01		TCRBJ02-05*01	TCRBJ02	TCRBJ02-05	01				6	3	4	3	8	5	39	50	58	63	66	13	In	VDJ												
TCTAAGAAGCTCCTTCTCAGTGACTCTGGCTTCTATCTCTGTGCCTGGAGTGCTATAGCGGATTACAATGAGCAGTTCTTCGGGCCA	CAWSAIADYNEQFF	8	3.689526357053913E-4	42	TCRBV30-01*01	TCRBV30	TCRBV30-01	01				TCRBD02-01	TCRBD02	TCRBD02-01				01,02	TCRBJ02-01*01	TCRBJ02	TCRBJ02-01	01				1	2	5	5	3	4	39	52	55	61	63	8	In	VDJ												
TCCCTGATTCTGGAGTCCGCCAGCACCAACCAGACATCTATGTACCTCTGTGCCAGCAGTTTAATAGATACGCAGTATTTTGGCCCA	CASSLIDTQYF	16	7.379052714107826E-4	33	TCRBV28-01*01	TCRBV28	TCRBV28-01	01											TCRBJ02-03*01	TCRBJ02	TCRBJ02-03	01				2	2	0	0	0	5	48	-1	-1	63	65	16	In	VJ												
ATCCGGTCCACAAAGCTGGAGGACTCAGCCATGTACTTCTGTGCCAGCAGATCGGGACAGGGATGGGATGAGCAGTTCTTCGGGCCA	CASRSGQGWDEQFF	8	3.689526357053913E-4	42	TCRBV02-01*01	TCRBV02	TCRBV02-01	01				TCRBD01-01*01	TCRBD01	TCRBD01-01	01				TCRBJ02-01*01	TCRBJ02	TCRBJ02-01	01				6	5	0	3	3	8	39	50	53	62	67	8	In	VDJ												
ATCAATTCCCTGGAGCTTGGTGACTCTGCTGTGTATTTCTGTGCCAGCAGCCCTAGCGGAGACACCGGGGAGCTGTTTTTTGGAGAA	CASSPSGDTGELFF	28	0.0012913342249688696	42	TCRBV03	TCRBV03				TCRBV03-01,TCRBV03-02		TCRBD02-01	TCRBD02	TCRBD02-01				01,02	TCRBJ02-02*01	TCRBJ02	TCRBJ02-02	01				4	2	4	5	0	3	39	-1	52	59	61	28	In	VDJ												
GGTCCACAAAGCTGGAGGACTCAGCCATGTACTTCTGTGCCAGCAGTCCCGGGGGACGGGGCTTCATACGAGCAGTACTTCGGGCCG		8	3.689526357053913E-4	46	TCRBV02-01*01	TCRBV02	TCRBV02-01	01				TCRBD02-01*01	TCRBD02	TCRBD02-01	01				TCRBJ02-07*01	TCRBJ02	TCRBJ02-07	01				5	11	8	2	2	4	35	47	49	55	66	8	Out	VDJ												
GAGTCGGCTGCTCCCTCCCAAACATCTGTGTACTTCTGTGCCAGCAGTTCCGACAGCGGTCCCTACAATGAGCAGTTCTTCGGGCCA	CASSSDSGPYNEQFF	7	3.228335562422174E-4	45	TCRBV06	TCRBV06				TCRBV06-02,TCRBV06-03						TCRBD01,TCRBD02	TCRBD01-01,TCRBD02-01		TCRBJ02-01*01	TCRBJ02	TCRBJ02-01	01				4	5	2	5	2	2	36	49	51	56	61	7	In	VDJ												
GGGTTGGAGTCGGCTGCTCCCTCCCAAACATCTGTGTACTTCTGTGCCAGCAGTCCAGGGGACACCGGGGAGCTGTTTTTTGGAGAA	CASSPGDTGELFF	1	4.611907946317391E-5	39	TCRBV06	TCRBV06				TCRBV06-02,TCRBV06-03		TCRBD01-01*01	TCRBD01	TCRBD01-01	01				TCRBJ02-02*01	TCRBJ02	TCRBJ02-02	01				5	0	4	2	1	3	42	54	55	-1	61	1	In	VDJ												
CTGAACATGAGCTCCTTGGAGCTGGGGGACTCAGCCCTGTACTTCTGTGCCAGCAGCTTACGCACAGATACGCAGTATTTTGGCCCA	CASSLRTDTQYF	9	4.1507171516856525E-4	36	TCRBV13-01*01	TCRBV13	TCRBV13-01	01											TCRBJ02-03*01	TCRBJ02	TCRBJ02-03	01				2	1	0	0	0	1	45	-1	-1	60	61	9	In	VJ												
AAGAAGCTCCTTCTCAGTGACTCTGGCTTCTATCTCTGTGCCTGGAGTGTACGTCCGGGCGCAGGGTACGAGCAGTACTTCGGGCCG	CAWSVRPGAGYEQYF	1	4.611907946317391E-5	45	TCRBV30-01*01	TCRBV30	TCRBV30-01	01				TCRBD01-01*01	TCRBD01	TCRBD01-01	01				TCRBJ02-07*01	TCRBJ02	TCRBJ02-07	01				0	0	4	3	11	4	36	50	61	-1	66	1	In	VDJ												"""

        PathBuilder.build(path)

        with open(path + "rep1.tsv", "w") as file:
            file.writelines(rep1text)

        if add_metadata:
            with open(path + "metadata.csv", "w") as file:
                file.writelines(
                    """filename,chain,subject_id,coeliac status (yes/no)
rep1.tsv,TRA,1234a,no""")
示例#24
0
    def test_generate(self):
        path = EnvironmentSettings.root_path + "test/tmp/motifseedrecovery/"
        PathBuilder.build(path)

        report = self._create_report(path)

        # Running the report
        result = report.generate_report()

        self.assertIsInstance(result, ReportResult)
        self.assertEqual(result.output_tables[0].path,
                         path + "motif_seed_recovery.csv")
        self.assertEqual(result.output_figures[0].path,
                         path + "motif_seed_recovery.html")

        # Actual tests
        self.assertTrue(os.path.isfile(path + "motif_seed_recovery.csv"))
        self.assertTrue(os.path.isfile(path + "motif_seed_recovery.html"))

        written_data = pd.read_csv(path + "motif_seed_recovery.csv")

        self.assertListEqual(list(written_data.columns),
                             ["features", "max_seed_overlap", "coefficients"])
        self.assertListEqual(list(written_data["coefficients"]),
                             [i for i in range(5)])
        self.assertListEqual(list(written_data["features"]),
                             ["AAA", "AAC", "CKJ", "KSA", "AKJ"])
        self.assertListEqual(list(written_data["max_seed_overlap"]),
                             [3, 2, 0, 1, 1])

        shutil.rmtree(path)
示例#25
0
    def test_parse_receptor_dataset(self):
        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
3050	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
15760	TRB	CASSWTWDAATLWGQGALGGANVLTF	TRBV5-5*01	TRBJ2-6*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0
3050	TRA	CAAIYESRGSTLGRLYF	TRAV13-1*01	TRAJ18*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"}	0
15760	TRA	CALRLNNQGGKLIF	TRAV9-2*01	TRAJ23*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"}	0
                """
        path = EnvironmentSettings.root_path + "test/tmp/dslimportparservdj/"
        data_path = EnvironmentSettings.root_path + "test/tmp/dslimportparservdj/receptor_data/"
        PathBuilder.build(data_path)

        with open(data_path + "receptors.tsv", "w") as file:
            file.writelines(file_content)

        st, desc = ImportParser.parse(
            {
                "datasets": {
                    "d1": {
                        "format": "VDJdb",
                        "params": {
                            "is_repertoire": False,
                            "paired": True,
                            "receptor_chains": "TRA_TRB",
                            "path": data_path
                        }
                    }
                }
            }, SymbolTable(), path)

        dataset = st.get("d1")
        self.assertTrue(isinstance(dataset, ReceptorDataset))
        self.assertEqual(2, dataset.get_example_count())

        shutil.rmtree(path)
示例#26
0
    def test_process(self):
        path = EnvironmentSettings.root_path + "test/tmp/clones_per_repertoire_filter/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["ACF", "ACF", "ACF"], ["ACF", "ACF"],
             ["ACF", "ACF", "ACF", "ACF"]], path)[0])

        dataset1 = ClonesPerRepertoireFilter.process(dataset, {
            "lower_limit": 3,
            "result_path": path
        })
        self.assertEqual(2, dataset1.get_example_count())

        dataset2 = ClonesPerRepertoireFilter.process(dataset, {
            "upper_limit": 2,
            "result_path": path
        })
        self.assertEqual(1, dataset2.get_example_count())

        self.assertRaises(AssertionError, ClonesPerRepertoireFilter.process,
                          dataset, {
                              "lower_limit": 10,
                              "result_path": path
                          })

        shutil.rmtree(path)
    def test_create_model(self):
        test_path = EnvironmentSettings.root_path + "test/tmp/w2v_test_tmp/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA")
        sequence2 = ReceptorSequence("CASSCCC")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        model_creator = KmerPairModelCreator()
        model = model_creator.create_model(dataset=dataset,
                                           k=2,
                                           vector_size=16,
                                           batch_size=1,
                                           model_path=test_path +
                                           "model.model")

        self.assertTrue(isinstance(model, Word2Vec))
        self.assertTrue("CA" in model.wv.vocab)
        self.assertEqual(400, len(model.wv.vocab))

        shutil.rmtree(test_path)
    def run(self):
        print("Starting MultiDatasetBenchmarkTool...", flush=True)
        PathBuilder.build(self.result_path)
        specs = self._split_specs_file()
        self._extract_reports()
        instruction_states = {}
        for index, specs_name in enumerate(specs.keys()):
            print(
                f"Running nested cross-validation on dataset {specs_name} ({index+1}/{len(list(specs.keys()))})..",
                flush=True)
            app = ImmuneMLApp(specification_path=specs[specs_name],
                              result_path=f"{self.result_path}/{specs_name}/")
            instruction_states[specs_name] = app.run()[0]
            print(
                f"Finished nested cross-validation on dataset {specs_name} ({index+1}/{len(list(specs.keys()))})..",
                flush=True)

        print(
            "Running reports on the results of nested cross-validation on all datasets...",
            flush=True)
        report_results = self._run_reports(instruction_states)
        print("Finished reports, now generating HTML output...", flush=True)
        MultiDatasetBenchmarkHTMLBuilder.build(
            report_results, self.result_path, {
                specs_name: f"{self.result_path}/{specs_name}/"
                for specs_name in specs.keys()
            })
        print("MultiDatasetBenchmarkTool finished.", flush=True)
示例#29
0
    def test_implant_in_repertoire(self):
        path = EnvironmentSettings.tmp_test_path + "healthysequenceimplanting/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(
            [
                ReceptorSequence(amino_acid_sequence="ACDFQ", identifier="1"),
                ReceptorSequence(amino_acid_sequence="TGCDF", identifier="2")
            ],
            path=path,
            metadata={"subject_id": "1"})
        implanting = HealthySequenceImplanting(
            GappedMotifImplanting(),
            implanting_computation=ImplantingComputation.ROUND)
        signal = Signal("1", [Motif("m1", GappedKmerInstantiation(), "CCC")],
                        implanting)

        repertoire2 = implanting.implant_in_repertoire(repertoire, 0.5, signal,
                                                       path)

        new_sequences = [
            sequence.get_sequence() for sequence in repertoire2.sequences
        ]
        self.assertTrue("ACDFQ" in new_sequences or "TGCDF" in new_sequences)
        self.assertTrue(any(["CCC" in sequence for sequence in new_sequences]))

        shutil.rmtree(path)
示例#30
0
    def test_load_repertoire(self):
        """Test dataset content with and without a header included in the input file"""
        path = EnvironmentSettings.root_path + "test/tmp/io_igor_load/"

        PathBuilder.build(path)
        self.write_dummy_files(path, True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/", "igor")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["path"] = path
        params["metadata_file"] = path + "metadata.csv"

        dataset = IGoRImport.import_dataset(params, "igor_repertoire_dataset")

        self.assertEqual(2, dataset.get_example_count())
        self.assertEqual(len(dataset.repertoires[0].sequences), 1)
        self.assertEqual(len(dataset.repertoires[1].sequences), 1)

        self.assertEqual(
            dataset.repertoires[0].sequences[0].amino_acid_sequence,
            "ARDRWSTPVLRYFDWWTPPYYYYMDV")

        self.assertListEqual(list(dataset.repertoires[0].get_counts()), [1])
        self.assertEqual(dataset.repertoires[0].get_chains(), None)

        shutil.rmtree(path)