def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams):
        labels = params.label_config.get_labels_by_name()

        assert len(labels) == 1, \
            "SequenceAbundanceEncoder: this encoding works only for single label."

        examples = self._calculate_sequence_abundance(dataset,
                                                      self.comparison_data,
                                                      labels[0], params)

        encoded_data = EncodedData(
            examples,
            dataset.get_metadata([labels[0]])
            if params.encode_labels else None,
            dataset.get_repertoire_ids(), [
                SequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE,
                SequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE
            ],
            encoding=SequenceAbundanceEncoder.__name__,
            info={'relevant_sequence_path': self.relevant_sequence_csv_path})

        encoded_dataset = RepertoireDataset(labels=dataset.labels,
                                            encoded_data=encoded_data,
                                            repertoires=dataset.repertoires)

        return encoded_dataset
Пример #2
0
    def process_dataset(self,
                        dataset: RepertoireDataset,
                        result_path: Path = None):
        self.check_dataset_type(dataset, [RepertoireDataset],
                                "ClonesPerRepertoireFilter")
        self.result_path = result_path if result_path is not None else self.result_path

        processed_dataset = dataset.clone()
        repertoires, indices = [], []

        for index, repertoire in enumerate(dataset.get_data()):
            if self.lower_limit != -1 and len(
                    repertoire.sequences) < self.lower_limit:
                continue
            if self.upper_limit != -1 and len(
                    repertoire.sequences) > self.upper_limit:
                continue
            repertoires.append(dataset.repertoires[index])
            indices.append(index)

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = self._build_new_metadata(
            dataset, indices)

        self.check_dataset_not_empty(processed_dataset,
                                     "ClonesPerRepertoireFilter")

        return processed_dataset
    def _encode_new_dataset(self, dataset, params: EncoderParams):
        encoded_dataset = RepertoireDataset(
            repertoires=dataset.repertoires,
            labels=dataset.labels,
            metadata_file=dataset.metadata_file)

        feature_annotations = self._get_feature_info()
        encoded_repertoires, labels, example_ids = self._encode_repertoires(
            dataset, params)

        encoded_dataset.add_encoded_data(
            EncodedData(
                # examples contains a np.ndarray with counts
                examples=encoded_repertoires,
                # example_ids contains a list of repertoire identifiers
                example_ids=example_ids,
                # feature_names contains a list of reference receptor identifiers
                feature_names=[
                    "{receptor_id}.{chain}".format(
                        receptor_id=row["receptor_id"], chain=row["chain"])
                    for index, row in feature_annotations.iterrows()
                ],
                # feature_annotations contains a PD dataframe with sequence and VDJ gene usage per reference receptor
                feature_annotations=feature_annotations,
                labels=labels,
                encoding=MatchedReceptorsEncoder.__name__))

        return encoded_dataset
Пример #4
0
    def test_process(self):
        path = EnvironmentSettings.root_path / "test/tmp/subject_rep_collector"
        PathBuilder.build(path)

        reps = [
            Repertoire.build_from_sequence_objects(
                [ReceptorSequence("AAA", identifier="1")],
                path=path,
                metadata={"subject_id": "patient1"}),
            Repertoire.build_from_sequence_objects(
                [ReceptorSequence("AAC", identifier="2")],
                path=path,
                metadata={"subject_id": "patient1"}),
            Repertoire.build_from_sequence_objects(
                [ReceptorSequence("AAC", identifier="3")],
                path=path,
                metadata={"subject_id": "patient3"})
        ]

        dataset = RepertoireDataset(repertoires=reps)

        dataset2 = SubjectRepertoireCollector.process(
            dataset, {"result_path": path / "result"})

        self.assertEqual(2, len(dataset2.get_data()))
        self.assertEqual(3, len(dataset.get_data()))

        values = [2, 1]
        for index, rep in enumerate(dataset2.get_data()):
            self.assertEqual(values[index], len(rep.sequences))

        shutil.rmtree(path)
Пример #5
0
    def create_dummy_dataset(self, path):
        repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path, labels={"label1": ["val1", "val2"], "label2": ["val1", "val2"]})

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)
        dataset.name = "my_dataset"
        PickleExporter.export(dataset, path)

        return f"{dataset.name}.iml_dataset"
Пример #6
0
    def test_run(self):
        path = EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"],
             ["AA"], ["CC"], ["AA"], ["CC"]], path)[0])
        dataset.encoded_data = EncodedData(
            examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3],
                               [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3,
                                                                        3]]),
            labels={
                "l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3],
                "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
            })

        label_config = LabelConfiguration()
        label_config.add_label("l1", [1, 3])

        label = Label(name='l1', values=[1, 2])

        method1 = LogisticRegression()
        method1.fit(dataset.encoded_data, label=label)

        res = MLMethodAssessment.run(
            MLMethodAssessmentParams(
                dataset=dataset,
                method=method1,
                metrics={
                    Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO
                },
                optimization_metric=Metric.LOG_LOSS,
                predictions_path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/predictions.csv",
                label=label,
                ml_score_path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/ml_score.csv",
                split_index=1,
                path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/"))

        self.assertTrue(isinstance(res, dict))
        self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1)

        self.assertTrue(
            os.path.isfile(EnvironmentSettings.root_path /
                           "test/tmp/mlmethodassessment/ml_score.csv"))

        df = pd.read_csv(EnvironmentSettings.root_path /
                         "test/tmp/mlmethodassessment/ml_score.csv")
        self.assertTrue(df.shape[0] == 1)

        df = pd.read_csv(EnvironmentSettings.root_path /
                         "test/tmp/mlmethodassessment/predictions.csv")
        self.assertEqual(12, df.shape[0])

        shutil.rmtree(EnvironmentSettings.root_path /
                      "test/tmp/mlmethodassessment/")
Пример #7
0
 def process_dataset(self, dataset: RepertoireDataset):
     extract_fn = self.build_matching_fn()
     repertoire_count = dataset.get_example_count()
     for index, repertoire in enumerate(dataset.get_data()):
         self.process_repertoire(repertoire, str(repertoire.identifier),
                                 extract_fn)
         logging.info("Repertoire {} ({}/{}) processed.".format(
             repertoire.identifier, index + 1, repertoire_count))
         logging.info(
             f"Currently, there are {self.item_count} items in the comparison data matrix."
         )
     self.merge_tmp_batches_to_matrix()
Пример #8
0
    def create_datasets(self, path: Path):
        repertoires, metadata = RepertoireBuilder.build(
            [["A", "B"], ["B", "C"], ["D"], ["E", "F"]], path, {
                "l1": [1, 0, 1, 0],
                "l2": [2, 3, 2, 3]
            })

        main_dataset = RepertoireDataset(repertoires=repertoires,
                                         metadata_file=metadata)
        sub_dataset = main_dataset.make_subset([0, 1],
                                               path=path,
                                               dataset_type="subset")
        return main_dataset, sub_dataset
Пример #9
0
 def export_updated_metadata(dataset: RepertoireDataset, result_path: Path,
                             repertoire_folder: str):
     df = pd.read_csv(dataset.metadata_file, comment=Constants.COMMENT_SIGN)
     identifiers = df["identifier"].values.tolist(
     ) if "identifier" in df.columns else dataset.get_example_ids()
     df["filename"] = [
         str(
             Path(repertoire_folder) /
             f"{repertoire.data_filename.stem}.tsv")
         for repertoire in dataset.get_data()
     ]
     df['identifier'] = identifiers
     df.to_csv(result_path / "metadata.csv", index=False)
Пример #10
0
    def test_get_metadata_fields(self):

        path = EnvironmentSettings.tmp_test_path / "repertoire_dataset/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["AA"], ["BB"]], path, {"l1": [1, 2], "hla": ["A", "B"]}, subject_ids=["d1", "d2"])
        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)

        self.assertTrue("l1" in dataset.get_metadata_fields())
        self.assertTrue("hla" in dataset.get_metadata_fields())
        self.assertTrue("subject_id" in dataset.get_metadata_fields())

        shutil.rmtree(path)
Пример #11
0
    def build_labels(self, dataset: RepertoireDataset,
                     params: EncoderParams) -> dict:

        lbl = ["repertoire_identifier"]
        lbl.extend(params.label_config.get_labels_by_name())

        tmp_labels = dataset.get_metadata(lbl, return_df=True)
        tmp_labels = tmp_labels.iloc[pd.Index(
            tmp_labels['repertoire_identifier']).get_indexer(
                dataset.get_repertoire_ids())]
        tmp_labels = tmp_labels.to_dict("list")
        del tmp_labels["repertoire_identifier"]

        return tmp_labels
    def _encode_repertoires(self, dataset: RepertoireDataset, params: EncoderParams):
        # Rows = repertoires, Columns = reference chains (two per sequence receptor)
        encoded_repertories = np.zeros((dataset.get_example_count(),
                                        len(self.reference_receptors) * 2),
                                       dtype=int)
        labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None

        for i, repertoire in enumerate(dataset.get_data()):
            encoded_repertories[i] = self._match_repertoire_to_receptors(repertoire)

            if labels is not None:
                for label_name in params.label_config.get_labels_by_name():
                    labels[label_name].append(repertoire.metadata[label_name])

        return encoded_repertories, labels, dataset.get_repertoire_ids()
Пример #13
0
    def test_generate(self):
        path = EnvironmentSettings.root_path / "test/tmp/featuredistribution/"
        PathBuilder.build(path)

        dataset = self._create_dummy_encoded_data(path)

        report = FeatureComparison.build_object(**{"dataset": dataset,
                                                     "result_path": path,
                                                     "comparison_label": "patient"})

        self.assertTrue(report.check_prerequisites())

        result = report.generate_report()

        self.assertIsInstance(result, ReportResult)

        self.assertEqual(result.output_figures[0].path, path / "feature_comparison.html")
        self.assertEqual(result.output_tables[0].path, path / "feature_values.csv")

        content = pd.read_csv(path / "feature_values.csv")
        self.assertListEqual(list(content.columns),
                             ["patient", "example_id", "sequence", "feature", "value"])

        # report should succeed to build but check_prerequisites should be false when data is not encoded
        report = FeatureDistribution.build_object(**{"dataset": RepertoireDataset(),
                                                     "result_path": path})

        self.assertFalse(report.check_prerequisites())

        shutil.rmtree(path)
Пример #14
0
    def _create_dummy_encoded_data(self, path):
        n_subjects = 50
        n_features = 30

        kmers = [''.join(random.choices(string.ascii_uppercase, k=3)) for i in range(n_features)]

        encoded_data = {
            'examples': sparse.csr_matrix(
                np.random.normal(50, 10, n_subjects * n_features).reshape((n_subjects, n_features))),
            'example_ids': [''.join(random.choices(string.ascii_uppercase, k=4)) for i in range(n_subjects)],
            'labels': {
            },
            'feature_names': kmers,
            'feature_annotations': pd.DataFrame({
                "sequence": kmers
            }),
            'encoding': "random"
        }

        metadata_filepath = path / "metadata.csv"

        metadata = pd.DataFrame({"patient": np.array([i % 2 == 0 for i in range(n_subjects)])})
        metadata.to_csv(metadata_filepath, index=False)

        dataset = RepertoireDataset(encoded_data=EncodedData(**encoded_data), metadata_file=metadata_filepath)

        return dataset
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        Preprocessor.check_dataset_type(dataset, [RepertoireDataset], "ClonesPerRepertoireFilter")
        processed_dataset = dataset.clone()
        repertoires = []
        indices = []
        for index, repertoire in enumerate(dataset.get_data()):
            if "lower_limit" in params.keys() and len(repertoire.sequences) >= params["lower_limit"] or \
                "upper_limit" in params.keys() and len(repertoire.sequences) <= params["upper_limit"]:
                repertoires.append(dataset.repertoires[index])
                indices.append(index)
        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = ClonesPerRepertoireFilter.build_new_metadata(dataset, indices, params["result_path"])

        Filter.check_dataset_not_empty(processed_dataset, "ClonesPerRepertoireFilter")

        return processed_dataset
Пример #16
0
    def _construct_test_repertoiredataset(self, path, positional):
        receptors1 = ReceptorSequenceList()
        receptors2 = ReceptorSequenceList()

        if positional:
            [receptors1.append(seq) for seq in
             [ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"), ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1")]]
            [receptors2.append(seq) for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")]]
        else:
            [receptors1.append(seq) for seq in
             [ReceptorSequence("AAAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3')]]
            [receptors2.append(seq) for seq in [ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2")]]

        rep1 = Repertoire.build_from_sequence_objects(receptors1,
                                                      metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path)

        rep2 = Repertoire.build_from_sequence_objects(receptors2,
                                                      metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        return dataset, lc
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        SubjectRepertoireCollector.check_dataset_type(dataset, [RepertoireDataset], "SubjectRepertoireCollector")

        rep_map = {}
        repertoires = []
        indices_to_keep = []

        processed_dataset = dataset.clone()
        PathBuilder.build(params["result_path"])

        for index, repertoire in enumerate(processed_dataset.get_data()):
            if repertoire.metadata["subject_id"] in rep_map.keys():
                sequences = np.append(repertoire.sequences, rep_map[repertoire.metadata["subject_id"]].sequences)
                del rep_map[repertoire.metadata["subject_id"]]
                repertoires.append(SubjectRepertoireCollector.store_repertoire(
                    params["result_path"], repertoire, sequences))
            else:
                rep_map[repertoire.metadata["subject_id"]] = repertoire
                indices_to_keep.append(index)

        for key in rep_map.keys():
            repertoires.append(SubjectRepertoireCollector.store_repertoire(params["result_path"], rep_map[key], rep_map[key].sequences))

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = SubjectRepertoireCollector.build_new_metadata(dataset, indices_to_keep, params["result_path"])

        return processed_dataset
Пример #18
0
    def create_dataset(self):
        path = Path(
            os.path.relpath(EnvironmentSettings.root_path /
                            "test/tmp/immunemlapp/initial_dataset"))
        PathBuilder.build(path)

        repertoire_count = 30
        repertoires, metadata = RepertoireBuilder.build(
            [["AA", "AAAA", "AAAA", "AAA"] for i in range(repertoire_count)],
            path, {
                "CD": [
                    'yes' if i % 2 == 0 else 'no'
                    for i in range(repertoire_count)
                ],
                "CMV": [
                    True if i % 2 == 1 else False
                    for i in range(repertoire_count)
                ]
            }, [[{
                "chain": "A" if i % 2 == 0 else "B",
                "count": random.randint(2, 5)
            } for i in range(4)] for j in range(repertoire_count)])

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    labels={
                                        "CD": [True, False],
                                        "CMV": [True, False]
                                    },
                                    name="d1")
        PickleExporter.export(dataset, path)

        return path / "d1.iml_dataset"
Пример #19
0
    def test_repertoire_export(self):
        path = EnvironmentSettings.tmp_test_path / "airr_exporter_repertoire/"
        PathBuilder.build(path)

        repertoire, metadata_path = self.create_dummy_repertoire(path)
        dataset = RepertoireDataset(repertoires=[repertoire], metadata_file=metadata_path)

        path_exported = path / "exported"
        AIRRExporter.export(dataset, path_exported)

        resulting_data = pd.read_csv(path_exported / f"repertoires/{repertoire.identifier}.tsv", sep="\t")

        self.assertListEqual(list(resulting_data["sequence_id"]), ["receptor_1", "receptor_2"])
        self.assertListEqual(list(resulting_data["cdr3"]), ["GCTGCTGCT", "GGTGGTGGT"])
        self.assertListEqual(list(resulting_data["cdr3_aa"]), ["AAA", "GGG"])
        self.assertListEqual(list(resulting_data["v_call"]), ["TRBV1", "TRAV2*01"])
        self.assertListEqual(list(resulting_data["j_call"]), ["TRBJ1", "TRAJ2"])
        self.assertListEqual(list(resulting_data["d_call"]), ["TRBD1", "TRAD2"])
        self.assertListEqual(list(resulting_data["locus"]), ["TRB", "TRA"])
        self.assertListEqual(list(resulting_data["duplicate_count"]), [5, 15])
        self.assertListEqual(list(resulting_data["custom_test"]), ["cust1", "cust2"])
        self.assertListEqual(list(resulting_data["productive"]), ['T', 'F'])
        self.assertListEqual(list(resulting_data["stop_codon"]), ['F', 'F'])

        shutil.rmtree(path)
    def test_get_normalized_sequence_lengths(self):
        path = EnvironmentSettings.root_path / "test/tmp/datareports/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence(amino_acid_sequence="AAA", identifier="1"),
            ReceptorSequence(amino_acid_sequence="AAAA", identifier="2"),
            ReceptorSequence(amino_acid_sequence="AAAAA", identifier="3"),
            ReceptorSequence(amino_acid_sequence="AAA", identifier="4")
        ],
                                                      path=path,
                                                      metadata={})
        rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence(amino_acid_sequence="AAA", identifier="5"),
            ReceptorSequence(amino_acid_sequence="AAAA", identifier="6"),
            ReceptorSequence(amino_acid_sequence="AAAA", identifier="7"),
            ReceptorSequence(amino_acid_sequence="AAA", identifier="8")
        ],
                                                      path=path,
                                                      metadata={})

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        sld = SequenceLengthDistribution(dataset, 1, path)

        result = sld.generate_report()
        self.assertTrue(os.path.isfile(result.output_figures[0].path))

        shutil.rmtree(path)
    def _create_dummy_data(self, path, dataset_type):
        PathBuilder.build(path)
        dataset = None

        test_repertoire = Repertoire.build(
            sequence_aas=[
                "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH"
            ],
            v_genes=["V1-1" for i in range(5)],
            j_genes=["J1-1" for i in range(5)],
            chains=[
                Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA
            ],
            custom_lists={
                "custom_1": [f"CUST-{i}" for i in range(5)],
                "custom_2":
                [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)]
            },
            cell_ids=["1", "1", "1", "2", '2'],
            path=path)

        if dataset_type == "receptor":

            dataset = ReceptorDataset.build_from_objects(
                test_repertoire.receptors, 100, path, name="receptor_dataset")
            dataset.identifier = 'receptor_dataset'

        elif dataset_type == "repertoire":
            test_repertoire.identifier = "repertoire_dataset"
            dataset = RepertoireDataset(repertoires=[test_repertoire])

        return dataset
Пример #22
0
    def _create_report(self, path):
        report = ConfounderAnalysis.build_object(
            metadata_labels=["age", "HLA"], name='test')

        report.ml_details_path = path / "ml_details.yaml"
        report.label = Label("disease")
        report.result_path = path
        encoder = KmerFrequencyEncoder.build_object(
            RepertoireDataset(), **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "k": 3,
                'sequence_type': SequenceType.AMINO_ACID.name
            })
        report.train_dataset = self._encode_dataset(
            encoder, self._make_dataset(path / "train", size=100), path)
        report.test_dataset = self._encode_dataset(encoder,
                                                   self._make_dataset(path /
                                                                      "test",
                                                                      size=40),
                                                   path,
                                                   learn_model=False)
        report.method = self._create_dummy_lr_model(
            path, report.train_dataset.encoded_data, Label("disease"))

        return report
Пример #23
0
    def _merge_repertoires(self, dataset: RepertoireDataset):
        rep_map = {}
        repertoires, indices_to_keep = [], []
        processed_dataset = dataset.clone()

        for index, repertoire in enumerate(processed_dataset.get_data()):
            if repertoire.metadata["subject_id"] in rep_map.keys():
                sequences = np.append(
                    repertoire.sequences,
                    rep_map[repertoire.metadata["subject_id"]].sequences)
                del rep_map[repertoire.metadata["subject_id"]]
                repertoires.append(
                    self._store_repertoire(repertoire, sequences))
            else:
                rep_map[repertoire.metadata["subject_id"]] = repertoire
                indices_to_keep.append(index)

        for key in rep_map.keys():
            repertoires.append(
                self._store_repertoire(rep_map[key], rep_map[key].sequences))

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = self._build_new_metadata(
            dataset, indices_to_keep)

        return processed_dataset
    def test_create_model(self):
        test_path = EnvironmentSettings.root_path / "test/tmp/w2v_test_tmp/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA")
        sequence2 = ReceptorSequence("CASSCCC")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        model_creator = KmerPairModelCreator()
        model = model_creator.create_model(dataset=dataset,
                                           k=2,
                                           vector_size=16,
                                           batch_size=1,
                                           model_path=test_path /
                                           "model.model")

        self.assertTrue(isinstance(model, Word2Vec))
        self.assertTrue("CA" in model.wv.vocab)
        self.assertEqual(400, len(model.wv.vocab))

        shutil.rmtree(test_path)
Пример #25
0
    def make_random_dataset(self, path):
        alphabet = EnvironmentSettings.get_sequence_alphabet()
        sequences = [["".join([rn.choice(alphabet) for i in range(20)]) for i in range(100)] for i in range(40)]

        repertoires, metadata = RepertoireBuilder.build(sequences, path, subject_ids=[i % 2 for i in range(len(sequences))])
        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)
        PickleExporter.export(dataset, path)
    def _create_dummy_data(self, path, dataset_type):
        PathBuilder.build(path)
        dataset = None

        test_repertoire = Repertoire.build(
            sequence_aas=[
                "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH"
            ],
            v_genes=["V1-1" for i in range(5)],
            j_genes=["J1-1" for i in range(5)],
            chains=[
                Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA
            ],
            custom_lists={
                "custom_1": [f"CUST-{i}" for i in range(5)],
                "custom_2":
                [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)]
            },
            cell_ids=[1, 1, 1, 2, 2],
            path=path)

        if dataset_type == "receptor":
            receptordataset_filename = path / "receptors.pkl"
            with open(receptordataset_filename, "wb") as file:
                pickle.dump(test_repertoire.receptors, file)

            dataset = ReceptorDataset(filenames=[receptordataset_filename],
                                      identifier="receptor_dataset")

        elif dataset_type == "repertoire":
            test_repertoire.identifier = "repertoire_dataset"
            dataset = RepertoireDataset(repertoires=[test_repertoire])

        return dataset
Пример #27
0
    def test_exporter(self):
        dataset = RepertoireDataset(encoded_data=EncodedData(examples=csr_matrix(np.arange(12).reshape(3, 4)),
                                                             labels={"l1": [1, 0, 1], "l2": [0, 0, 1]},
                                                             example_ids=[0, 1, 2],
                                                             feature_names=["f1", "f2", "f3", "f4"],
                                                             encoding="test_encoding"))

        path = EnvironmentSettings.tmp_test_path / "designmatrrixexporterreport/"

        report = DesignMatrixExporter(dataset=dataset, result_path=path,
                                      name="design_matrix", file_format='csv')
        report.generate_report()
        self.assertTrue(os.path.isfile(path / "design_matrix.csv"))
        report.file_format = 'csv.zip'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.csv.zip"))

        report.file_format = 'npy'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.npy"))
        report.file_format = 'npy.zip'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.npy.zip"))

        report.file_format = 'hdf5'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.hdf5"))
        report.file_format = 'hdf5.zip'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.hdf5.zip"))
        shutil.rmtree(path)

        with self.assertRaises(AssertionError):
            DesignMatrixExporter.build_object(**{'file_format': "random"})
Пример #28
0
    def test_generate(self):
        dataset = RepertoireDataset(encoded_data=EncodedData(examples=csr_matrix(np.arange(12).reshape(3, 4)),
                                                             labels={"l1": [1, 0, 1], "l2": [0, 0, 1]},
                                                             example_ids=[0, 1, 2],
                                                             feature_names=["f1", "f2", "f3", "f4"],
                                                             encoding="test_encoding"))

        path = EnvironmentSettings.tmp_test_path / "designmatrrixexporterreport/"

        report = DesignMatrixExporter(dataset, path, name='report', file_format='csv')
        report.generate_report()
        self.assertTrue(os.path.isfile(path / "design_matrix.csv"))

        self.assertTrue(os.path.isfile(path / "labels.csv"))
        self.assertTrue(os.path.isfile(path / "encoding_details.yaml"))

        matrix = pd.read_csv(path / "design_matrix.csv", sep=",").values
        self.assertTrue(np.array_equal(matrix, np.arange(12).reshape(3, 4)))

        labels = pd.read_csv(path / "labels.csv", sep=",").values
        self.assertTrue(np.array_equal(labels, np.array([[1, 0], [0, 0], [1, 1]])))

        with open(path / "encoding_details.yaml", "r") as file:
            loaded = yaml.safe_load(file)

        self.assertTrue("feature_names" in loaded)
        self.assertTrue("encoding" in loaded)
        self.assertTrue("example_ids" in loaded)

        self.assertTrue(np.array_equal(loaded["example_ids"], np.array([0, 1, 2])))
        self.assertTrue(np.array_equal(loaded["feature_names"], np.array(["f1", "f2", "f3", "f4"])))
        self.assertEqual("test_encoding", loaded["encoding"])

        shutil.rmtree(path)
    def test_process(self):

        path = EnvironmentSettings.root_path / "test/tmp/chain_filter/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAA", metadata=SequenceMetadata(chain="A"), identifier="1")
        ],
                                                      path=path,
                                                      metadata={})
        rep2 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAC", metadata=SequenceMetadata(chain="B"), identifier="2")
        ],
                                                      path=path,
                                                      metadata={})

        metadata = pd.DataFrame({"CD": [1, 0]})
        metadata.to_csv(path / "metadata.csv")

        dataset = RepertoireDataset(repertoires=[rep1, rep2],
                                    metadata_file=path / "metadata.csv")

        dataset2 = ChainRepertoireFilter.process(
            dataset, {
                "keep_chain": "ALPHA",
                "result_path": path / "results"
            })

        self.assertEqual(1, len(dataset2.get_data()))
        self.assertEqual(2, len(dataset.get_data()))

        metadata_dict = dataset2.get_metadata(["CD"])
        self.assertEqual(1, len(metadata_dict["CD"]))
        self.assertEqual(1, metadata_dict["CD"][0])

        for rep in dataset2.get_data():
            self.assertEqual("AAA", rep.sequences[0].get_sequence())

        self.assertRaises(AssertionError, ChainRepertoireFilter.process,
                          dataset, {
                              "keep_chain": "GAMMA",
                              "result_path": path / "results"
                          })

        shutil.rmtree(path)
Пример #30
0
    def create_comparison_data(self,
                               dataset: RepertoireDataset) -> ComparisonData:

        comparison_data = ComparisonData(dataset.get_repertoire_ids(),
                                         self.matching_columns,
                                         self.sequence_batch_size, self.path)
        comparison_data.process_dataset(dataset)

        return comparison_data