예제 #1
0
    def _create_new_sequences(self, sequences, new_sequence_count,
                              signal) -> List[ReceptorSequence]:
        new_sequences = sequences[:-new_sequence_count]

        for _ in range(new_sequence_count):

            motif = random.choice(signal.motifs)
            motif_instance = motif.instantiate_motif()
            annotation = SequenceAnnotation([
                ImplantAnnotation(signal_id=signal.id,
                                  motif_id=motif.identifier,
                                  motif_instance=motif_instance.instance,
                                  position=0)
            ])
            metadata = SequenceMetadata(v_gene="TRBV6-1",
                                        j_gene="TRBJ2-7",
                                        count=1,
                                        chain="B")

            new_sequences.append(
                ReceptorSequence(amino_acid_sequence=motif_instance.instance,
                                 annotation=annotation,
                                 metadata=metadata))

        return new_sequences
예제 #2
0
 def create_gapped_kmers_from_sequence(sequence: ReceptorSequence,
                                       k_left: int,
                                       max_gap: int,
                                       k_right: int = None,
                                       min_gap: int = 0):
     return KmerHelper.create_gapped_kmers_from_string(
         sequence.get_sequence(), k_left, max_gap, k_right, min_gap)
    def test_process(self):

        path = EnvironmentSettings.root_path / "test/tmp/chain_filter/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAA", metadata=SequenceMetadata(chain="A"), identifier="1")
        ],
                                                      path=path,
                                                      metadata={})
        rep2 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAC", metadata=SequenceMetadata(chain="B"), identifier="2")
        ],
                                                      path=path,
                                                      metadata={})

        metadata = pd.DataFrame({"CD": [1, 0]})
        metadata.to_csv(path / "metadata.csv")

        dataset = RepertoireDataset(repertoires=[rep1, rep2],
                                    metadata_file=path / "metadata.csv")

        dataset2 = ChainRepertoireFilter.process(
            dataset, {
                "keep_chain": "ALPHA",
                "result_path": path / "results"
            })

        self.assertEqual(1, len(dataset2.get_data()))
        self.assertEqual(2, len(dataset.get_data()))

        metadata_dict = dataset2.get_metadata(["CD"])
        self.assertEqual(1, len(metadata_dict["CD"]))
        self.assertEqual(1, metadata_dict["CD"][0])

        for rep in dataset2.get_data():
            self.assertEqual("AAA", rep.sequences[0].get_sequence())

        self.assertRaises(AssertionError, ChainRepertoireFilter.process,
                          dataset, {
                              "keep_chain": "GAMMA",
                              "result_path": path / "results"
                          })

        shutil.rmtree(path)
예제 #4
0
    def test_run(self):
        path = EnvironmentSettings.root_path / "test/tmp/dataencoder/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("AAA", identifier="1")],
            metadata={
                "l1": 1,
                "l2": 2
            },
            path=path)

        rep2 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("ATA", identifier="2")],
            metadata={
                "l1": 0,
                "l2": 3
            },
            path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])
        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": ModelType.SEQUENCE.name,
                "vector_size": 6
            })

        res = DataEncoder.run(
            DataEncoderParams(dataset=dataset,
                              encoder=encoder,
                              encoder_params=EncoderParams(
                                  model={},
                                  pool_size=2,
                                  label_config=lc,
                                  result_path=path,
                                  filename="dataset.csv"),
                              store_encoded_data=False))

        self.assertTrue(isinstance(res, RepertoireDataset))
        self.assertTrue(res.encoded_data.examples.shape[0] == 2)

        shutil.rmtree(path)
예제 #5
0
    def construct_test_flatten_dataset(self, path):
        sequences = [
            ReceptorSequence(
                amino_acid_sequence="AAATTT",
                identifier="1",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="ATATAT",
                identifier="2",
                metadata=SequenceMetadata(custom_params={"l1": 2}))
        ]

        PathBuilder.build(path)

        return SequenceDataset.build(sequences=sequences,
                                     file_size=10,
                                     path=path)
예제 #6
0
 def test_create_IMGT_kmers_from_sequence(self):
     kmers = KmerHelper.create_IMGT_kmers_from_sequence(ReceptorSequence("CASSRYUF"), 3, sequence_type=SequenceType.AMINO_ACID)
     self.assertTrue(("CAS", 105) in kmers)
     self.assertTrue(("ASS", 106) in kmers)
     self.assertTrue(("SSR", 107) in kmers)
     self.assertTrue(("SRY", 108) in kmers)
     self.assertTrue(("RYU", 114) in kmers)
     self.assertTrue(("YUF", 115) in kmers)
예제 #7
0
    def create_dummy_repertoire(self, path):
        sequence_objects = [
            ReceptorSequence(amino_acid_sequence="AAA",
                             nucleotide_sequence="GCTGCTGCT",
                             identifier="receptor_1",
                             metadata=SequenceMetadata(v_gene="TRBV1",
                                                       j_gene="TRBJ1",
                                                       chain=Chain.BETA,
                                                       count=5,
                                                       region_type="IMGT_CDR3",
                                                       frame_type="IN",
                                                       custom_params={
                                                           "d_call": "TRBD1",
                                                           "custom_test":
                                                           "cust1"
                                                       })),
            ReceptorSequence(amino_acid_sequence="GGG",
                             nucleotide_sequence="GGTGGTGGT",
                             identifier="receptor_2",
                             metadata=SequenceMetadata(v_gene="TRAV2",
                                                       v_allele="TRAV2*01",
                                                       j_gene="TRAJ2",
                                                       chain=Chain.ALPHA,
                                                       count=15,
                                                       frame_type=None,
                                                       region_type="IMGT_CDR3",
                                                       custom_params={
                                                           "d_call": "TRAD2",
                                                           "custom_test":
                                                           "cust2"
                                                       }))
        ]

        repertoire = Repertoire.build_from_sequence_objects(
            sequence_objects=sequence_objects,
            path=path,
            metadata={"subject_id": "REP1"})
        df = pd.DataFrame({
            "filename": [f"{repertoire.identifier}_data.npy"],
            "subject_id": ["1"],
            "repertoire_identifier": [repertoire.identifier]
        })
        df.to_csv(path / "metadata.csv", index=False)

        return repertoire, path / "metadata.csv"
예제 #8
0
    def test_encode(self):

        test_path = EnvironmentSettings.root_path / "test/tmp/w2v/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA", identifier="1")
        sequence2 = ReceptorSequence("CASSCCC", identifier="2")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        label_configuration = LabelConfiguration()
        label_configuration.add_label("T1D", ["T1D", "CTL"])

        config_params = EncoderParams(model={},
                                      learn_model=True,
                                      result_path=test_path,
                                      label_config=label_configuration,
                                      filename="dataset.pkl")

        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": "sequence",
                "vector_size": 16
            })

        encoded_dataset = encoder.encode(dataset=dataset, params=config_params)

        self.assertIsNotNone(encoded_dataset.encoded_data)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16)
        self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2)
        self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D")
        self.assertTrue(isinstance(encoder, W2VRepertoireEncoder))

        shutil.rmtree(test_path)
예제 #9
0
    def test_create_sentences_from_repertoire(self):

        path = EnvironmentSettings.tmp_test_path / "kmer/"
        PathBuilder.build(path)

        rep = Repertoire.build_from_sequence_objects([
            ReceptorSequence(amino_acid_sequence="AACT"),
            ReceptorSequence(amino_acid_sequence="ACCT"),
            ReceptorSequence(amino_acid_sequence="AACT")
        ], path, {})

        sentences = KmerHelper.create_sentences_from_repertoire(rep, 3)

        self.assertEqual(3, len(sentences))
        self.assertTrue(
            len(sentences[0]) == 2 and "AAC" in sentences[0]
            and "ACT" in sentences[0])

        shutil.rmtree(path)
    def test(self):

        receptors = [TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"), beta=ReceptorSequence(amino_acid_sequence="AAACCC"), identifier="1"),
                     TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"), beta=ReceptorSequence(amino_acid_sequence="CCC"), identifier="2"),
                     TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"), beta=ReceptorSequence(amino_acid_sequence="AAACCC"), identifier="3"),
                     TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"), beta=ReceptorSequence(amino_acid_sequence="CCC"), identifier="4")]

        path = EnvironmentSettings.tmp_test_path / "kmer_receptor_frequency/"
        PathBuilder.build(path)
        filename = path / "receptors.pkl"
        with open(filename, "wb") as file:
            pickle.dump(receptors, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = ReceptorDataset(labels={"l1": [1, 2]}, filenames=[filename], identifier="d1")

        encoder = KmerFreqReceptorEncoder.build_object(dataset, **{
                "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "sequence_type": SequenceType.AMINO_ACID.name,
                "k": 3
            })

        encoded_dataset = encoder.encode(dataset, EncoderParams(
            result_path=path / "2/",
            label_config=lc,
            pool_size=2,
            learn_model=True,
            model={},
            filename="dataset.csv",
            encode_labels=False
        ))

        self.assertEqual(4, encoded_dataset.encoded_data.examples.shape[0])
        self.assertTrue(all(identifier in encoded_dataset.encoded_data.example_ids
                            for identifier in ['1', '2', '3', '4']))
        self.assertTrue(numpy.array_equal(encoded_dataset.encoded_data.examples[0].A, encoded_dataset.encoded_data.examples[2].A))
        self.assertTrue(all(feature_name in encoded_dataset.encoded_data.feature_names for feature_name in ["alpha_AAA", "alpha_AAC", "beta_CCC"]))

        shutil.rmtree(path)
    def _construct_test_dataset(self, path, dataset_size: int = 50):
        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATA"),
                         metadata={"l1": 1},
                         identifier=str("1")),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="ATA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATT"),
                         metadata={"l1": 2},
                         identifier=str("2"))
        ]

        PathBuilder.build(path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = ReceptorDataset.build(receptors, 2, path)
        return dataset, lc
예제 #12
0
    def get_formatted_node_metadata(self, seq: ReceptorSequence):
        # sequence, v_gene_subgroup, v_gene, j_gene_subgroup, j_gene
        chain = seq.get_attribute('chain').value
        v_gene = seq.get_attribute('v_gene')
        j_gene = seq.get_attribute('j_gene')

        additional_info = []

        for attr in self.additional_node_attributes:
            try:
                additional_info.append(seq.get_attribute(attr))
            except KeyError:
                additional_info.append(None)
                warnings.warn(
                    f"CytoscapeNetworkExporter: additional metadata attribute {attr} was not found for some receptor chain(s), "
                    f"value None was used instead.")

        return [seq.get_sequence(),
                f"{chain}{v_gene.split('-')[0]}", f"{chain}{v_gene}",
                f"{chain}{j_gene.split('-')[0]}", f"{chain}{j_gene}"] + additional_info
예제 #13
0
 def test_create_IMGT_gapped_kmers_from_sequence(self):
     kmers = KmerHelper.create_IMGT_gapped_kmers_from_sequence(
         ReceptorSequence("CASSRYUF"), 2, 1, 1, 1)
     self.assertTrue(
         all([
             k in kmers
             for k in [('CA.S', 105), ('AS.R',
                                       106), ('SS.Y',
                                              107), ('SR.U',
                                                     108), ('RY.F', 114)]
         ]))
예제 #14
0
    def test_encode_sequence(self):
        sequence = ReceptorSequence(amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="OUT"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(enc.encode_sequence(sequence, EncoderParams(model={},
                                                                     label_config=LabelConfiguration(),
                                                                     result_path="")),
                         ["AAA"])

        sequence = ReceptorSequence(amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="STOP"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(enc.encode_sequence(sequence, EncoderParams(model={},
                                                                     label_config=LabelConfiguration(),
                                                                     result_path="")),
                         ["AAA"])

        sequence = ReceptorSequence(amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="IN"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(["AAA"],
                         enc.encode_sequence(sequence, EncoderParams(model={},
                                                                     label_config=LabelConfiguration(),
                                                                     result_path="")))
예제 #15
0
    def _construct_test_repertoiredataset(self, path, positional):
        receptors1 = ReceptorSequenceList()
        receptors2 = ReceptorSequenceList()

        if positional:
            [receptors1.append(seq) for seq in
             [ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"), ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1")]]
            [receptors2.append(seq) for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")]]
        else:
            [receptors1.append(seq) for seq in
             [ReceptorSequence("AAAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3')]]
            [receptors2.append(seq) for seq in [ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2")]]

        rep1 = Repertoire.build_from_sequence_objects(receptors1,
                                                      metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path)

        rep2 = Repertoire.build_from_sequence_objects(receptors2,
                                                      metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        return dataset, lc
예제 #16
0
    def create_from_record(cls, record: np.void):
        if 'version' in record.dtype.names and record[
                'version'] == TCGDReceptor.version:

            gamma_record = record[[
                'gamma_' + name
                for name in ReceptorSequence.get_record_names()
            ]]
            gamma_record.dtype.names = ReceptorSequence.get_record_names()

            delta_record = record[[
                'delta_' + name
                for name in ReceptorSequence.get_record_names()
            ]]
            delta_record.dtype.names = ReceptorSequence.get_record_names()

            return TCGDReceptor(
                gamma=ReceptorSequence.create_from_record(gamma_record),
                delta=ReceptorSequence.create_from_record(delta_record),
                identifier=record['identifier'],
                metadata=json.loads(record['metadata']))
        else:
            raise NotImplementedError(
                f"Supported ({TCGDReceptor.version}) and available version differ, but there is no converter available."
            )
예제 #17
0
    def create_from_record(cls, record):
        if 'version' in record.dtype.names and record[
                'version'] == BCKReceptor.version:

            heavy_record = record[[
                'heavy_' + name
                for name in ReceptorSequence.get_record_names()
            ]]
            heavy_record.dtype.names = ReceptorSequence.get_record_names()

            kappa_record = record[[
                'kappa_' + name
                for name in ReceptorSequence.get_record_names()
            ]]
            kappa_record.dtype.names = ReceptorSequence.get_record_names()

            return BCKReceptor(
                heavy=ReceptorSequence.create_from_record(heavy_record),
                kappa=ReceptorSequence.create_from_record(kappa_record),
                identifier=record['identifier'],
                metadata=json.loads(record['metadata']))
        else:
            raise NotImplementedError(
                f"Supported ({BCKReceptor.version}) and available version differ, but there is no converter available."
            )
예제 #18
0
    def test_split_dataset(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "leave_one_out_splitter/")
        receptors = []
        for i in range(10):
            receptors.append(
                TCABReceptor(ReceptorSequence(), ReceptorSequence(),
                             {"subject": i % 3}))

        filename = path / "batch1.pickle"
        with open(filename, "wb") as file:
            pickle.dump(receptors, file)

        dataset = ReceptorDataset(filenames=[filename])

        params = DataSplitterParams(
            dataset,
            SplitType.LEAVE_ONE_OUT_STRATIFICATION,
            3,
            paths=[path / f"result_{i}/" for i in range(1, 4)],
            split_config=SplitConfig(SplitType.LEAVE_ONE_OUT_STRATIFICATION,
                                     split_count=3,
                                     leave_one_out_config=LeaveOneOutConfig(
                                         "subject", 1)))
        train_datasets, test_datasets = LeaveOneOutSplitter.split_dataset(
            params)

        self.assertEqual(3, len(train_datasets))
        self.assertEqual(3, len(test_datasets))

        for i in range(3):
            self.assertTrue(
                all(receptor.metadata["subject"] == i
                    for receptor in test_datasets[i].get_data()))
            self.assertTrue(
                all(receptor.metadata["subject"] != i
                    for receptor in train_datasets[i].get_data()))

        shutil.rmtree(path)
예제 #19
0
    def test_encode_sequence(self):
        seq = ReceptorSequence(amino_acid_sequence="CASSVFRTY")
        result = KmerSequenceEncoder.encode_sequence(seq, EncoderParams(model={"k": 3},
                                                                        label_config=LabelConfiguration(),
                                                                        result_path="", pool_size=4))

        self.assertTrue("CAS" in result)
        self.assertTrue("ASS" in result)
        self.assertTrue("SSV" in result)
        self.assertTrue("SVF" in result)
        self.assertTrue("VFR" in result)
        self.assertTrue("FRT" in result)
        self.assertTrue("RTY" in result)

        self.assertEqual(7, len(result))
        self.assertEqual(
            KmerSequenceEncoder.encode_sequence(
                ReceptorSequence(amino_acid_sequence="AC"),
                EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="", pool_size=4)
            ),
            None
        )
예제 #20
0
    def match_sequence(self, sequence: ReceptorSequence,
                       reference_sequences: list, max_distance: int) -> dict:
        matching_sequences = [
            seq.get_sequence() for seq in reference_sequences
            if self.matches_sequence(sequence, seq, max_distance)
        ]

        return {
            "matching_sequences": matching_sequences,
            "sequence": sequence.get_sequence(),
            "v_gene": sequence.metadata.v_gene,
            "j_gene": sequence.metadata.j_gene,
            "chain": sequence.metadata.chain
        }
    def test_process(self):
        path = EnvironmentSettings.root_path / "test/tmp/subject_rep_collector"
        PathBuilder.build(path)

        reps = [Repertoire.build_from_sequence_objects([ReceptorSequence("AAA", identifier="1")], path=path,
                                                       metadata={"subject_id": "patient1"}),
                Repertoire.build_from_sequence_objects([ReceptorSequence("AAC", identifier="2")], path=path,
                                                       metadata={"subject_id": "patient1"}),
                Repertoire.build_from_sequence_objects([ReceptorSequence("AAC", identifier="3")], path=path,
                                                       metadata={"subject_id": "patient3"})]

        dataset = RepertoireDataset(repertoires=reps)

        dataset2 = SubjectRepertoireCollector().process_dataset(dataset, path / "result")

        self.assertEqual(2, len(dataset2.get_data()))
        self.assertEqual(3, len(dataset.get_data()))

        values = [2, 1]
        for index, rep in enumerate(dataset2.get_data()):
            self.assertEqual(values[index], len(rep.sequences))

        shutil.rmtree(path)
예제 #22
0
    def test_match_repertoire(self):

        path = EnvironmentSettings.root_path / "test/tmp/seqmatchrep/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence(amino_acid_sequence="AAAAAA",
                             identifier="1",
                             metadata=SequenceMetadata(chain="A", count=3)),
            ReceptorSequence(amino_acid_sequence="CCCCCC",
                             identifier="2",
                             metadata=SequenceMetadata(chain="A", count=2)),
            ReceptorSequence(amino_acid_sequence="AAAACC",
                             identifier="3",
                             metadata=SequenceMetadata(chain="A", count=1)),
            ReceptorSequence(amino_acid_sequence="TADQVF",
                             identifier="4",
                             metadata=SequenceMetadata(chain="A", count=4))
        ],
                                                            metadata={
                                                                "CD": True
                                                            },
                                                            path=path)

        sequences = [
            ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A")),
            ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A"))
        ]

        matcher = SequenceMatcher()
        result = matcher.match_repertoire(repertoire, 0, sequences, 2,
                                          SequenceMatchingSummaryType.COUNT)

        self.assertTrue("sequences" in result)
        self.assertTrue("repertoire" in result)
        self.assertTrue("repertoire_index" in result)

        self.assertEqual(4, len(result["sequences"]))
        self.assertEqual(1, len(result["sequences"][0]["matching_sequences"]))
        self.assertEqual(0, len(result["sequences"][1]["matching_sequences"]))
        self.assertEqual(1, len(result["sequences"][2]["matching_sequences"]))
        self.assertEqual(1, len(result["sequences"][3]["matching_sequences"]))

        self.assertEqual(
            3,
            len([
                r for r in result["sequences"]
                if len(r["matching_sequences"]) > 0
            ]))
        self.assertTrue(result["metadata"]["CD"])

        result = matcher.match_repertoire(
            repertoire, 0, sequences, 2,
            SequenceMatchingSummaryType.CLONAL_PERCENTAGE)
        self.assertEqual(0.8, result["clonal_percentage"])

        shutil.rmtree(path)
예제 #23
0
    def test_match(self):
        path = EnvironmentSettings.root_path / "test/tmp/seqmatch/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(
            sequence_objects=[
                ReceptorSequence(amino_acid_sequence="AAAAAA",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="3"),
                ReceptorSequence(amino_acid_sequence="CCCCCC",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="4"),
                ReceptorSequence(amino_acid_sequence="AAAACC",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="5"),
                ReceptorSequence(amino_acid_sequence="TADQVF",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J3"),
                                 identifier="6")
            ],
            metadata={"CD": True},
            path=path)

        dataset = RepertoireDataset(repertoires=[repertoire])
        sequences = [
            ReceptorSequence("AAAACA",
                             metadata=SequenceMetadata(chain="A",
                                                       v_gene="V1",
                                                       j_gene="J2"),
                             identifier="1"),
            ReceptorSequence("TADQV",
                             metadata=SequenceMetadata(chain="A",
                                                       v_gene="V1",
                                                       j_gene="J3"),
                             identifier="2")
        ]

        matcher = SequenceMatcher()
        result = matcher.match(dataset, sequences, 2,
                               SequenceMatchingSummaryType.PERCENTAGE)

        self.assertTrue("repertoires" in result)
        self.assertEqual(
            1,
            len(result["repertoires"][0]["sequences"][3]
                ["matching_sequences"]))
        self.assertTrue(result["repertoires"][0]["metadata"]["CD"])
        self.assertEqual(1, len(result["repertoires"]))

        shutil.rmtree(path)
    def test_encode_sequence(self):
        sequence = ReceptorSequence("AHCDE", None, None)
        kmers = IMGTGappedKmerEncoder.encode_sequence(
            sequence,
            EncoderParams(model={
                "k_left": 1,
                "max_gap": 1
            },
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual(
            {
                'AH///105', 'HC///106', 'CD///107', 'DE///116', 'A.C///105',
                'H.D///106', 'C.E///107'
            }, set(kmers))

        sequence = ReceptorSequence("CASSPRERATYEQCAY", None, None)
        kmers = IMGTGappedKmerEncoder.encode_sequence(
            sequence,
            EncoderParams(model={
                "k_left": 1,
                "max_gap": 1
            },
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual(
            {
                'CA///105', 'AS///106', 'SS///107', 'SP///108', 'PR///109',
                'RE///110', 'ER///111', 'RA///111.001', 'AT///112.002',
                'TY///112.001', 'YE///112', 'EQ///113', 'QC///114', 'CA///115',
                'AY///116', 'C.S///105', 'A.S///106', 'S.P///107', 'S.R///108',
                'P.E///109', 'R.R///110', 'E.A///111', 'R.T///111.001',
                'A.Y///112.002', 'T.E///112.001', 'Y.Q///112', 'E.C///113',
                'Q.A///114', 'C.Y///115'
            }, set(kmers))
예제 #25
0
    def _make_sequence_object(self, row, load_implants: bool = False):

        fields = row.dtype.names

        implants = []
        if load_implants:
            keys = [
                key for key in row.dtype.names if key not in Repertoire.FIELDS
            ]
            for key in keys:
                value_dict = row[key]
                if value_dict:
                    try:
                        implants.append(
                            ImplantAnnotation(**ast.literal_eval(value_dict)))
                    except (SyntaxError, ValueError, TypeError) as e:
                        pass

        seq = ReceptorSequence(
            amino_acid_sequence=row["sequence_aas"]
            if "sequence_aas" in fields else None,
            nucleotide_sequence=row["sequences"]
            if "sequences" in fields else None,
            identifier=row["sequence_identifiers"]
            if "sequence_identifiers" in fields else None,
            metadata=SequenceMetadata(
                v_gene=row["v_genes"] if "v_genes" in fields else None,
                j_gene=row["j_genes"] if "j_genes" in fields else None,
                v_subgroup=row["v_subgroups"]
                if "v_subgroups" in fields else None,
                j_subgroup=row["j_subgroups"]
                if "j_subgroups" in fields else None,
                v_allele=row["v_alleles"] if "v_alleles" in fields else None,
                j_allele=row["j_alleles"] if "j_alleles" in fields else None,
                chain=row["chains"] if "chains" in fields else None,
                count=row["counts"] if "counts" in fields
                and not NumpyHelper.is_nan_or_empty(row['counts']) else None,
                region_type=row["region_types"]
                if "region_types" in fields else None,
                frame_type=row["frame_types"]
                if "frame_types" in fields else "IN",
                cell_id=row["cell_ids"] if "cell_ids" in fields else None,
                custom_params={
                    key: row[key] if key in fields else None
                    for key in set(self.fields) - set(Repertoire.FIELDS)
                }),
            annotation=SequenceAnnotation(implants=implants))

        return seq
예제 #26
0
    def process_iris_chain(row, chain, dual_chain_id, all_genes):
        sequences = ReceptorSequenceList()

        v_alleles = set([gene.replace("TR{}".format(chain), "").replace(chain, "") for gene in row["TR{} - V gene (1)".format(chain)].split(" | ")])
        j_alleles = set([gene.replace("TR{}".format(chain), "").replace(chain, "") for gene in row["TR{} - J gene (1)".format(chain)].split(" | ")])

        make_sequence_metadata = lambda v_allele, j_allele, chain, dual_chain_id: \
            SequenceMetadata(v_gene=v_allele.split(Constants.ALLELE_DELIMITER)[0], v_allele=v_allele, v_subgroup=v_allele.split("-")[0],
                             j_gene=j_allele.split(Constants.ALLELE_DELIMITER)[0], j_allele=j_allele, j_subgroup=j_allele.split("-")[0], chain=chain,
                             custom_params={"dual_chain_id": dual_chain_id})

        if all_genes:
            for v_allele in v_alleles:
                for j_allele in j_alleles:
                    metadata = make_sequence_metadata(v_allele, j_allele, chain, dual_chain_id)
                    sequences.append(ReceptorSequence(amino_acid_sequence=row[f"Chain: TR{chain} ({dual_chain_id})"], metadata=metadata))
        else:
            # select a random v and j gene
            v_allele = v_alleles.pop()
            j_allele = j_alleles.pop()
            metadata = make_sequence_metadata(v_allele, j_allele, chain, dual_chain_id)
            sequences.append(ReceptorSequence(amino_acid_sequence=row[f"Chain: TR{chain} ({dual_chain_id})"], metadata=metadata))

        return sequences
예제 #27
0
    def test_encode_sequence(self):
        sequence = ReceptorSequence("AHCDE", None, None)
        kmers = IMGTGappedKmerEncoder.encode_sequence(
            sequence,
            EncoderParams(model={
                "k_left": 1,
                "max_gap": 1
            },
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual(
            {
                'AH-105', 'HC-106', 'CD-107', 'DE-116', 'A.C-105', 'H.D-106',
                'C.E-107'
            }, set(kmers))

        sequence = ReceptorSequence("CASSPRERATYEQCAY", None, None)
        kmers = IMGTGappedKmerEncoder.encode_sequence(
            sequence,
            EncoderParams(model={
                "k_left": 1,
                "max_gap": 1
            },
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual(
            {
                'CA-105', 'AS-106', 'SS-107', 'SP-108', 'PR-109', 'RE-110',
                'ER-111', 'RA-111.001', 'AT-112.002', 'TY-112.001', 'YE-112',
                'EQ-113', 'QC-114', 'CA-115', 'AY-116', 'C.S-105', 'A.S-106',
                'S.P-107', 'S.R-108', 'P.E-109', 'R.R-110', 'E.A-111',
                'R.T-111.001', 'A.Y-112.002', 'T.E-112.001', 'Y.Q-112',
                'E.C-113', 'Q.A-114', 'C.Y-115'
            }, set(kmers))
예제 #28
0
 def create_dummy_sequencedataset(self, path):
     sequences = [
         ReceptorSequence(amino_acid_sequence="AAATTT",
                          identifier="1a",
                          metadata=SequenceMetadata(v_gene="TRAV1",
                                                    j_gene="TRAJ1",
                                                    chain=Chain.ALPHA,
                                                    frame_type="IN",
                                                    custom_params={
                                                        "d_call": "TRAD1",
                                                        "custom1": "cust1"
                                                    })),
         ReceptorSequence(amino_acid_sequence="ATATAT",
                          identifier="1b",
                          metadata=SequenceMetadata(v_gene="TRBV1",
                                                    j_gene="TRBJ1",
                                                    chain=Chain.BETA,
                                                    frame_type="IN",
                                                    custom_params={
                                                        "d_call": "TRBD1",
                                                        "custom2": "cust1"
                                                    })),
         ReceptorSequence(amino_acid_sequence="ATATAT",
                          identifier="2b",
                          metadata=SequenceMetadata(v_gene="TRBV1",
                                                    j_gene="TRBJ1",
                                                    chain=Chain.BETA,
                                                    frame_type="IN",
                                                    custom_params={
                                                        "d_call": "TRBD1",
                                                        "custom2": "cust1"
                                                    }))
     ]
     sequences_path = path / "sequences"
     PathBuilder.build(sequences_path)
     return SequenceDataset.build(sequences, 2, sequences_path)
예제 #29
0
    def build(sequences: list, path: Path, labels: dict = None, seq_metadata: list = None, subject_ids: list = None):

        if subject_ids is not None:
            assert len(subject_ids) == len(sequences)

        if seq_metadata is not None:
            assert len(sequences) == len(seq_metadata)
            for index, sequence_list in enumerate(sequences):
                assert len(sequence_list) == len(seq_metadata[index])

        PathBuilder.build(path)
        rep_path = PathBuilder.build(path / "repertoires")

        repertoires = []
        if subject_ids is None:
            subject_ids = []

        for rep_index, sequence_list in enumerate(sequences):
            rep_sequences = ReceptorSequenceList()
            if len(subject_ids) < len(sequences):
                subject_ids.append("rep_" + str(rep_index))
            for seq_index, sequence in enumerate(sequence_list):
                if seq_metadata is None:
                    m = SequenceMetadata(v_subgroup="TRBV1", v_gene="TRBV1-1", v_allele="TRBV1-1*01", j_subgroup="TRBJ1", j_gene="TRBJ1-1", j_allele="TRBJ1-1*01", count=1, chain="TRB", region_type="IMGT_CDR3")
                else:
                    m = SequenceMetadata(**seq_metadata[rep_index][seq_index])

                s = ReceptorSequence(amino_acid_sequence=sequence, metadata=m, identifier=str(seq_index))
                rep_sequences.append(s)

            if labels is not None:
                metadata = {key: labels[key][rep_index] for key in labels.keys()}
            else:
                metadata = {}

            metadata = {**metadata, **{"subject_id": subject_ids[rep_index]}}

            repertoire = Repertoire.build_from_sequence_objects(rep_sequences, rep_path, metadata, filename_base=f"rep_{rep_index}")
            repertoires.append(repertoire)

        df = pd.DataFrame({**{"filename": [repertoire.data_filename for repertoire in repertoires],
                              "subject_id": subject_ids,
                              "repertoire_identifier": [repertoire.identifier for repertoire in repertoires]},
                           **(labels if labels is not None else {})})
        df.to_csv(path / "metadata.csv", index=False)

        return repertoires, path / "metadata.csv"
예제 #30
0
    def encode_sequence(sequence: ReceptorSequence, params: EncoderParams):
        """
        creates overlapping continuous k-mers and IMGT position pairs from a sequence as features for use in
        KmerFrequencyEncoder object of type EncoderParams, same object as passed into KmerFrequencyEncoder.
        :param sequence: ReceptorSequence
        :param params: EncoderParams (where params["model"]["k"] is used)
        :return: SequenceEncodingResult
        """
        k = params.model["k"]
        length = len(sequence.get_sequence())

        if length < k:
            logging.warning('KmerSequenceEncoder: Sequence length is less than k. Ignoring sequence...')
            return None

        kmers = KmerHelper.create_kmers_from_sequence(sequence, k)

        return kmers