Пример #1
0
    def create_dataset(self, path, dataset_size: int = 50):

        sequences = []

        for i in range(dataset_size):
            if i % 2 == 0:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="AAACCC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 1})))
            else:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="ACACAC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 2})))

        PathBuilder.build(path)
        filename = "{}sequences.pkl".format(path)
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = SequenceDataset(params={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset
Пример #2
0
    def test_encode_sequence(self):
        seq = ReceptorSequence(amino_acid_sequence="CASSVFRTY")
        result = KmerSequenceEncoder.encode_sequence(
            seq,
            EncoderParams(model={"k": 3},
                          label_config=LabelConfiguration(),
                          result_path="",
                          pool_size=4))

        self.assertTrue("CAS" in result)
        self.assertTrue("ASS" in result)
        self.assertTrue("SSV" in result)
        self.assertTrue("SVF" in result)
        self.assertTrue("VFR" in result)
        self.assertTrue("FRT" in result)
        self.assertTrue("RTY" in result)

        self.assertEqual(7, len(result))
        self.assertEqual(
            KmerSequenceEncoder.encode_sequence(
                ReceptorSequence(amino_acid_sequence="AC"),
                EncoderParams(model={"k": 3},
                              label_config=LabelConfiguration(),
                              result_path="",
                              pool_size=4)), None)
Пример #3
0
    def create_dummy_sequencedataset(self, path):
        sequences = [
            ReceptorSequence(amino_acid_sequence="AAATTT",
                             identifier="1a",
                             metadata=SequenceMetadata(v_gene="TRAV1",
                                                       j_gene="TRAJ1",
                                                       chain=Chain.ALPHA,
                                                       frame_type="IN",
                                                       custom_params={
                                                           "d_call": "TRAD1",
                                                           "custom1": "cust1"
                                                       })),
            ReceptorSequence(amino_acid_sequence="ATATAT",
                             identifier="1b",
                             metadata=SequenceMetadata(v_gene="TRBV1",
                                                       j_gene="TRBJ1",
                                                       chain=Chain.BETA,
                                                       frame_type="IN",
                                                       custom_params={
                                                           "d_call": "TRBD1",
                                                           "custom2": "cust1"
                                                       })),
            ReceptorSequence(amino_acid_sequence="ATATAT",
                             identifier="2b",
                             metadata=SequenceMetadata(v_gene="TRBV1",
                                                       j_gene="TRBJ1",
                                                       chain=Chain.BETA,
                                                       frame_type="IN",
                                                       custom_params={
                                                           "d_call": "TRBD1",
                                                           "custom2": "cust1"
                                                       }))
        ]

        return SequenceDataset.build(sequences, 2, "{}sequences".format(path))
Пример #4
0
 def create_IMGT_gapped_kmers_from_sequence(sequence: ReceptorSequence,
                                            k_left: int,
                                            max_gap: int,
                                            k_right: int = None,
                                            min_gap: int = 0):
     positions = PositionHelper.gen_imgt_positions_from_length(
         len(sequence.get_sequence()))
     sequence_w_pos = list(zip(list(sequence.get_sequence()), positions))
     kmers = KmerHelper.create_gapped_kmers_from_string(sequence_w_pos,
                                                        k_left=k_left,
                                                        max_gap=max_gap,
                                                        k_right=k_right,
                                                        min_gap=min_gap)
     if kmers is not None:
         kmers = [(
             ''.join([x[0] if isinstance(x, tuple) else x for x in kmer]),
             min([i[1] if isinstance(i, tuple) else 1000 for i in kmer]) if
             int(min([i[1] if isinstance(i, tuple) else 1000
                      for i in kmer])) != 112 else max([
                          i[1] if isinstance(i, tuple) else 0 for i in kmer
                          if int(i[1] if isinstance(i, tuple) else 0) == 112
                      ])) for kmer in kmers]
         return kmers
     else:
         return None
Пример #5
0
    def test_create_model(self):
        test_path = EnvironmentSettings.root_path + "test/tmp/w2v_test_tmp/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA")
        sequence2 = ReceptorSequence("CASSCCC")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        model_creator = KmerPairModelCreator()
        model = model_creator.create_model(dataset=dataset,
                                           k=2,
                                           vector_size=16,
                                           batch_size=1,
                                           model_path=test_path +
                                           "model.model")

        self.assertTrue(isinstance(model, Word2Vec))
        self.assertTrue("CA" in model.wv.vocab)
        self.assertEqual(400, len(model.wv.vocab))

        shutil.rmtree(test_path)
Пример #6
0
    def test_implant_in_repertoire(self):
        path = EnvironmentSettings.tmp_test_path + "healthysequenceimplanting/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(
            [
                ReceptorSequence(amino_acid_sequence="ACDFQ", identifier="1"),
                ReceptorSequence(amino_acid_sequence="TGCDF", identifier="2")
            ],
            path=path,
            metadata={"subject_id": "1"})
        implanting = HealthySequenceImplanting(
            GappedMotifImplanting(),
            implanting_computation=ImplantingComputation.ROUND)
        signal = Signal("1", [Motif("m1", GappedKmerInstantiation(), "CCC")],
                        implanting)

        repertoire2 = implanting.implant_in_repertoire(repertoire, 0.5, signal,
                                                       path)

        new_sequences = [
            sequence.get_sequence() for sequence in repertoire2.sequences
        ]
        self.assertTrue("ACDFQ" in new_sequences or "TGCDF" in new_sequences)
        self.assertTrue(any(["CCC" in sequence for sequence in new_sequences]))

        shutil.rmtree(path)
Пример #7
0
    def test_create_kmers_from_sequence(self):
        kmers = KmerHelper.create_kmers_from_sequence(ReceptorSequence(amino_acid_sequence="ABCDEFG"), 3)
        self.assertTrue("ABC" in kmers and "BCD" in kmers and "CDE" in kmers and "DEF" in kmers and "EFG" in kmers)
        self.assertEqual(5, len(kmers))

        kmers = KmerHelper.create_kmers_from_sequence(ReceptorSequence(amino_acid_sequence="AB"), 3)
        self.assertTrue(len(kmers) == 0)
Пример #8
0
    def test_encode_sequence(self):
        sequence = ReceptorSequence(
            amino_acid_sequence="AAA",
            metadata=SequenceMetadata(frame_type="OUT"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(
            enc.encode_sequence(
                sequence,
                EncoderParams(model={},
                              label_config=LabelConfiguration(),
                              result_path="")), ["AAA"])

        sequence = ReceptorSequence(
            amino_acid_sequence="AAA",
            metadata=SequenceMetadata(frame_type="STOP"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(
            enc.encode_sequence(
                sequence,
                EncoderParams(model={},
                              label_config=LabelConfiguration(),
                              result_path="")), ["AAA"])

        sequence = ReceptorSequence(amino_acid_sequence="AAA",
                                    metadata=SequenceMetadata(frame_type="IN"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(["AAA"],
                         enc.encode_sequence(
                             sequence,
                             EncoderParams(model={},
                                           label_config=LabelConfiguration(),
                                           result_path="")))
Пример #9
0
    def test_get_sequence(self):

        sequence = ReceptorSequence(amino_acid_sequence="CAS",
                                    nucleotide_sequence="TGTGCTTCC")

        EnvironmentSettings.set_sequence_type(SequenceType.AMINO_ACID)

        self.assertEqual(sequence.get_sequence(), "CAS")
Пример #10
0
    def test_implant(self):

        strategy = GappedMotifImplanting()
        motif_instance = MotifInstance("CC/T", 2)
        sequence = strategy.implant(
            ReceptorSequence(amino_acid_sequence="AAAAAAAAAA"), {
                "signal_id": "1",
                "motif_id": "1",
                "motif_instance": motif_instance
            })

        self.assertTrue(sequence.get_sequence().find("CCAAT") > -1)
        self.assertEqual(10, len(sequence.get_sequence()))

        sequence = strategy.implant(
            ReceptorSequence(amino_acid_sequence="AAAAAAAAAA"), {
                "signal_id": "1",
                "motif_id": "1",
                "motif_instance": motif_instance
            },
            sequence_position_weights={
                105: 0.8,
                106: 0.2
            })

        self.assertTrue(-1 < sequence.get_sequence().find("CCAAT") < 2)
        self.assertEqual(10, len(sequence.get_sequence()))

        motif_instance = MotifInstance("CCT", 0)
        sequence = strategy.implant(
            ReceptorSequence(amino_acid_sequence="AAAAAAAAAA"), {
                "signal_id": "1",
                "motif_id": "1",
                "motif_instance": motif_instance
            },
            sequence_position_weights={
                105: 0.8,
                106: 0.2
            })

        self.assertTrue(-1 < sequence.get_sequence().find("CCT") < 2)
        self.assertEqual(10, len(sequence.get_sequence()))

        motif_instance = MotifInstance("C/T", 0)
        sequence = strategy.implant(
            ReceptorSequence(amino_acid_sequence="AAAAAAAAAA"), {
                "signal_id": "1",
                "motif_id": "1",
                "motif_instance": motif_instance
            },
            sequence_position_weights={
                105: 0.8,
                106: 0.2
            })

        self.assertTrue(-1 < sequence.get_sequence().find("CT") < 2)
        self.assertTrue("/" not in sequence.get_sequence())
Пример #11
0
 def create_IMGT_kmers_from_sequence(sequence: ReceptorSequence, k: int):
     positions = PositionHelper.gen_imgt_positions_from_length(
         len(sequence.get_sequence()))
     sequence_w_pos = list(zip(list(sequence.get_sequence()), positions))
     kmers = KmerHelper.create_kmers_from_string(sequence_w_pos, k)
     kmers = [(''.join([x[0] for x in kmer]),
               min([i[1] for i in kmer]) if int(min([i[1]
                                                     for i in kmer])) != 112
               else max([i[1] for i in kmer if int(i[1]) == 112]))
              for kmer in kmers]
     return kmers
Пример #12
0
 def matches_sequence(self, original_sequence: ReceptorSequence, reference_sequence: ReceptorSequence, max_distance):
     """
     :param original_sequence: ReceptorSequence
     :param reference_sequence: ReceptorSequence
     :param max_distance: max allowed Levenshtein distance between two sequences to be considered a match
     :return: True if chain, v_gene and j_gene are the same and sequences are within given Levenshtein distance
     """
     return reference_sequence.metadata.chain == original_sequence.metadata.chain \
         and self.matches_gene(reference_sequence.metadata.v_gene, original_sequence.metadata.v_gene) \
         and self.matches_gene(reference_sequence.metadata.j_gene, original_sequence.metadata.j_gene) \
         and edit_distance(original_sequence.get_sequence(), reference_sequence.get_sequence()) <= max_distance
Пример #13
0
    def test_implant_in_sequence(self):
        implanting = HealthySequenceImplanting(
            GappedMotifImplanting(),
            implanting_computation=ImplantingComputation.ROUND)
        signal = Signal("1", [Motif("m1", GappedKmerInstantiation(), "CCC")],
                        implanting)
        sequence = ReceptorSequence(amino_acid_sequence="ACDFQ")
        sequence2 = implanting.implant_in_sequence(sequence, signal)

        self.assertEqual(len(sequence.get_sequence()),
                         len(sequence2.get_sequence()))
        self.assertTrue("CCC" in sequence2.get_sequence())
Пример #14
0
    def construct_test_flatten_dataset(self, path):
        sequences = [ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1", metadata=SequenceMetadata(custom_params={"l1": 1})),
                     ReceptorSequence(amino_acid_sequence="ATATAT", identifier="2", metadata=SequenceMetadata(custom_params={"l1": 2}))]

        PathBuilder.build(path)
        filename = "{}sequences.pkl".format(path)
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        return SequenceDataset(params={"l1": [1, 2]}, filenames=[filename], identifier="d1")
Пример #15
0
    def create_dummy_receptordataset(self, path):
        receptors = [
            TCABReceptor(identifier="1",
                         alpha=ReceptorSequence(amino_acid_sequence="AAATTT",
                                                identifier="1a",
                                                metadata=SequenceMetadata(
                                                    v_gene="TRAV1",
                                                    j_gene="TRAJ1",
                                                    chain=Chain.ALPHA,
                                                    frame_type="IN",
                                                    custom_params={
                                                        "d_call": "TRAD1",
                                                        "custom1": "cust1"
                                                    })),
                         beta=ReceptorSequence(amino_acid_sequence="ATATAT",
                                               identifier="1b",
                                               metadata=SequenceMetadata(
                                                   v_gene="TRBV1",
                                                   j_gene="TRBJ1",
                                                   chain=Chain.BETA,
                                                   frame_type="IN",
                                                   custom_params={
                                                       "d_call": "TRBD1",
                                                       "custom1": "cust1"
                                                   }))),
            TCABReceptor(identifier="2",
                         alpha=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                                identifier="2a",
                                                metadata=SequenceMetadata(
                                                    v_gene="TRAV1",
                                                    j_gene="TRAJ1",
                                                    chain=Chain.ALPHA,
                                                    frame_type="IN",
                                                    custom_params={
                                                        "d_call": "TRAD1",
                                                        "custom2": "cust1"
                                                    })),
                         beta=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                               identifier="2b",
                                               metadata=SequenceMetadata(
                                                   v_gene="TRBV1",
                                                   j_gene="TRBJ1",
                                                   chain=Chain.BETA,
                                                   frame_type="IN",
                                                   custom_params={
                                                       "d_call": "TRBD1",
                                                       "custom2": "cust1"
                                                   })))
        ]

        return ReceptorDataset.build(receptors, 2, "{}receptors".format(path))
Пример #16
0
    def test_create_sentences_from_repertoire(self):

        path = EnvironmentSettings.tmp_test_path + "kmer/"
        PathBuilder.build(path)

        rep = Repertoire.build_from_sequence_objects([ReceptorSequence(amino_acid_sequence="AACT"),
                                                      ReceptorSequence(amino_acid_sequence="ACCT"),
                                                      ReceptorSequence(amino_acid_sequence="AACT")], path, {})

        sentences = KmerHelper.create_sentences_from_repertoire(rep, 3)

        self.assertEqual(3, len(sentences))
        self.assertTrue(len(sentences[0]) == 2 and "AAC" in sentences[0] and "ACT" in sentences[0])

        shutil.rmtree(path)
Пример #17
0
 def create_gapped_kmers_from_sequence(sequence: ReceptorSequence,
                                       k_left: int,
                                       max_gap: int,
                                       k_right: int = None,
                                       min_gap: int = 0):
     return KmerHelper.create_gapped_kmers_from_string(
         sequence.get_sequence(), k_left, max_gap, k_right, min_gap)
Пример #18
0
    def test_make_subset(self):
        sequences = []
        for i in range(100):
            sequences.append(
                ReceptorSequence(amino_acid_sequence="AAA", identifier=str(i)))

        path = EnvironmentSettings.tmp_test_path + "element_generator_subset/"
        PathBuilder.build(path)

        for i in range(10):
            with open("{}batch{}.pkl".format(path, i), "wb") as file:
                sequences_to_pickle = sequences[i * 10:(i + 1) * 10]
                pickle.dump(sequences_to_pickle, file)

        d = SequenceDataset(
            filenames=["{}batch{}.pkl".format(path, i) for i in range(10)],
            file_size=10)

        indices = [1, 20, 21, 22, 23, 24, 25, 50, 52, 60, 70, 77, 78, 90, 92]

        d2 = d.make_subset(indices, path, SequenceDataset.TRAIN)

        for batch in d2.get_batch(1000):
            for sequence in batch:
                self.assertTrue(int(sequence.identifier) in indices)

        self.assertEqual(15, d2.get_example_count())

        shutil.rmtree(path)
Пример #19
0
    def encode_sequence(sequence: ReceptorSequence, params: EncoderParams):
        """
        creates all overlapping gapped k-mers and IMGT position pairs from a sequence as features for use in KmerFrequencyEncoder.
        this gap length goes from min_gap to max_gap inclusive, and there is a k-mer of length k_left on the left
        side of the gap and a k-mer of length k_right on the right side of the gap.
        :param sequence: ReceptorSequence
        :param params: EncoderParams (within the "model", the following keys are used: "k_left", "k_right", "max_gap",
                        "min_gap")
        :return: SequenceEncodingResult
        """
        k_left = params.model.get('k_left')
        k_right = params.model.get('k_right', k_left)
        max_gap = params.model.get('max_gap')
        min_gap = params.model.get('min_gap', 0)
        length = len(sequence.get_sequence())

        if length < k_left + k_right + max_gap:
            warnings.warn(
                'Sequence length is less than k_left + k_right + max_gap. Ignoring sequence'
            )
            return None

        gapped_kmers = KmerHelper.create_IMGT_gapped_kmers_from_sequence(
            sequence,
            k_left=k_left,
            max_gap=max_gap,
            min_gap=min_gap,
            k_right=k_right)

        gapped_kmers = [
            Constants.FEATURE_DELIMITER.join([str(mer) for mer in kmer])
            for kmer in gapped_kmers
        ]

        return gapped_kmers
Пример #20
0
    def test_run(self):
        path = EnvironmentSettings.root_path + "test/tmp/dataencoder/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("AAA", identifier="1")],
            metadata={
                "l1": 1,
                "l2": 2
            },
            path=path)

        rep2 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("ATA", identifier="2")],
            metadata={
                "l1": 0,
                "l2": 3
            },
            path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])
        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": ModelType.SEQUENCE.name,
                "vector_size": 6
            })

        res = DataEncoder.run(
            DataEncoderParams(dataset=dataset,
                              encoder=encoder,
                              encoder_params=EncoderParams(
                                  model={},
                                  pool_size=2,
                                  label_config=lc,
                                  result_path=path,
                                  filename="dataset.csv"),
                              store_encoded_data=False))

        self.assertTrue(isinstance(res, RepertoireDataset))
        self.assertTrue(res.encoded_data.examples.shape[0] == 2)

        shutil.rmtree(path)
    def test_encode_sequence(self):
        sequence = ReceptorSequence("ABCDEFG", None, None)
        result = GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 3, "max_gap": 1},
                                                                                   label_config=LabelConfiguration(),
                                                                                   result_path=""))
        self.assertEqual({'ABC.EFG', 'ABCDEF', 'BCDEFG'}, set(result))
        result = GappedKmerSequenceEncoder.get_feature_names(EncoderParams(model={"k_left": 3, "max_gap": 1},
                                                                           label_config=LabelConfiguration(),
                                                                           result_path=""))
        self.assertEqual({'sequence'}, set(result))

        self.assertEqual(GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 10, "max_gap": 1},
                                                                                           label_config=LabelConfiguration(),
                                                                                           result_path="")),
                         None)

        sequence.amino_acid_sequence = "ABCDEFG"
        result = GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 3, "max_gap": 1},
                                                                                   label_config=LabelConfiguration(),
                                                                                   result_path=""))
        self.assertEqual({'ABC.EFG', 'ABCDEF', 'BCDEFG'}, set(result))
        result = GappedKmerSequenceEncoder.get_feature_names(EncoderParams(model={"k_left": 3, "max_gap": 1},
                                                                           label_config=LabelConfiguration(),
                                                                           result_path=""))
        self.assertEqual({'sequence'}, set(result))

        self.assertEqual(GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 10, "max_gap": 1},
                                                                                           label_config=LabelConfiguration(),
                                                                                           result_path="")),
                         None)

        sequence.amino_acid_sequence = "ABCDEFG"
        result = GappedKmerSequenceEncoder.encode_sequence(sequence,
                                                           EncoderParams(model={"k_left": 2,
                                                                                "max_gap": 1,
                                                                                "min_gap": 1,
                                                                                "k_right": 3},
                                                                         label_config=LabelConfiguration(),
                                                                         result_path=""))
        self.assertEqual({'AB.DEF', 'BC.EFG'}, set(result))
        result = GappedKmerSequenceEncoder.get_feature_names(EncoderParams(model={"k_left": 2,
                                                                                  "max_gap": 1,
                                                                                  "min_gap": 1,
                                                                                  "k_right": 3},
                                                                           label_config=LabelConfiguration(),
                                                                           result_path=""))
        self.assertEqual({'sequence'}, set(result))
Пример #22
0
    def test_process(self):

        path = EnvironmentSettings.root_path + "test/tmp/chain_filter/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAA", metadata=SequenceMetadata(chain="A"), identifier="1")
        ],
                                                      path=path,
                                                      metadata={})
        rep2 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAC", metadata=SequenceMetadata(chain="B"), identifier="2")
        ],
                                                      path=path,
                                                      metadata={})

        metadata = pd.DataFrame({"CD": [1, 0]})
        metadata.to_csv(path + "metadata.csv")

        dataset = RepertoireDataset(repertoires=[rep1, rep2],
                                    metadata_file=path + "metadata.csv")

        dataset2 = ChainRepertoireFilter.process(
            dataset, {
                "keep_chain": "ALPHA",
                "result_path": path + "results/"
            })

        self.assertEqual(1, len(dataset2.get_data()))
        self.assertEqual(2, len(dataset.get_data()))

        metadata_dict = dataset2.get_metadata(["CD"])
        self.assertEqual(1, len(metadata_dict["CD"]))
        self.assertEqual(1, metadata_dict["CD"][0])

        for rep in dataset2.get_data():
            self.assertEqual("AAA", rep.sequences[0].get_sequence())

        self.assertRaises(AssertionError, ChainRepertoireFilter.process,
                          dataset, {
                              "keep_chain": "GAMMA",
                              "result_path": path + "results/"
                          })

        shutil.rmtree(path)
Пример #23
0
 def test_create_IMGT_kmers_from_sequence(self):
     kmers = KmerHelper.create_IMGT_kmers_from_sequence(ReceptorSequence("CASSRYUF"), 3)
     self.assertTrue(("CAS", 105) in kmers)
     self.assertTrue(("ASS", 106) in kmers)
     self.assertTrue(("SSR", 107) in kmers)
     self.assertTrue(("SRY", 108) in kmers)
     self.assertTrue(("RYU", 114) in kmers)
     self.assertTrue(("YUF", 115) in kmers)
Пример #24
0
    def create_dummy_repertoire(self, path):
        sequence_objects = [
            ReceptorSequence(amino_acid_sequence="AAA",
                             nucleotide_sequence="GCTGCTGCT",
                             identifier="receptor_1",
                             metadata=SequenceMetadata(v_gene="TRBV1",
                                                       j_gene="TRBJ1",
                                                       chain=Chain.BETA,
                                                       count=5,
                                                       region_type="IMGT_CDR3",
                                                       frame_type="IN",
                                                       custom_params={
                                                           "d_call": "TRBD1",
                                                           "custom_test":
                                                           "cust1"
                                                       })),
            ReceptorSequence(amino_acid_sequence="GGG",
                             nucleotide_sequence="GGTGGTGGT",
                             identifier="receptor_2",
                             metadata=SequenceMetadata(v_gene="TRAV2",
                                                       v_allele="TRAV2*01",
                                                       j_gene="TRAJ2",
                                                       chain=Chain.ALPHA,
                                                       count=15,
                                                       frame_type=None,
                                                       region_type="IMGT_CDR3",
                                                       custom_params={
                                                           "d_call": "TRAD2",
                                                           "custom_test":
                                                           "cust2"
                                                       }))
        ]

        repertoire = Repertoire.build_from_sequence_objects(
            sequence_objects=sequence_objects,
            path=path,
            metadata={"subject_id": "REP1"})
        df = pd.DataFrame({
            "filename": [f"{repertoire.identifier}_data.npy"],
            "subject_id": ["1"],
            "repertoire_identifier": [repertoire.identifier]
        })
        df.to_csv(path + "metadata.csv", index=False)

        return repertoire, path + "metadata.csv"
Пример #25
0
    def test_encode(self):

        test_path = EnvironmentSettings.root_path + "test/tmp/w2v/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA", identifier="1")
        sequence2 = ReceptorSequence("CASSCCC", identifier="2")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        label_configuration = LabelConfiguration()
        label_configuration.add_label("T1D", ["T1D", "CTL"])

        config_params = EncoderParams(model={},
                                      learn_model=True,
                                      result_path=test_path,
                                      label_config=label_configuration,
                                      filename="dataset.pkl")

        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": "sequence",
                "vector_size": 16
            })

        encoded_dataset = encoder.encode(dataset=dataset, params=config_params)

        self.assertIsNotNone(encoded_dataset.encoded_data)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16)
        self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2)
        self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D")
        self.assertTrue(isinstance(encoder, W2VRepertoireEncoder))

        shutil.rmtree(test_path)
Пример #26
0
    def generate_receptor_dataset(receptor_count: int, chain_1_length_probabilities: dict, chain_2_length_probabilities: dict, labels: dict,
                                  path: str):
        """
        Creates receptor_count receptors where the length of sequences in each chain is sampled independently for each sequence from
        chain_n_length_probabilities distribution. The labels are also randomly assigned to receptors from the distribution given in
        labels. In this case, labels are multi-class, so each receptor will get one class from each label. This means that negative
        classes for the labels should be included as well in the specification. chain 1 and 2 in this case refer to alpha and beta
        chain of a T-cell receptor.

        An example of input parameters is given below:

        receptor_count: 100 # generate 100 TRABReceptors
        chain_1_length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15
        chain_2_length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 2) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 2) will have length 15
        labels:
            epitope1: # label name
                True: 0.5 # 50% of the receptors will have class True
                False: 0.5 # 50% of the receptors will have class False
            epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters
                1: 0.3 # 30% of the generated receptors will have class 1
                0: 0.7 # 70% of the generated receptors will have class 0
        """
        RandomDatasetGenerator._check_receptor_dataset_generation_params(receptor_count, chain_1_length_probabilities,
                                                                         chain_2_length_probabilities, labels, path)

        alphabet = EnvironmentSettings.get_sequence_alphabet()
        PathBuilder.build(path)

        get_random_sequence = lambda proba, chain, id: ReceptorSequence("".join(random.choices(alphabet, k=random.choices(list(proba.keys()),
                                                                                                                      proba.values())[0])),
                                                                    metadata=SequenceMetadata(count=1,
                                                                                              v_subgroup=chain+"V1",
                                                                                              v_gene=chain+"V1-1",
                                                                                              v_allele=chain+"V1-1*01",
                                                                                              j_subgroup=chain + "J1",
                                                                                              j_gene=chain + "J1-1",
                                                                                              j_allele=chain + "J1-1*01",
                                                                                              chain=chain,
                                                                                              cell_id=id))

        receptors = [TCABReceptor(alpha=get_random_sequence(chain_1_length_probabilities, "TRA", i),
                                  beta=get_random_sequence(chain_2_length_probabilities, "TRB", i),
                                  metadata={**{label: random.choices(list(label_dict.keys()), label_dict.values(), k=1)[0]
                                               for label, label_dict in labels.items()}, **{"subject": f"subj_{i + 1}"}})
                     for i in range(receptor_count)]

        filename = f"{path if path[-1] == '/' else path + '/'}batch01.pickle"

        with open(filename, "wb") as file:
            pickle.dump(receptors, file)

        return ReceptorDataset(params={label: list(label_dict.keys()) for label, label_dict in labels.items()},
                               filenames=[filename], file_size=receptor_count)
Пример #27
0
    def test_run(self):

        r = []

        path = EnvironmentSettings.root_path + "test/tmp/signalImplanter/"

        if not os.path.isdir(path):
            os.makedirs(path)

        sequences = [ReceptorSequence("ACDEFG", identifier="1"), ReceptorSequence("ACDEFG", identifier="2"),
                     ReceptorSequence("ACDEFG", identifier="3"), ReceptorSequence("ACDEFG", identifier="4")]

        for i in range(10):
            rep = Repertoire.build_from_sequence_objects(sequence_objects=sequences, path=path, metadata={})
            r.append(rep)

        dataset = RepertoireDataset(repertoires=r)

        m1 = Motif(identifier="m1", instantiation=GappedKmerInstantiation(), seed="CAS")
        m2 = Motif(identifier="m2", instantiation=GappedKmerInstantiation(), seed="CCC")
        s1 = Signal(identifier="s1", motifs=[m1], implanting_strategy=HealthySequenceImplanting(GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND))
        s2 = Signal(identifier="s2", motifs=[m1, m2],
                    implanting_strategy=HealthySequenceImplanting(GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND))

        simulation = Simulation([Implanting(dataset_implanting_rate=0.2, repertoire_implanting_rate=0.5, signals=[s1, s2], name="i1"),
                                 Implanting(dataset_implanting_rate=0.2, repertoire_implanting_rate=0.5, signals=[s2], name="i2")])

        input_params = SimulationState(dataset=dataset, result_path=path, simulation=simulation, signals=[s1, s2])

        new_dataset = SignalImplanter.run(input_params)
        reps_with_s2 = sum([rep.metadata[f"signal_{s2.id}"] is True for rep in new_dataset.get_data(batch_size=10)])
        reps_with_s1 = sum([rep.metadata[f"signal_{s1.id}"] is True for rep in new_dataset.get_data(batch_size=10)])
        self.assertEqual(10, len(new_dataset.get_example_ids()))
        self.assertTrue(all([f"signal_{s1.id}" in rep.metadata.keys() for rep in new_dataset.get_data(batch_size=10)]))
        self.assertTrue(all([f"signal_{s2.id}" in rep.metadata.keys() for rep in new_dataset.get_data(batch_size=10)]))
        self.assertTrue(reps_with_s2 == 4)
        self.assertTrue(reps_with_s1 == 2)

        self.assertEqual(10, len(new_dataset.get_example_ids()))

        metadata_filenames = new_dataset.get_metadata(["filename"])["filename"]
        self.assertTrue(all([repertoire.data_filename in metadata_filenames for repertoire in new_dataset.repertoires]))

        shutil.rmtree(path)
Пример #28
0
    def _construct_test_repertoiredataset(self, path, positional):
        receptors1 = ReceptorSequenceList()
        receptors2 = ReceptorSequenceList()

        if positional:
            [
                receptors1.append(seq) for seq in [
                    ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"),
                    ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1")
                ]
            ]
            [
                receptors2.append(seq)
                for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")]
            ]
        else:
            [
                receptors1.append(seq) for seq in [
                    ReceptorSequence("AAAA", identifier="1"),
                    ReceptorSequence("ATA", identifier="2"),
                    ReceptorSequence("ATA", identifier='3')
                ]
            ]
            [
                receptors2.append(seq) for seq in [
                    ReceptorSequence("ATA", identifier="1"),
                    ReceptorSequence("TAA", identifier="2")
                ]
            ]

        rep1 = Repertoire.build_from_sequence_objects(receptors1,
                                                      metadata={
                                                          "l1": 1,
                                                          "l2": 2,
                                                          "subject_id": "1"
                                                      },
                                                      path=path)

        rep2 = Repertoire.build_from_sequence_objects(receptors2,
                                                      metadata={
                                                          "l1": 0,
                                                          "l2": 3,
                                                          "subject_id": "2"
                                                      },
                                                      path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        return dataset, lc
Пример #29
0
    def match_sequence(self, sequence: ReceptorSequence, reference_sequences: list, max_distance: int) -> dict:
        matching_sequences = [seq.get_sequence() for seq in reference_sequences
                              if self.matches_sequence(sequence, seq, max_distance)]

        return {
            "matching_sequences": matching_sequences,
            "sequence": sequence.get_sequence(),
            "v_gene": sequence.metadata.v_gene,
            "j_gene": sequence.metadata.j_gene,
            "chain": sequence.metadata.chain
        }
Пример #30
0
    def get_formatted_node_metadata(self, seq: ReceptorSequence):
        # sequence, v_gene_subgroup, v_gene, j_gene_subgroup, j_gene
        chain = seq.get_attribute('chain').value
        v_gene = seq.get_attribute('v_gene')
        j_gene = seq.get_attribute('j_gene')

        additional_info = []

        for attr in self.additional_node_attributes:
            try:
                additional_info.append(seq.get_attribute(attr))
            except KeyError:
                additional_info.append(None)
                warnings.warn(
                    f"CytoscapeNetworkExporter: additional metadata attribute {attr} was not found for some receptor chain(s), "
                    f"value None was used instead.")

        return [
            seq.get_sequence(), f"{chain}{v_gene.split('-')[0]}",
            f"{chain}{v_gene}", f"{chain}{j_gene.split('-')[0]}",
            f"{chain}{j_gene}"
        ] + additional_info