示例#1
0
    def test_simple_incorporation(self):
        """
        test simple variant incorporation. only 1 variant in 1 transcript.
        input reference transcript: AAAAACCCCCGGGGG

        variant 3: insert TT after pos 7

        variant 1: SNP C -> T at pos 2

        variant 4: del CCCCC after pos 9
        """
        dummy_db = DummyAdapter()

        # INSERTIONS:
        dummy_vars = [var_3]
        trans = Generator.generate_transcripts_from_variants(
            dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next()
        self.assertEqual(str(trans), "AAAAACCTTCCCGGGGG")

        # SNPs:
        dummy_vars = [var_1]
        trans = Generator.generate_transcripts_from_variants(
            dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next()
        self.assertEqual(str(trans), "ATAAACCCCCGGGGG")

        # DELETIONS:
        dummy_vars = [var_4]
        trans = Generator.generate_transcripts_from_variants(
            dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next()
        self.assertEqual(str(trans), "AAAAAGGGGG")
示例#2
0
    def test_heterozygous_variants(self):
        """
        Create multiple transcript variants for a transcript, given a set
        containing heterozygous variants .

        Variants:
        3-DEL(-2)  , 5-INS(+3)  , 7-DEL(-4)
        HET-DEL(-2), HOM-INS(+3), HET-DEL(-1)

        Reference sequence:
        AAAAACCCCCGGGGG
        AAATTTCGGGGG (DEL,INS,DEL)
        AAATTTCCCCCGGGGG (DEL,INS)
        AAAAATTTCCGGGG (INS,DEL)
        AAAAATTTCCCCCGGGGG (INS)

        GGGGGCCCCCAAAAA
        GGGTTTCAAAAA (DEL,INS,DEL)
        GGGTTTCCCCCAAAAA (DEL,INS)
        GGGGGTTTCAAAAA (INS,DEL)
        GGGGGTTTCCCCCAAAAA (INS)
        """

        dummy_db = DummyAdapter()

        # 1) INS, SNP, DEL
        dummy_vars = [var_10, var_11, var_12]
        trans_gener = Generator.generate_transcripts_from_variants(
            dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)
        trans = [t for t in trans_gener]

        trans = map(str, trans)

        self.assertEqual(len(trans), 8)

        self.assertTrue("AAATTTGGGGG" in trans)
        self.assertTrue("AAAAATTTGGGGG" in trans)
        self.assertTrue("AAATTTCCCCCGGGGG" in trans)
        self.assertTrue("AAAAATTTCCCCCGGGGG" in trans)

        self.assertTrue("GGGTTTAAAAA" in trans)
        self.assertTrue("GGGGGTTTAAAAA" in trans)
        self.assertTrue("GGGTTTCCCCCAAAAA" in trans)
        self.assertTrue("GGGGGTTTCCCCCAAAAA" in trans)
示例#3
0
    def test_proteins_from_variants(self):
        """
                Variants:
        3-DEL(-2)  , 5-INS(+3)  , 7-DEL(-4)
        HET-DEL(-2), HOM-INS(+3), HET-DEL(-1)

        Reference sequence:set(['GLK', 'PPK', 'GFP', 'PQK', 'GFK', 'GGF', 'FPQ', 'FPP', 'GGL'])
        AAATTTCGGGGG (DEL,INS,DEL)
        AAATTTCCCCCGGGGG (DEL,INS)
        AAAAATTTCCGGGG (INS,DEL)
        AAAAATTTCCCCCGGGGG (INS)

        GGGTTTCAAAAA (DEL,INS,DEL)
        GGGTTTCCCCCAAAAA (DEL,INS)
        GGGGGTTTCAAAAA (INS,DEL)
        GGGGGTTTCCCCCAAAAA (INS)


        Resulting protein sequences:
        KFG
        KNLG
        KFPPG
        KNFPRG

        GFK
        GGLK
        GFPPK
        GGFPQK
        """
        dummy_db = DummyAdapter()
        dummy_vars = [var_10, var_11, var_12]
        exp_prot = set([
            'KFG', 'KNLG', 'KFPPG', 'KNFPRG', 'GFK', 'GGLK', 'GFPPK', 'GFPPK',
            'GGFPQK'
        ])
        prot = set(
            map(
                lambda x: str(x),
                Generator.generate_proteins_from_transcripts(
                    Generator.generate_transcripts_from_variants(
                        dummy_vars, dummy_db, EIdentifierTypes.REFSEQ))))
        self.assertTrue(len(prot - exp_prot) == 0)
        self.assertTrue(len(exp_prot - prot) == 0)
示例#4
0
    def test_offset_single(self):
        """
        tests if offset is correctly handled when several variants for one
        transcript occur. still only one transcript with one transcript variant.
        reference transcript: AAAAACCCCCGGGGG

        Each variant so that it is clearly down stream of
        it's predecessor

        """
        dummy_db = DummyAdapter()

        # 1) INS, SNP, DEL
        dummy_vars = [var_3, var_7, var_6]
        trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next()

        self.assertEqual(str(trans), "AAAAACCTTCTGGGG")

        # 2.) INS, DEL, INS
        dummy_vars = [var_9, var_4, var_8]
        trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next()
        self.assertEqual(str(trans), "AATTAAAGGGGGTTT")
示例#5
0
    def test3_protein_from_variants(self):
        """
        Generate some transcripts from the 3 input variants
        (should give 8 transcripts, check also if all fields are complete)
        Using a protein made from variants:

        Translate to proteins (check if all fields are there/filled)

        fragment to unique peptides
        (check for uniqueness of sequences, check fields of peptides, check
        correctness of fragments)
        """
        dummy_db = DummyAdapter()
        dummy_vars = [var_10, var_11, var_12]

        proteins = []
        t = list(generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ))
        for trans in t:
            # check gene id field:
            print trans
            self.assertEqual(trans.gene_id, "gene_1")

            # check trans id name:
            name = trans.transcript_id.split(":FRED2_")
            self.assertEqual(len(name), 2)
            self.assertTrue(name[0] == "tsc_1" or name[0] == "tsc_2")
            self.assertTrue(len(name[1]) == 1 and name[1].isdigit)

            # check var:
            self.assertIsNotNone(trans.vars)
            self.assertTrue(len(trans.vars) > 0)

            # check sequence:
            self.assertTrue(str(trans) > 5)

            ### GET PROTS:
            # IGNORE invalid sequence lengths
            try:
                proteins.append(generate_proteins_from_transcripts(trans).next())
            except ValueError:
                pass

        self.assertEqual(len(proteins), 8)

        ## CHECK Proteins:
        for prot in proteins:
            self.assertEqual(prot.gene_id, "gene_1")

            # check trans id name:
            name = prot.transcript_id.split(":FRED2_")
            self.assertEqual(len(name), 2)
            self.assertTrue(name[0] == "tsc_1" or name[0] == "tsc_2")
            self.assertTrue(len(name[1]) == 1 and name[1].isdigit)

            orig = prot.orig_transcript
            self.assertEqual(prot.transcript_id, orig.transcript_id)
            self.assertEqual(len(set(e for subl in prot.vars.itervalues() for e in subl)), len(orig.vars))

            # check sequence:
            self.assertTrue(str(prot) > 2)

        ## GENERATE Peptides:
        peptides = generate_peptides_from_proteins(proteins,2)
示例#6
0
    def test4_peptides_from_variants(self):
        """
        Ref trancript: AAAAACCCCCGGGGG
        ref protein:   KNPRG
        ref peps(3):   KNPR, NPRG

        variant1: heterozygous, fs+1 in first aa
        variant2: heterozygous, insertion +2 in last aa

        trans-var1: TKPPGA
        1: peps(3): TKPP, KPPG, PPGA

        trans-var2: KNPRG
        2: peps(3): KNPR, NPRG

        Output:
        -------
        PEPTIDE: PPGA
            TRANSCRIPT: tsc_1:FRED2_3
                 Variant(15CC)
                 Variant(1C)
        PEPTIDE: KPPG
            TRANSCRIPT: tsc_1:FRED2_3
                 Variant(1C)
        PEPTIDE: TKPP
            TRANSCRIPT: tsc_1:FRED2_3
                 Variant(1C)

        PEPTIDE: KNPR
            TRANSCRIPT: tsc_1:FRED2_0
        PEPTIDE: NPRG
            TRANSCRIPT: tsc_1:FRED2_0
        """
        #TODO Somewhere here a print statement is called
        peps_trans1 = ["KNPR", "NPRG"]
        peps_trans2 = ["PPGA", "KPPG", "TKPP"]
        expected_vars = ["Variant(1C)", "Variant(15CC)"]
        expected = peps_trans1 + peps_trans2

        dummy_db = DummyAdapter()
        dummy_vars = [var_13, var_14]

        proteins = []
        transcripts = list(generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ))
        for trans in transcripts:
            ### GET PROTS:
            # IGNORE invalid sequence lengths
            try:
                proteins.append(generate_proteins_from_transcripts(trans).next())
            except ValueError:
                pass

        peptides = list(generate_peptides_from_proteins(proteins, 4))

        sequences = [str(pep) for pep in peptides]

        # Check if all peptides are generated as expected
        self.assertTrue(all(pep in sequences for pep in expected))
        # no duplicates or more than the expected ones:
        self.assertEqual(len(peptides), len(expected))

        #vari_peps = [pep.get_all_variants() for pep in peptides \
        #             if str(pep) in peps_trans2]

        #vars_ = [str(var) for varlist in vari_peps for var in varlist]

        # Check that for the peptides from the transcript containing the
        # variants, we also get all expected variants. Especally the first
        # variant needs to be present in all peptides
        for prot in proteins:
            for p in peptides:
                try:
                    vars_ = map(str, p.get_variants_by_protein(prot.transcript_id))
                    expected_vars = [str(v) for vars in prot.vars.itervalues() for v in vars]
                    print "peptide vars: ", vars_
                    print "Prot vars: ", expected_vars
                    print repr(p)
                    print repr(prot)
                    self.assertTrue(all(var in expected_vars for var in vars_))
                except KeyError:
                    pass
示例#7
0
    def test_peptides_from_variants(self):
        """
        Create multiple peptides, given a set
        containing heterozygous variants .

        Variants:
        3-DEL(-2)  , 5-INS(+3)  , 7-DEL(-4)
        HET-DEL(-2), HOM-INS(+3), HET-DEL(-1)

        Reference sequence:
        AAAAACCCCCGGGGG
        AAATTTGGGGG (DEL,INS,DEL)
        AAATTTCCCCCGGGGG (DEL,INS)
        AAAAATTTGGGGG (INS,DEL)
        AAAAATTTCCCCCGGGGG (INS)

        GGGGGCCCCCAAAAA
        GGGTTTCAAAAA (DEL,INS,DEL)
        GGGTTTCCCCCAAAAA (DEL,INS)
        GGGGGTTTCAAAAA (INS,DEL)
        GGGGGTTTCCCCCAAAAA (INS)


        Resulting protein sequences:
        KFG
        KNLG
        KFPPG
        KNFPRG

        GFK
        GGLK
        GFPPK
        GGFPQK

        Resulting peptides of length 3:
        KFG +
        KNL +
        NLG +
        KFP +
        FPP +
        PPG +
        KNF +
        NFP +
        FPR +
        PRG +

        GFK +
        GGL +
        GLK +
        GFP +
        FPP +
        PPK +
        GGF +
        GFP +
        FPQ +
        PQK +
        """
        dummy_db = DummyAdapter()

        exp_peps = set([
            'PRG', 'GLK', 'PPG', 'KFP', 'GFK', 'PPK', 'GFP', 'PQK', 'KNL',
            'KFG', 'GGF', 'FPQ', 'FPP', 'NLG', 'FPR', 'KNF', 'GGL', 'NFP'
        ])
        # 1) INS, SNP, DEL
        dummy_vars = [var_10, var_11, var_12]
        peps = set(
            map(
                lambda x: str(x),
                Generator.generate_peptides_from_variants(
                    dummy_vars, 3, dummy_db, EIdentifierTypes.REFSEQ)))

        peps_from_prot = set(
            map(
                str,
                Generator.generate_peptides_from_proteins(
                    Generator.generate_proteins_from_transcripts(
                        Generator.generate_transcripts_from_variants(
                            dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)),
                    3)))

        self.assertTrue(len(peps - peps_from_prot) == 0)
        self.assertTrue(len(peps_from_prot - peps) == 0)
        self.assertTrue(len(peps - exp_peps) == 0)
        self.assertTrue(len(exp_peps - peps) == 0)