def test_simple_incorporation(self): """ test simple variant incorporation. only 1 variant in 1 transcript. input reference transcript: AAAAACCCCCGGGGG variant 3: insert TT after pos 7 variant 1: SNP C -> T at pos 2 variant 4: del CCCCC after pos 9 """ dummy_db = DummyAdapter() # INSERTIONS: dummy_vars = [var_3] trans = Generator.generate_transcripts_from_variants( dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next() self.assertEqual(str(trans), "AAAAACCTTCCCGGGGG") # SNPs: dummy_vars = [var_1] trans = Generator.generate_transcripts_from_variants( dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next() self.assertEqual(str(trans), "ATAAACCCCCGGGGG") # DELETIONS: dummy_vars = [var_4] trans = Generator.generate_transcripts_from_variants( dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next() self.assertEqual(str(trans), "AAAAAGGGGG")
def test_heterozygous_variants(self): """ Create multiple transcript variants for a transcript, given a set containing heterozygous variants . Variants: 3-DEL(-2) , 5-INS(+3) , 7-DEL(-4) HET-DEL(-2), HOM-INS(+3), HET-DEL(-1) Reference sequence: AAAAACCCCCGGGGG AAATTTCGGGGG (DEL,INS,DEL) AAATTTCCCCCGGGGG (DEL,INS) AAAAATTTCCGGGG (INS,DEL) AAAAATTTCCCCCGGGGG (INS) GGGGGCCCCCAAAAA GGGTTTCAAAAA (DEL,INS,DEL) GGGTTTCCCCCAAAAA (DEL,INS) GGGGGTTTCAAAAA (INS,DEL) GGGGGTTTCCCCCAAAAA (INS) """ dummy_db = DummyAdapter() # 1) INS, SNP, DEL dummy_vars = [var_10, var_11, var_12] trans_gener = Generator.generate_transcripts_from_variants( dummy_vars, dummy_db, EIdentifierTypes.REFSEQ) trans = [t for t in trans_gener] trans = map(str, trans) self.assertEqual(len(trans), 8) self.assertTrue("AAATTTGGGGG" in trans) self.assertTrue("AAAAATTTGGGGG" in trans) self.assertTrue("AAATTTCCCCCGGGGG" in trans) self.assertTrue("AAAAATTTCCCCCGGGGG" in trans) self.assertTrue("GGGTTTAAAAA" in trans) self.assertTrue("GGGGGTTTAAAAA" in trans) self.assertTrue("GGGTTTCCCCCAAAAA" in trans) self.assertTrue("GGGGGTTTCCCCCAAAAA" in trans)
def test_proteins_from_variants(self): """ Variants: 3-DEL(-2) , 5-INS(+3) , 7-DEL(-4) HET-DEL(-2), HOM-INS(+3), HET-DEL(-1) Reference sequence:set(['GLK', 'PPK', 'GFP', 'PQK', 'GFK', 'GGF', 'FPQ', 'FPP', 'GGL']) AAATTTCGGGGG (DEL,INS,DEL) AAATTTCCCCCGGGGG (DEL,INS) AAAAATTTCCGGGG (INS,DEL) AAAAATTTCCCCCGGGGG (INS) GGGTTTCAAAAA (DEL,INS,DEL) GGGTTTCCCCCAAAAA (DEL,INS) GGGGGTTTCAAAAA (INS,DEL) GGGGGTTTCCCCCAAAAA (INS) Resulting protein sequences: KFG KNLG KFPPG KNFPRG GFK GGLK GFPPK GGFPQK """ dummy_db = DummyAdapter() dummy_vars = [var_10, var_11, var_12] exp_prot = set([ 'KFG', 'KNLG', 'KFPPG', 'KNFPRG', 'GFK', 'GGLK', 'GFPPK', 'GFPPK', 'GGFPQK' ]) prot = set( map( lambda x: str(x), Generator.generate_proteins_from_transcripts( Generator.generate_transcripts_from_variants( dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)))) self.assertTrue(len(prot - exp_prot) == 0) self.assertTrue(len(exp_prot - prot) == 0)
def test_offset_single(self): """ tests if offset is correctly handled when several variants for one transcript occur. still only one transcript with one transcript variant. reference transcript: AAAAACCCCCGGGGG Each variant so that it is clearly down stream of it's predecessor """ dummy_db = DummyAdapter() # 1) INS, SNP, DEL dummy_vars = [var_3, var_7, var_6] trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next() self.assertEqual(str(trans), "AAAAACCTTCTGGGG") # 2.) INS, DEL, INS dummy_vars = [var_9, var_4, var_8] trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next() self.assertEqual(str(trans), "AATTAAAGGGGGTTT")
def test3_protein_from_variants(self): """ Generate some transcripts from the 3 input variants (should give 8 transcripts, check also if all fields are complete) Using a protein made from variants: Translate to proteins (check if all fields are there/filled) fragment to unique peptides (check for uniqueness of sequences, check fields of peptides, check correctness of fragments) """ dummy_db = DummyAdapter() dummy_vars = [var_10, var_11, var_12] proteins = [] t = list(generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)) for trans in t: # check gene id field: print trans self.assertEqual(trans.gene_id, "gene_1") # check trans id name: name = trans.transcript_id.split(":FRED2_") self.assertEqual(len(name), 2) self.assertTrue(name[0] == "tsc_1" or name[0] == "tsc_2") self.assertTrue(len(name[1]) == 1 and name[1].isdigit) # check var: self.assertIsNotNone(trans.vars) self.assertTrue(len(trans.vars) > 0) # check sequence: self.assertTrue(str(trans) > 5) ### GET PROTS: # IGNORE invalid sequence lengths try: proteins.append(generate_proteins_from_transcripts(trans).next()) except ValueError: pass self.assertEqual(len(proteins), 8) ## CHECK Proteins: for prot in proteins: self.assertEqual(prot.gene_id, "gene_1") # check trans id name: name = prot.transcript_id.split(":FRED2_") self.assertEqual(len(name), 2) self.assertTrue(name[0] == "tsc_1" or name[0] == "tsc_2") self.assertTrue(len(name[1]) == 1 and name[1].isdigit) orig = prot.orig_transcript self.assertEqual(prot.transcript_id, orig.transcript_id) self.assertEqual(len(set(e for subl in prot.vars.itervalues() for e in subl)), len(orig.vars)) # check sequence: self.assertTrue(str(prot) > 2) ## GENERATE Peptides: peptides = generate_peptides_from_proteins(proteins,2)
def test4_peptides_from_variants(self): """ Ref trancript: AAAAACCCCCGGGGG ref protein: KNPRG ref peps(3): KNPR, NPRG variant1: heterozygous, fs+1 in first aa variant2: heterozygous, insertion +2 in last aa trans-var1: TKPPGA 1: peps(3): TKPP, KPPG, PPGA trans-var2: KNPRG 2: peps(3): KNPR, NPRG Output: ------- PEPTIDE: PPGA TRANSCRIPT: tsc_1:FRED2_3 Variant(15CC) Variant(1C) PEPTIDE: KPPG TRANSCRIPT: tsc_1:FRED2_3 Variant(1C) PEPTIDE: TKPP TRANSCRIPT: tsc_1:FRED2_3 Variant(1C) PEPTIDE: KNPR TRANSCRIPT: tsc_1:FRED2_0 PEPTIDE: NPRG TRANSCRIPT: tsc_1:FRED2_0 """ #TODO Somewhere here a print statement is called peps_trans1 = ["KNPR", "NPRG"] peps_trans2 = ["PPGA", "KPPG", "TKPP"] expected_vars = ["Variant(1C)", "Variant(15CC)"] expected = peps_trans1 + peps_trans2 dummy_db = DummyAdapter() dummy_vars = [var_13, var_14] proteins = [] transcripts = list(generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)) for trans in transcripts: ### GET PROTS: # IGNORE invalid sequence lengths try: proteins.append(generate_proteins_from_transcripts(trans).next()) except ValueError: pass peptides = list(generate_peptides_from_proteins(proteins, 4)) sequences = [str(pep) for pep in peptides] # Check if all peptides are generated as expected self.assertTrue(all(pep in sequences for pep in expected)) # no duplicates or more than the expected ones: self.assertEqual(len(peptides), len(expected)) #vari_peps = [pep.get_all_variants() for pep in peptides \ # if str(pep) in peps_trans2] #vars_ = [str(var) for varlist in vari_peps for var in varlist] # Check that for the peptides from the transcript containing the # variants, we also get all expected variants. Especally the first # variant needs to be present in all peptides for prot in proteins: for p in peptides: try: vars_ = map(str, p.get_variants_by_protein(prot.transcript_id)) expected_vars = [str(v) for vars in prot.vars.itervalues() for v in vars] print "peptide vars: ", vars_ print "Prot vars: ", expected_vars print repr(p) print repr(prot) self.assertTrue(all(var in expected_vars for var in vars_)) except KeyError: pass
def test_peptides_from_variants(self): """ Create multiple peptides, given a set containing heterozygous variants . Variants: 3-DEL(-2) , 5-INS(+3) , 7-DEL(-4) HET-DEL(-2), HOM-INS(+3), HET-DEL(-1) Reference sequence: AAAAACCCCCGGGGG AAATTTGGGGG (DEL,INS,DEL) AAATTTCCCCCGGGGG (DEL,INS) AAAAATTTGGGGG (INS,DEL) AAAAATTTCCCCCGGGGG (INS) GGGGGCCCCCAAAAA GGGTTTCAAAAA (DEL,INS,DEL) GGGTTTCCCCCAAAAA (DEL,INS) GGGGGTTTCAAAAA (INS,DEL) GGGGGTTTCCCCCAAAAA (INS) Resulting protein sequences: KFG KNLG KFPPG KNFPRG GFK GGLK GFPPK GGFPQK Resulting peptides of length 3: KFG + KNL + NLG + KFP + FPP + PPG + KNF + NFP + FPR + PRG + GFK + GGL + GLK + GFP + FPP + PPK + GGF + GFP + FPQ + PQK + """ dummy_db = DummyAdapter() exp_peps = set([ 'PRG', 'GLK', 'PPG', 'KFP', 'GFK', 'PPK', 'GFP', 'PQK', 'KNL', 'KFG', 'GGF', 'FPQ', 'FPP', 'NLG', 'FPR', 'KNF', 'GGL', 'NFP' ]) # 1) INS, SNP, DEL dummy_vars = [var_10, var_11, var_12] peps = set( map( lambda x: str(x), Generator.generate_peptides_from_variants( dummy_vars, 3, dummy_db, EIdentifierTypes.REFSEQ))) peps_from_prot = set( map( str, Generator.generate_peptides_from_proteins( Generator.generate_proteins_from_transcripts( Generator.generate_transcripts_from_variants( dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)), 3))) self.assertTrue(len(peps - peps_from_prot) == 0) self.assertTrue(len(peps_from_prot - peps) == 0) self.assertTrue(len(peps - exp_peps) == 0) self.assertTrue(len(exp_peps - peps) == 0)