def testMulticoreAnnotate(self): """Test a (too) simple annotating exercise from GAF on 2 cores""" gafDatasource = TestUtils.createGafDatasourceProxy(self.config) # Test pickling dump(gafDatasource, file('out/testGAFPickle.pkl','w')) m1 = MutationData() m1.chr = '3' m1.start = '178866811' m1.end = '178866811' m1.ref_allele = "A" m1.alt_allele = "C" m1.build = "hg19" m2 = MutationData() m2.chr = '3' m2.start = '178866812' m2.end = '178866812' m2.ref_allele = "A" m2.alt_allele = "C" m2.build = "hg19" p = LoggingPool(processes=2) result = p.map(annotate_mutation_global, [(gafDatasource, m1), (gafDatasource, m2)]) p.close() p.join() for r in result: self.assertTrue("transcript_id" in r.keys()) self.assertTrue("gene" in r.keys()) self.assertTrue(r["gene"] == "PIK3CA") self.assertTrue(result[0].start != result[1].start)
def testRealWorld(self): """Test that the full COSMIC datasource can retrieve entries by both gp and gpp.""" gafDS = TestUtils.createTranscriptProviderDatasource(self.config) cosmicDS = TestUtils.createCosmicDatasource(self.config) # These values are not taken from a real world scenario, but are cooked for this test. m = MutationData() m.chr = '1' m.start = '12941796' m.end = '12941796' m.ref_allele = "G" m.alt_allele = "T" m = gafDS.annotate_mutation(m) m = cosmicDS.annotate_mutation(m) self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '0') #1 150483621 150483621 m = MutationData() m.chr = '1' m.start = '150483621' m.end = '150483621' m.ref_allele = "G" m.alt_allele = "T" m = gafDS.annotate_mutation(m) m = cosmicDS.annotate_mutation(m)
def test_effect_tx_mode(self): gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT) # Canonical mutation was Intron m = MutationData() m.chr = '2' m.start = '219137340' m.end = '219137340' m.ref_allele = 'G' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "PNKD") self.assertTrue(m['variant_classification'] == "Missense_Mutation") gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_CANONICAL) m = MutationData() m.chr = '2' m.start = '219137340' m.end = '219137340' m.ref_allele = 'G' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "PNKD") self.assertTrue( m['variant_classification'] == "Intron", "Canonical no longer is Intron. This test is no longer valid. This failure can come up when changing the GAF datasource." )
def _simple_annotate(self, is_skip_no_alts): runSpec = RunSpecification() runSpec.initialize(None, None, datasources=[], is_skip_no_alts=is_skip_no_alts) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationData() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" m.createAnnotation("alt_allele_seen", "False") m2 = MutationData() m2.chr = "1" m2.start = "12941796" m2.end = "12941796" m2.alt_allele = "G" m2.ref_allele = "T" muts = [m, m2] muts = annotator.annotate_mutations(muts) ctr = 0 for m in muts: ctr += 1 return ctr
def testMulticoreAnnotate(self): """Test a (too) simple annotating exercise from GAF on 2 cores""" gafDatasource = TestUtils.createGafDatasourceProxy(self.config) # Test pickling dump(gafDatasource, file('out/testGAFPickle.pkl', 'w')) m1 = MutationData() m1.chr = '3' m1.start = '178866811' m1.end = '178866811' m1.ref_allele = "A" m1.alt_allele = "C" m1.build = "hg19" m2 = MutationData() m2.chr = '3' m2.start = '178866812' m2.end = '178866812' m2.ref_allele = "A" m2.alt_allele = "C" m2.build = "hg19" p = LoggingPool(processes=2) result = p.map(annotate_mutation_global, [(gafDatasource, m1), (gafDatasource, m2)]) p.close() p.join() for r in result: self.assertTrue("transcript_id" in r.keys()) self.assertTrue("gene" in r.keys()) self.assertTrue(r["gene"] == "PIK3CA") self.assertTrue(result[0].start != result[1].start)
def test_denovo(self): """GAF de novo test """ gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) m = MutationData() m.start = str(22221735) m.end = str(22221737) m.chr="22" m.ref_allele = '' m.alt_allele = 'CAT' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['variant_classification'] == 'De_novo_Start_OutOfFrame') m = MutationData() m.start = str(22221735) m.end = str(22221740) m.chr="22" m.ref_allele = '' m.alt_allele = 'AACATAA' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['variant_classification'] == 'De_novo_Start_OutOfFrame') m = MutationData() m.start = str(22221735) m.end = str(22221739) m.chr="22" m.ref_allele = '' m.alt_allele = 'ACATAA' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['variant_classification'] == 'De_novo_Start_InFrame')
def testRealWorld(self): """Test that the full COSMIC datasource can retrieve entries by both gp and gpp.""" gafDS = TestUtils.createTranscriptProviderDatasource(self.config) cosmicDS = TestUtils.createCosmicDatasource(self.config) # These values are not taken from a real world scenario, but are cooked for this test. m = MutationData() m.chr = '1' m.start = '12941796' m.end = '12941796' m.ref_allele = "G" m.alt_allele = "T" m = gafDS.annotate_mutation(m) m = cosmicDS.annotate_mutation(m) self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '0') #1 150483621 150483621 m = MutationData() m.chr = '1' m.start = '150483621' m.end = '150483621' m.ref_allele = "G" m.alt_allele = "T" m = gafDS.annotate_mutation(m) m = cosmicDS.annotate_mutation(m)
def initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build): mut = MutationData(chrom, startPos, endPos, ref, alt, build) varType = MutUtils.determineVariantType(mut) if varType == "snp": # Snps mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue="") if varType == "del": # deletion preceding_bases, updated_ref_allele, updated_start, updated_end =\ MutUtils.retrievePrecedingBasesForDeletions(mut) mut.ref_allele = updated_ref_allele mut["ref_allele"] = updated_ref_allele mut.alt_allele = "-" mut["alt_allele"] = "-" mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) elif varType == "ins": # insertion preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut["ref_allele"] = "-" mut.alt_allele = updated_alt_allele mut["alt_allele"] = updated_alt_allele mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) return mut
def testRetrievePrecedingBaseFromAnnotationForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue( updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue( updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue( updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele)) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue( updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue( updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue( updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele))
def initializeMutFromAttributes(chr, start, end, ref_allele, alt_allele, build): mut = MutationData(str(chr), str(start), str(end), ref_allele, alt_allele, str(build)) varType = TranscriptProviderUtils.infer_variant_type(mut.ref_allele, mut.alt_allele) if TranscriptProviderUtils.is_xnp(varType): # Snps and other xNPs mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue="") if varType == VariantClassification.VT_DEL: # deletion preceding_bases, updated_ref_allele, updated_start, updated_end =\ MutUtils.retrievePrecedingBasesForDeletions(mut) mut.ref_allele = updated_ref_allele mut["ref_allele"] = updated_ref_allele mut.alt_allele = "-" mut["alt_allele"] = "-" mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) elif varType == VariantClassification.VT_INS: # insertion preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut["ref_allele"] = "-" mut.alt_allele = updated_alt_allele mut["alt_allele"] = updated_alt_allele mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) return mut
def test_denovo(self): """GAF de novo test """ gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) m = MutationData() m.start = str(22221735) m.end = str(22221737) m.chr = "22" m.ref_allele = '' m.alt_allele = 'CAT' m = gafDatasource.annotate_mutation(m) self.assertTrue( m['variant_classification'] == 'De_novo_Start_OutOfFrame') m = MutationData() m.start = str(22221735) m.end = str(22221740) m.chr = "22" m.ref_allele = '' m.alt_allele = 'AACATAA' m = gafDatasource.annotate_mutation(m) self.assertTrue( m['variant_classification'] == 'De_novo_Start_OutOfFrame') m = MutationData() m.start = str(22221735) m.end = str(22221739) m.chr = "22" m.ref_allele = '' m.alt_allele = 'ACATAA' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['variant_classification'] == 'De_novo_Start_InFrame')
def _simple_annotate(self, is_skip_no_alts): runSpec = RunSpecification() runSpec.initialize(None, None, datasources=[], is_skip_no_alts=is_skip_no_alts) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationData() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" m.createAnnotation("alt_allele_seen", "False") m2 = MutationData() m2.chr = "1" m2.start = "12941796" m2.end = "12941796" m2.alt_allele = "G" m2.ref_allele = "T" muts = [m, m2] muts = annotator.annotate_mutations(muts) ctr = 0 for m in muts: ctr += 1 return ctr
def test_effect_tx_mode(self): gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT) # Canonical mutation was Intron m = MutationData() m.chr = '2' m.start = '219137340' m.end = '219137340' m.ref_allele = 'G' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "PNKD") self.assertTrue(m['variant_classification'] == "Missense_Mutation") gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_CANONICAL) m = MutationData() m.chr = '2' m.start = '219137340' m.end = '219137340' m.ref_allele = 'G' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "PNKD") self.assertTrue(m['variant_classification'] == "Intron", "Canonical no longer is Intron. This test is no longer valid. This failure can come up when changing the GAF datasource.")
def testRetrievePrecedingBasesForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234570 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "T", "Alt allele should be T but was %s." % mut.alt_allele) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234571 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "TT", "Alt allele should be TT but was %s." % mut.alt_allele)
def testFlank(self): """Test that we can see a Flank mutation.""" #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94 # refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA" alts = "TGGGCTCAGGCTCTCTGAAAAGAAAA" vcs = [] gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) numSpliceSites = 0 numSilent = 0 startWindow = 11042200 for s in range(startWindow, startWindow+len(refs)): m = MutationData() m.start = str(s) m.end = str(s) m.chr="1" m.ref_allele = refs[s-startWindow] m.alt_allele = alts[s-startWindow] m = gafDatasource.annotate_mutation(m) vc = m['variant_classification'] vcs.append(vc) print vc + " " + m.start pass
def testAnnotateListOfMutations(self): """Test that we can initialize an Annotator, without an input or output and then feed mutations, one at a time... using a runspec""" # Locate the datasource directory and create a runspec dbDir = self.config.get("DEFAULT", "dbDir") ds = DatasourceFactory.createDatasources(dbDir) runSpec = RunSpecification() runSpec.initialize(None, None, datasources=ds) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationData() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" muts = [m] muts = annotator.annotate_mutations(muts) m2 = muts.next() self.assertTrue(m2.get("gene", None) is not None)
def testdbNSFPAnnotationWithMissingExactMatch(self): # SNPs only """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join(*["testdata", "dbNSFP_chr1_6vars_exact_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "dbNSFP_chr1_6vars_exact_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "1" m1.start = "35138" m1.end = "35138" m1.ref_allele = "T" m1.alt_allele = "C" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation(value="", datasourceName="dbNSFP", dataType="Integer", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation(value="", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation(value="", datasourceName="dbNSFP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testSpliceSiteWithinNBases(self): """Test that a silent mutation is changed to splice site w/in 10 bases of a splice site """ # chr21:10,998,326-10,998,346 # 10,998,336 is a splice site. (Junction between 10998335 and 336) # AGTTCTCCTT C TGGAAAAAAG refs = 'AGTTCTCCTTCTGGAAAAAAG' alts = 'TCAGACTGAAAATACCCCCCT' gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) vcs = [] for s in range(10998326, 10998347): m = MutationData() m.start = str(s) m.end = str(s) m.chr = "21" m.ref_allele = refs[s - 10998326] m.alt_allele = alts[s - 10998326] m = gafDatasource.annotate_mutation(m) distanceFromSpliceSite = abs(10998336 - int(m.start)) vc = m['variant_classification'] self.assertTrue(vc != 'Silent', 'Silent mutation found when it should be a splice site.') vcs.append(vc) print vc + " " + m.start self.assertTrue(all([tmp == "Splice_Site" for tmp in vcs[8:12]]), "Not all vcs within 2 bases were splice site: " + str(vcs[8:12])) self.assertTrue(all([tmp != "Splice_Site" for tmp in vcs[0:8]]), "No splice sites should be seen: " + str(vcs[0:8])) self.assertTrue(all([tmp != "Splice_Site" for tmp in vcs[12:20]]), "No splice sites should be seen: " + str(vcs[12:20]))
def test_validation_correction_valid(self): """ Test that the validation allele fields are determined automatically when not specified by the user for a valid mutation. """ m = MutationData() m.chr = "3" m.start = "178948145" m.end = "178948145" m.alt_allele = "A" m.ref_allele = "G" m['validation_status'] = "Valid" m['Match_Norm_Validation_Allele1'] = "" m['Match_Norm_Validation_Allele2'] = "" m['Tumor_Validation_Allele1'] = "" m['Tumor_Validation_Allele2'] = "" m['Mutation_Status'] = "Somatic" output_filename = os.path.join("out", "test_validation_correction2.maf.tsv") outputRenderer = TcgaMafOutputRenderer(output_filename, configFile=os.path.join("configs", "tcgaMAF2.4_output.config")) outputRenderer.renderMutations([m].__iter__()) tsv_reader = GenericTsvReader(output_filename) for line_dict in tsv_reader: self.assertTrue(line_dict['Match_Norm_Validation_Allele1'] == line_dict['Match_Norm_Validation_Allele2'], "Matched norm alleles did not match.") self.assertTrue(line_dict['Tumor_Validation_Allele1'] == line_dict['Reference_Allele'], "Tumor validation allele 1 did not match reference for a valid validation result.") self.assertTrue(line_dict['Tumor_Validation_Allele2'] == line_dict['Tumor_Seq_Allele2'], "Tumor validation allele 2 did not match Tumor_Seq_Allele2 for a valid validation result.") self.assertTrue(line_dict['Match_Norm_Validation_Allele1'] == line_dict['Tumor_Validation_Allele1'], "Tumor allele 1 did not match normal alleles for a valid validation result.") self.assertTrue(line_dict['Match_Norm_Validation_Allele1'] == line_dict['Reference_Allele'], "Norm validation alleles did not match reference (norm, reference): (%s, %s)" %(line_dict['Match_Norm_Validation_Allele1'] ,line_dict['Reference_Allele']) ) self.assertTrue("G" == line_dict['Reference_Allele'], "Reference allele should have been G, but was " + line_dict['Reference_Allele']) self.assertTrue("A" == line_dict['Tumor_Seq_Allele2'], "Alt allele should have been A, but was " + line_dict['Tumor_Seq_Allele2'])
def testFlank(self): """Test that we can see a Flank mutation.""" #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94 # refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA" alts = "TGGGCTCAGGCTCTCTGAAAAGAAAA" vcs = [] gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) numSpliceSites = 0 numSilent = 0 startWindow = 11042200 for s in range(startWindow, startWindow + len(refs)): m = MutationData() m.start = str(s) m.end = str(s) m.chr = "1" m.ref_allele = refs[s - startWindow] m.alt_allele = alts[s - startWindow] m = gafDatasource.annotate_mutation(m) vc = m['variant_classification'] vcs.append(vc) print vc + " " + m.start pass
def testSilentMutationGoingToSpliceSite(self): """Test that a silent mutation within 10 bp of a splice junction should become a splice site""" #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94 # refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA" alts = "TGGGCTCAGGCTCGCTGAAAAGAAAA" vcs = [] gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) numSpliceSites = 0 numSilent = 0 startWindow = 28233780 for s in range(startWindow, 28233806): m = MutationData() m.start = str(s) m.end = str(s) m.chr = "1" m.ref_allele = refs[s - startWindow] m.alt_allele = alts[s - startWindow] m = gafDatasource.annotate_mutation(m) distanceFromSpliceSite = abs(28233793 - int(m.start)) vc = m['variant_classification'] vcs.append(vc) # self.assertTrue(vc <> 'Silent', 'Silent mutation found when it should be a splice site.') if vc.lower() == "splice_site": numSpliceSites += 1 if vc.lower() == "silent": numSilent += 1 print vc + " " + m.start + " " + str(distanceFromSpliceSite) self.assertTrue(numSpliceSites == 4, "Should have seen 4 splice site mutations, but saw: " + str(numSpliceSites)) self.assertTrue(numSilent == 11, "Should have seen 11 Silent mutations, but saw: " + str(numSilent))
def testAnnotateListOfMutations(self): """Test that we can initialize an Annotator, without an input or output and then feed mutations, one at a time... using a runspec""" # Locate the datasource directory and create a runspec dbDir = self.config.get("DEFAULT", "dbDir") ds = DatasourceFactory.createDatasources(dbDir) runSpec = RunSpecification() runSpec.initialize(None, None, datasources=ds) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationData() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" muts = [m] muts = annotator.annotate_mutations(muts) m2 = muts.next() self.assertTrue(m2.get("gene", None) is not None)
def testRetrievePrecedingBaseFromAnnotationForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue(updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue(updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue(updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele)) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue(updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue(updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue(updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele))
def test_validation_correction(self): """ Test that the validation allele fields are determined automatically when not specified by the user for invalid mutation. """ m = MutationData() m.chr = "3" m.start = "178948145" m.end = "178948145" m.alt_allele = "A" m.ref_allele = "G" m['validation_status'] = "Invalid" m['Match_Norm_Validation_Allele1'] = "" m['Match_Norm_Validation_Allele2'] = "" m['Tumor_Validation_Allele1'] = "" m['Tumor_Validation_Allele2'] = "" m['Mutation_Status'] = "Somatic" output_filename = os.path.join("out", "test_validation_correction1.maf.tsv") outputRenderer = TcgaMafOutputRenderer(output_filename, configFile=os.path.join( "configs", "tcgaMAF2.4_output.config")) outputRenderer.renderMutations([m].__iter__()) tsv_reader = GenericTsvReader(output_filename) for line_dict in tsv_reader: self.assertTrue( line_dict['Match_Norm_Validation_Allele1'] == line_dict['Match_Norm_Validation_Allele2'], "Matched norm alleles did not match.") self.assertTrue( line_dict['Tumor_Validation_Allele1'] == line_dict['Tumor_Validation_Allele2'], "Tumor alleles did not match for an invalid validation result." ) self.assertTrue( line_dict['Match_Norm_Validation_Allele1'] == line_dict['Tumor_Validation_Allele2'], "Tumor alleles did not match normal alleles for an invalid validation result." ) self.assertTrue( line_dict['Match_Norm_Validation_Allele1'] == line_dict['Reference_Allele'], "Norm validation alleles did not match reference (norm, reference): (%s, %s)" % (line_dict['Match_Norm_Validation_Allele1'], line_dict['Reference_Allele'])) self.assertTrue( "G" == line_dict['Reference_Allele'], "Reference allele should have been G, but was " + line_dict['Reference_Allele']) self.assertTrue( "None" == line_dict['Mutation_Status'], "Mutation Status must be None when Validation Status is Invalid: " + line_dict['Mutation_Status'])
def testAKT1(self): """ Test that this version of the GAF produces the up to date gene for a position given from a website user. """ m = MutationData() m.chr = '14' m.start = '105246407' m.end = '105246407' m.ref_allele = 'G' m.alt_allele = 'A' gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "AKT1", "Incorrect gene found: " + m['gene'] + " If updating GAF, this may not be an error, but should be confirmed manually.")
def test_start_codon(self): """Test a start codon hit in a GAF datasource""" gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) m = MutationData() m.start = str(22221729) m.end = str(22221729) m.chr="22" m.ref_allele = 'A' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['variant_classification'] == VariantClassification.MISSENSE)
def testRetrievePrecedingBasesForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234570 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "T", "Alt allele should be T but was %s." % mut.alt_allele) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234571 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "TT", "Alt allele should be TT but was %s." % mut.alt_allele)
def testdbNSFPAnnotationWithMissingExactMatch(self): # SNPs only """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join( *["testdata", "dbNSFP_chr1_6vars_exact_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "dbNSFP_chr1_6vars_exact_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "1" m1.start = "35138" m1.end = "35138" m1.ref_allele = "T" m1.alt_allele = "C" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation( value="", datasourceName="dbNSFP", dataType="Integer", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation( value="", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation( value="", datasourceName="dbNSFP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testBasicAnnotation(self): ds = GenericGenomicMutationDatasource('testdata/small_cosmic_2/cosmic_v65_chr18.tsv') m = MutationData() m.chr = '18' m.start = '48604683' m.end = '48604683' m.ref_allele = 'G' m.alt_allele = 'A' m.createAnnotation('strand', '+') guess = ds.annotate_mutation(m) self.assertTrue(guess['_cosmic_muts_disease_counts'], 'Unable to annotate mutation correctly')
def test_simple_annotate(self): ds = self._create_test_ds("testdata/small_tsv_leveldb/dbNSFP2.4_variant.chr1_cut5000.tsv", os.path.abspath("out/test_simple_annotate_snp_only_leveldb"), ["chr", "pos(1-coor)", "pos(1-coor)", "ref", "alt"]) m = MutationData() # 1 35138 T A m.chr = "1" m.start = "35138" m.end = "35138" m.ref_allele = "T" m.alt_allele = "A" m = ds.annotate_mutation(m) self.assertTrue(m['phyloP100way_vertebrate_rankscore'] == "0.19875")
def testMC1R(self): """Test that this version of the GAF produces a MC1R, instead of TUBB gene""" m = MutationData() m.chr = '16' m.start = '89985913' m.end = '89985913' m.ref_allele = 'G' m.alt_allele = 'A' gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) m = gafDatasource.annotate_mutation(m) # At some point, we would expect this to be MC1R, not TUBB3 self.assertTrue(m['gene'] == "TUBB3", "Incorrect gene found: " + m['gene'] + " If updating GAF, this may not be an error, but should be confirmed manually.")
def test_small_positive_strand_transcript_change(self): """Test one location on a transcript and make sure that the transcript change rendered properly """ ds = TestUtils._create_test_gencode_ds("out/small_positive_strand_") # Now for a negative strand m = MutationData() m.chr = "22" m.start = "22221730" m.end = "22221730" m.ref_allele = "T" m.alt_allele = "G" m2 = ds.annotate_mutation(m) self.assertTrue(m2['transcript_change'] == "c.1A>C", "Incorrect transcript change: " + m2['transcript_change']) # positive strand m = MutationData() m.chr = "3" m.start = "178916614" m.end = "178916614" m.ref_allele = "G" m.alt_allele = "T" m2 = ds.annotate_mutation(m) self.assertTrue(m2['transcript_change'] == "c.1G>T", "Incorrect transcript change: " + m2['transcript_change'])
def test_small_positive_strand_transcript_change(self): """Test one location on a transcript and make sure that the transcript change rendered properly """ ds = TestUtils._create_test_gencode_v19_ds("out/small_positive_strand_") # Now for a negative strand m = MutationData() m.chr = "22" m.start = "22221730" m.end = "22221730" m.ref_allele = "T" m.alt_allele = "G" m2 = ds.annotate_mutation(m) self.assertTrue(m2['transcript_change'] == "c.1A>C", "Incorrect transcript change: " + m2['transcript_change']) # positive strand m = MutationData() m.chr = "3" m.start = "178916614" m.end = "178916614" m.ref_allele = "G" m.alt_allele = "T" m2 = ds.annotate_mutation(m) self.assertTrue(m2['transcript_change'] == "c.1G>T", "Incorrect transcript change: " + m2['transcript_change'])
def createMutations(self): """ No inputs. Returns a generator of mutations built from the specified maflite file. """ aliasKeys = self._reverseAlternativeDict.keys() allColumns = self._tsvReader.getFieldNames() for line in self._tsvReader: # We only need to assign fields that are mutation attributes and have a different name in the maflite file. mut = MutationData(build=self._build) for col in allColumns: # Three scenarios: # 1) col is name of mutation data field -- simple createAnnotation # 2) col name is an alias for a mutation data field -- do lookup then createAnnotation # 3) col name is not an alias for a mutation data field -- simple createAnnotation if col in aliasKeys: realKey = self._reverseAlternativeDict[col] self.logger.debug(realKey + " found from " + col) val = line[col] if realKey == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat( line[col]) mut.createAnnotation(realKey, val, 'INPUT') else: # Scenario 1 and 3 # Make sure to convert chromosome values. val = line[col] if col == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat( line[col]) mut.createAnnotation(col, val, 'INPUT') mut.ref_allele, mut.alt_allele = mut.ref_allele.strip( ), mut.alt_allele.strip( ) #remove any trailing whitespace if present # if the alt allele == ref_allele, check that this is not a case where there is an alt_allele2 that is different. if mut.alt_allele == mut.ref_allele: mut.alt_allele = self._find_alt_allele_in_other_field( line, mut.ref_allele) # FIXME: Support more than one alias in the reverse dictionary. Then this line can be removed. if mut.start is not "" and mut.end is "": mut.end = mut.start if mut.end is not "" and mut.start is "": mut.start = mut.end yield mut
def test_start_codon(self): """Test a start codon hit in a GAF datasource""" gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) m = MutationData() m.start = str(22221729) m.end = str(22221729) m.chr = "22" m.ref_allele = 'A' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue( m['variant_classification'] == VariantClassification.MISSENSE)
def test_simple_annotate_with_nonhuman(self): """Test a very simple annotation with a nonhuman genome (saccer)""" ensembl_ds = self._create_ensembl_ds_from_saccer() m = MutationData() m.chr = "I" m.start = "500" m.end = "500" m.ref_allele = "C" m.alt_allele = "A" m2 = ensembl_ds.annotate_mutation(m) self.assertTrue(m2['annotation_transcript'] == "YAL069W") self.assertTrue(m2['gene'] == "YAL069W")
def test_simple_annotate_with_nonhuman(self): """Test a very simple annotation with a nonhuman genome (saccer)""" ensembl_ds = self._create_ensembl_ds_from_saccer() m = MutationData() m.chr = "I" m.start = "500" m.end = "500" m.ref_allele = "C" m.alt_allele = "A" m2 = ensembl_ds.annotate_mutation(m) self.assertTrue(m2['annotation_transcript'] == "YAL069W") self.assertTrue(m2['gene'] == "YAL069W")
def testMicroRNA(self): """Test proper annotation of miRNA """ #uc021qwk.1 chr12:31379258-31379277:- hsa-miR-3194-3p|? chr12:31379258-31379277:- Confidence=100 gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) m = MutationData() m.start = 31379268 m.end = 31379268 m.chr= "12" m.alt_allele = 'G' # This is accurate m.ref_allele = 'A' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'].lower() == "hsa-mir-3194-3p", "Wrong gene (GT: hsa-mir-3194-3p): " + m['gene'] + " -- if updating GAF, this test may fail as this result may not be appropriate.")
def testFlank2(self): """Test a second real-world flank scenario""" gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) # 1 228646357 nearest Gene=HIST3H2A C>T m = MutationData() m.start = str(228646357) m.end = str(228646357) m.chr="1" m.ref_allele = 'C' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "HIST3H2A", "Wrong gene (GT: HIST3H2A): " + m['gene'] + " -- if updating GAF, this test may fail as this gene may not be appropriate.") self.assertTrue(m['variant_classification'] == "5'Flank", "Should be 5'Flank, but was " + m['variant_classification'] + " -- if updating GAF, this test may fail as this test is data specific. Also, this may fail if padding parameters are changed.")
def testBasicAnnotation(self): ds = GenericGenomicMutationDatasource( 'testdata/small_cosmic_2/cosmic_v65_chr18.tsv') m = MutationData() m.chr = '18' m.start = '48604683' m.end = '48604683' m.ref_allele = 'G' m.alt_allele = 'A' m.createAnnotation('strand', '+') guess = ds.annotate_mutation(m) self.assertTrue(guess['_cosmic_muts_disease_counts'], 'Unable to annotate mutation correctly')
def test_hgvs_annotations_simple_SNP(self): """Test that HGVS annotations appear (incl. protein change) in a mutation, so we believe that the Transcript objects are populated properly.""" ds = TestUtils._create_test_gencode_ds("out/test_hgvs_annotations_") # Now for a negative strand m = MutationData() m.chr = "22" m.start = "22221730" m.end = "22221730" m.ref_allele = "T" m.alt_allele = "G" m.build = "hg19" m2 = ds.annotate_mutation(m) self.assertEqual(m2.get('HGVS_genomic_change', None), 'chr22.hg19:g.22221730T>G') self.assertEqual(m2.get('HGVS_coding_DNA_change', None), 'ENST00000215832.6:c.1A>C') self.assertEqual(m2.get('HGVS_protein_change', None), 'ENSP00000215832:p.Met1Leu')
def testAKT1(self): """ Test that this version of the GAF produces the up to date gene for a position given from a website user. """ m = MutationData() m.chr = '14' m.start = '105246407' m.end = '105246407' m.ref_allele = 'G' m.alt_allele = 'A' gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) m = gafDatasource.annotate_mutation(m) self.assertTrue( m['gene'] == "AKT1", "Incorrect gene found: " + m['gene'] + " If updating GAF, this may not be an error, but should be confirmed manually." )
def test_hgvs_annotations_simple_SNP(self): """Test that HGVS annotations appear (incl. protein change) in a mutation, so we believe that the Transcript objects are populated properly.""" ds = TestUtils._create_test_gencode_v19_ds("out/test_hgvs_annotations_SNP_") # Now for a negative strand m = MutationData() m.chr = "22" m.start = "22221730" m.end = "22221730" m.ref_allele = "T" m.alt_allele = "G" m.build = "hg19" m2 = ds.annotate_mutation(m) self.assertEqual(m2.get('HGVS_genomic_change', None), 'chr22.hg19:g.22221730T>G') self.assertEqual(m2.get('HGVS_coding_DNA_change', None), 'ENST00000215832.6:c.1A>C') self.assertEqual(m2.get('HGVS_protein_change', None), 'ENSP00000215832:p.Met1Leu')
def testMC1R(self): """Test that this version of the GAF produces a MC1R, instead of TUBB gene""" m = MutationData() m.chr = '16' m.start = '89985913' m.end = '89985913' m.ref_allele = 'G' m.alt_allele = 'A' gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) m = gafDatasource.annotate_mutation(m) # At some point, we would expect this to be MC1R, not TUBB3 self.assertTrue( m['gene'] == "TUBB3", "Incorrect gene found: " + m['gene'] + " If updating GAF, this may not be an error, but should be confirmed manually." )
def test_no_mapping_file(self): """Test that we can still create (from scratch) and instantiate a EnsemblDatasource when no protein mapping is specified (i.e. limited HGVS support)""" """Test that HGVS annotations appear (incl. protein change) in a mutation, so we believe that the Transcript objects are populated properly.""" ds = TestUtils._create_test_gencode_ds("out/test_hgvs_annotations_no_mapping_", protein_id_mapping_file=None) # Now for a negative strand m = MutationData() m.chr = "22" m.start = "22221730" m.end = "22221730" m.ref_allele = "T" m.alt_allele = "G" m.build = "hg19" m2 = ds.annotate_mutation(m) self.assertEqual(m2.get('HGVS_genomic_change', None), 'chr22.hg19:g.22221730T>G') self.assertEqual(m2.get('HGVS_coding_DNA_change', None), 'ENST00000215832.6:c.1A>C') self.assertEqual(m2.get('HGVS_protein_change', None), 'unknown_prot_seq_id:p.Met1Leu')
def test_no_mapping_file(self): """Test that we can still create (from scratch) and instantiate a EnsemblDatasource when no protein mapping is specified (i.e. limited HGVS support)""" """Test that HGVS annotations appear (incl. protein change) in a mutation, so we believe that the Transcript objects are populated properly.""" ds = TestUtils._create_test_gencode_v19_ds("out/test_hgvs_annotations_no_mapping_file_", protein_id_mapping_file=None) # Now for a negative strand m = MutationData() m.chr = "22" m.start = "22221730" m.end = "22221730" m.ref_allele = "T" m.alt_allele = "G" m.build = "hg19" m2 = ds.annotate_mutation(m) self.assertEqual(m2.get('HGVS_genomic_change', None), 'chr22.hg19:g.22221730T>G') self.assertEqual(m2.get('HGVS_coding_DNA_change', None), 'ENST00000215832.6:c.1A>C') self.assertEqual(m2.get('HGVS_protein_change', None), 'unknown_prot_seq_id:p.Met1Leu')
def test_protein_position_off_by_one(self, chrom, start, end, ref, alt, gt_prot_change): config = TestUtils.createUnitTestConfig() transcript_ds = TestUtils.createTranscriptProviderDatasource(config) cc_txs_fp = file("testdata/tx_exact_uniprot_matches.txt", 'r') cc_txs = [tx.rsplit(".", 1)[0] for tx in cc_txs_fp] cc_txs.append("ENST00000338368") # Add a transcript that is not exactly the same, but close cc_txs_fp.close() transcript_ds.set_custom_canonical_txs(cc_txs) m = MutationData() m.chr = chrom m.start = start m.end = end m.ref_allele = ref m.alt_allele = alt m2 = transcript_ds.annotate_mutation(m) self.assertEqual(m2['protein_change'], gt_prot_change)
def test_canonical_tx_list_empty(self): """Test that not specifying the canonical list will do nothing.""" ds = TestUtils._create_test_gencode_v19_ds("out/test_canonical_tx_list_") m = MutationData() m.chr = "22" m.start = "22142650" m.end = "22142650" m.ref_allele = "T" m.alt_allele = "A" m2 = ds.annotate_mutation(m) self.assertFalse(m2['annotation_transcript'].startswith("ENST00000544786")) self.assertFalse(m2['variant_classification'] == VariantClassification.INTRON) ds.set_custom_canonical_txs([]) m2 = ds.annotate_mutation(m) self.assertTrue(m2['variant_classification'] == VariantClassification.MISSENSE) self.assertFalse(m2['annotation_transcript'].startswith("ENST00000544786"))
def test_simple_annotate(self): """ Annotate a simple example. """ base_config_location = "testdata/ensembl/saccer/" config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config") title = config_parser.get("general", "title") version = config_parser.get("general", "version") src_file = config_parser.get("general", "src_file") ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file) m = MutationData() m.chr = "22" m.start = "22161963" m.end = "22161963" m.ref_allele = "C" m.alt_allele = "A" m2 = ensembl_ds.annotate_mutation(m)
def test_simple_annotate(self): """ Annotate a simple example. """ base_config_location = "testdata/ensembl/saccer/" config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config") title = config_parser.get("general", "title") version = config_parser.get("general", "version") src_file = config_parser.get("general", "src_file") ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file) m = MutationData() m.chr = "22" m.start = "22161963" m.end = "22161963" m.ref_allele = "C" m.alt_allele = "A" m2 = ensembl_ds.annotate_mutation(m)
def testMicroRNA(self): """Test proper annotation of miRNA """ #uc021qwk.1 chr12:31379258-31379277:- hsa-miR-3194-3p|? chr12:31379258-31379277:- Confidence=100 gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) m = MutationData() m.start = 31379268 m.end = 31379268 m.chr = "12" m.alt_allele = 'G' # This is accurate m.ref_allele = 'A' m = gafDatasource.annotate_mutation(m) self.assertTrue( m['gene'].lower() == "hsa-mir-3194-3p", "Wrong gene (GT: hsa-mir-3194-3p): " + m['gene'] + " -- if updating GAF, this test may fail as this result may not be appropriate." )
def testSilentMutationGoingToSpliceSite(self): """Test that a silent mutation within 10 bp of a splice junction should become a splice site""" #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94 # refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA" alts = "TGGGCTCAGGCTCGCTGAAAAGAAAA" vcs = [] gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) numSpliceSites = 0 numSilent = 0 startWindow = 28233780 for s in range(startWindow, 28233806): m = MutationData() m.start = str(s) m.end = str(s) m.chr = "1" m.ref_allele = refs[s - startWindow] m.alt_allele = alts[s - startWindow] m = gafDatasource.annotate_mutation(m) distanceFromSpliceSite = abs(28233793 - int(m.start)) vc = m['variant_classification'] vcs.append(vc) # self.assertTrue(vc <> 'Silent', 'Silent mutation found when it should be a splice site.') if vc.lower() == "splice_site": numSpliceSites += 1 if vc.lower() == "silent": numSilent += 1 print vc + " " + m.start + " " + str(distanceFromSpliceSite) self.assertTrue( numSpliceSites == 4, "Should have seen 4 splice site mutations, but saw: " + str(numSpliceSites)) self.assertTrue( numSilent == 11, "Should have seen 11 Silent mutations, but saw: " + str(numSilent))
def test_canonical_tx_list(self): """Test that specifying the canonical list will actually change the transcript selected. """ ds = TestUtils._create_test_gencode_v19_ds("out/test_canonical_tx_list_") m = MutationData() m.chr = "22" m.start = "22142650" m.end = "22142650" m.ref_allele = "T" m.alt_allele = "A" ds.set_custom_canonical_txs(["ENST00000544786"]) ds.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT) # NOTE: tx list overrides best effect m2 = ds.annotate_mutation(m) self.assertTrue(m2['annotation_transcript'].startswith("ENST00000544786")) self.assertTrue(m2['variant_classification'] == VariantClassification.INTRON) ds.set_custom_canonical_txs([]) m2 = ds.annotate_mutation(m) self.assertTrue(m2['variant_classification'] == VariantClassification.MISSENSE) self.assertFalse(m2['annotation_transcript'].startswith("ENST00000544786"))
def testSkippingAltsForSingleMut(self): """Test a simple case where a single mutation with alt_allele_seen of False is not produced.""" runSpec = RunSpecification() runSpec.initialize(None, None, datasources=[], is_skip_no_alts=True) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationData() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" m.createAnnotation("alt_allele_seen", "False") muts = [m] muts = annotator.annotate_mutations(muts) self.assertRaises(StopIteration, muts.next)
def testSkippingAltsForSingleMut(self): """Test a simple case where a single mutation with alt_allele_seen of False is not produced.""" runSpec = RunSpecification() runSpec.initialize(None, None, datasources=[], is_skip_no_alts=True) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationData() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" m.createAnnotation("alt_allele_seen", "False") muts = [m] muts = annotator.annotate_mutations(muts) self.assertRaises(StopIteration, muts.next)
def testFlank2(self): """Test a second real-world flank scenario""" gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) # 1 228646357 nearest Gene=HIST3H2A C>T m = MutationData() m.start = str(228646357) m.end = str(228646357) m.chr = "1" m.ref_allele = 'C' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue( m['gene'] == "HIST3H2A", "Wrong gene (GT: HIST3H2A): " + m['gene'] + " -- if updating GAF, this test may fail as this gene may not be appropriate." ) self.assertTrue( m['variant_classification'] == "5'Flank", "Should be 5'Flank, but was " + m['variant_classification'] + " -- if updating GAF, this test may fail as this test is data specific. Also, this may fail if padding parameters are changed." )
def testSpliceSiteWithinNBases(self): """Test that a silent mutation is changed to splice site w/in 10 bases of a splice site """ # chr21:10,998,326-10,998,346 # 10,998,336 is a splice site. (Junction between 10998335 and 336) # AGTTCTCCTT C TGGAAAAAAG refs = 'AGTTCTCCTTCTGGAAAAAAG' alts = 'TCAGACTGAAAATACCCCCCT' gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) vcs = [] for s in range(10998326, 10998347): m = MutationData() m.start = str(s) m.end = str(s) m.chr = "21" m.ref_allele = refs[s - 10998326] m.alt_allele = alts[s - 10998326] m = gafDatasource.annotate_mutation(m) distanceFromSpliceSite = abs(10998336 - int(m.start)) vc = m['variant_classification'] self.assertTrue( vc != 'Silent', 'Silent mutation found when it should be a splice site.') vcs.append(vc) print vc + " " + m.start self.assertTrue( all([tmp == "Splice_Site" for tmp in vcs[8:12]]), "Not all vcs within 2 bases were splice site: " + str(vcs[8:12])) self.assertTrue(all([tmp != "Splice_Site" for tmp in vcs[0:8]]), "No splice sites should be seen: " + str(vcs[0:8])) self.assertTrue(all([tmp != "Splice_Site" for tmp in vcs[12:20]]), "No splice sites should be seen: " + str(vcs[12:20]))