def test_effect_tx_mode(self): gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT) # Canonical mutation was Intron m = MutationDataFactory.default_create() m.chr = '2' m.start = '219137340' m.end = '219137340' m.ref_allele = 'G' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "PNKD") self.assertTrue(m['variant_classification'] == "Missense_Mutation") gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_CANONICAL) m = MutationDataFactory.default_create() m.chr = '2' m.start = '219137340' m.end = '219137340' m.ref_allele = 'G' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "PNKD") self.assertTrue(m['variant_classification'] == "Intron", "Canonical no longer is Intron. This test is no longer valid. This failure can come up when changing the GAF datasource.")
def testRealWorld(self): """Test that the full COSMIC datasource can retrieve entries by both gp and gpp.""" gafDS = TestUtils.createTranscriptProviderDatasource(self.config) cosmicDS = TestUtils.createCosmicDatasource(self.config) # These values are not taken from a real world scenario, but are cooked for this test. m = MutationDataFactory.default_create() m.chr = '1' m.start = '12941796' m.end = '12941796' m.ref_allele = "G" m.alt_allele = "T" m = gafDS.annotate_mutation(m) m = cosmicDS.annotate_mutation(m) self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '0') #1 150483621 150483621 m = MutationDataFactory.default_create() m.chr = '1' m.start = '150483621' m.end = '150483621' m.ref_allele = "G" m.alt_allele = "T" m = gafDS.annotate_mutation(m) m = cosmicDS.annotate_mutation(m)
def test_continuous_exons_in_segments(self): """Test that all exons are accounted when annotating adjacent segments that skip an exon. """ # SPECC1L 10+ 22 24734447 SPECC1L 10+ 41783674 TEF 1- 1215.0 -0.04975556624325125 hg19 CESC.TCGA.BI.A0VR.Tumor.SM.1RACM # SPECC1L 8- 22 16282318 POTEH 2- 24730543 SPECC1L 8- 433.0 -0.00781166374668759 hg19 CESC.TCGA.BI.A0VR.Tumor.SM.1RACM # SPECC1L-ADORA2A 22 24734447 SPECC1L 10+ 41783674 TEF 1- 1215.0 -0.04975556624325125 hg19 CESC.TCGA.BI.A0VR.Tumor.SM.1RACM seg1 = MutationDataFactory.default_create() seg1.chr = "22" seg1.start = "24734447" # Just passed the exon 9 (0-based) seg1.end = "41783674" seg2 = MutationDataFactory.default_create() seg2.chr = "22" seg2.start = "16282318" seg2.end = "24730543" # Just passed the exon 8 (0-based) segs = [seg1, seg2] # 'ENST00000314328.9' for GENCODE v19 chosen_tx, transcript_ds = self._get_chosen_tx_and_transcript_ds(seg1.chr, seg1.start) result_tuple = transcript_ds._determine_exons_affected_by_start(seg1.start, chosen_tx) self.assertTrue(result_tuple == (10, '+')) result_tuple = transcript_ds._determine_exons_affected_by_end(seg2.end, chosen_tx) self.assertTrue(result_tuple == (8, '-'))
def test_simple_collapse(self): """Ensure simple rules for numeric collapsing are honored""" m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000") m1.createAnnotation('ALT_F2R1', "34|36") m1.createAnnotation('i_t_Foxog', ".509|.511") m1.createAnnotation('i_tumor_f', ".200|.210") m1.createAnnotation('hamilcar', "0|0") m1.createAnnotation('donotcollapse', "1|45") m2 = MutationDataFactory.default_create(chr="1", start="10000", end="10000") m2.createAnnotation('ALT_F2R1', "36|38") m2.createAnnotation('i_t_Foxog', ".500|.510") m2.createAnnotation('i_tumor_f', ".100|.110") m2.createAnnotation('hamilcar', "0.01|0") m2.createAnnotation('barca', "0.02|0") m2.createAnnotation('donotcollapse', "100|4500") cc = ColumnCollapser() cc.update_mutation(m1) self.assertEqual(m1['ALT_F2R1'], "34") self.assertEqual(float(m1['i_t_Foxog']), float(".510")) self.assertEqual(float(m1['i_tumor_f']), float(".205")) self.assertEqual(float(m1['hamilcar']), float("0")) self.assertEqual(m1['donotcollapse'], "1|45") cc.update_mutation(m2) self.assertEqual(m2['ALT_F2R1'], "36") self.assertEqual(float(m2['i_t_Foxog']), float(".505")) self.assertEqual(float(m2['i_tumor_f']), float(".105")) self.assertEqual(float(m2['hamilcar']), float("0.005")) self.assertEqual(float(m2['barca']), float("0.01")) self.assertEqual(m2['donotcollapse'], "100|4500")
def _simple_annotate(self, is_skip_no_alts): runSpec = RunSpecification() runSpec.initialize(None, None, datasources=[], is_skip_no_alts=is_skip_no_alts) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationDataFactory.default_create() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" m.createAnnotation("alt_allele_seen", "False") m2 = MutationDataFactory.default_create() m2.chr = "1" m2.start = "12941796" m2.end = "12941796" m2.alt_allele = "G" m2.ref_allele = "T" muts = [m, m2] muts = annotator.annotate_mutations(muts) ctr = 0 for m in muts: ctr += 1 return ctr
def test_effect_tx_mode(self): gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT) # Canonical mutation was Intron m = MutationDataFactory.default_create() m.chr = '2' m.start = '219137340' m.end = '219137340' m.ref_allele = 'G' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "PNKD") self.assertTrue(m['variant_classification'] == "Missense_Mutation") gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_CANONICAL) m = MutationDataFactory.default_create() m.chr = '2' m.start = '219137340' m.end = '219137340' m.ref_allele = 'G' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "PNKD") self.assertTrue( m['variant_classification'] == "Intron", "Canonical no longer is Intron. This test is no longer valid. This failure can come up when changing the GAF datasource." )
def testMulticoreAnnotate(self): """Test a (too) simple annotating exercise from GAF on 2 cores""" gafDatasource = TestUtils.createGafDatasourceProxy(self.config) # Test pickling dump(gafDatasource, file('out/testGAFPickle.pkl','w')) m1 = MutationDataFactory.default_create() m1.chr = '3' m1.start = '178866811' m1.end = '178866811' m1.ref_allele = "A" m1.alt_allele = "C" m1.build = "hg19" m2 = MutationDataFactory.default_create() m2.chr = '3' m2.start = '178866812' m2.end = '178866812' m2.ref_allele = "A" m2.alt_allele = "C" m2.build = "hg19" p = LoggingPool(processes=2) result = p.map(annotate_mutation_global, [(gafDatasource, m1), (gafDatasource, m2)]) p.close() p.join() for r in result: self.assertTrue("transcript_id" in r.keys()) self.assertTrue("gene" in r.keys()) self.assertTrue(r["gene"] == "PIK3CA") self.assertTrue(result[0].start != result[1].start)
def test_mutation_combiner(self): """Test that attributes and annotations are set properly with combine mutations""" mut1 = MutationDataFactory.default_create(chr=1, start=100, end=100, ref_allele="G", alt_allele="A") mut1.createAnnotation("SomeValue", "value1", "INPUT", "STRING", "a value") mut2 = MutationDataFactory.default_create(chr=1, start=101, end=101, ref_allele="C", alt_allele="T") mut2.createAnnotation("SomeValue", "value2", tags=["IT"]) mut2.createAnnotation("AnotherValue", "5") mdf = MutationDataFactory() result = OnpQueue._combine_mutations([mut1, mut2], mdf) expected = MutationDataFactory.default_create(chr=1, start=100, end=101, ref_allele="GC", alt_allele="AT") expected.createAnnotation("SomeValue", "value1|value2", "INPUT", "STRING", "a value", tags=["IT"]) expected.createAnnotation("AnotherValue", "5") self.assertTrue(result.attributesEqual(expected)) self.assertEqual(result, expected)
def test_annotation_copy_collision(self): """Test that annotation copy will use the bahavior of the mutation in case of collision due to suffix""" m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000") m1.createAnnotation('ALT_F2R1', "30|36", annotationSource="TEST") m1.createAnnotation('ALT_F2R1_full', "going_to_be_overwritten", annotationSource="TEST") is_exception_seen = False cc = ColumnCollapser() try: cc.update_mutation(m1, copy_old_suffix="_full") except DuplicateAnnotationException as dae: is_exception_seen = True self.assertTrue(is_exception_seen, "Did not see duplicate annotation exception") m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000", allow_overwriting=True) m1.createAnnotation('ALT_F2R1', "30|36", annotationSource="TEST") m1.createAnnotation('ALT_F2R1_full', "going_to_be_overwritten", annotationSource="TEST") cc = ColumnCollapser() cc.update_mutation(m1, copy_old_suffix="_full") self.assertEqual(m1['ALT_F2R1_full'], "30|36") self.assertEqual(m1['ALT_F2R1'], "30")
def test_continuous_exons_in_segments(self): """Test that all exons are accounted when annotating adjacent segments that skip an exon. """ # SPECC1L 10+ 22 24734447 SPECC1L 10+ 41783674 TEF 1- 1215.0 -0.04975556624325125 hg19 CESC.TCGA.BI.A0VR.Tumor.SM.1RACM # SPECC1L 8- 22 16282318 POTEH 2- 24730543 SPECC1L 8- 433.0 -0.00781166374668759 hg19 CESC.TCGA.BI.A0VR.Tumor.SM.1RACM # SPECC1L-ADORA2A 22 24734447 SPECC1L 10+ 41783674 TEF 1- 1215.0 -0.04975556624325125 hg19 CESC.TCGA.BI.A0VR.Tumor.SM.1RACM seg1 = MutationDataFactory.default_create() seg1.chr = "22" seg1.start = "24734447" # Just passed the exon 9 (0-based) seg1.end = "41783674" seg2 = MutationDataFactory.default_create() seg2.chr = "22" seg2.start = "16282318" seg2.end = "24730543" # Just passed the exon 8 (0-based) segs = [seg1, seg2] # 'ENST00000314328.9' for GENCODE v19 chosen_tx, transcript_ds = self._get_chosen_tx_and_transcript_ds( seg1.chr, seg1.start) result_tuple = transcript_ds._determine_exons_affected_by_start( seg1.start, chosen_tx) self.assertTrue(result_tuple == (10, '+')) result_tuple = transcript_ds._determine_exons_affected_by_end( seg2.end, chosen_tx) self.assertTrue(result_tuple == (8, '-'))
def test_small_positive_strand_transcript_change(self): """Test one location on a transcript and make sure that the transcript change rendered properly """ ds = TestUtils._create_test_gencode_v19_ds( "out/small_positive_strand_") # Now for a negative strand m = MutationDataFactory.default_create() m.chr = "22" m.start = "22221730" m.end = "22221730" m.ref_allele = "T" m.alt_allele = "G" m2 = ds.annotate_mutation(m) self.assertTrue( m2['transcript_change'] == "c.1A>C", "Incorrect transcript change: " + m2['transcript_change']) # positive strand m = MutationDataFactory.default_create() m.chr = "3" m.start = "178916614" m.end = "178916614" m.ref_allele = "G" m.alt_allele = "T" m2 = ds.annotate_mutation(m) self.assertTrue( m2['transcript_change'] == "c.1G>T", "Incorrect transcript change: " + m2['transcript_change'])
def test_mutation_combiner_ordering(self): """Test that ordering of combined attributes makes matches original order""" mut1 = MutationDataFactory.default_create(chr=1, start=100, end=100, ref_allele="G", alt_allele="A") mut1.createAnnotation("SomeDepth", "2") mut1.createAnnotation("AnotherDepth", "1") mut2 = MutationDataFactory.default_create(chr=1, start=101, end=101, ref_allele="C", alt_allele="T") mut2.createAnnotation("SomeDepth", "1") mut2.createAnnotation("AnotherDepth", "2") mdf = MutationDataFactory() result = OnpQueue._combine_mutations([mut1, mut2], mdf) expected = MutationDataFactory.default_create(chr=1, start=100, end=101, ref_allele="GC", alt_allele="AT") expected.createAnnotation("SomeDepth", "2|1") expected.createAnnotation("AnotherDepth", "1|2") self.assertTrue(result.attributesEqual(expected)) self.assertEqual(result, expected)
def test_mutation_combiner_identical_annotation(self): """Test that annotations with all identical values are not repeated with | between them""" mut1 = MutationDataFactory.default_create(chr=1, start=100, end=100, ref_allele="G", alt_allele="A") mut1.createAnnotation("SampleName", "John Doe") mut2 = MutationDataFactory.default_create(chr=1, start=101, end=101, ref_allele="C", alt_allele="T") mut2.createAnnotation("SampleName", "John Doe") mdf = MutationDataFactory() result = OnpQueue._combine_mutations([mut1, mut2], mdf) expected = MutationDataFactory.default_create(chr=1, start=100, end=101, ref_allele="GC", alt_allele="AT") expected.createAnnotation("SampleName", "John Doe") self.assertTrue(result.attributesEqual(expected)) self.assertEqual(result, expected)
def test_denovo(self): """GAF de novo test """ gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) m = MutationDataFactory.default_create() m.start = str(22221735) m.end = str(22221737) m.chr = "22" m.ref_allele = '' m.alt_allele = 'CAT' m = gafDatasource.annotate_mutation(m) self.assertTrue( m['variant_classification'] == 'De_novo_Start_OutOfFrame') m = MutationDataFactory.default_create() m.start = str(22221735) m.end = str(22221740) m.chr = "22" m.ref_allele = '' m.alt_allele = 'AACATAA' m = gafDatasource.annotate_mutation(m) self.assertTrue( m['variant_classification'] == 'De_novo_Start_OutOfFrame') m = MutationDataFactory.default_create() m.start = str(22221735) m.end = str(22221739) m.chr = "22" m.ref_allele = '' m.alt_allele = 'ACATAA' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['variant_classification'] == 'De_novo_Start_InFrame')
def test_denovo(self): """GAF de novo test """ gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) m = MutationDataFactory.default_create() m.start = str(22221735) m.end = str(22221737) m.chr="22" m.ref_allele = '' m.alt_allele = 'CAT' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['variant_classification'] == 'De_novo_Start_OutOfFrame') m = MutationDataFactory.default_create() m.start = str(22221735) m.end = str(22221740) m.chr="22" m.ref_allele = '' m.alt_allele = 'AACATAA' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['variant_classification'] == 'De_novo_Start_OutOfFrame') m = MutationDataFactory.default_create() m.start = str(22221735) m.end = str(22221739) m.chr="22" m.ref_allele = '' m.alt_allele = 'ACATAA' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['variant_classification'] == 'De_novo_Start_InFrame')
def testMulticoreAnnotate(self): """Test a (too) simple annotating exercise from GAF on 2 cores""" gafDatasource = TestUtils.createGafDatasourceProxy(self.config) # Test pickling dump(gafDatasource, file('out/testGAFPickle.pkl', 'w')) m1 = MutationDataFactory.default_create() m1.chr = '3' m1.start = '178866811' m1.end = '178866811' m1.ref_allele = "A" m1.alt_allele = "C" m1.build = "hg19" m2 = MutationDataFactory.default_create() m2.chr = '3' m2.start = '178866812' m2.end = '178866812' m2.ref_allele = "A" m2.alt_allele = "C" m2.build = "hg19" p = LoggingPool(processes=2) result = p.map(annotate_mutation_global, [(gafDatasource, m1), (gafDatasource, m2)]) p.close() p.join() for r in result: self.assertTrue("transcript_id" in r.keys()) self.assertTrue("gene" in r.keys()) self.assertTrue(r["gene"] == "PIK3CA") self.assertTrue(result[0].start != result[1].start)
def testRetrievePrecedingBasesForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationDataFactory.default_create(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234570 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "T", "Alt allele should be T but was %s." % mut.alt_allele) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationDataFactory.default_create(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234571 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "TT", "Alt allele should be TT but was %s." % mut.alt_allele)
def test_mutation_combiner(self): """Test that attributes and annotations are set properly with combine mutations""" mut1 = MutationDataFactory.default_create(chr=1,start=100, end=100, ref_allele="G", alt_allele="A") mut1.createAnnotation("SomeValue", "value1", "INPUT", "STRING", "a value") mut2 = MutationDataFactory.default_create(chr=1,start=101, end=101, ref_allele="C", alt_allele="T") mut2.createAnnotation("SomeValue", "value2", tags=["IT"]) mut2.createAnnotation("AnotherValue","5") mdf = MutationDataFactory() result = OnpQueue._combine_mutations([mut1, mut2], mdf) expected = MutationDataFactory.default_create(chr=1, start=100, end=101, ref_allele="GC", alt_allele="AT") expected.createAnnotation("SomeValue", "value1|value2", "INPUT", "STRING", "a value", tags=["IT"]) expected.createAnnotation("AnotherValue", "5") self.assertTrue(result.attributesEqual(expected)) self.assertEqual(result, expected)
def test_mutation_combiner_identical_annotation(self): """Test that annotations with all identical values are not repeated with | between them""" mut1 = MutationDataFactory.default_create(chr=1,start=100, end=100, ref_allele="G", alt_allele="A") mut1.createAnnotation("SampleName", "John Doe") mut2 = MutationDataFactory.default_create(chr=1,start=101, end=101, ref_allele="C", alt_allele="T") mut2.createAnnotation("SampleName", "John Doe" ) mdf = MutationDataFactory() result = OnpQueue._combine_mutations([mut1, mut2], mdf) expected = MutationDataFactory.default_create(chr=1, start=100, end=101, ref_allele="GC", alt_allele="AT") expected.createAnnotation("SampleName", "John Doe") self.assertTrue(result.attributesEqual(expected)) self.assertEqual(result, expected)
def test_canonical_tx_list(self): """Test that specifying the canonical list will actually change the transcript selected. """ ds = TestUtils._create_test_gencode_v19_ds( "out/test_canonical_tx_list_") m = MutationDataFactory.default_create() m.chr = "22" m.start = "22142650" m.end = "22142650" m.ref_allele = "T" m.alt_allele = "A" ds.set_custom_canonical_txs(["ENST00000544786"]) ds.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT) # NOTE: tx list overrides best effect m2 = ds.annotate_mutation(m) self.assertTrue( m2['annotation_transcript'].startswith("ENST00000544786")) self.assertTrue( m2['variant_classification'] == VariantClassification.INTRON) ds.set_custom_canonical_txs([]) m2 = ds.annotate_mutation(m) self.assertTrue( m2['variant_classification'] == VariantClassification.MISSENSE) self.assertFalse( m2['annotation_transcript'].startswith("ENST00000544786"))
def testdbNSFPNoRefAltAnnotationWithExactMatch(self): """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join(*["testdata", "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds.config"), tabixIndexedTsvDirName) m1 = MutationDataFactory.default_create() m1.chr = "1" m1.start = "35140" m1.end = "35140" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation(value="1|1|1", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation(value="TAA|TAA|TAA", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation(value="-|-|-", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testAnnotateListOfMutations(self): """Test that we can initialize an Annotator, without an input or output and then feed mutations, one at a time... using a runspec""" # Locate the datasource directory and create a runspec dbDir = self.config.get("DEFAULT", "dbDir") ds = DatasourceFactory.createDatasources(dbDir) runSpec = RunSpecification() runSpec.initialize(None, None, datasources=ds) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationDataFactory.default_create() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" muts = [m] muts = annotator.annotate_mutations(muts) m2 = muts.next() self.assertTrue(m2.get("gene", None) is not None)
def testESPCoverageAnnotationWithSNPAvgMatch(self): """ """ self.logger.info("Initializing ESP6500SI-V2 Coverage") tabixIndexedTsvDirName = os.path.join(*["testdata", "small_esp_coverage_avg_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_avg_ds.config"), tabixIndexedTsvDirName) m1 = MutationDataFactory.default_create() m1.chr = "X" m1.start = "100075334" m1.end = "100075334" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AvgAAsampleReadDepth") cur_annotation = Annotation(value="75.0", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_TotalAAsamplesCovered") cur_annotation = Annotation(value="692.0", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Chromosome") cur_annotation = Annotation(value="X", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def test_appris_selects_transcript(self): m = MutationDataFactory.default_create(chr="2", start="201722365", end="201722366", ref_allele="AC", alt_allele="-", build="hg19") transcript_ds = TestUtils.createTranscriptProviderDatasource(self.config) m = transcript_ds.annotate_mutation(m) tx = transcript_ds.get_transcript(m['annotation_transcript']) self.assertTrue(tx is not None, "Transcript was None when it should have been found. Does the ground truth transcript above need to be updated?") self.assertEqual(tx._transcript_id,'ENST00000321356.4')
def testExtentOutOfRangeError(self): ''' If a window is specified that extends beyond the beginning or end of a file, truncate the ref_context. Use what is left for gc_content as well.''' ds = ReferenceDatasource('testdata/reference_ds', windowSizeRef=6, windowSizeGCContent=5) m = MutationDataFactory.default_create() m.chr = "22" m.start = "4" m.end = "4" # "CCCAAGCTAAACCCAGGCCAC" groundTruth = "CCCAAGCTAA" guess = ds.annotate_mutation(m) self.assertTrue( guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(guess['ref_context'])) # gc_content is rounded to 3 decimal places self.assertTrue( fabs(float(guess['gc_content']) - (float(5) / float(9))) < .001, "gc_content was not populated properly: " + str(guess['gc_content']))
def testESPCoverageAnnotationWithMissingAnnotationValuesIndelAvgMatch( self): """ """ self.logger.info("Initializing ESP6500SI-V2 Coverage") tabixIndexedTsvDirName = os.path.join( *["testdata", "small_esp_coverage_avg_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_avg_ds.config"), tabixIndexedTsvDirName) m1 = MutationDataFactory.default_create() m1.chr = "X" m1.start = "100075350" m1.end = "100075356" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AvgSampleReadDepth") cur_annotation = Annotation( value="91.25", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testMixedAnnotation(self): """Test that the COSMIC datasource can retrieve entries by both gp and gpp.""" tabixDir = "testdata/small_cosmic_with_gp_and_gpp/" cosmicDS = Cosmic( src_file=tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.gz", title="Cosmic", version="test", gpp_tabix_file=tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.byAA.sorted.tsv.gz") # These values are not taken from a real world scenario, but are cooked for this test. # Line 9 should get picked up genomic coords # Lines 7,8 should get picked up by the protein position m = MutationDataFactory.default_create() m.createAnnotation("gene", "A2M") m.createAnnotation("transcript_protein_position_start", "1300") m.createAnnotation("transcript_protein_position_end", "1400") m.chr = '12' m.start = '9227220' m.end = '9227230' m = cosmicDS.annotate_mutation(m) self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '3') self.assertTrue( m['COSMIC_overlapping_mutation_AAs'].find('1229') != -1, "Could not find the entry specified by genomic coords.") self.assertTrue( m['COSMIC_overlapping_primary_sites'] == "lung(3)", "Did not have the correct primary sites annotation (lung(3)): " + m['COSMIC_overlapping_primary_sites'])
def testSpliceSiteWithinNBases(self): """Test that a silent mutation is changed to splice site w/in 10 bases of a splice site """ # chr21:10,998,326-10,998,346 # 10,998,336 is a splice site. (Junction between 10998335 and 336) # AGTTCTCCTT C TGGAAAAAAG refs = 'AGTTCTCCTTCTGGAAAAAAG' alts = 'TCAGACTGAAAATACCCCCCT' gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) vcs = [] for s in range(10998326, 10998347): m = MutationDataFactory.default_create() m.start = str(s) m.end = str(s) m.chr = "21" m.ref_allele = refs[s - 10998326] m.alt_allele = alts[s - 10998326] m = gafDatasource.annotate_mutation(m) distanceFromSpliceSite = abs(10998336 - int(m.start)) vc = m['variant_classification'] self.assertTrue(vc != 'Silent', 'Silent mutation found when it should be a splice site.') vcs.append(vc) print vc + " " + m.start self.assertTrue(all([tmp == "Splice_Site" for tmp in vcs[8:12]]), "Not all vcs within 2 bases were splice site: " + str(vcs[8:12])) self.assertTrue(all([tmp != "Splice_Site" for tmp in vcs[0:8]]), "No splice sites should be seen: " + str(vcs[0:8])) self.assertTrue(all([tmp != "Splice_Site" for tmp in vcs[12:20]]), "No splice sites should be seen: " + str(vcs[12:20]))
def test_not_updating_annotation_source(self): """Test that do not have to update annotation source if columns are collapsed""" m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000") m1.createAnnotation('ALT_F2R1', "|36", annotationSource="TEST") cc = ColumnCollapser() cc.update_mutation(m1) self.assertEqual(m1.getAnnotation("ALT_F2R1").getDatasource(), "TEST")
def testFlank(self): """Test that we can see a Flank mutation.""" #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94 # refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA" alts = "TGGGCTCAGGCTCTCTGAAAAGAAAA" vcs = [] gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) numSpliceSites = 0 numSilent = 0 startWindow = 11042200 for s in range(startWindow, startWindow+len(refs)): m = MutationDataFactory.default_create() m.start = str(s) m.end = str(s) m.chr="1" m.ref_allele = refs[s-startWindow] m.alt_allele = alts[s-startWindow] m = gafDatasource.annotate_mutation(m) vc = m['variant_classification'] vcs.append(vc) print vc + " " + m.start pass
def test_validation_correction_valid(self): """ Test that the validation allele fields are determined automatically when not specified by the user for a valid mutation. """ m = MutationDataFactory.default_create() m.chr = "3" m.start = "178948145" m.end = "178948145" m.alt_allele = "A" m.ref_allele = "G" m['validation_status'] = "Valid" m['Match_Norm_Validation_Allele1'] = "" m['Match_Norm_Validation_Allele2'] = "" m['Tumor_Validation_Allele1'] = "" m['Tumor_Validation_Allele2'] = "" m['Mutation_Status'] = "Somatic" output_filename = os.path.join("out", "test_validation_correction2.maf.tsv") outputRenderer = TcgaMafOutputRenderer(output_filename, configFile=os.path.join("configs", "tcgaMAF2.4_output.config")) outputRenderer.renderMutations([m].__iter__()) tsv_reader = GenericTsvReader(output_filename) for line_dict in tsv_reader: self.assertTrue(line_dict['Match_Norm_Validation_Allele1'] == line_dict['Match_Norm_Validation_Allele2'], "Matched norm alleles did not match.") self.assertTrue(line_dict['Tumor_Validation_Allele1'] == line_dict['Reference_Allele'], "Tumor validation allele 1 did not match reference for a valid validation result.") self.assertTrue(line_dict['Tumor_Validation_Allele2'] == line_dict['Tumor_Seq_Allele2'], "Tumor validation allele 2 did not match Tumor_Seq_Allele2 for a valid validation result.") self.assertTrue(line_dict['Match_Norm_Validation_Allele1'] == line_dict['Tumor_Validation_Allele1'], "Tumor allele 1 did not match normal alleles for a valid validation result.") self.assertTrue(line_dict['Match_Norm_Validation_Allele1'] == line_dict['Reference_Allele'], "Norm validation alleles did not match reference (norm, reference): (%s, %s)" %(line_dict['Match_Norm_Validation_Allele1'] ,line_dict['Reference_Allele']) ) self.assertTrue("G" == line_dict['Reference_Allele'], "Reference allele should have been G, but was " + line_dict['Reference_Allele']) self.assertTrue("A" == line_dict['Tumor_Seq_Allele2'], "Alt allele should have been A, but was " + line_dict['Tumor_Seq_Allele2'])
def testAddTag(self): ''' Test adding a tag to an annotation ''' m = MutationDataFactory.default_create() m.createAnnotation("fake1", "1") m.addTagToAnnotation("fake1", "fakeTag") self.assertTrue("fakeTag" in m.getAnnotation("fake1").getTags(), "Tag was not added properly.")
def testFlank(self): """Test that we can see a Flank mutation.""" #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94 # refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA" alts = "TGGGCTCAGGCTCTCTGAAAAGAAAA" vcs = [] gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) numSpliceSites = 0 numSilent = 0 startWindow = 11042200 for s in range(startWindow, startWindow + len(refs)): m = MutationDataFactory.default_create() m.start = str(s) m.end = str(s) m.chr = "1" m.ref_allele = refs[s - startWindow] m.alt_allele = alts[s - startWindow] m = gafDatasource.annotate_mutation(m) vc = m['variant_classification'] vcs.append(vc) print vc + " " + m.start pass
def testSilentMutationGoingToSpliceSite(self): """Test that a silent mutation within 10 bp of a splice junction should become a splice site""" #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94 # refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA" alts = "TGGGCTCAGGCTCGCTGAAAAGAAAA" vcs = [] gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) numSpliceSites = 0 numSilent = 0 startWindow = 28233780 for s in range(startWindow, 28233806): m = MutationDataFactory.default_create() m.start = str(s) m.end = str(s) m.chr = "1" m.ref_allele = refs[s - startWindow] m.alt_allele = alts[s - startWindow] m = gafDatasource.annotate_mutation(m) distanceFromSpliceSite = abs(28233793 - int(m.start)) vc = m['variant_classification'] vcs.append(vc) # self.assertTrue(vc <> 'Silent', 'Silent mutation found when it should be a splice site.') if vc.lower() == "splice_site": numSpliceSites += 1 if vc.lower() == "silent": numSilent += 1 print vc + " " + m.start + " " + str(distanceFromSpliceSite) self.assertTrue(numSpliceSites == 4, "Should have seen 4 splice site mutations, but saw: " + str(numSpliceSites)) self.assertTrue(numSilent == 11, "Should have seen 11 Silent mutations, but saw: " + str(numSilent))
def test_range_fetch(self): m = MutationDataFactory.default_create() m.createAnnotation('chr', '1') m.createAnnotation('start', 78978) m.createAnnotation('end', 79000) self.bigwig_datasource.annotate_mutation(m) self.assertEqual(m.get('TestBigWig_score'), 0.75)
def testPickleable(self): """Test that a near-empty MutationData can be pickled""" m = MutationDataFactory.default_create() m.chr = "2" m.createAnnotation("fake1", "1") m.addTagToAnnotation("fake1", "fakeTag") import cPickle cPickle.dump(m, open("out/testMDPickle.pkl", 'w'))
def testIter(self): m = MutationDataFactory.default_create() m.createAnnotation("fake1", "1") m.createAnnotation("fake2", "blah blah") for k in m: self.assertTrue((k in ["fake1", "fake2"]) or (k in MutationData.attributes), "Key not present: " + k)
def test_mutation_combiner_ordering(self): """Test that ordering of combined attributes makes matches original order""" mut1 = MutationDataFactory.default_create(chr=1,start=100, end=100, ref_allele="G", alt_allele="A") mut1.createAnnotation("SomeDepth", "2") mut1.createAnnotation("AnotherDepth", "1") mut2 = MutationDataFactory.default_create(chr=1,start=101, end=101, ref_allele="C", alt_allele="T") mut2.createAnnotation("SomeDepth", "1" ) mut2.createAnnotation("AnotherDepth", "2") mdf = MutationDataFactory() result = OnpQueue._combine_mutations([mut1, mut2], mdf) expected = MutationDataFactory.default_create(chr=1, start=100, end=101, ref_allele="GC", alt_allele="AT") expected.createAnnotation("SomeDepth", "2|1") expected.createAnnotation("AnotherDepth","1|2") self.assertTrue(result.attributesEqual(expected)) self.assertEqual(result, expected)
def testRetrievePrecedingBaseFromAnnotationForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationDataFactory.default_create(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue(updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue(updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue(updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele)) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationDataFactory.default_create(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue(updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue(updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue(updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele))
def test_validation_correction(self): """ Test that the validation allele fields are determined automatically when not specified by the user for invalid mutation. """ m = MutationDataFactory.default_create() m.chr = "3" m.start = "178948145" m.end = "178948145" m.alt_allele = "A" m.ref_allele = "G" m['validation_status'] = "Invalid" m['Match_Norm_Validation_Allele1'] = "" m['Match_Norm_Validation_Allele2'] = "" m['Tumor_Validation_Allele1'] = "" m['Tumor_Validation_Allele2'] = "" m['Mutation_Status'] = "Somatic" output_filename = os.path.join("out", "test_validation_correction1.maf.tsv") outputRenderer = TcgaMafOutputRenderer(output_filename, configFile=os.path.join( "configs", "tcgaMAF2.4_output.config")) outputRenderer.renderMutations([m].__iter__()) tsv_reader = GenericTsvReader(output_filename) for line_dict in tsv_reader: self.assertTrue( line_dict['Match_Norm_Validation_Allele1'] == line_dict['Match_Norm_Validation_Allele2'], "Matched norm alleles did not match.") self.assertTrue( line_dict['Tumor_Validation_Allele1'] == line_dict['Tumor_Validation_Allele2'], "Tumor alleles did not match for an invalid validation result." ) self.assertTrue( line_dict['Match_Norm_Validation_Allele1'] == line_dict['Tumor_Validation_Allele2'], "Tumor alleles did not match normal alleles for an invalid validation result." ) self.assertTrue( line_dict['Match_Norm_Validation_Allele1'] == line_dict['Reference_Allele'], "Norm validation alleles did not match reference (norm, reference): (%s, %s)" % (line_dict['Match_Norm_Validation_Allele1'], line_dict['Reference_Allele'])) self.assertTrue( "G" == line_dict['Reference_Allele'], "Reference allele should have been G, but was " + line_dict['Reference_Allele']) self.assertTrue( "None" == line_dict['Mutation_Status'], "Mutation Status must be None when Validation Status is Invalid: " + line_dict['Mutation_Status'])
def testMissingAnnotations(self): ''' Tests that if the required annotations ("gene", "protein_change", and "other_transcripts") are missing, an exception is thrown. ''' datasource = GenericGeneProteinPositionDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.tsv", title="SmallNatVar", version="test") m = MutationDataFactory.default_create() m.createAnnotation("gene", "TP53") #m.createAnnotation("protein_change", "p.S376C") self.assertRaisesRegexp(MissingAnnotationException, "protein_change", datasource.annotate_mutation, m)
def testBasicGeneTSVInit(self): """ Make sure that we can initialize a simple tsv data source """ geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationDataFactory.default_create() m.createAnnotation('gene',"ABL1") m = geneDS.annotate_mutation(m) self.assertTrue(m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1","Test gene TSV datasource did not annotate properly.")
def testSetValues(self): m = MutationDataFactory.default_create() m.createAnnotation("fake1", "1") m.createAnnotation("fake2", "blah blah") self.assertTrue(m["fake1"] == "1", "Could not properly retrieve annotation using the dictionary interface. " + str(m["fake1"])) self.assertTrue(m["fake2"] == "blah blah", "Could not properly retrieve annotation using the dictionary interface. " + str(m["fake2"])) m["fake2"] = "Whoa" self.assertTrue(m["fake2"] == "Whoa", "Could not properly retrieve annotation using the dictionary interface, after a value change.") print(str(m))
def test_annotation_copy(self): """Test that we can create a backup annotation with the old values after collapsing, if requested.""" m1 = MutationDataFactory.default_create(chr="1", start="10000", end="10000") m1.createAnnotation('ALT_F2R1', "|36", annotationSource="TEST") cc = ColumnCollapser() cc.update_mutation(m1, new_annotation_source="foo", copy_old_suffix="_full") self.assertEqual(m1["ALT_F2R1_full"], "|36") self.assertEqual(m1["ALT_F2R1"], "36") self.assertEqual(m1.getAnnotation("ALT_F2R1_full").getDatasource(), "TEST") self.assertTrue(m1.getAnnotation("ALT_F2R1").getDatasource() != m1.getAnnotation("ALT_F2R1_full").getDatasource())
def test_cached_annots_dummy_cache(self): """Test dummy cache. Also, tests a simple store and retrieve, which should be None.""" cm = CacheManager() fake_db_dir_key = "blah" cm.initialize(None, fake_db_dir_key, is_read_only=False) m = MutationDataFactory.default_create() m.createAnnotation("blah1", "val1", annotationSource="INPUT") m.createAnnotation("blah2", "val5", annotationSource="some_datasource") cm.store_annotations_in_cache(m) annots = cm.retrieve_cached_annotations(m) self.assertTrue(annots is None)
def test_not_5_prime_flank_annotation_positive_strand(self): m = MutationDataFactory.default_create(chr="3", start="180625088", end="180625088", ref_allele="C", alt_allele="A", build="hg19") transcript_ds = TestUtils.createTranscriptProviderDatasource( self.config) m = transcript_ds.annotate_mutation(m) self.assertEqual(m['variant_classification'], "IGR")
def test_3_prime_flank_annotation_negative_strand(self): m = MutationDataFactory.default_create(chr="5", start="1253255", end="1253255", ref_allele="A", alt_allele="T", build="hg19") transcript_ds = TestUtils.createTranscriptProviderDatasource( self.config) m = transcript_ds.annotate_mutation(m) self.assertEqual(m['variant_classification'], "3'Flank")
def testDatasourceCreator(self): """ Test that the datasource creator process will work for v1 of TranscriptToUniProtProteinPositionTransformingDatasource. NOTE: This test needs to be updated to use sqlite instead of filesystem file. """ tDS = DatasourceFactory.createDatasource("testdata/small_uniprot_prot_seq_ds/small_uniprot_prot_seq_ds.config", "testdata/small_uniprot_prot_seq_ds/") outputAnnotation = "UniProt_aapos" m = MutationDataFactory.default_create() m.createAnnotation('transcript_id', 'uc009vvt.1') m.createAnnotation('protein_change', 'p.T1105A') m = tDS.annotate_mutation(m) self.assertTrue(m[outputAnnotation] == "969", "Did not get proper value (969): " + m[outputAnnotation])
def testHeaderCreation(self): """Test that a tcga vcf header can be generated, even from a blank mutation. """ vcfOR = TcgaVcfOutputRenderer("out/TCGAVCFHeader.out.txt") m = MutationDataFactory.default_create() m.createAnnotation('center', "broad.mit.edu") hdr = vcfOR.createVcfHeader(m) self.assertTrue(hdr is not None) self.assertTrue(hdr <> "") self.assertTrue( hdr.find("broad.mit.edu") <> -1, "Could not find string that should have been in header.")
def testAnnotationSourceIsPopulated(self): ''' Tests that the annotation source is not blank for the example tsv datasource. ''' geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationDataFactory.default_create() m.createAnnotation('gene',"ABL1") m = geneDS.annotate_mutation(m) self.assertTrue(m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1","Test gene TSV datasource did not annotate properly.") self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource() <> "Unknown", "Annotation source was unknown") self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource().strip() <> "", "Annotation source was blank")
def test_no_data_fetch(self): """Test for value not found in bigwig. In this case, our test bigwig only has data for chr1 so None is expected return value. """ m = MutationDataFactory.default_create() m.createAnnotation('chr', '13') m.createAnnotation('start', 78978) m.createAnnotation('end', 79000) self.bigwig_datasource.annotate_mutation(m) self.assertEqual(m.get('TestBigWig_score'), None)
def annotate_genes_given_txs(self, txs): """ Given a list of Transcripts, create and annotate dummy mutations that represent only the gene. :param txs: list of Transcripts :type txs: list :return: """ gene_to_tx_dict = {} for tx in txs: try: gene_to_tx_dict[tx.get_gene()].append(tx) except KeyError: gene_to_tx_dict[tx.get_gene()] = [tx] genes = set(gene_to_tx_dict.keys()) genes = sorted(list(genes)) muts_dict = {} for gene in genes: m = MutationDataFactory.default_create() m.createAnnotation("gene", gene) m.createAnnotation("transcripts", ",".join(sorted([tx.get_transcript_id() for tx in gene_to_tx_dict[gene]]))) m.createAnnotation("strand", gene_to_tx_dict[gene][0].get_strand()) m.createAnnotation("class", gene_to_tx_dict[gene][0].get_gene_type()) endAA = str(max([len(tx.get_protein_seq()) for tx in gene_to_tx_dict[gene]])) m.createAnnotation("protein_change", "p.DUMMY1_" + endAA) m.createAnnotation("chr", gene_to_tx_dict[gene][0].get_contig()) # Annotate each transcript and collapse the relevant transcript annotations for each gene. tx_muts_uncollapsed = [self.annotate_transcript(tx) for tx in gene_to_tx_dict[gene]] annotation_vals_collapsed = defaultdict(set) for tx_mut in tx_muts_uncollapsed: for annotation_name in tx_mut.keys(): # For every annotation on the dummy transcript (tx_mut), create a dictionary containing a # set of values. # Only consider annotations that are not INPUT and the datasource is known. invalid_annotation_sources = ["INPUT", "OUTPUT", "Unknown"] if tx_mut.getAnnotation(annotation_name).getDatasource() not in invalid_annotation_sources: annotation_vals_collapsed[annotation_name].add(tx_mut[annotation_name]) # Create a new annotation that encompasses the transcript data for the gene. for new_annotation in annotation_vals_collapsed.keys(): # Remove blank values from the set annotation_val_collapsed_set = annotation_vals_collapsed[new_annotation] - set([""]) str_val = "|".join(sorted(list(annotation_val_collapsed_set))) m.createAnnotation(new_annotation, str_val, annotationSource="OUTPUT") muts_dict[gene] = m self._annotate_genes(muts_dict.values()) return muts_dict