def testRetrievePrecedingBaseFromAnnotationForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue( updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue( updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue( updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele)) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue( updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue( updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue( updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele))
def _simple_annotate(self, is_skip_no_alts): runSpec = RunSpecification() runSpec.initialize(None, None, datasources=[], is_skip_no_alts=is_skip_no_alts) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationData() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" m.createAnnotation("alt_allele_seen", "False") m2 = MutationData() m2.chr = "1" m2.start = "12941796" m2.end = "12941796" m2.alt_allele = "G" m2.ref_allele = "T" muts = [m, m2] muts = annotator.annotate_mutations(muts) ctr = 0 for m in muts: ctr += 1 return ctr
def testAddTag(self): ''' Test adding a tag to an annotation ''' m = MutationData() m.createAnnotation("fake1", "1") m.addTagToAnnotation("fake1", "fakeTag") self.assertTrue("fakeTag" in m.getAnnotation("fake1").getTags(), "Tag was not added properly.")
def _combine_mutations(mutations): """ Merge multiple adjacent mutations into a single new mutation. :param mutations: an ordered list of MutationData :returns a new MutationData :warning: _combine_mutations does not make any attempt to sanity check input mutations it will happily combine overlapping and non-adjacent mutations on disparate chromosomes """ if len(mutations) == 0: return None if len(mutations) == 1: return mutations[0] # special logic for the attributes start = min([mut.start for mut in mutations]) end = max([mut.end for mut in mutations]) chr = mutations[0].chr ref = "".join([mut.ref_allele for mut in mutations]) alt = "".join([mut.alt_allele for mut in mutations]) build = "|".join(set([x.build for x in mutations])) #create the new mutation newmut = MutationData(chr=chr, start=start, end=end, ref_allele=ref, alt_allele=alt, build=build) #add annotations to the mutation allAnnotations = set(flatmap(lambda x: x.keys(), mutations)) annotationNames = allAnnotations - set( mutations[0].getAttributeNames()) for annotName in annotationNames: annotations = [] for mut in mutations: try: annotations.append(mut.getAnnotation(annotName)) except KeyError: pass values = sorted( (set([x.getValue() for x in annotations if x.getValue()]))) value = "|".join(values) tags = sorted(set(flatmap(lambda x: x.getTags(), annotations))) source = annotations[0].getDatasource() datatype = annotations[0].getDataType() number = annotations[0].getNumber() description = annotations[0].getDescription() newmut.createAnnotation(annotationName=annotName, annotationValue=value, annotationSource=source, annotationDataType=datatype, annotationDescription=description, tags=tags, number=number) return newmut
def generateTranscriptMuts(gafDS,uniprotDS): tDict = gafDS.getTranscriptDict() for transcriptID in tDict.keys(): m = MutationData() m.createAnnotation('gene', tDict[transcriptID]['gene']) m.createAnnotation('transcript_id', transcriptID) m = uniprotDS.annotate_mutation(m) yield m
def test_copy(self): """Test annotation copy """ m = MutationData() m.createAnnotation("foo", "3", "blah_source", annotationDescription="testing", tags=["superblah"], number="A") m.createCopyAnnotation(m.getAnnotation("foo"), "bar") # Note that getAnnotation returns an instance of Annotation, not simply the value self.assertEqual(m.getAnnotation("foo"), m.getAnnotation("bar"))
def testPickleable(self): """Test that a near-empty MutationData can be pickled""" m = MutationData() m.chr = "2" m.createAnnotation("fake1", "1") m.addTagToAnnotation("fake1", "fakeTag") import cPickle cPickle.dump(m, open("out/testMDPickle.pkl", 'w'))
def generateTranscriptMuts(gafDS, uniprotDS): tDict = gafDS.getTranscriptDict() for transcriptID in tDict.keys(): m = MutationData() m.createAnnotation('gene', tDict[transcriptID]['gene']) m.createAnnotation('transcript_id', transcriptID) m = uniprotDS.annotate_mutation(m) yield m
def testIter(self): m = MutationData() m.createAnnotation("fake1", "1") m.createAnnotation("fake2", "blah blah") for k in m: self.assertTrue((k in ["fake1", "fake2"]) or (k in MutationData.attributes), "Key not present: " + k)
def testHeaderCreation(self): """Test that a tcga vcf header can be generated, even from a blank mutation. """ vcfOR = TcgaVcfOutputRenderer("out/TCGAVCFHeader.out.txt") m = MutationData() m.createAnnotation('center', "broad.mit.edu") hdr = vcfOR.createVcfHeader(m) self.assertTrue(hdr is not None) self.assertTrue(hdr <> "") self.assertTrue(hdr.find("broad.mit.edu") <> -1, "Could not find string that should have been in header.")
def testBasicGeneTSVInit(self): """ Make sure that we can initialize a simple tsv data source """ geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationData() m.createAnnotation('gene',"ABL1") m = geneDS.annotate_mutation(m) self.assertTrue(m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1","Test gene TSV datasource did not annotate properly.")
def testMissingAnnotations(self): ''' Tests that if the required annotations ("gene", "protein_change", and "other_transcripts") are missing, an excpetion is thrown. ''' datasource = GenericGeneProteinPositionDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.tsv", title="SmallNatVar", version="test") m = MutationData() m.createAnnotation("gene", "TP53") #m.createAnnotation("protein_change", "p.S376C") self.assertRaisesRegexp(MissingAnnotationException, "protein_change", datasource.annotate_mutation, m)
def testMissingAnnotations(self): ''' Tests that if the required annotations ("gene", "protein_change", and "other_transcripts") are missing, an exception is thrown. ''' datasource = GenericGeneProteinPositionDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.tsv", title="SmallNatVar", version="test") m = MutationData() m.createAnnotation("gene", "TP53") #m.createAnnotation("protein_change", "p.S376C") self.assertRaisesRegexp(MissingAnnotationException, "protein_change", datasource.annotate_mutation, m)
def testSetValues(self): m = MutationData() m.createAnnotation("fake1", "1") m.createAnnotation("fake2", "blah blah") self.assertTrue(m["fake1"] == "1", "Could not properly retrieve annotation using the dictionary interface. " + str(m["fake1"])) self.assertTrue(m["fake2"] == "blah blah", "Could not properly retrieve annotation using the dictionary interface. " + str(m["fake2"])) m["fake2"] = "Whoa" self.assertTrue(m["fake2"] == "Whoa", "Could not properly retrieve annotation using the dictionary interface, after a value change.") print(str(m))
def testAnnotationSourceIsPopulated(self): ''' Tests that the annotation source is not blank for the example tsv datasource. ''' geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationData() m.createAnnotation('gene',"ABL1") m = geneDS.annotate_mutation(m) self.assertTrue(m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1","Test gene TSV datasource did not annotate properly.") self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource() <> "Unknown", "Annotation source was unknown") self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource().strip() <> "", "Annotation source was blank")
def testRetrievePrecedingBasesForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234570 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "T", "Alt allele should be T but was %s." % mut.alt_allele) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234571 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "TT", "Alt allele should be TT but was %s." % mut.alt_allele)
def test_cached_annots_dummy_cache(self): """Test dummy cache. Also, tests a simple store and retrieve, which should be None.""" cm = CacheManager() fake_db_dir_key = "blah" cm.initialize(None, fake_db_dir_key, is_read_only=False) m = MutationData() m.createAnnotation("blah1", "val1", annotationSource="INPUT") m.createAnnotation("blah2", "val5", annotationSource="some_datasource") cm.store_annotations_in_cache(m) annots = cm.retrieve_cached_annotations(m) self.assertTrue(annots is None)
def testDatasourceCreator(self): """ Test that the datasource creator process will work for TranscriptToUniProtProteinPositionTransformingDatasource. NOTE: This test needs to be updated to use sqlite instead of filesystem file. """ tDS = DatasourceFactory.createDatasource("testdata/small_uniprot_prot_seq_ds/small_uniprot_prot_seq_ds.config", "testdata/small_uniprot_prot_seq_ds/") outputAnnotation = "UniProt_aapos" m = MutationData() m.createAnnotation('transcript_id', 'uc009vvt.1') m.createAnnotation('protein_change', 'p.T1105A') m = tDS.annotate_mutation(m) self.assertTrue(m[outputAnnotation] == "969", "Did not get proper value (969): " + m[outputAnnotation])
def testRangeAnnotation(self): ''' Test a simple case with range. ''' datasource = GenericGeneProteinPositionDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.tsv", title="UniProt_NatVar", version="2011_09") m = MutationData() m.createAnnotation("gene", "TP53") m.createAnnotation("protein_change", "p.SLEELEE370_376del") # This is not valid, but does the test. m2 = datasource.annotate_mutation(m) annotationName= "UniProt_NatVar_natural_variations" self.assertTrue(sorted(m[annotationName].split("|")) == sorted("K -> Q (in a sporadic cancer; somatic mutation).|S -> T (in a sporadic cancer; somatic mutation).|S -> A (in a sporadic cancer; somatic mutation).".split("|")), "Incorrect annotation value seen: " + m[annotationName])
def testBasicAnnotation(self): ''' Test an extremely simple case. ''' datasource = GenericGeneProteinPositionDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.tsv", title="UniProt_NatVar", version="2011_09") m = MutationData() m.createAnnotation("gene", "TP53") m.createAnnotation("protein_change", "p.S376C") m2 = datasource.annotate_mutation(m) annotationName= "UniProt_NatVar_natural_variations" self.assertTrue(sorted(m[annotationName].split("|")) == sorted("S -> T (in a sporadic cancer; somatic mutation).|S -> A (in a sporadic cancer; somatic mutation).".split("|")), "Incorrect annotation value seen: " + m[annotationName])
def testBasicAnnotationWithChange(self): """ Test whether we can translate from one coordinate system to another. This tests a known change. """ tDS = TranscriptToUniProtProteinPositionTransformingDatasource(title="UniProt", version="test", src_file="file://testdata/small_uniprot_prot_seq_ds/db") # Must correspond to what the datasource is going to generate. outputAnnotation = "UniProt_aapos" m = MutationData() m.createAnnotation('transcript_id', 'uc009vvt.1') m.createAnnotation('protein_change', 'p.T1105A') m = tDS.annotate_mutation(m) self.assertTrue(m[outputAnnotation] == "969", "Did not get proper value (969): " + m[outputAnnotation])
def testBasicAnnotation(self): ds = GenericGenomicMutationDatasource('testdata/small_cosmic_2/cosmic_v65_chr18.tsv') m = MutationData() m.chr = '18' m.start = '48604683' m.end = '48604683' m.ref_allele = 'G' m.alt_allele = 'A' m.createAnnotation('strand', '+') guess = ds.annotate_mutation(m) self.assertTrue(guess['_cosmic_muts_disease_counts'], 'Unable to annotate mutation correctly')
def createMutations(self): """ No inputs. Returns a generator of mutations built from the specified maflite file. """ aliasKeys = self._reverseAlternativeDict.keys() allColumns = self._tsvReader.getFieldNames() for line in self._tsvReader: # We only need to assign fields that are mutation attributes and have a different name in the maflite file. mut = MutationData(build=self._build) for col in allColumns: # Three scenarios: # 1) col is name of mutation data field -- simple createAnnotation # 2) col name is an alias for a mutation data field -- do lookup then createAnnotation # 3) col name is not an alias for a mutation data field -- simple createAnnotation if col in aliasKeys: realKey = self._reverseAlternativeDict[col] self.logger.debug(realKey + " found from " + col) val = line[col] if realKey == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat( line[col]) mut.createAnnotation(realKey, val, 'INPUT') else: # Scenario 1 and 3 # Make sure to convert chromosome values. val = line[col] if col == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat( line[col]) mut.createAnnotation(col, val, 'INPUT') mut.ref_allele, mut.alt_allele = mut.ref_allele.strip( ), mut.alt_allele.strip( ) #remove any trailing whitespace if present # if the alt allele == ref_allele, check that this is not a case where there is an alt_allele2 that is different. if mut.alt_allele == mut.ref_allele: mut.alt_allele = self._find_alt_allele_in_other_field( line, mut.ref_allele) # FIXME: Support more than one alias in the reverse dictionary. Then this line can be removed. if mut.start is not "" and mut.end is "": mut.end = mut.start if mut.end is not "" and mut.start is "": mut.start = mut.end yield mut
def test_cached_annots(self): """Test to make sure that we are not storing annotations that should not be cached. Also, tests a simple store and retrieve.""" cache_file = "out/shove.managertest.annots.cache" cm = CacheManager() fake_db_dir_key = "blah" cm.initialize("file://" + cache_file, fake_db_dir_key, is_read_only=False) m = MutationData() m.createAnnotation("blah1", "val1", annotationSource="INPUT") m.createAnnotation("blah2", "val5", annotationSource="some_datasource") cm.store_annotations_in_cache(m) annots = cm.retrieve_cached_annotations(m) self.assertTrue(len(annots.keys()) == 1) self.assertTrue(annots["blah2"].getValue() == "val5")
def test_mutation_combiner(self): """Test that attributes and annotations are set properly with combine mutations""" mut1 = MutationData(chr=1,start=100, end=100, ref_allele="G", alt_allele="A") mut1.createAnnotation("SomeValue", "value1", "INPUT", "STRING", "a value") mut2 = MutationData(chr=1,start=101, end=101, ref_allele="C", alt_allele="T") mut2.createAnnotation("SomeValue", "value2", tags=["IT"]) mut2.createAnnotation("AnotherValue","5") result = OnpQueue._combine_mutations([mut1, mut2]) expected = MutationData(chr=1, start=100, end=101, ref_allele="GC", alt_allele="AT") expected.createAnnotation("SomeValue", "value1|value2", "INPUT", "STRING", "a value", tags=["IT"]) expected.createAnnotation("AnotherValue", "5") self.assertTrue(result.attributesEqual(expected)) self.assertEqual(result, expected)
def testDatasourceCreator(self): """ Test that the datasource creator process will work for v1 of TranscriptToUniProtProteinPositionTransformingDatasource. NOTE: This test needs to be updated to use sqlite instead of filesystem file. """ tDS = DatasourceFactory.createDatasource( "testdata/small_uniprot_prot_seq_ds/small_uniprot_prot_seq_ds.config", "testdata/small_uniprot_prot_seq_ds/") outputAnnotation = "UniProt_aapos" m = MutationData() m.createAnnotation('transcript_id', 'uc009vvt.1') m.createAnnotation('protein_change', 'p.T1105A') m = tDS.annotate_mutation(m) self.assertTrue( m[outputAnnotation] == "969", "Did not get proper value (969): " + m[outputAnnotation])
def testBasicGeneTSVInit(self): """ Make sure that we can initialize a simple tsv data source """ geneDS = DatasourceFactory.createDatasource( "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationData() m.createAnnotation('gene', "ABL1") m = geneDS.annotate_mutation(m) self.assertTrue( m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1", "Test gene TSV datasource did not annotate properly.")
def testBasicAnnotation(self): ds = GenericGenomicMutationDatasource( 'testdata/small_cosmic_2/cosmic_v65_chr18.tsv') m = MutationData() m.chr = '18' m.start = '48604683' m.end = '48604683' m.ref_allele = 'G' m.alt_allele = 'A' m.createAnnotation('strand', '+') guess = ds.annotate_mutation(m) self.assertTrue(guess['_cosmic_muts_disease_counts'], 'Unable to annotate mutation correctly')
def testSimpleAnnotation(self): ''' Create a dummy mutation and make sure it gets annotated properly ''' m = MutationData() m.createAnnotation('transcript_id', 'uc001hms.3') transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/") m = transcriptDS.annotate_mutation(m) self.assertTrue( m['refseq_test_mRNA_Id'] == 'NM_022746', "Transcript-based annotation did not populate properly: " + m['refseq_test_mRNA_Id']) self.assertTrue( m['refseq_test_prot_Id'] == 'NP_073583', "Transcript-based annotation did not populate properly: " + m['refseq_test_prot_Id'])
def testSimpleAnnotation(self): """ Create a dummy mutation and make sure it gets annotated properly """ m = MutationData() m.createAnnotation("transcript_id", "uc001hms.3") transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/" ) m = transcriptDS.annotate_mutation(m) self.assertTrue( m["refseq_test_mRNA_Id"] == "NM_022746", "Transcript-based annotation did not populate properly: " + m["refseq_test_mRNA_Id"], ) self.assertTrue( m["refseq_test_prot_Id"] == "NP_073583", "Transcript-based annotation did not populate properly: " + m["refseq_test_prot_Id"], )
def testBasicAnnotationWithChange(self): """ Test whether we can translate from one coordinate system to another (v1). This tests a known change. """ tDS = TranscriptToUniProtProteinPositionTransformingDatasource( title="UniProt", version="test", src_file="file://testdata/small_uniprot_prot_seq_ds/db") # Must correspond to what the datasource is going to generate. outputAnnotation = "UniProt_aapos" m = MutationData() m.createAnnotation('transcript_id', 'uc009vvt.1') m.createAnnotation('protein_change', 'p.T1105A') m = tDS.annotate_mutation(m) self.assertTrue( m[outputAnnotation] == "969", "Did not get proper value (969): " + m[outputAnnotation])
def test_basic_annotation_no_change_2(self): """ Test whether we can translate from one coordinate system to another (v2 ... 2014). This tests no change. """ tDS = TranscriptToUniProtProteinPositionTransformingDatasource( title="UniProt", version="test", src_file="file://testdata/small_uniprot_prot_seq_ds_blastp_2014/db" ) # Must correspond to what the datasource is going to generate outputAnnotation = "UniProt_aapos" m = MutationData() m.createAnnotation('transcript_id', 'ENST00000264990.6') m.createAnnotation('protein_change', 'p.S50T') m = tDS.annotate_mutation(m) self.assertTrue( m[outputAnnotation] == "50", "Did not get proper value (50): " + m[outputAnnotation])
def testPopulatedButNullValuesInInitNLod(self): """Test that if init_n_lod is "." or "", there is no error """ m = MutationData() m.createAnnotation("init_n_lod", "") outputFilename = "out/blank.vcf" vcfOR = TcgaVcfOutputRenderer(outputFilename) lod = vcfOR._extract_lod(m,"init_n_lod") self.assertEqual(lod, 50) m["init_n_lod"] = '.' lod = vcfOR._extract_lod(m, "init_n_lod") self.assertEqual(lod, 50) m["init_n_lod"] = '6' lod = vcfOR._extract_lod(m, "init_n_lod") self.assertEqual(lod, 6) m["init_n_lod"] = '6.8' lod = vcfOR._extract_lod(m, "init_n_lod") self.assertEqual(lod, 6) m["init_n_lod"] = '-12.8' lod = vcfOR._extract_lod(m, "init_n_lod") self.assertEqual(lod, -12) m.createAnnotation("t_lod_fstar", "") lod = vcfOR._extract_lod(m, "t_lod_fstar") self.assertEqual(lod, 50) m["t_lod_fstar"] = '.' lod = vcfOR._extract_lod(m, "t_lod_fstar") self.assertEqual(lod, 50) m["t_lod_fstar"] = '6' lod = vcfOR._extract_lod(m, "t_lod_fstar") self.assertEqual(lod, 6) m["t_lod_fstar"] = '6.8' lod = vcfOR._extract_lod(m, "t_lod_fstar") self.assertEqual(lod, 6) m["t_lod_fstar"] = '-12.8' lod = vcfOR._extract_lod(m, "t_lod_fstar") self.assertEqual(lod, -12)
def testRetrievePrecedingBaseFromAnnotationForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue(updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue(updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue(updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele)) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue(updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue(updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue(updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele))
def testSetValues(self): m = MutationData() m.createAnnotation("fake1", "1") m.createAnnotation("fake2", "blah blah") self.assertTrue( m["fake1"] == "1", "Could not properly retrieve annotation using the dictionary interface. " + str(m["fake1"])) self.assertTrue( m["fake2"] == "blah blah", "Could not properly retrieve annotation using the dictionary interface. " + str(m["fake2"])) m["fake2"] = "Whoa" self.assertTrue( m["fake2"] == "Whoa", "Could not properly retrieve annotation using the dictionary interface, after a value change." ) print(str(m))
def test_basic_annotation_with_change(self): """ Test whether we can translate from one coordinate system to another (v2 2014). This tests a known change. """ tDS = TranscriptToUniProtProteinPositionTransformingDatasource( title="UniProt", version="test", src_file="file://testdata/small_uniprot_prot_seq_ds_blastp_2014/db" ) # Must correspond to what the datasource is going to generate. #ENST00000545482.1_Silent_p.S178S outputAnnotation = "UniProt_aapos" m = MutationData() m.createAnnotation('transcript_id', 'ENST00000545482.1') m.createAnnotation('protein_change', 'p.S178S') m = tDS.annotate_mutation(m) self.assertTrue( m[outputAnnotation] == "293", "Did not get proper value (293): " + m[outputAnnotation])
def createMutations(self): """ No inputs. Returns a generator of mutations built from the specified maflite file. """ aliasKeys = self._reverseAlternativeDict.keys() allColumns = self._tsvReader.getFieldNames() for line in self._tsvReader: # We only need to assign fields that are mutation attributes and have a different name in the maflite file. mut = MutationData(build=self._build) for col in allColumns: # Three scenarios: # 1) col is name of mutation data field -- simple createAnnotation # 2) col name is an alias for a mutation data field -- do lookup then createAnnotation # 3) col name is not an alias for a mutation data field -- simple createAnnotation if col in aliasKeys: realKey = self._reverseAlternativeDict[col] self.logger.debug(realKey + " found from " + col) val = line[col] if realKey == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat(line[col]) mut.createAnnotation(realKey, val, 'INPUT') else: # Scenario 1 and 3 # Make sure to convert chromosome values. val = line[col] if col == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat(line[col]) mut.createAnnotation(col, val, 'INPUT') # if the alt allele == ref_allele, check that this is not a case where there is an alt_allele2 that is different. if mut.alt_allele == mut.ref_allele: mut.alt_allele = self._find_alt_allele_in_other_field(line, mut.ref_allele) # FIXME: Support more than one alias in the reverse dictionary. Then this line can be removed. if mut.start is not "" and mut.end is "": mut.end = mut.start if mut.end is not "" and mut.start is "": mut.start = mut.end yield mut
def testRetrieveMissingAnnotations(self): """ Test simple case. """ m = MutationData() m.createAnnotation("a1", "1") m.createAnnotation("a2", "1") m.createAnnotation("a3", "1") m.createAnnotation("a4", "1") annotationNames = ["a3", "a2"] result = MutUtils.retrieveMissingAnnotations(m,annotationNames) self.assertIsNotNone(result) self.assertTrue(len(result) == 0, "Result was not empty: " + str(result)) annotationNames = ["zztop", "a1", "blah", "dummy"] result = MutUtils.retrieveMissingAnnotations(m,annotationNames) self.assertTrue(result[0] == "blah", "Result was not sorted") self.assertTrue("blah" in result and "dummy" in result and "zztop" in result, "Incorrect elements (Truth: [zztop, blah, dummy]): " + str(result))
def initializeMutFromAttributes(chr, start, end, ref_allele, alt_allele, build): mut = MutationData(str(chr), str(start), str(end), ref_allele, alt_allele, str(build)) varType = TranscriptProviderUtils.infer_variant_type(mut.ref_allele, mut.alt_allele) if TranscriptProviderUtils.is_xnp(varType): # Snps and other xNPs mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue="") if varType == VariantClassification.VT_DEL: # deletion preceding_bases, updated_ref_allele, updated_start, updated_end =\ MutUtils.retrievePrecedingBasesForDeletions(mut) mut.ref_allele = updated_ref_allele mut["ref_allele"] = updated_ref_allele mut.alt_allele = "-" mut["alt_allele"] = "-" mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) elif varType == VariantClassification.VT_INS: # insertion preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut["ref_allele"] = "-" mut.alt_allele = updated_alt_allele mut["alt_allele"] = updated_alt_allele mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) return mut
def testMixedAnnotation(self): """Test that the COSMIC datasource can retrieve entries by both gp and gpp.""" tabixDir = "testdata/small_cosmic_with_gp_and_gpp/" cosmicDS = Cosmic( src_file=tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.gz", title="Cosmic", version="test", gpp_tabix_file=tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.byAA.sorted.tsv.gz") # These values are not taken from a real world scenario, but are cooked for this test. # Line 9 should get picked up genomic coords # Lines 7,8 should get picked up by the protein position m = MutationData() m.createAnnotation("gene", "A2M") m.createAnnotation("transcript_protein_position_start", "1300") m.createAnnotation("transcript_protein_position_end", "1400") m.chr = '12' m.start = '9227220' m.end = '9227230' m = cosmicDS.annotate_mutation(m) self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '3') self.assertTrue( m['COSMIC_overlapping_mutation_AAs'].find('1229') != -1, "Could not find the entry specified by genomic coords.") self.assertTrue( m['COSMIC_overlapping_primary_sites'] == "lung(3)", "Did not have the correct primary sites annotation (lung(3)): " + m['COSMIC_overlapping_primary_sites'])
def initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build): mut = MutationData(chrom, startPos, endPos, ref, alt, build) varType = MutUtils.determineVariantType(mut) if varType == "snp": # Snps mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue="") if varType == "del": # deletion preceding_bases, updated_ref_allele, updated_start, updated_end =\ MutUtils.retrievePrecedingBasesForDeletions(mut) mut.ref_allele = updated_ref_allele mut["ref_allele"] = updated_ref_allele mut.alt_allele = "-" mut["alt_allele"] = "-" mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) elif varType == "ins": # insertion preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut["ref_allele"] = "-" mut.alt_allele = updated_alt_allele mut["alt_allele"] = updated_alt_allele mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) return mut
def testRetrievePrecedingBasesForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234570 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "T", "Alt allele should be T but was %s." % mut.alt_allele) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234571 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "TT", "Alt allele should be TT but was %s." % mut.alt_allele)
def testSkippingAltsForSingleMut(self): """Test a simple case where a single mutation with alt_allele_seen of False is not produced.""" runSpec = RunSpecification() runSpec.initialize(None, None, datasources=[], is_skip_no_alts=True) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationData() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" m.createAnnotation("alt_allele_seen", "False") muts = [m] muts = annotator.annotate_mutations(muts) self.assertRaises(StopIteration, muts.next)
def testDefaultAnnotations(self): """Test that the default annotation values populate properly. """ annotator = Annotator() default_annotations = {"test2": "foo2", "test3": "Should not be seen"} overrides = {'test3': 'foo3'} m1 = MutationData() m1.createAnnotation("test1", "foo1") m1.createAnnotation("test2", "") m2 = MutationData() m2.createAnnotation("test1", "") m3 = MutationData() m3.createAnnotation("test1", "") m3.createAnnotation("test2", "foo2-original") muts = [m1, m2, m3] muts2 = annotator._applyManualAnnotations(muts, overrides) muts_final_gen = annotator._applyDefaultAnnotations(muts2, default_annotations) muts_final = [] for m in muts_final_gen: self.assertTrue(m['test3'] == "foo3", "Override did not work") muts_final.append(m) self.assertTrue(muts_final[0]['test1'] == "foo1") self.assertTrue(muts_final[0]['test2'] == "foo2") self.assertTrue(muts_final[0]['test3'] == "foo3") self.assertTrue(muts_final[1]['test1'] == "") self.assertTrue(muts_final[1]['test2'] == "foo2") self.assertTrue(muts_final[1]['test3'] == "foo3") self.assertTrue(muts_final[2]['test1'] == "") self.assertTrue(muts_final[2]['test2'] == "foo2-original") self.assertTrue(muts_final[2]['test3'] == "foo3")
def test_phasing_info_missing(self): """Test whether we accurately say whether the phasing info present test works""" m1 = MutationData() m2 = MutationData() m3 = MutationData() m4 = MutationData() m1.createAnnotation("phasing_id", "blah") m2.createAnnotation("phasing_id", "blah") m2.createAnnotation("phasing_genotype", "0|1") m4.createAnnotation("phasing_genotype", "0|1") # m1 missing gt, m2 complete, m3 missing everything, m4 missing ID self.assertFalse(PhasingUtils.has_phasing_information(m1)) self.assertTrue(PhasingUtils.has_phasing_information(m2)) self.assertFalse(PhasingUtils.has_phasing_information(m3)) self.assertFalse(PhasingUtils.has_phasing_information(m4))
def testInternalFields(self): """ Test that an annotation that is not listed explicitly in the required or optional columns is rendered with i_ prepended """ outputFilename = "out/testInternalFields_v2.4.maf.tsv" m = MutationData() m.createAnnotation("TEST", "THIS IS A TEST", "TESTING") # The next annotation is real and should not be considered internal. m.createAnnotation("gene", "EGFR") outputRenderer = TcgaMafOutputRenderer(outputFilename, configFile='configs/tcgaMAF2.4_output.config') outputRenderer.renderMutations(iter([m]), ['No comments']) configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.4_output.config') requiredColumns = configFile.get("general", "requiredColumns") self.assertTrue("Hugo_Symbol" in requiredColumns, " This test assumes that Hugo_Symbol is a required column in the TCGA MAF. If not, the test must be modified.") statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReader = GenericTsvReader(outputFilename) headers = tsvReader.getFieldNames() self.assertTrue("Hugo_Symbol" in headers, "Hugo_Symbol not found in output headers") self.assertTrue("TEST" not in headers, "TEST was found in output headers when it should have been renamed to i_TEST") self.assertTrue("i_TEST" in headers, "i_TEST not found in output headers")
def testInternalFieldsSkipPrepend(self): """ Test that no prepending of "i_" is honored.""" outputFilename = "out/testInternalFields_v2.4.maf.tsv" m = MutationData() m.createAnnotation("TEST", "THIS IS A TEST", "TESTING") # The next annotation is real and should not be considered internal. m.createAnnotation("gene", "EGFR") outputRenderer = TcgaMafOutputRenderer(outputFilename, configFile='configs/tcgaMAF2.4_output.config', other_options={OptionConstants.NO_PREPEND:True}) outputRenderer.renderMutations(iter([m]), ['No comments']) configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.4_output.config') requiredColumns = configFile.get("general", "requiredColumns") self.assertTrue("Hugo_Symbol" in requiredColumns, " This test assumes that Hugo_Symbol is a required column in the TCGA MAF. If not, the test must be modified.") statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReader = GenericTsvReader(outputFilename) headers = tsvReader.getFieldNames() self.assertTrue("Hugo_Symbol" in headers, "Hugo_Symbol not found in output headers") self.assertTrue("i_TEST" not in headers, "i_TEST was found in output headers when prepend was disabled.") self.assertTrue("TEST" in headers, "TEST was not found in output headers.")
def testBasicAnnotation(self): ''' Test an extremely simple case. ''' datasource = GenericGeneProteinPositionDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.tsv", title="UniProt_NatVar", version="2011_09") m = MutationData() m.createAnnotation("gene", "TP53") m.createAnnotation("protein_change", "p.S376C") m.createAnnotation("other_transcripts", "TP53_uc002gig.1_Intron|TP53_uc002gih.2_Intron|TP53_uc010cne.1_RNA|TP53_uc010cnf.1_3'UTR|TP53_uc010cng.1_3'UTR|TP53_uc002gii.1_Missense_Mutation_p.S244C|TP53_uc010cnh.1_3'UTR|TP53_uc010cni.1_3'UTR|TP53_uc002gij.2_Missense_Mutation_p.S376C") m2 = datasource.annotate_mutation(m) annotationName= "UniProt_NatVar_natural_variations" self.assertTrue(sorted(m[annotationName].split("|")) == sorted("S -> T (in a sporadic cancer; somatic mutation).|S -> A (in a sporadic cancer; somatic mutation).".split("|")), "Incorrect annotation value seen: " + m[annotationName])
def testBasicAnnotate(self): '''Test that the COSMIC datasource can be initialized with two index files (gp and gpp) and a simple annotation performed''' tabixDir = "testdata/small_cosmic_with_gp_and_gpp/" cosmicDS = Cosmic(src_file=tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.gz", title="Cosmic", version="test", gpp_tabix_file= tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.byAA.sorted.tsv.gz") # These values are not taken from a real world scenario, but are cooked for this test. m = MutationData() m.createAnnotation("gene", "EGFR") m.createAnnotation("transcript_protein_position_start", "747") m.createAnnotation("transcript_protein_position_end", "747") m.chr = '7' m.start = '55259560' m.end = '55259560' m = cosmicDS.annotate_mutation(m) self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '2')
def testMixedAnnotation(self): """Test that the COSMIC datasource can retrieve entries by both gp and gpp.""" tabixDir = "testdata/small_cosmic_with_gp_and_gpp/" cosmicDS = Cosmic(src_file=tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.gz", title="Cosmic", version="test", gpp_tabix_file= tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.byAA.sorted.tsv.gz") # These values are not taken from a real world scenario, but are cooked for this test. # Line 9 should get picked up genomic coords # Lines 7,8 should get picked up by the protein position m = MutationData() m.createAnnotation("gene", "A2M") m.createAnnotation("transcript_protein_position_start", "1300") m.createAnnotation("transcript_protein_position_end", "1400") m.chr = '12' m.start = '9227220' m.end = '9227230' m = cosmicDS.annotate_mutation(m) self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '3') self.assertTrue(m['COSMIC_overlapping_mutation_AAs'].find('1229') != -1, "Could not find the entry specified by genomic coords.") self.assertTrue(m['COSMIC_overlapping_primary_sites'] == "lung(3)", "Did not have the correct primary sites annotation (lung(3)): " + m['COSMIC_overlapping_primary_sites'])
def testDuplicateException(self): ''' Check that a Duplicate Exception is raised by default when annotation value is changed through createAnnotation''' m = MutationData() m.createAnnotation("fake1", "1") with self.assertRaises(DuplicateAnnotationException): m.createAnnotation("fake1", "blah blah")