def test_overlapping_single_transcripts(self): base_config_location = "testdata/ensembl/saccer/" ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location) recs = ensembl_ds.get_overlapping_transcripts("I", "500", "500") self.assertTrue(len(recs) == 1) self.assertTrue(recs[0].get_gene() == 'YAL069W')
def testESPCoverageAnnotationWithSNPAvgMatch(self): """ """ self.logger.info("Initializing ESP6500SI-V2 Coverage") tabixIndexedTsvDirName = os.path.join(*["testdata", "small_esp_coverage_avg_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_avg_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "X" m1.start = "100075334" m1.end = "100075334" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AvgAAsampleReadDepth") cur_annotation = Annotation(value="75.0", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_TotalAAsamplesCovered") cur_annotation = Annotation(value="692.0", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Chromosome") cur_annotation = Annotation(value="X", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testdbNSFPNoRefAltAnnotationWithExactMatch(self): """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join(*["testdata", "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "1" m1.start = "35140" m1.end = "35140" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation(value="1|1|1", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation(value="TAA|TAA|TAA", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation(value="-|-|-", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def test_no_overwriting_muts(self): """Ensure that (given configuration that disallows) we cannot annotate from a datasource when a value was specified in the input.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/" ) input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = {OptionConstants.ALLOW_ANNOTATION_OVERWRITING: False, OptionConstants.NO_PREPEND: True} run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts, ) annotator = Annotator() annotator.initialize(run_spec) self.assertRaises(DuplicateAnnotationException, annotator.annotate)
def test_overwriting_muts(self): """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/" ) input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = {OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: True} run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts, ) annotator = Annotator() annotator.initialize(run_spec) annotator.annotate() tsv_reader = GenericTsvReader(output_filename) for i, line_dict in enumerate(tsv_reader): self.assertTrue(line_dict.get("TJ_Data_Who", "") != "Tromokratis")
def _create_test_ds(self, input_tsv, dir_name, index_cols): base_name = "test_snp_leveldb" full_name = dir_name + "/" + base_name if os.path.exists(full_name): shutil.rmtree(full_name) os.makedirs(full_name) tsv_reader = GenericTsvReader(input_tsv, commentPrepend="%") annotation_cols = copy.copy(tsv_reader.getFieldNames()) for icol in index_cols: if icol in annotation_cols: annotation_cols.remove(icol) ds_creator = SnpOnlyLevelDbDatasourceCreator() ds_creator.createDatasource(full_name, input_tsv, ",".join(index_cols), full_name + "/" + base_name + ".config", "snp_leveldb", base_name, "TEST", "exact", annotation_cols, []) config_filename = "out/test_simple_annotate_snp_only_leveldb/test_snp_leveldb/test_snp_leveldb.config" ds = DatasourceFactory.createDatasource(os.path.abspath(config_filename), os.path.dirname(config_filename)) return ds
def test_overlapping_single_transcripts(self): base_config_location = "testdata/ensembl/saccer/" ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location) recs = ensembl_ds.get_overlapping_transcripts("I", "500", "500") self.assertTrue(len(recs) == 1) self.assertTrue(recs[0].get_gene() == 'YAL069W')
def testdbNSFPAnnotationWithMissingOverlapMatch(self): # SNPs only """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join(*["testdata", "dbNSFP_chr1_chr3_100vars_overlap_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "dbNSFP_chr1_chr3_100vars_overlap_ds.config"), tabixIndexedTsvDirName) m1 = MutationDataFactory.default_create() m1.chr = "1" m1.start = "35136" m1.end = "35137" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation(value="", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation(value="", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation(value="", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testCreationAndAnnotation(self): """ Test the datasource creation and then do a simple annotation """ outputFilename = 'out/genericGeneProteinPositionTest.out.tsv' gafDS = TestUtils.createTranscriptProviderDatasource(self.config) gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/") annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDS) annotator.addDatasource(gppDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename)) tsvReader = GenericTsvReader(testFilename) ctr = 0 for lineDict in tsvReader: colName = "UniProt_NatVar_natural_variations" self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName]) ctr += 1 self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
def testCreationAndAnnotation(self): """ Test the datasource creation and then do a simple annotation """ outputFilename = 'out/genericGeneProteinPositionTest.out.tsv' gafDS = TestUtils.createTranscriptProviderDatasource(self.config) gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/") annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDS) annotator.addDatasource(gppDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename)) tsvReader = GenericTsvReader(testFilename) ctr = 0 for lineDict in tsvReader: colName = "UniProt_NatVar_natural_variations" self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName]) ctr += 1 self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
def testTCGAMAFAsInputAndQuickAnnotate(self): """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """ inputFilename = "testdata/maf/Patient0.maf.annotated" tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config') outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv" outputRenderer = TcgaMafOutputRenderer(outputFilename, 'configs/tcgaMAF2.4_output.config') annotator = Annotator() annotator.setInputCreator(tmp) annotator.setOutputRenderer(outputRenderer) ds = DatasourceFactory.createDatasource("testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") annotator.addDatasource(ds) annotator.annotate() statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReaderIn = GenericTsvReader(inputFilename) tsvReader = GenericTsvReader(outputFilename) self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number") self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Why) from header") self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Who) from header") ctrOut = 0 for lineDict in tsvReader: ctrOut += 1 ctrIn = 0 for lineDict in tsvReaderIn: ctrIn += 1 ctrIn += len(tsvReaderIn.getCommentsAsList()) ctrOut += len(tsvReader.getCommentsAsList()) self.assertTrue(ctrOut == (ctrIn + 2), "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file. (In,Out): " + str(ctrIn) + ", " + str(ctrOut))
def testBasicAnnotation(self): ''' Annotate from a basic tsv of Genomic positions. This tests both single- and multiple-nucleotide variants. The tsv is already installed (i.e. proper config file created). ''' outputFilename = 'out/genericGenomePositionTest.out.tsv' gpDS = DatasourceFactory.createDatasource("testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/") annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gpDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename)) tsvReader = GenericTsvReader(testFilename) ctr = 1 # Two overlap, one does not. Repeat... for lineDict in tsvReader: if (ctr % 3 == 0): self.assertTrue(lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should have had blank value, but did not: " + lineDict["ORegAnno_hg19.oreganno.id"]) else: self.assertFalse(lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should not have had blank value, but did.") self.assertTrue(lineDict["ORegAnno_hg19.oreganno.id"] == 'OREG0013034', "Line " + str(ctr) + " did not have correct value: " + lineDict["ORegAnno_hg19.oreganno.id"]) ctr = ctr + 1
def test_no_overwriting_muts(self): """Ensure that (given configuration that disallows) we cannot annotate from a datasource when a value was specified in the input.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = { OptionConstants.ALLOW_ANNOTATION_OVERWRITING: False, OptionConstants.NO_PREPEND: True } run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts) annotator = Annotator() annotator.initialize(run_spec) self.assertRaises(DuplicateAnnotationException, annotator.annotate)
def test_overwriting_muts(self): """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = { OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: True } run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts) annotator = Annotator() annotator.initialize(run_spec) annotator.annotate() tsv_reader = GenericTsvReader(output_filename) for i, line_dict in enumerate(tsv_reader): self.assertTrue(line_dict.get('TJ_Data_Who', "") != "Tromokratis")
def createCosmicDatasource(config): """ Creates a Cosmic datasource from a config file. """ cosmic_dirname = config.get("COSMIC", "CosmicDir") cosmicDatasource = DatasourceFactory.createDatasource( cosmic_dirname + "/cosmic.config", cosmic_dirname) return cosmicDatasource
def testBasicAnnotation(self): ''' Test annotation from a generic TSV based on a transcript annotation. Only confirms the proper headers of the output. ''' # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource( config=self.config) transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/") outputFilename = 'out/genericTranscriptTest.out.tsv' annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/Patient0.snp.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(transcriptDS) outputFilename = annotator.annotate() tsvReader = GenericTsvReader(outputFilename) headers = tsvReader.getFieldNames() self.assertTrue( "refseq_test_mRNA_Id" in headers, "refseq_test_mRNA_Id not found in headers: " + str(headers)) self.assertTrue( "refseq_test_prot_Id" in headers, "refseq_test_prot_Id not found in headers: " + str(headers))
def testESPCoverageAnnotationWithMissingAnnotationValuesIndelAvgMatch( self): """ """ self.logger.info("Initializing ESP6500SI-V2 Coverage") tabixIndexedTsvDirName = os.path.join( *["testdata", "small_esp_coverage_avg_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_avg_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "X" m1.start = "100075350" m1.end = "100075356" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AvgSampleReadDepth") cur_annotation = Annotation( value="91.25", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testExampleVcfDBAnnotationWithSNPExactMatch(self): """ """ tabixIndexedVcfDirName = os.path.join(*["testdata", "vcf_db_exact", "hg19"]) tabixIndexedVcfDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedVcfDirName, "vcf_db_exact.config"), tabixIndexedVcfDirName) chrom = "20" start = "1110696" end = "1110696" ref_allele = "A" alt_allele = "T" build = "hg19" m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build) m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AF") cur_annotation = Annotation(value="0.667", datasourceName="ESP", dataType="Float", description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT], number=-1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_AC") cur_annotation = Annotation(value="2,4", datasourceName="ESP", dataType="Integer", description="Allele Count", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_H2") cur_annotation = Annotation(value="False", datasourceName="ESP", dataType="Flag", description="HapMap2 membership", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=0) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") chrom = "20" start = "1230237" end = "1230237" ref_allele = "T" alt_allele = "A" build = "hg19" m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build) m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_NS") cur_annotation = Annotation(value="3", datasourceName="ESP", dataType="Integer", description="Number of Samples With Data", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_AF") cur_annotation = Annotation(value="", datasourceName="ESP", dataType="Float", description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT], number=-1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def test_hashcode_generation(self): """Test that we can read a hashcode for a datasource, if available.""" geneDS = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") self.assertTrue(geneDS is not None, "gene indexed datasource was None.") self.assertTrue( geneDS.get_hashcode() == "7120edfdc7b29e45191c81c99894afd5")
def testBasicDatasourceSorting(self): """Test that the GAF datasource is sorted before a gene-based datasource""" gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") incorrectSortList = [geneDS, gafDatasource] guessSortList = DatasourceFactory.sortDatasources(incorrectSortList) self.assertTrue(guessSortList[1] == geneDS, "Sorting is incorrect.") self.assertTrue(len(guessSortList) == 2, "Sorting altered number of datasources (gt: 2): " + str(len(guessSortList)))
def testBasicGeneTSVInit(self): """ Make sure that we can initialize a simple tsv data source """ geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationDataFactory.default_create() m.createAnnotation('gene',"ABL1") m = geneDS.annotate_mutation(m) self.assertTrue(m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1","Test gene TSV datasource did not annotate properly.")
def testBasicAnnotation(self): ''' Annotate from a basic tsv gene file. Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated. Using trimmed CancerGeneCensus as basis for this test. ''' # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g" genesAvailable = [ 'ABL1', 'ABL2', 'ACSL3', 'AF15Q14', 'AF1Q', 'AF3p21', 'AF5q31', 'AKAP9', 'AKT1', 'AKT2', 'ALDH2', 'ALK', 'ALO17', 'APC', 'ARHGEF12', 'ARHH', 'ARID1A', 'ARID2', 'ARNT', 'ASPSCR1', 'ASXL1', 'ATF1', 'ATIC', 'ATM', 'ATRX', 'BAP1', 'BCL10', 'BCL11A', 'BCL11B' ] # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource( config=self.config) geneDS = DatasourceFactory.createDatasource( "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") outputFilename = 'out/genericGeneTest.out.tsv' annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/Patient0.snp.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(geneDS) annotator.annotate() # Check that there were actual annotations performed. tsvReader = GenericTsvReader(outputFilename) fields = tsvReader.getFieldNames() self.assertTrue( 'CGC_Abridged_Other Syndrome/Disease' in fields, "'CGC_Other Syndrome/Disease' was not present in the header") self.assertTrue( 'CGC_Abridged_Mutation Type' in fields, "'CGC_Abridged_Mutation Type' was not present in the header") ctr = 1 linesThatShouldBeAnnotated = 0 for lineDict in tsvReader: self.assertTrue('gene' in lineDict.keys()) if lineDict['gene'] in genesAvailable: self.assertTrue( lineDict['CGC_Abridged_GeneID'] != '', "'CGC_Abridged_GeneID' was missing on a row that should have been populated. Line: " + str(ctr)) linesThatShouldBeAnnotated += 1 ctr += 1 self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")
def testAnnotationSourceIsPopulated(self): ''' Tests that the annotation source is not blank for the example tsv datasource. ''' geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationData() m.createAnnotation('gene',"ABL1") m = geneDS.annotate_mutation(m) self.assertTrue(m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1","Test gene TSV datasource did not annotate properly.") self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource() <> "Unknown", "Annotation source was unknown") self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource().strip() <> "", "Annotation source was blank")
def testAnnotationSourceIsPopulated(self): ''' Tests that the annotation source is not blank for the example tsv datasource. ''' geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationData() m.createAnnotation('gene',"ABL1") m = geneDS.annotate_mutation(m) self.assertTrue(m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1","Test gene TSV datasource did not annotate properly.") self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource() <> "Unknown", "Annotation source was unknown") self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource().strip() <> "", "Annotation source was blank")
def testDatasourceCreator(self): """ Test that the datasource creator process will work for TranscriptToUniProtProteinPositionTransformingDatasource. NOTE: This test needs to be updated to use sqlite instead of filesystem file. """ tDS = DatasourceFactory.createDatasource("testdata/small_uniprot_prot_seq_ds/small_uniprot_prot_seq_ds.config", "testdata/small_uniprot_prot_seq_ds/") outputAnnotation = "UniProt_aapos" m = MutationData() m.createAnnotation('transcript_id', 'uc009vvt.1') m.createAnnotation('protein_change', 'p.T1105A') m = tDS.annotate_mutation(m) self.assertTrue(m[outputAnnotation] == "969", "Did not get proper value (969): " + m[outputAnnotation])
def test_overlapping_multiple_transcripts_snp(self): base_config_location = "testdata/ensembl/saccer/" ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location) recs = ensembl_ds.get_overlapping_transcripts("I", "550", "550") self.assertTrue(len(recs) == 2) ids = set() for r in recs: ids.add(r.get_transcript_id()) self.assertTrue(len(ids - set(['YAL069W', 'YAL068W-A'])) == 0)
def test_overlapping_multiple_transcripts_snp(self): base_config_location = "testdata/ensembl/saccer/" ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location) recs = ensembl_ds.get_overlapping_transcripts("I", "550", "550") self.assertTrue(len(recs) == 2) ids = set() for r in recs: ids.add(r.get_transcript_id()) self.assertTrue(len(ids - {'YAL069W', 'YAL068W-A'}) == 0)
def testdbNSFPNoRefAltAnnotationWithExactMatch(self): """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join(*[ "testdata", "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds", "hg19" ]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join( tabixIndexedTsvDirName, "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds.config"), tabixIndexedTsvDirName) m1 = MutationDataFactory.default_create() m1.chr = "1" m1.start = "35140" m1.end = "35140" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation( value="1|1|1", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation( value="TAA|TAA|TAA", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation( value="-|-|-", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testdbNSFPAnnotationWithMissingExactMatch(self): # SNPs only """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join( *["testdata", "dbNSFP_chr1_6vars_exact_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "dbNSFP_chr1_6vars_exact_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "1" m1.start = "35138" m1.end = "35138" m1.ref_allele = "T" m1.alt_allele = "C" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation( value="", datasourceName="dbNSFP", dataType="Integer", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation( value="", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation( value="", datasourceName="dbNSFP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testESPCoverageAnnotationWithMissingIndelOverlapMatch(self): """ """ self.logger.info("Initializing ESP6500SI-V2 Coverage") tabixIndexedTsvDirName = os.path.join( *["testdata", "small_esp_coverage_overlap_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_overlap_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "X" m1.start = "100075300" m1.end = "100075336" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AvgAAsampleReadDepth") cur_annotation = Annotation( value="75.0|81.0|81.0", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_TotalAAsamplesCovered") cur_annotation = Annotation( value="692|692|692", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Chromosome") cur_annotation = Annotation( value="X|X|X", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testBasicCosmicInit(self): """ Very simple test that will create a datasource from a sample datasource directory. The directory conforms to the standard datasource structure, including placement of the config file. """ ds = DatasourceFactory.createDatasource('testdata/small_cosmic/small_cosmic.config', "testdata/small_cosmic") m = MutationDataFactory.default_create() m.chr = 19 m.start = 58858921 m.end = 58858921 m = ds.annotate_mutation(m) self.assertTrue(m['COSMIC_overlapping_mutation_AAs'] == 'p.P426P(1)', "Did not properly annotate mutation: " + m['COSMIC_overlapping_mutation_AAs'])
def testTags(self): """ """ self.logger.info("Initializing ESP6500SI-V2") tabixIndexedVcfDirName = os.path.join(*["testdata", "small_esp_ds"]) tabixIndexedVcfDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedVcfDirName, "esp.config"), tabixIndexedVcfDirName) tagsDict = tabixIndexedVcfDatasource._determine_tags() for ID in tagsDict: tags = tagsDict[ID] self.assertTrue(len(tags) == 2, "The length of tags is not 2 but %s." % len(tags)) self.assertTrue(TagConstants.INFO in tags, "INFO tag is missing for %s." % ID) self.assertTrue(TagConstants.NOT_SPLIT in tags, "NOT_SPLIT tag is missing for %s." % ID)
def testBasicGeneTSVInit(self): """ Make sure that we can initialize a simple tsv data source """ geneDS = DatasourceFactory.createDatasource( "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationDataFactory.default_create() m.createAnnotation('gene', "ABL1") m = geneDS.annotate_mutation(m) self.assertTrue( m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1", "Test gene TSV datasource did not annotate properly.")
def testDatasourceCreator(self): """ Test that the datasource creator process will work for v1 of TranscriptToUniProtProteinPositionTransformingDatasource. NOTE: This test needs to be updated to use sqlite instead of filesystem file. """ tDS = DatasourceFactory.createDatasource( "testdata/small_uniprot_prot_seq_ds/small_uniprot_prot_seq_ds.config", "testdata/small_uniprot_prot_seq_ds/") outputAnnotation = "UniProt_aapos" m = MutationData() m.createAnnotation('transcript_id', 'uc009vvt.1') m.createAnnotation('protein_change', 'p.T1105A') m = tDS.annotate_mutation(m) self.assertTrue( m[outputAnnotation] == "969", "Did not get proper value (969): " + m[outputAnnotation])
def test_simple_annotation_without_version_number_in_data_nor_query(self): ''' Create a dummy mutation and make sure it gets annotated properly when there is a version number in the query, but version number is not in the datasource.''' m = MutationDataFactory.default_create() m.createAnnotation('transcript_id', 'uc001hms') transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds_no_version_number/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds_no_version_number/") m = transcriptDS.annotate_mutation(m) self.assertTrue( m['refseq_test_mRNA_Id'] == 'NM_022746', "Transcript-based annotation did not populate properly: " + m['refseq_test_mRNA_Id']) self.assertTrue( m['refseq_test_prot_Id'] == 'NP_073583', "Transcript-based annotation did not populate properly: " + m['refseq_test_prot_Id'])
def testSimpleAnnotation(self): """ Create a dummy mutation and make sure it gets annotated properly """ m = MutationData() m.createAnnotation("transcript_id", "uc001hms.3") transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/" ) m = transcriptDS.annotate_mutation(m) self.assertTrue( m["refseq_test_mRNA_Id"] == "NM_022746", "Transcript-based annotation did not populate properly: " + m["refseq_test_mRNA_Id"], ) self.assertTrue( m["refseq_test_prot_Id"] == "NP_073583", "Transcript-based annotation did not populate properly: " + m["refseq_test_prot_Id"], )
def testDoubleAnnotationError(self): ''' Given a maf file that used to cause a duplicate annotation exception, do not throw that (or any) exception. ''' outputFilename = 'out/genericGenomePositionDoubleAnnotationTest.out.tsv' gpDS = DatasourceFactory.createDatasource("testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/") annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/testDoubleAnnotate.maf.tsv')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gpDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename))
def test_simple_annotation_with_version_number_in_data_but_not_query(self): """ Create a dummy mutation and make sure it gets annotated properly with version num in data, but not query """ m = MutationDataFactory.default_create() m.createAnnotation("transcript_id", "uc001hms") transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/" ) m = transcriptDS.annotate_mutation(m) self.assertTrue( m["refseq_test_mRNA_Id"] == "NM_022746", "Transcript-based annotation did not populate properly: " + m["refseq_test_mRNA_Id"], ) self.assertTrue( m["refseq_test_prot_Id"] == "NP_073583", "Transcript-based annotation did not populate properly: " + m["refseq_test_prot_Id"], )
def testBasicDatasourceSorting(self): """Test that the GAF datasource is sorted before a gene-based datasource""" gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) geneDS = DatasourceFactory.createDatasource( "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") incorrectSortList = [geneDS, gafDatasource] guessSortList = DatasourceFactory.sortDatasources(incorrectSortList) self.assertTrue(guessSortList[1] == geneDS, "Sorting is incorrect.") self.assertTrue( len(guessSortList) == 2, "Sorting altered number of datasources (gt: 2): " + str(len(guessSortList)))
def testSimpleAnnotation(self): ''' Create a dummy mutation and make sure it gets annotated properly ''' m = MutationData() m.createAnnotation('transcript_id', 'uc001hms.3') transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/") m = transcriptDS.annotate_mutation(m) self.assertTrue( m['refseq_test_mRNA_Id'] == 'NM_022746', "Transcript-based annotation did not populate properly: " + m['refseq_test_mRNA_Id']) self.assertTrue( m['refseq_test_prot_Id'] == 'NP_073583', "Transcript-based annotation did not populate properly: " + m['refseq_test_prot_Id'])
def testExampleVcfDBAnnotationWithMissingIndelExactMatch(self): """ """ tabixIndexedVcfDirName = os.path.join(*["testdata", "vcf_db_exact", "hg19"]) tabixIndexedVcfDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedVcfDirName, "vcf_db_exact.config"), tabixIndexedVcfDirName) chrom = "21" start = "1234567" end = "1234567" ref_allele = "AGTC" alt_allele = "A" build = "hg19" m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build) m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AF") cur_annotation = Annotation(value="", datasourceName="ESP", dataType="Float", description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT], number=-1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_X") cur_annotation = Annotation(value="", datasourceName="ESP", dataType="String", description="A random variable, X", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=2) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_H2") cur_annotation = Annotation(value="", datasourceName="ESP", dataType="Flag", description="HapMap2 membership", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=0) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Y") cur_annotation = Annotation(value="", datasourceName="ESP", dataType="String", description="A random variable, Y", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=-2) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Z") cur_annotation = Annotation(value="", datasourceName="ESP", dataType="Float", description="A random variable, Z", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=3) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testExampleVcfDBAnnotationWithIndelAvgMatch(self): """ """ tabixIndexedVcfDirName = os.path.join(*["testdata", "vcf_db_avg", "hg19"]) tabixIndexedVcfDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedVcfDirName, "vcf_db_avg.config"), tabixIndexedVcfDirName) chrom = "4" start = "1234567" end = "1234567" ref_allele = "GTC" alt_allele = "GTCTTA" build = "hg19" m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build) m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AF") cur_annotation = Annotation(value="0.5", datasourceName="ESP", dataType="Float", description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT], number=-1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_AC") cur_annotation = Annotation(value="3.0", datasourceName="ESP", dataType="Float", description="Allele Count", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_H2") cur_annotation = Annotation(value="False|False|False", datasourceName="ESP", dataType="String", description="HapMap2 membership", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_AA") cur_annotation = Annotation(value="T", datasourceName="ESP", dataType="String", description="Ancestral Allele", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Z") cur_annotation = Annotation(value="2.0,3.0,3.0", datasourceName="ESP", dataType="Float", description="A random variable, Z", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=3) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def test_convert_genomic_space_to_transcript_space(self): base_config_location = "testdata/ensembl/saccer/" ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location) tx = ensembl_ds.get_overlapping_transcripts("I", "350", "350") # transcript starts at 335. start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("350", "350", tx[0]) self.assertTrue(start == end) self.assertTrue(start == 16) tx = ensembl_ds.get_overlapping_transcripts("II", "764690", "764690") # transcript starts at 764697 (strand is '-'). start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("764690", "764690", tx[0]) self.assertTrue(start == end) self.assertTrue(start == 7) start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("764680", "764690", tx[0]) self.assertTrue(start == (end - 10)) self.assertTrue(start == 7)
def test_simple_annotation_without_version_number_in_data(self): """ Create a dummy mutation and make sure it gets annotated properly when there is a version number in the query, but version number is not in the datasource.""" m = MutationDataFactory.default_create() m.createAnnotation("transcript_id", "uc001hms.3") transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds_no_version_number/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds_no_version_number/", ) m = transcriptDS.annotate_mutation(m) self.assertTrue( m["refseq_test_mRNA_Id"] == "NM_022746", "Transcript-based annotation did not populate properly: " + m["refseq_test_mRNA_Id"], ) self.assertTrue( m["refseq_test_prot_Id"] == "NP_073583", "Transcript-based annotation did not populate properly: " + m["refseq_test_prot_Id"], )
def testBasicRefInit(self): """ Very simple test that will create a reference datasource from a sample datasource directory. The directory conforms to the standard datasource structure, including placement of the config file. """ ds = DatasourceFactory.createDatasource('testdata/reference_ds/reference_ds.config', "testdata/reference_ds") m = MutationDataFactory.default_create() m.chr = "22" m.start = "11" m.end = "11" groundTruth = "CCCAAGCTAAACCCAGGCCAC" # remember that the annotate_mutation returns a generator, so we use an iterator m = ds.annotate_mutation(m) self.assertTrue(m['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(m['ref_context']))
def testTCGAMAFAsInputAndQuickAnnotate(self): """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """ inputFilename = "testdata/maf/Patient0.maf.annotated" tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config') outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv" outputRenderer = TcgaMafOutputRenderer( outputFilename, 'configs/tcgaMAF2.4_output.config') annotator = Annotator() annotator.setInputCreator(tmp) annotator.setOutputRenderer(outputRenderer) ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") annotator.addDatasource(ds) annotator.annotate() statinfo = os.stat(outputFilename) self.assertTrue( statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReaderIn = GenericTsvReader(inputFilename) tsvReader = GenericTsvReader(outputFilename) self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number") self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Why) from header") self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Who) from header") ctrOut = 0 for lineDict in tsvReader: ctrOut += 1 ctrIn = 0 for lineDict in tsvReaderIn: ctrIn += 1 ctrIn += len(tsvReaderIn.getCommentsAsList()) ctrOut += len(tsvReader.getCommentsAsList()) self.assertTrue( ctrOut == (ctrIn + 2), "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file. (In,Out): " + str(ctrIn) + ", " + str(ctrOut))
def testDoubleAnnotationError(self): ''' Given a maf file that used to cause a duplicate annotation exception, do not throw that (or any) exception. ''' outputFilename = 'out/genericGenomePositionDoubleAnnotationTest.out.tsv' gpDS = DatasourceFactory.createDatasource( "testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/") annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/testDoubleAnnotate.maf.tsv')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gpDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename))
def testBasicCosmicInit(self): """ Very simple test that will create a datasource from a sample datasource directory. The directory conforms to the standard datasource structure, including placement of the config file. """ ds = DatasourceFactory.createDatasource( 'testdata/small_cosmic/small_cosmic.config', "testdata/small_cosmic") m = MutationDataFactory.default_create() m.chr = 19 m.start = 58858921 m.end = 58858921 m = ds.annotate_mutation(m) self.assertTrue( m['COSMIC_overlapping_mutation_AAs'] == 'p.P426P(1)', "Did not properly annotate mutation: " + m['COSMIC_overlapping_mutation_AAs'])
def main(): args = parseOptions() input_gene_list_file = args.input_gene_list_file transcript_ds = args.transcript_ds_config outputFilename = args.outputFilename isNonCoding = args.includeNonCoding padding = args.padding ds = DatasourceFactory.createDatasource(transcript_ds, os.path.dirname(transcript_ds)) input_gene_list_file_fp = file(input_gene_list_file, 'r') outputFileFP = file(outputFilename, 'w') errorFileFP = file(outputFilename + ".err", 'w') for line in input_gene_list_file_fp: gene = line.strip() exons = ds.retrieveExons(gene, isCodingOnly=(not isNonCoding), padding=int(padding)) if len(exons) == 0: errorFileFP.write("Could not locate " + gene + "\n") for e in exons: outputFileFP.write('%s\t%s\t%s\t%s\n' % (e[0], e[1], e[2], e[3])) print("Done ... " + outputFilename)
def testBasicRefInit(self): """ Very simple test that will create a reference datasource from a sample datasource directory. The directory conforms to the standard datasource structure, including placement of the config file. """ ds = DatasourceFactory.createDatasource( 'testdata/reference_ds/reference_ds.config', "testdata/reference_ds") m = MutationDataFactory.default_create() m.chr = "22" m.start = "11" m.end = "11" groundTruth = "CCCAAGCTAAACCCAGGCCAC" # remember that the annotate_mutation returns a generator, so we use an iterator m = ds.annotate_mutation(m) self.assertTrue( m['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(m['ref_context']))
def testBasicAnnotation(self): """ Test annotation from a generic TSV based on a transcript annotation. Only confirms the proper headers of the output. """ # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config) transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/" ) outputFilename = "out/genericTranscriptTest.out.tsv" annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator("testdata/maflite/Patient0.snp.maf.txt")) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(transcriptDS) outputFilename = annotator.annotate() tsvReader = GenericTsvReader(outputFilename) headers = tsvReader.getFieldNames() self.assertTrue("refseq_test_mRNA_Id" in headers, "refseq_test_mRNA_Id not found in headers: " + str(headers)) self.assertTrue("refseq_test_prot_Id" in headers, "refseq_test_prot_Id not found in headers: " + str(headers))
def main(): args = parseOptions() input_gene_list_file = args.input_gene_list_file transcript_ds = args.transcript_ds_config outputFilename = args.outputFilename isNonCoding = args.includeNonCoding padding = args.padding ds = DatasourceFactory.createDatasource(transcript_ds, os.path.dirname(transcript_ds)) input_gene_list_file_fp = file(input_gene_list_file, 'r') outputFileFP = file(outputFilename, 'w') errorFileFP = file(outputFilename + ".err", 'w') for line in input_gene_list_file_fp: gene = line.strip() exons = ds.retrieveExons(gene, isCodingOnly=(not isNonCoding), padding=int(padding)) if len(exons) == 0: errorFileFP.write("Could not locate " + gene + "\n") for e in exons: outputFileFP.write('%s\t%s\t%s\t%s\n' % (e[0], e[1], e[2], e[3])) print("Done ... " + outputFilename)
def testBasicAnnotation(self): ''' Annotate from a basic tsv of Genomic positions. This tests both single- and multiple-nucleotide variants. The tsv is already installed (i.e. proper config file created). ''' outputFilename = 'out/genericGenomePositionTest.out.tsv' gpDS = DatasourceFactory.createDatasource( "testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/") annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/tiny_maflite.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gpDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename)) tsvReader = GenericTsvReader(testFilename) ctr = 1 # Two overlap, one does not. Repeat... for lineDict in tsvReader: if (ctr % 3 == 0): self.assertTrue( lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should have had blank value, but did not: " + lineDict["ORegAnno_hg19.oreganno.id"]) else: self.assertFalse( lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should not have had blank value, but did.") self.assertTrue( lineDict["ORegAnno_hg19.oreganno.id"] == 'OREG0013034', "Line " + str(ctr) + " did not have correct value: " + lineDict["ORegAnno_hg19.oreganno.id"]) ctr = ctr + 1
def testBasicAnnotation(self): ''' Annotate from a basic tsv gene file. Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated. Using trimmed CancerGeneCensus as basis for this test. ''' # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g" genesAvailable = ['ABL1','ABL2','ACSL3','AF15Q14','AF1Q','AF3p21','AF5q31','AKAP9','AKT1','AKT2','ALDH2','ALK','ALO17','APC','ARHGEF12','ARHH','ARID1A','ARID2','ARNT','ASPSCR1','ASXL1','ATF1','ATIC','ATM','ATRX','BAP1','BCL10','BCL11A','BCL11B'] # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config) geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") outputFilename = 'out/genericGeneTest.out.tsv' annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(geneDS) annotator.annotate() # Check that there were actual annotations performed. tsvReader = GenericTsvReader(outputFilename) fields = tsvReader.getFieldNames() self.assertTrue('CGC_Abridged_Other Syndrome/Disease' in fields, "'CGC_Other Syndrome/Disease' was not present in the header") self.assertTrue('CGC_Abridged_Mutation Type' in fields, "'CGC_Abridged_Mutation Type' was not present in the header") ctr = 1 linesThatShouldBeAnnotated = 0 for lineDict in tsvReader: self.assertTrue('gene' in lineDict.keys()) if lineDict['gene'] in genesAvailable: self.assertTrue(lineDict['CGC_Abridged_GeneID'] <> '', "'CGC_Abridged_GeneID' was missing on a row that should have been populated. Line: " + str(ctr)) linesThatShouldBeAnnotated = linesThatShouldBeAnnotated + 1 ctr = ctr + 1 self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")
def testCreateIndexedTsvDatasource(self): datasourceFilename = "testdata/ESP6500SI-V2.chr1.snps_indels.head.25.txt" datasourceFoldername = "1000Genomes" datasourceName = "1000Genomes" datasourceType = "indexed_tsv" datasourceVersion = "V4.1" genomeBuild = "hg19" indexColumnNames = "CHROM,POS,POS" columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED" annotationColumnNames = "DBSNP,EA_AC,AA_AC,TAC" tmpDir = tempfile.mkdtemp() destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild]) os.makedirs(destDir) DatasourceInstallUtils.create_datasource(destDir=destDir, ds_file=datasourceFilename, ds_foldername=datasourceFoldername, ds_name=datasourceName, ds_type=datasourceType, ds_version=datasourceVersion, index_columns=indexColumnNames, ds_annotation_columns=annotationColumnNames) datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz" configFilename = os.path.join(*[destDir, "1000Genomes.config"]) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "column_names"), "column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "annotation_column_names"), "annotation_column_names option is missing in general section.") self.assertEqual(configParser.get("general", "type"), datasourceType, "Expected data source type is %s but was %s." % (datasourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), datasourceName, "Expected data source title is %s but was %s." % (datasourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), datasourceVersion, "Expected data source version is %s but was %s." % (datasourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "column_names"), columnNames, "Expected data source column names is %s but was %s." % (columnNames, configParser.get("general", "column_names"))) self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames, "Expected data source annotation column names is %s but was %s." % (annotationColumnNames, configParser.get("general", "annotation_column_names"))) self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])), "No md5 file was generated.") datasource = DatasourceFactory.createDatasource(configFilename, destDir) m1 = MutationDataFactory.default_create() m1.chr = "1" m1.start = "802177" m1.end = "802177" m1.ref_allele = "T" m1.alt_allele = "C" m1_annotated = datasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("1000Genomes_AA_AC") cur_annotation = Annotation(value="2,866", datasourceName="1000Genomes", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") annotationNames = ["1000Genomes_CHROM", "1000Genomes_POS", "1000Genomes_REF", "1000Genomes_ALT", "1000Genomes_GWAS_PUBMED"] for annotationName in annotationNames: self.assertTrue(annotationName not in m1_annotated, "m1_annotated was annotated with %s." % annotationName) annotationNames = ["1000Genomes_DBSNP", "1000Genomes_EA_AC", "1000Genomes_AA_AC", "1000Genomes_TAC"] for annotationName in annotationNames: self.assertTrue(annotationName in m1_annotated, "m1_annotated was not annotated with %s value." % annotationName) MutUtils.removeDir(tmpDir)
def main(): args = parseOptions() uniprot_swiss_fname = expanduser(args.swiss_file) uniprot_trembl_fname = expanduser(args.trembl_file) output_file = args.output_file gencode_ds_loc = expanduser(args.gencode_ds) out_tx_matches = args.out_tx_matches out_tx_matches_fp = file(out_tx_matches, 'w') setup_logging() uniprot_tsv = expanduser(args.uniprot_tsv) blast_exe = args.blast_exe gencode_ds = DatasourceFactory.createDatasource( configFilename=gencode_ds_loc, leafDir=os.path.dirname(gencode_ds_loc)) #uniprotDS = DatasourceFactory.createDatasource(configFilename=uniprot_ds_loc, leafDir=os.path.dirname(uniprot_ds_loc)) uniprotDS = GenericTranscriptDatasource(src_file=uniprot_tsv, title="UniProt", version="2014_12", geneColumnName="gene") tmp_dir = args.temp_pickle_store if tmp_dir is None: tmp_dir = mkdtemp(prefix="onco_unipickles_") swiss_data = parseWithShove(uniprot_swiss_fname, parse_uniprot_data, tmp_dir) trembl_data = parseWithShove(uniprot_trembl_fname, parse_uniprot_data, tmp_dir) alignmentDB = Shove("file://" + output_file, "simple://") # Go through each transcript txs = gencode_ds.getTranscriptDict() tx_ids = txs.keys() num_tx_ids = len(tx_ids) swissKeys = swiss_data.keys() tremblKeys = trembl_data.keys() uniprotEntryNameKey = 'UniProt_uniprot_entry_name' numNotInProteinSeqs = 0 numTranscriptsNotInUniprot = 0 ctr = 0 process_list = [] tpool = Pool(processes=4) for tx_id in tx_ids: ctr += 1 if (ctr % 2000) == 0: logging.getLogger(__name__).info(str(ctr) + "/" + str(num_tx_ids)) tx_protein_seq = txs[tx_id].get_protein_seq() if tx_protein_seq is None or tx_protein_seq.strip( ) == "" or tx_protein_seq.strip() == "*": numNotInProteinSeqs += 1 continue # Create a fake dummy mutation and annotate the gene and the simple_uniprot info m = MutationDataFactory.default_create() m.createAnnotation('gene', txs[tx_id].get_gene()) m.createAnnotation('transcript_id', tx_id) m = uniprotDS.annotate_mutation(m) uniprot_entry_key = m[uniprotEntryNameKey] if uniprot_entry_key in swissKeys: uniprot_record = swiss_data[uniprot_entry_key] elif uniprot_entry_key in tremblKeys: uniprot_record = trembl_data[uniprot_entry_key] else: numTranscriptsNotInUniprot += 1 continue uniprot_seq = uniprot_record.sequence # print(m['transcript_id'] + " " + m[uniprotEntryNameKey]) # "/bulk/blast-2.2.26/bin/bl2seq" is blast_exe for Lee's laptop VM # When doing the comparison, tx protein includes stop codon at the end, uniprot does not. if tx_protein_seq[0:-1] == uniprot_seq: out_tx_matches_fp.write(tx_id + "\n") # runAlignment(tx_id, uniprot_entry_key, tx_protein_seq, uniprot_seq, tmp_dir, blast_exe, alignmentDB) p = (tx_id, uniprot_entry_key, tx_protein_seq, uniprot_seq, tmp_dir, blast_exe) process_list.append(p) # # Running all of the processes.... logging.getLogger( __name__).info("Running big block of alignments across multicores (" + str(len(process_list)) + " alignments)") alignment_data_tuples = tpool.map(run_alignment_given_tuple, process_list) # logging.getLogger(__name__).info("Running big block of alignments across one core (" + str(len(process_list)) + " alignments)") # alignment_data_tuples = [run_alignment_given_tuple(p) for p in process_list] logging.getLogger(__name__).info("Storing results") for t in alignment_data_tuples: alignmentDB[t[0]] = t[1] logging.getLogger(__name__).info("Could not get protein seq for " + str(numNotInProteinSeqs) + " transcripts.") logging.getLogger(__name__).info("Could not get uniprot seq for " + str(numTranscriptsNotInUniprot) + " transcripts.") logging.getLogger(__name__).info("Attempted " + str(ctr) + " muts")
parser = ArgumentParser(description=desc, formatter_class=RawDescriptionHelpFormatter, epilog=epilog) parser.add_argument("gencode_ds_loc", type=str, help="Location of the GENCODE datasource config file -- E.g. /bulk/dbDir/gencode_ds/hg19/gencode_ds.config") parser.add_argument("output_file", type=str, help="TSV filename for output. File will be overwritten if it already exists.") args = parser.parse_args() return args if __name__ == '__main__': args = parseOptions() output_file = expanduser(args.output_file) gencode_ds_loc = expanduser(args.gencode_ds_loc) # Instantiate a gencode datasource gencode_ds = DatasourceFactory.createDatasource(configFilename=gencode_ds_loc, leafDir=os.path.dirname(gencode_ds_loc)) # Get all transcript IDs in the datasource txs = gencode_ds.getTranscriptDict() tx_ids = txs.keys() num_tx_ids = len(tx_ids) # Initialize output file fp = file(output_file, 'w') ctr = 0 for tx_id in tx_ids: ctr += 1 if (ctr % 2000) == 0: print(str(ctr) + "/" + str(num_tx_ids)) fp.flush()
m = MutationData() m.createAnnotation('gene', tDict[transcriptID]['gene']) m.createAnnotation('transcript_id', transcriptID) m = uniprotDS.annotate_mutation(m) yield m if __name__ == '__main__': args = parseOptions() uniprot_swiss_fname = expanduser(args.swiss_file) uniprot_trembl_fname = expanduser(args.trembl_file) output_file = args.output_file gencode_ds_loc = expanduser(args.gencode_ds) uniprot_ds_loc = expanduser(args.simple_uniprot_ds) blast_exe = args.blast_exe gencode_ds = DatasourceFactory.createDatasource(configFilename=gencode_ds_loc, leafDir=os.path.dirname(gencode_ds_loc)) uniprotDS = DatasourceFactory.createDatasource(configFilename=uniprot_ds_loc, leafDir=os.path.dirname(uniprot_ds_loc)) tmp_dir = args.temp_pickle_store if tmp_dir is None: tmp_dir = mkdtemp(prefix="onco_unipickles_") swiss_data = parseWithShove(uniprot_swiss_fname, parse_uniprot_data, tmp_dir) trembl_data = parseWithShove(uniprot_trembl_fname, parse_uniprot_data, tmp_dir) alignmentDB = Shove("file://" + output_file, "simple://") # Go through each transcript txs = gencode_ds.getTranscriptDict() tx_ids = txs.keys() num_tx_ids = len(tx_ids) swissKeys = swiss_data.keys() tremblKeys = trembl_data.keys()