def testInitializingDatasources(self): """ Test initializing a database dir, both single and multicore. This test is RAM intensive and requires default data corpus.""" multiDS = DatasourceFactory.createDatasources(self.config.get( "DEFAULT", "dbDir"), "hg19", isMulticore=True) self.assertTrue(multiDS is not None, "Datasource list was None") self.assertTrue(len(multiDS) != 0, "Datasource list was empty") for i in range(0, len(multiDS)): self.assertTrue(multiDS[i] is not None, "multi core datasource was None: " + str(i)) self.assertTrue(isinstance(multiDS[i], Datasource)) # This test can be memory intensive, so get rid of the multiDS, but record how many datasources were created. numMultiDS = len(multiDS) del multiDS singleCoreDS = DatasourceFactory.createDatasources(self.config.get( "DEFAULT", "dbDir"), "hg19", isMulticore=False) self.assertTrue(singleCoreDS is not None, "Datasource list was None") self.assertTrue(len(singleCoreDS) != 0, "Datasource list was empty") for i in range(0, len(singleCoreDS)): self.assertTrue(singleCoreDS[i] is not None, "single core datasource was None: " + str(i)) self.assertTrue(isinstance(singleCoreDS[i], Datasource)) self.assertTrue( numMultiDS == len(singleCoreDS), "Length of single core datasource list was not the same as multicore" ) del singleCoreDS
def testBasicDatasourceSorting(self): """Test that the GAF datasource is sorted before a gene-based datasource""" gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") incorrectSortList = [geneDS, gafDatasource] guessSortList = DatasourceFactory.sortDatasources(incorrectSortList) self.assertTrue(guessSortList[1] == geneDS, "Sorting is incorrect.") self.assertTrue(len(guessSortList) == 2, "Sorting altered number of datasources (gt: 2): " + str(len(guessSortList)))
def testAnnotateListOfMutations(self): """Test that we can initialize an Annotator, without an input or output and then feed mutations, one at a time... using a runspec""" # Locate the datasource directory and create a runspec dbDir = self.config.get("DEFAULT", "dbDir") ds = DatasourceFactory.createDatasources(dbDir) runSpec = RunSpecification() runSpec.initialize(None, None, datasources=ds) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationData() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" muts = [m] muts = annotator.annotate_mutations(muts) m2 = muts.next() self.assertTrue(m2.get("gene", None) is not None)
def testESPCoverageAnnotationWithMissingAnnotationValuesIndelAvgMatch( self): """ """ self.logger.info("Initializing ESP6500SI-V2 Coverage") tabixIndexedTsvDirName = os.path.join( *["testdata", "small_esp_coverage_avg_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_avg_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "X" m1.start = "100075350" m1.end = "100075356" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AvgSampleReadDepth") cur_annotation = Annotation( value="91.25", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def _create_test_ds(self, input_tsv, dir_name, index_cols): base_name = "test_snp_leveldb" full_name = dir_name + "/" + base_name if os.path.exists(full_name): shutil.rmtree(full_name) os.makedirs(full_name) tsv_reader = GenericTsvReader(input_tsv, commentPrepend="%") annotation_cols = copy.copy(tsv_reader.getFieldNames()) for icol in index_cols: if icol in annotation_cols: annotation_cols.remove(icol) ds_creator = SnpOnlyLevelDbDatasourceCreator() ds_creator.createDatasource(full_name, input_tsv, ",".join(index_cols), full_name + "/" + base_name + ".config", "snp_leveldb", base_name, "TEST", "exact", annotation_cols, []) config_filename = "out/test_simple_annotate_snp_only_leveldb/test_snp_leveldb/test_snp_leveldb.config" ds = DatasourceFactory.createDatasource(os.path.abspath(config_filename), os.path.dirname(config_filename)) return ds
def testTCGAMAFAsInputAndQuickAnnotate(self): """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """ inputFilename = "testdata/maf/Patient0.maf.annotated" tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config') outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv" outputRenderer = TcgaMafOutputRenderer(outputFilename, 'configs/tcgaMAF2.4_output.config') annotator = Annotator() annotator.setInputCreator(tmp) annotator.setOutputRenderer(outputRenderer) ds = DatasourceFactory.createDatasource("testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") annotator.addDatasource(ds) annotator.annotate() statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReaderIn = GenericTsvReader(inputFilename) tsvReader = GenericTsvReader(outputFilename) self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number") self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Why) from header") self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Who) from header") ctrOut = 0 for lineDict in tsvReader: ctrOut += 1 ctrIn = 0 for lineDict in tsvReaderIn: ctrIn += 1 ctrIn += len(tsvReaderIn.getCommentsAsList()) ctrOut += len(tsvReader.getCommentsAsList()) self.assertTrue(ctrOut == (ctrIn + 2), "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file. (In,Out): " + str(ctrIn) + ", " + str(ctrOut))
def testBasicDatasourceSorting(self): """Test that the GAF datasource is sorted before a gene-based datasource""" gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) geneDS = DatasourceFactory.createDatasource( "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") incorrectSortList = [geneDS, gafDatasource] guessSortList = DatasourceFactory.sortDatasources(incorrectSortList) self.assertTrue(guessSortList[1] == geneDS, "Sorting is incorrect.") self.assertTrue( len(guessSortList) == 2, "Sorting altered number of datasources (gt: 2): " + str(len(guessSortList)))
def test_no_overwriting_muts(self): """Ensure that (given configuration that disallows) we cannot annotate from a datasource when a value was specified in the input.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/" ) input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = {OptionConstants.ALLOW_ANNOTATION_OVERWRITING: False, OptionConstants.NO_PREPEND: True} run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts, ) annotator = Annotator() annotator.initialize(run_spec) self.assertRaises(DuplicateAnnotationException, annotator.annotate)
def testBasicAnnotation(self): ''' Test annotation from a generic TSV based on a transcript annotation. Only confirms the proper headers of the output. ''' # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource( config=self.config) transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/") outputFilename = 'out/genericTranscriptTest.out.tsv' annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/Patient0.snp.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(transcriptDS) outputFilename = annotator.annotate() tsvReader = GenericTsvReader(outputFilename) headers = tsvReader.getFieldNames() self.assertTrue( "refseq_test_mRNA_Id" in headers, "refseq_test_mRNA_Id not found in headers: " + str(headers)) self.assertTrue( "refseq_test_prot_Id" in headers, "refseq_test_prot_Id not found in headers: " + str(headers))
def createCosmicDatasource(config): """ Creates a Cosmic datasource from a config file. """ cosmic_dirname = config.get("COSMIC", "CosmicDir") cosmicDatasource = DatasourceFactory.createDatasource( cosmic_dirname + "/cosmic.config", cosmic_dirname) return cosmicDatasource
def test_overlapping_single_transcripts(self): base_config_location = "testdata/ensembl/saccer/" ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location) recs = ensembl_ds.get_overlapping_transcripts("I", "500", "500") self.assertTrue(len(recs) == 1) self.assertTrue(recs[0].get_gene() == 'YAL069W')
def testBasicAnnotation(self): ''' Annotate from a basic tsv of Genomic positions. This tests both single- and multiple-nucleotide variants. The tsv is already installed (i.e. proper config file created). ''' outputFilename = 'out/genericGenomePositionTest.out.tsv' gpDS = DatasourceFactory.createDatasource("testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/") annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gpDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename)) tsvReader = GenericTsvReader(testFilename) ctr = 1 # Two overlap, one does not. Repeat... for lineDict in tsvReader: if (ctr % 3 == 0): self.assertTrue(lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should have had blank value, but did not: " + lineDict["ORegAnno_hg19.oreganno.id"]) else: self.assertFalse(lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should not have had blank value, but did.") self.assertTrue(lineDict["ORegAnno_hg19.oreganno.id"] == 'OREG0013034', "Line " + str(ctr) + " did not have correct value: " + lineDict["ORegAnno_hg19.oreganno.id"]) ctr = ctr + 1
def test_overwriting_muts(self): """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/" ) input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = {OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: True} run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts, ) annotator = Annotator() annotator.initialize(run_spec) annotator.annotate() tsv_reader = GenericTsvReader(output_filename) for i, line_dict in enumerate(tsv_reader): self.assertTrue(line_dict.get("TJ_Data_Who", "") != "Tromokratis")
def testCreationAndAnnotation(self): """ Test the datasource creation and then do a simple annotation """ outputFilename = 'out/genericGeneProteinPositionTest.out.tsv' gafDS = TestUtils.createTranscriptProviderDatasource(self.config) gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/") annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDS) annotator.addDatasource(gppDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename)) tsvReader = GenericTsvReader(testFilename) ctr = 0 for lineDict in tsvReader: colName = "UniProt_NatVar_natural_variations" self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName]) ctr += 1 self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
def testMulticoreNoDatasources(self): """ If using multicore, does not hang when no datasources are in the db dir""" multiDS = DatasourceFactory.createDatasources('testdata/maflite/', "hg19", True) self.assertTrue( len(multiDS) == 0, "Length of multiDS when there were no datasources was not zero.")
def test_overwriting_muts(self): """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = { OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: True } run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts) annotator = Annotator() annotator.initialize(run_spec) annotator.annotate() tsv_reader = GenericTsvReader(output_filename) for i, line_dict in enumerate(tsv_reader): self.assertTrue(line_dict.get('TJ_Data_Who', "") != "Tromokratis")
def testdbNSFPNoRefAltAnnotationWithExactMatch(self): """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join(*["testdata", "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "1" m1.start = "35140" m1.end = "35140" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation(value="1|1|1", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation(value="TAA|TAA|TAA", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation(value="-|-|-", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def test_no_overwriting_muts(self): """Ensure that (given configuration that disallows) we cannot annotate from a datasource when a value was specified in the input.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = { OptionConstants.ALLOW_ANNOTATION_OVERWRITING: False, OptionConstants.NO_PREPEND: True } run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts) annotator = Annotator() annotator.initialize(run_spec) self.assertRaises(DuplicateAnnotationException, annotator.annotate)
def testdbNSFPAnnotationWithMissingOverlapMatch(self): # SNPs only """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join(*["testdata", "dbNSFP_chr1_chr3_100vars_overlap_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "dbNSFP_chr1_chr3_100vars_overlap_ds.config"), tabixIndexedTsvDirName) m1 = MutationDataFactory.default_create() m1.chr = "1" m1.start = "35136" m1.end = "35137" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation(value="", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation(value="", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation(value="", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testESPCoverageAnnotationWithSNPAvgMatch(self): """ """ self.logger.info("Initializing ESP6500SI-V2 Coverage") tabixIndexedTsvDirName = os.path.join(*["testdata", "small_esp_coverage_avg_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_avg_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "X" m1.start = "100075334" m1.end = "100075334" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AvgAAsampleReadDepth") cur_annotation = Annotation(value="75.0", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_TotalAAsamplesCovered") cur_annotation = Annotation(value="692.0", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Chromosome") cur_annotation = Annotation(value="X", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testExampleVcfDBAnnotationWithSNPExactMatch(self): """ """ tabixIndexedVcfDirName = os.path.join(*["testdata", "vcf_db_exact", "hg19"]) tabixIndexedVcfDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedVcfDirName, "vcf_db_exact.config"), tabixIndexedVcfDirName) chrom = "20" start = "1110696" end = "1110696" ref_allele = "A" alt_allele = "T" build = "hg19" m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build) m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AF") cur_annotation = Annotation(value="0.667", datasourceName="ESP", dataType="Float", description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT], number=-1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_AC") cur_annotation = Annotation(value="2,4", datasourceName="ESP", dataType="Integer", description="Allele Count", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_H2") cur_annotation = Annotation(value="False", datasourceName="ESP", dataType="Flag", description="HapMap2 membership", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=0) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") chrom = "20" start = "1230237" end = "1230237" ref_allele = "T" alt_allele = "A" build = "hg19" m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build) m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_NS") cur_annotation = Annotation(value="3", datasourceName="ESP", dataType="Integer", description="Number of Samples With Data", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_AF") cur_annotation = Annotation(value="", datasourceName="ESP", dataType="Float", description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT], number=-1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def test_hashcode_generation(self): """Test that we can read a hashcode for a datasource, if available.""" geneDS = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") self.assertTrue(geneDS is not None, "gene indexed datasource was None.") self.assertTrue( geneDS.get_hashcode() == "7120edfdc7b29e45191c81c99894afd5")
def testBasicGeneTSVInit(self): """ Make sure that we can initialize a simple tsv data source """ geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationDataFactory.default_create() m.createAnnotation('gene',"ABL1") m = geneDS.annotate_mutation(m) self.assertTrue(m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1","Test gene TSV datasource did not annotate properly.")
def testBasicAnnotation(self): ''' Annotate from a basic tsv gene file. Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated. Using trimmed CancerGeneCensus as basis for this test. ''' # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g" genesAvailable = [ 'ABL1', 'ABL2', 'ACSL3', 'AF15Q14', 'AF1Q', 'AF3p21', 'AF5q31', 'AKAP9', 'AKT1', 'AKT2', 'ALDH2', 'ALK', 'ALO17', 'APC', 'ARHGEF12', 'ARHH', 'ARID1A', 'ARID2', 'ARNT', 'ASPSCR1', 'ASXL1', 'ATF1', 'ATIC', 'ATM', 'ATRX', 'BAP1', 'BCL10', 'BCL11A', 'BCL11B' ] # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource( config=self.config) geneDS = DatasourceFactory.createDatasource( "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") outputFilename = 'out/genericGeneTest.out.tsv' annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/Patient0.snp.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(geneDS) annotator.annotate() # Check that there were actual annotations performed. tsvReader = GenericTsvReader(outputFilename) fields = tsvReader.getFieldNames() self.assertTrue( 'CGC_Abridged_Other Syndrome/Disease' in fields, "'CGC_Other Syndrome/Disease' was not present in the header") self.assertTrue( 'CGC_Abridged_Mutation Type' in fields, "'CGC_Abridged_Mutation Type' was not present in the header") ctr = 1 linesThatShouldBeAnnotated = 0 for lineDict in tsvReader: self.assertTrue('gene' in lineDict.keys()) if lineDict['gene'] in genesAvailable: self.assertTrue( lineDict['CGC_Abridged_GeneID'] != '', "'CGC_Abridged_GeneID' was missing on a row that should have been populated. Line: " + str(ctr)) linesThatShouldBeAnnotated += 1 ctr += 1 self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")
def test_simple_transcript_annotation(self): """Test web api backend call /transcript/ """ # http://www.broadinstitute.org/oncotator/transcript/ENST00000215832.6/ datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False) annotator = Annotator() for ds in datasource_list: annotator.addDatasource(ds) tx = annotator.retrieve_transcript_by_id("ENST00000215832.6") self.assertTrue(tx is not None) self.assertTrue(tx.get_gene() == "MAPK1")
def test_querying_transcripts_by_genes(self): """Test that we can get all of the transcripts for a given set of genes. """ datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False) annotator = Annotator() for ds in datasource_list: annotator.addDatasource(ds) # Step 1 get all of the relevant transcripts txs = annotator.retrieve_transcripts_by_genes(["MAPK1", "PIK3CA"]) self.assertTrue(len(txs) > 3)
def testAnnotationSourceIsPopulated(self): ''' Tests that the annotation source is not blank for the example tsv datasource. ''' geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationData() m.createAnnotation('gene',"ABL1") m = geneDS.annotate_mutation(m) self.assertTrue(m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1","Test gene TSV datasource did not annotate properly.") self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource() <> "Unknown", "Annotation source was unknown") self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource().strip() <> "", "Annotation source was blank")
def testDatasourceCreator(self): """ Test that the datasource creator process will work for TranscriptToUniProtProteinPositionTransformingDatasource. NOTE: This test needs to be updated to use sqlite instead of filesystem file. """ tDS = DatasourceFactory.createDatasource("testdata/small_uniprot_prot_seq_ds/small_uniprot_prot_seq_ds.config", "testdata/small_uniprot_prot_seq_ds/") outputAnnotation = "UniProt_aapos" m = MutationData() m.createAnnotation('transcript_id', 'uc009vvt.1') m.createAnnotation('protein_change', 'p.T1105A') m = tDS.annotate_mutation(m) self.assertTrue(m[outputAnnotation] == "969", "Did not get proper value (969): " + m[outputAnnotation])
def test_overlapping_multiple_transcripts_snp(self): base_config_location = "testdata/ensembl/saccer/" ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location) recs = ensembl_ds.get_overlapping_transcripts("I", "550", "550") self.assertTrue(len(recs) == 2) ids = set() for r in recs: ids.add(r.get_transcript_id()) self.assertTrue(len(ids - {'YAL069W', 'YAL068W-A'}) == 0)
def test_overlapping_multiple_transcripts_snp(self): base_config_location = "testdata/ensembl/saccer/" ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location) recs = ensembl_ds.get_overlapping_transcripts("I", "550", "550") self.assertTrue(len(recs) == 2) ids = set() for r in recs: ids.add(r.get_transcript_id()) self.assertTrue(len(ids - set(['YAL069W', 'YAL068W-A'])) == 0)
def testdbNSFPAnnotationWithMissingExactMatch(self): # SNPs only """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join( *["testdata", "dbNSFP_chr1_6vars_exact_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "dbNSFP_chr1_6vars_exact_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "1" m1.start = "35138" m1.end = "35138" m1.ref_allele = "T" m1.alt_allele = "C" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation( value="", datasourceName="dbNSFP", dataType="Integer", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation( value="", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation( value="", datasourceName="dbNSFP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testdbNSFPNoRefAltAnnotationWithExactMatch(self): """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join(*[ "testdata", "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds", "hg19" ]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join( tabixIndexedTsvDirName, "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds.config"), tabixIndexedTsvDirName) m1 = MutationDataFactory.default_create() m1.chr = "1" m1.start = "35140" m1.end = "35140" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation( value="1|1|1", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation( value="TAA|TAA|TAA", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation( value="-|-|-", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def test_simple_genes_by_gene_annotation(self): """Test web api backend call /gene/ """ # http://www.broadinstitute.org/oncotator/gene/MAPK1/ datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False) annotator = Annotator() for ds in datasource_list: annotator.addDatasource(ds) txs = annotator.retrieve_transcripts_by_genes(["MAPK1"]) self.assertTranscriptsFound(txs) mut_dict = annotator.annotate_genes_given_txs(txs) self.assertTrue(len(mut_dict.keys()) == 1)
def testESPCoverageAnnotationWithMissingIndelOverlapMatch(self): """ """ self.logger.info("Initializing ESP6500SI-V2 Coverage") tabixIndexedTsvDirName = os.path.join( *["testdata", "small_esp_coverage_overlap_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_overlap_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "X" m1.start = "100075300" m1.end = "100075336" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AvgAAsampleReadDepth") cur_annotation = Annotation( value="75.0|81.0|81.0", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_TotalAAsamplesCovered") cur_annotation = Annotation( value="692|692|692", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Chromosome") cur_annotation = Annotation( value="X|X|X", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testBasicCosmicInit(self): """ Very simple test that will create a datasource from a sample datasource directory. The directory conforms to the standard datasource structure, including placement of the config file. """ ds = DatasourceFactory.createDatasource('testdata/small_cosmic/small_cosmic.config', "testdata/small_cosmic") m = MutationDataFactory.default_create() m.chr = 19 m.start = 58858921 m.end = 58858921 m = ds.annotate_mutation(m) self.assertTrue(m['COSMIC_overlapping_mutation_AAs'] == 'p.P426P(1)', "Did not properly annotate mutation: " + m['COSMIC_overlapping_mutation_AAs'])
def testInitializingDatasources(self): """ Test initializing a database dir, both single and multicore. This test is RAM intensive and requires default data corpus.""" multiDS = DatasourceFactory.createDatasources(self.config.get("DEFAULT", "dbDir"), "hg19", isMulticore=True) self.assertTrue(multiDS is not None, "Datasource list was None") self.assertTrue(len(multiDS) != 0, "Datasource list was empty") for i in range(0,len(multiDS)): self.assertTrue(multiDS[i] is not None, "multi core datasource was None: " + str(i)) self.assertTrue(isinstance(multiDS[i],Datasource)) # This test can be memory intensive, so get rid of the multiDS, but record how many datasources were created. numMultiDS = len(multiDS) del multiDS singleCoreDS = DatasourceFactory.createDatasources(self.config.get("DEFAULT", "dbDir"), "hg19", isMulticore=False) self.assertTrue(singleCoreDS is not None, "Datasource list was None") self.assertTrue(len(singleCoreDS) != 0, "Datasource list was empty") for i in range(0,len(singleCoreDS)): self.assertTrue(singleCoreDS[i] is not None, "single core datasource was None: " + str(i)) self.assertTrue(isinstance(singleCoreDS[i],Datasource)) self.assertTrue(numMultiDS == len(singleCoreDS), "Length of single core datasource list was not the same as multicore") del singleCoreDS
def test_simple_genes_by_region_annotation(self): """Test web api backend call /genes/ """ # http://www.broadinstitute.org/oncotator/genes/chr22_22112223_22312558/ # Two genes: chr22:22,112,223-22,312,558 datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False) annotator = Annotator() for ds in datasource_list: annotator.addDatasource(ds) # Here is what the API would call.... txs = annotator.retrieve_transcripts_by_region("22", 22112223, 22312558) self.assertTranscriptsFound(txs) mut_dict = annotator.annotate_genes_given_txs(txs) # Each mut will be for a separate gene for gene in mut_dict.keys(): mut = mut_dict[gene] alt_accessions = mut["UniProt_alt_uniprot_accessions"].split("|") tcgascape_amp_peaks = mut["TCGAScape_Amplification_Peaks"].split("|") tcgascape_del_peaks = mut["TCGAScape_Deletion_Peaks"].split("|") tumorscape_amp_peaks = mut["TUMORScape_Amplification_Peaks"].split("|") tumorscape_del_peaks = mut["TUMORScape_Deletion_Peaks"].split("|") full_name = mut["HGNC_Approved Name"] cosmic = { "tissue_types_affected": mut["COSMIC_Tissue_tissue_types_affected"], "total_alterations_in_gene": mut["COSMIC_Tissue_tissue_types_affected"], } alt_aliases = list( itertools.chain([mut["HGNC_Previous Symbols"].split(", "), mut["HGNC_Synonyms"].split(", ")]) ) location = mut["HGNC_Chromosome"] uniprot_accession = mut["UniProt_uniprot_accession"] transcripts = mut["transcripts"] self.assertTrue(transcripts is not None) self.assertTrue(len(transcripts) > 0) self.assertTrue(transcripts.startswith("ENST")) strand = mut["strand"] klass = mut["class"] uniprot_experimentals = mut["UniProt_AA_experimental_info"].split("|") self.assertTrue(uniprot_experimentals is not None) uniprot_natural_variations = mut["UniProt_AA_natural_variation"].split("|") uniprot_regions = mut["UniProt_AA_region"].split("|") uniprot_sites = mut["UniProt_AA_site"].split("|") uniprot_go_biological_processes = mut["UniProt_GO_Biological_Process"].split("|") uniprot_go_cellular_components = mut["UniProt_GO_Cellular_Component"].split("|") self.assertTrue(uniprot_go_cellular_components is not None) uniprot_go_molecular_functions = mut["UniProt_GO_Molecular_Function"].split("|") pass
def testBasicGeneTSVInit(self): """ Make sure that we can initialize a simple tsv data source """ geneDS = DatasourceFactory.createDatasource( "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationDataFactory.default_create() m.createAnnotation('gene', "ABL1") m = geneDS.annotate_mutation(m) self.assertTrue( m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1", "Test gene TSV datasource did not annotate properly.")
def testTags(self): """ """ self.logger.info("Initializing ESP6500SI-V2") tabixIndexedVcfDirName = os.path.join(*["testdata", "small_esp_ds"]) tabixIndexedVcfDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedVcfDirName, "esp.config"), tabixIndexedVcfDirName) tagsDict = tabixIndexedVcfDatasource._determine_tags() for ID in tagsDict: tags = tagsDict[ID] self.assertTrue(len(tags) == 2, "The length of tags is not 2 but %s." % len(tags)) self.assertTrue(TagConstants.INFO in tags, "INFO tag is missing for %s." % ID) self.assertTrue(TagConstants.NOT_SPLIT in tags, "NOT_SPLIT tag is missing for %s." % ID)
def testDatasourceCreator(self): """ Test that the datasource creator process will work for v1 of TranscriptToUniProtProteinPositionTransformingDatasource. NOTE: This test needs to be updated to use sqlite instead of filesystem file. """ tDS = DatasourceFactory.createDatasource( "testdata/small_uniprot_prot_seq_ds/small_uniprot_prot_seq_ds.config", "testdata/small_uniprot_prot_seq_ds/") outputAnnotation = "UniProt_aapos" m = MutationData() m.createAnnotation('transcript_id', 'uc009vvt.1') m.createAnnotation('protein_change', 'p.T1105A') m = tDS.annotate_mutation(m) self.assertTrue( m[outputAnnotation] == "969", "Did not get proper value (969): " + m[outputAnnotation])
def testSimpleAnnotation(self): """ Create a dummy mutation and make sure it gets annotated properly """ m = MutationData() m.createAnnotation("transcript_id", "uc001hms.3") transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/" ) m = transcriptDS.annotate_mutation(m) self.assertTrue( m["refseq_test_mRNA_Id"] == "NM_022746", "Transcript-based annotation did not populate properly: " + m["refseq_test_mRNA_Id"], ) self.assertTrue( m["refseq_test_prot_Id"] == "NP_073583", "Transcript-based annotation did not populate properly: " + m["refseq_test_prot_Id"], )
def test_simple_annotation_with_version_number_in_data_but_not_query(self): """ Create a dummy mutation and make sure it gets annotated properly with version num in data, but not query """ m = MutationDataFactory.default_create() m.createAnnotation("transcript_id", "uc001hms") transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/" ) m = transcriptDS.annotate_mutation(m) self.assertTrue( m["refseq_test_mRNA_Id"] == "NM_022746", "Transcript-based annotation did not populate properly: " + m["refseq_test_mRNA_Id"], ) self.assertTrue( m["refseq_test_prot_Id"] == "NP_073583", "Transcript-based annotation did not populate properly: " + m["refseq_test_prot_Id"], )
def testSimpleAnnotation(self): ''' Create a dummy mutation and make sure it gets annotated properly ''' m = MutationData() m.createAnnotation('transcript_id', 'uc001hms.3') transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/") m = transcriptDS.annotate_mutation(m) self.assertTrue( m['refseq_test_mRNA_Id'] == 'NM_022746', "Transcript-based annotation did not populate properly: " + m['refseq_test_mRNA_Id']) self.assertTrue( m['refseq_test_prot_Id'] == 'NP_073583', "Transcript-based annotation did not populate properly: " + m['refseq_test_prot_Id'])
def test_simple_annotation_without_version_number_in_data_nor_query(self): ''' Create a dummy mutation and make sure it gets annotated properly when there is a version number in the query, but version number is not in the datasource.''' m = MutationDataFactory.default_create() m.createAnnotation('transcript_id', 'uc001hms') transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds_no_version_number/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds_no_version_number/") m = transcriptDS.annotate_mutation(m) self.assertTrue( m['refseq_test_mRNA_Id'] == 'NM_022746', "Transcript-based annotation did not populate properly: " + m['refseq_test_mRNA_Id']) self.assertTrue( m['refseq_test_prot_Id'] == 'NP_073583', "Transcript-based annotation did not populate properly: " + m['refseq_test_prot_Id'])
def testDoubleAnnotationError(self): ''' Given a maf file that used to cause a duplicate annotation exception, do not throw that (or any) exception. ''' outputFilename = 'out/genericGenomePositionDoubleAnnotationTest.out.tsv' gpDS = DatasourceFactory.createDatasource("testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/") annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/testDoubleAnnotate.maf.tsv')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gpDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename))
def testExampleVcfDBAnnotationWithMissingIndelExactMatch(self): """ """ tabixIndexedVcfDirName = os.path.join(*["testdata", "vcf_db_exact", "hg19"]) tabixIndexedVcfDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedVcfDirName, "vcf_db_exact.config"), tabixIndexedVcfDirName) chrom = "21" start = "1234567" end = "1234567" ref_allele = "AGTC" alt_allele = "A" build = "hg19" m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build) m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AF") cur_annotation = Annotation(value="", datasourceName="ESP", dataType="Float", description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT], number=-1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_X") cur_annotation = Annotation(value="", datasourceName="ESP", dataType="String", description="A random variable, X", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=2) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_H2") cur_annotation = Annotation(value="", datasourceName="ESP", dataType="Flag", description="HapMap2 membership", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=0) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Y") cur_annotation = Annotation(value="", datasourceName="ESP", dataType="String", description="A random variable, Y", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=-2) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Z") cur_annotation = Annotation(value="", datasourceName="ESP", dataType="Float", description="A random variable, Z", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=3) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testExampleVcfDBAnnotationWithIndelAvgMatch(self): """ """ tabixIndexedVcfDirName = os.path.join(*["testdata", "vcf_db_avg", "hg19"]) tabixIndexedVcfDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedVcfDirName, "vcf_db_avg.config"), tabixIndexedVcfDirName) chrom = "4" start = "1234567" end = "1234567" ref_allele = "GTC" alt_allele = "GTCTTA" build = "hg19" m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build) m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AF") cur_annotation = Annotation(value="0.5", datasourceName="ESP", dataType="Float", description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT], number=-1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_AC") cur_annotation = Annotation(value="3.0", datasourceName="ESP", dataType="Float", description="Allele Count", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_H2") cur_annotation = Annotation(value="False|False|False", datasourceName="ESP", dataType="String", description="HapMap2 membership", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_AA") cur_annotation = Annotation(value="T", datasourceName="ESP", dataType="String", description="Ancestral Allele", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=1) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Z") cur_annotation = Annotation(value="2.0,3.0,3.0", datasourceName="ESP", dataType="Float", description="A random variable, Z", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=3) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def test_convert_genomic_space_to_transcript_space(self): base_config_location = "testdata/ensembl/saccer/" ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location) tx = ensembl_ds.get_overlapping_transcripts("I", "350", "350") # transcript starts at 335. start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("350", "350", tx[0]) self.assertTrue(start == end) self.assertTrue(start == 16) tx = ensembl_ds.get_overlapping_transcripts("II", "764690", "764690") # transcript starts at 764697 (strand is '-'). start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("764690", "764690", tx[0]) self.assertTrue(start == end) self.assertTrue(start == 7) start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("764680", "764690", tx[0]) self.assertTrue(start == (end - 10)) self.assertTrue(start == 7)
def testTCGAMAFAsInputAndQuickAnnotate(self): """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """ inputFilename = "testdata/maf/Patient0.maf.annotated" tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config') outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv" outputRenderer = TcgaMafOutputRenderer( outputFilename, 'configs/tcgaMAF2.4_output.config') annotator = Annotator() annotator.setInputCreator(tmp) annotator.setOutputRenderer(outputRenderer) ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") annotator.addDatasource(ds) annotator.annotate() statinfo = os.stat(outputFilename) self.assertTrue( statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReaderIn = GenericTsvReader(inputFilename) tsvReader = GenericTsvReader(outputFilename) self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number") self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Why) from header") self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Who) from header") ctrOut = 0 for lineDict in tsvReader: ctrOut += 1 ctrIn = 0 for lineDict in tsvReaderIn: ctrIn += 1 ctrIn += len(tsvReaderIn.getCommentsAsList()) ctrOut += len(tsvReader.getCommentsAsList()) self.assertTrue( ctrOut == (ctrIn + 2), "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file. (In,Out): " + str(ctrIn) + ", " + str(ctrOut))
def testDoubleAnnotationError(self): ''' Given a maf file that used to cause a duplicate annotation exception, do not throw that (or any) exception. ''' outputFilename = 'out/genericGenomePositionDoubleAnnotationTest.out.tsv' gpDS = DatasourceFactory.createDatasource( "testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/") annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/testDoubleAnnotate.maf.tsv')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gpDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename))
def test_simple_genes_by_region_annotation(self): """Test web api backend call /genes/ """ # http://www.broadinstitute.org/oncotator/genes/chr22_22112223_22312558/ # Two genes: chr22:22,112,223-22,312,558 datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False) annotator = Annotator() for ds in datasource_list: annotator.addDatasource(ds) # Here is what the API would call.... txs = annotator.retrieve_transcripts_by_region("22", 22112223, 22312558) self.assertTranscriptsFound(txs) mut_dict = annotator.annotate_genes_given_txs(txs) # Each mut will be for a separate gene for gene in mut_dict.keys(): mut = mut_dict[gene] alt_accessions = mut['UniProt_alt_uniprot_accessions'].split("|") tcgascape_amp_peaks = mut['TCGAScape_Amplification_Peaks'].split("|") tcgascape_del_peaks = mut['TCGAScape_Deletion_Peaks'].split("|") tumorscape_amp_peaks = mut['TUMORScape_Amplification_Peaks'].split("|") tumorscape_del_peaks = mut['TUMORScape_Deletion_Peaks'].split("|") full_name = mut['HGNC_Approved Name'] cosmic = {"tissue_types_affected": mut['COSMIC_Tissue_tissue_types_affected'], "total_alterations_in_gene": mut["COSMIC_Tissue_tissue_types_affected"]} alt_aliases = list(itertools.chain([mut["HGNC_Previous Symbols"].split(", "), mut["HGNC_Synonyms"].split(", ")])) location = mut["HGNC_Chromosome"] uniprot_accession = mut["UniProt_uniprot_accession"] transcripts = mut['transcripts'] self.assertTrue(transcripts is not None) self.assertTrue(len(transcripts) > 0) self.assertTrue(transcripts.startswith('ENST')) strand = mut['strand'] klass = mut['class'] uniprot_experimentals = mut['UniProt_AA_experimental_info'].split("|") self.assertTrue(uniprot_experimentals is not None) uniprot_natural_variations = mut['UniProt_AA_natural_variation'].split("|") uniprot_regions = mut['UniProt_AA_region'].split("|") uniprot_sites = mut['UniProt_AA_site'].split("|") uniprot_go_biological_processes = mut["UniProt_GO_Biological_Process"].split("|") uniprot_go_cellular_components = mut["UniProt_GO_Cellular_Component"].split("|") self.assertTrue(uniprot_go_cellular_components is not None) uniprot_go_molecular_functions = mut["UniProt_GO_Molecular_Function"].split("|") pass
def test_querying_transcripts_by_region(self): """Test web api backend call /transcripts/.... """ datasource_list = DatasourceFactory.createDatasources( self._determine_db_dir(), "hg19", isMulticore=False) annotator = Annotator() for ds in datasource_list: annotator.addDatasource(ds) txs = annotator.retrieve_transcripts_by_region("4", 50164411, 60164411) self.assertTranscriptsFound(txs) ## Here is an example of getting enough data to populate the json in doc/transcript_json_commented.json.txt # None of these values are validated. for tx in txs: transcript_id = tx.get_transcript_id() tx_start = tx.determine_transcript_start() tx_end = tx.determine_transcript_stop() gene = tx.get_gene() chr = tx.get_contig() n_exons = len(tx.get_exons()) strand = tx.get_strand() footprint_start, footprint_end = tx.determine_cds_footprint() klass = tx.get_gene_type() cds_start = tx.determine_cds_start() cds_end = tx.determine_cds_stop() id = tx.get_gene_id() genomic_coords = [[exon[0], exon[1]] for exon in tx.get_exons()] transcript_coords = [[ TranscriptProviderUtils.convert_genomic_space_to_exon_space( exon[0] + 1, exon[1], tx) ] for exon in tx.get_exons()] code_len = int(cds_end) - int(cds_start) + 1 # If refseq datasources are not available, this will fail. # Step 2 annotate the transcript, which produces a dummy mutation with the refseq annotations. dummy_mut = annotator.annotate_transcript(tx) refseq_mRNA_id = dummy_mut["gencode_xref_refseq_mRNA_id"] refseq_prot_id = dummy_mut["gencode_xref_refseq_prot_acc"] # Description is unavailable right now description = "" self.assertTrue(refseq_mRNA_id is not None) self.assertTrue(refseq_prot_id is not None) self.assertTrue(len(transcript_coords) == n_exons)
def testBasicCosmicInit(self): """ Very simple test that will create a datasource from a sample datasource directory. The directory conforms to the standard datasource structure, including placement of the config file. """ ds = DatasourceFactory.createDatasource( 'testdata/small_cosmic/small_cosmic.config', "testdata/small_cosmic") m = MutationDataFactory.default_create() m.chr = 19 m.start = 58858921 m.end = 58858921 m = ds.annotate_mutation(m) self.assertTrue( m['COSMIC_overlapping_mutation_AAs'] == 'p.P426P(1)', "Did not properly annotate mutation: " + m['COSMIC_overlapping_mutation_AAs'])