示例#1
0
    def testFullSnpVcf(self):
        """ Perform test of a SNP call stats (maflite) all the way through TCGA VCF creation.  Only checks that a file was created.
        """
        outputFilename = "out/TCGAVCFTest.snp.vcf"
        callStatsIn = MafliteInputMutationCreator(
            "testdata/Test.call_stats.trim.txt")
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))

        maflite_ic = MafliteInputMutationCreator(
            "testdata/maflite/Patient0.indel.maf.txt")
        muts = maflite_ic.createMutations()
        vcf_reader = vcf.Reader(open(outputFilename, 'r'))
        for i, m in enumerate(muts):
            rec = vcf_reader.next()
            qual = rec.QUAL

            # All records should have QUAL with a value (i.e. NOT ".")
            self.assertIsNotNone(qual)
    def testCreationAndAnnotation(self):
        """ Test the datasource creation and then do a simple annotation
        """
        outputFilename = 'out/genericGeneProteinPositionTest.out.tsv'

        gafDS = TestUtils.createTranscriptProviderDatasource(self.config)
        gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/")

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDS)
        annotator.addDatasource(gppDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
        tsvReader = GenericTsvReader(testFilename)

        ctr = 0
        for lineDict in tsvReader:
            colName = "UniProt_NatVar_natural_variations"
            self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName])
            ctr += 1

        self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
    def testCreationAndAnnotation(self):
        """ Test the datasource creation and then do a simple annotation
        """
        outputFilename = 'out/genericGeneProteinPositionTest.out.tsv'

        gafDS = TestUtils.createTranscriptProviderDatasource(self.config)
        gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/")

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDS)
        annotator.addDatasource(gppDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
        tsvReader = GenericTsvReader(testFilename)

        ctr = 0
        for lineDict in tsvReader:
            colName = "UniProt_NatVar_natural_variations"
            self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName])
            ctr += 1

        self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
    def testTCGAMAFAsInputAndQuickAnnotate(self):
        """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """
        inputFilename = "testdata/maf/Patient0.maf.annotated"
        tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config')
        outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv"
        outputRenderer = TcgaMafOutputRenderer(outputFilename, 'configs/tcgaMAF2.4_output.config')
        annotator = Annotator()
        
        annotator.setInputCreator(tmp)
        annotator.setOutputRenderer(outputRenderer)
        ds = DatasourceFactory.createDatasource("testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/")
        annotator.addDatasource(ds)
        annotator.annotate()
        
        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.")
        tsvReaderIn = GenericTsvReader(inputFilename)
        tsvReader = GenericTsvReader(outputFilename)
        
        self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number")
        self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Why) from header")
        self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Who) from header")
        
        ctrOut = 0
        for lineDict in tsvReader:
            ctrOut += 1
        ctrIn = 0
        for lineDict in tsvReaderIn:
            ctrIn += 1
        ctrIn += len(tsvReaderIn.getCommentsAsList())
        ctrOut += len(tsvReader.getCommentsAsList())

        self.assertTrue(ctrOut == (ctrIn + 2), "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file.  (In,Out): " + str(ctrIn) + ", " + str(ctrOut))
    def testFullIndelVcf(self):
        """ Perform test of a Indel maflite all the way through TCGA VCF creation
        """
        outputFilename = "out/TCGAVCFTest.indel.vcf"
        callStatsIn = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt")
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))

        # Check that the deletions have position decremented by one from what is present in the maflite
        #  Checking that 1	36643701 in the maflite (a deletion) becomes 1	36643700 in the vcf, but that the others are
        #  the same.
        maflite_ic = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt")
        muts = maflite_ic.createMutations()
        vcf_reader = vcf.Reader(open(outputFilename, 'r'))

        vcf_pos = [int(rec.POS) for rec in vcf_reader]
        for m in muts:
            # If the variant is a deletion, then the vcf position should be the same as maflite minus one.  Otherwise, the same.
            is_variant_deletion = (m.alt_allele == "") or (m.alt_allele == "-") or (m.alt_allele == ".")
            if is_variant_deletion:
                self.assertTrue((int(m.start) - 1) in vcf_pos, "Deletion was not correct for " + m.chr + ":" + m.start)
            else:
                self.assertTrue(int(m.start) in vcf_pos, "Insertion was not correct for " + m.chr + ":" + m.start)
示例#6
0
    def testBasicAnnotation(self):
        ''' Test annotation from a generic TSV based on a transcript annotation.  Only confirms the proper headers of the output. '''
        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            config=self.config)
        transcriptDS = DatasourceFactory.createDatasource(
            "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config",
            "testdata/small_transcript_tsv_ds/")
        outputFilename = 'out/genericTranscriptTest.out.tsv'

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(transcriptDS)
        outputFilename = annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue(
            "refseq_test_mRNA_Id" in headers,
            "refseq_test_mRNA_Id not found in headers: " + str(headers))
        self.assertTrue(
            "refseq_test_prot_Id" in headers,
            "refseq_test_prot_Id not found in headers: " + str(headers))
 def testBasicAnnotation(self):
     ''' Annotate from a basic tsv of Genomic positions.  This tests both single- and multiple-nucleotide variants.  The tsv is already installed (i.e. proper config file created).
     '''
     outputFilename = 'out/genericGenomePositionTest.out.tsv'
     
     gpDS = DatasourceFactory.createDatasource("testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/")
     
     annotator = Annotator()
     annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite.maf.txt'))
     annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
     annotator.addDatasource(gpDS)
     testFilename = annotator.annotate()
     
     # Make sure that some values were populated
     self.assertTrue(os.path.exists(testFilename))
     tsvReader = GenericTsvReader(testFilename) 
     
     ctr = 1
     # Two overlap, one does not.  Repeat...
     for lineDict in tsvReader:
         if (ctr % 3 == 0):
             self.assertTrue(lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should have had blank value, but did not: " + lineDict["ORegAnno_hg19.oreganno.id"])
         else:
             self.assertFalse(lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should not have had blank value, but did.")
             self.assertTrue(lineDict["ORegAnno_hg19.oreganno.id"] == 'OREG0013034', "Line " + str(ctr) + " did not have correct value: " + lineDict["ORegAnno_hg19.oreganno.id"])
         ctr = ctr + 1
    def testFullIndelVcf(self):
        """ Perform test of a Indel maflite all the way through TCGA VCF creation
        """
        outputFilename = "out/TCGAVCFTest.indel.vcf"
        callStatsIn = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt")
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))

        # Check that the deletions have position decremented by one from what is present in the maflite
        #  Checking that 1	36643701 in the maflite (a deletion) becomes 1	36643700 in the vcf, but that the others are
        #  the same.
        maflite_ic = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt")
        muts = maflite_ic.createMutations()
        vcf_reader = vcf.Reader(open(outputFilename, 'r'))

        vcf_pos = [int(rec.POS) for rec in vcf_reader]
        for m in muts:
            # If the variant is a deletion, then the vcf position should be the same as maflite minus one.  Otherwise, the same.
            is_variant_deletion = (m.alt_allele == "") or (m.alt_allele == "-") or (m.alt_allele == ".")
            if is_variant_deletion:
                self.assertTrue((int(m.start) - 1) in vcf_pos, "Deletion was not correct for " + m.chr + ":" + m.start)
            else:
                self.assertTrue(int(m.start) in vcf_pos, "Insertion was not correct for " + m.chr + ":" + m.start)
示例#9
0
 def testVersionHeader(self):
     """ This method simply tests that the version string returned by the annotator does not cause an exception.
         Minimal checking that the returned sting is actually correct.
         Does not attempt to initialize input or output.  Only a gaf datasource.
      """
     annotator = Annotator()
     annotator.addDatasource(TestUtils.createTranscriptProviderDatasource(self.config))
     tmp = annotator.createHeaderString()
     self.assertTrue(tmp.find("Gaf ") != -1 or tmp.find("GENCODE") != -1, "Could not find Gaf or GENCODE version in header string.")
     self.assertTrue(tmp.find("Oncotator") != -1, "Could not find the word Oncotator in header string.")
示例#10
0
 def testVersionHeader(self):
     """ This method simply tests that the version string returned by the annotator does not cause an exception.
         Minimal checking that the returned sting is actually correct.
         Does not attempt to initialize input or output.  Only a gaf datasource.
      """
     annotator = Annotator()
     annotator.addDatasource(TestUtils.createTranscriptProviderDatasource(self.config))
     tmp = annotator.createHeaderString()
     self.assertTrue(tmp.find("Gaf ") != -1 or tmp.find("GENCODE") != -1, "Could not find Gaf or GENCODE version in header string.")
     self.assertTrue(tmp.find("Oncotator") != -1, "Could not find the word Oncotator in header string.")
    def testBasicAnnotation(self):
        ''' Annotate from a basic tsv gene file.  Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated.
        Using trimmed CancerGeneCensus as basis for this test.
        '''

        # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g"
        genesAvailable = [
            'ABL1', 'ABL2', 'ACSL3', 'AF15Q14', 'AF1Q', 'AF3p21', 'AF5q31',
            'AKAP9', 'AKT1', 'AKT2', 'ALDH2', 'ALK', 'ALO17', 'APC',
            'ARHGEF12', 'ARHH', 'ARID1A', 'ARID2', 'ARNT', 'ASPSCR1', 'ASXL1',
            'ATF1', 'ATIC', 'ATM', 'ATRX', 'BAP1', 'BCL10', 'BCL11A', 'BCL11B'
        ]

        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            config=self.config)
        geneDS = DatasourceFactory.createDatasource(
            "testdata/small_tsv_ds/small_tsv_ds.config",
            "testdata/small_tsv_ds/")
        outputFilename = 'out/genericGeneTest.out.tsv'

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(geneDS)
        annotator.annotate()

        # Check that there were actual annotations performed.
        tsvReader = GenericTsvReader(outputFilename)

        fields = tsvReader.getFieldNames()
        self.assertTrue(
            'CGC_Abridged_Other Syndrome/Disease' in fields,
            "'CGC_Other Syndrome/Disease' was not present in the header")
        self.assertTrue(
            'CGC_Abridged_Mutation Type' in fields,
            "'CGC_Abridged_Mutation Type' was not present in the header")

        ctr = 1
        linesThatShouldBeAnnotated = 0
        for lineDict in tsvReader:
            self.assertTrue('gene' in lineDict.keys())
            if lineDict['gene'] in genesAvailable:
                self.assertTrue(
                    lineDict['CGC_Abridged_GeneID'] != '',
                    "'CGC_Abridged_GeneID' was missing on a row that should have been populated.  Line: "
                    + str(ctr))
                linesThatShouldBeAnnotated += 1
            ctr += 1
        self.assertTrue((linesThatShouldBeAnnotated) > 0,
                        "Bad data -- cannot test missed detects.")
示例#12
0
    def test_simple_transcript_annotation(self):
        """Test web api backend call /transcript/ """
        # http://www.broadinstitute.org/oncotator/transcript/ENST00000215832.6/
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        tx = annotator.retrieve_transcript_by_id("ENST00000215832.6")
        self.assertTrue(tx is not None)
        self.assertTrue(tx.get_gene() == "MAPK1")
示例#13
0
    def test_querying_transcripts_by_genes(self):
        """Test that we can get all of the transcripts for a given set of genes. """

        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        # Step 1 get all of the relevant transcripts
        txs = annotator.retrieve_transcripts_by_genes(["MAPK1", "PIK3CA"])
        self.assertTrue(len(txs) > 3)
示例#14
0
    def test_querying_transcripts_by_genes(self):
        """Test that we can get all of the transcripts for a given set of genes. """

        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        # Step 1 get all of the relevant transcripts
        txs = annotator.retrieve_transcripts_by_genes(["MAPK1", "PIK3CA"])
        self.assertTrue(len(txs) > 3)
示例#15
0
    def test_simple_transcript_annotation(self):
        """Test web api backend call /transcript/ """
        # http://www.broadinstitute.org/oncotator/transcript/ENST00000215832.6/
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        tx = annotator.retrieve_transcript_by_id("ENST00000215832.6")
        self.assertTrue(tx is not None)
        self.assertTrue(tx.get_gene() == "MAPK1")
示例#16
0
    def test_simple_genes_by_gene_annotation(self):
        """Test web api backend call /gene/ """
        # http://www.broadinstitute.org/oncotator/gene/MAPK1/
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        txs = annotator.retrieve_transcripts_by_genes(["MAPK1"])
        self.assertTranscriptsFound(txs)

        mut_dict = annotator.annotate_genes_given_txs(txs)
        self.assertTrue(len(mut_dict.keys()) == 1)
示例#17
0
    def test_simple_genes_by_gene_annotation(self):
        """Test web api backend call /gene/ """
        # http://www.broadinstitute.org/oncotator/gene/MAPK1/
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        txs = annotator.retrieve_transcripts_by_genes(["MAPK1"])
        self.assertTranscriptsFound(txs)

        mut_dict = annotator.annotate_genes_given_txs(txs)
        self.assertTrue(len(mut_dict.keys()) == 1)
示例#18
0
    def test_simple_genes_by_region_annotation(self):
        """Test web api backend call /genes/ """
        # http://www.broadinstitute.org/oncotator/genes/chr22_22112223_22312558/
        # Two genes: chr22:22,112,223-22,312,558
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        # Here is what the API would call....
        txs = annotator.retrieve_transcripts_by_region("22", 22112223, 22312558)
        self.assertTranscriptsFound(txs)

        mut_dict = annotator.annotate_genes_given_txs(txs)

        # Each mut will be for a separate gene
        for gene in mut_dict.keys():
            mut = mut_dict[gene]
            alt_accessions = mut["UniProt_alt_uniprot_accessions"].split("|")
            tcgascape_amp_peaks = mut["TCGAScape_Amplification_Peaks"].split("|")
            tcgascape_del_peaks = mut["TCGAScape_Deletion_Peaks"].split("|")
            tumorscape_amp_peaks = mut["TUMORScape_Amplification_Peaks"].split("|")
            tumorscape_del_peaks = mut["TUMORScape_Deletion_Peaks"].split("|")
            full_name = mut["HGNC_Approved Name"]
            cosmic = {
                "tissue_types_affected": mut["COSMIC_Tissue_tissue_types_affected"],
                "total_alterations_in_gene": mut["COSMIC_Tissue_tissue_types_affected"],
            }
            alt_aliases = list(
                itertools.chain([mut["HGNC_Previous Symbols"].split(", "), mut["HGNC_Synonyms"].split(", ")])
            )
            location = mut["HGNC_Chromosome"]
            uniprot_accession = mut["UniProt_uniprot_accession"]
            transcripts = mut["transcripts"]
            self.assertTrue(transcripts is not None)
            self.assertTrue(len(transcripts) > 0)
            self.assertTrue(transcripts.startswith("ENST"))
            strand = mut["strand"]
            klass = mut["class"]
            uniprot_experimentals = mut["UniProt_AA_experimental_info"].split("|")
            self.assertTrue(uniprot_experimentals is not None)
            uniprot_natural_variations = mut["UniProt_AA_natural_variation"].split("|")
            uniprot_regions = mut["UniProt_AA_region"].split("|")
            uniprot_sites = mut["UniProt_AA_site"].split("|")
            uniprot_go_biological_processes = mut["UniProt_GO_Biological_Process"].split("|")
            uniprot_go_cellular_components = mut["UniProt_GO_Cellular_Component"].split("|")
            self.assertTrue(uniprot_go_cellular_components is not None)
            uniprot_go_molecular_functions = mut["UniProt_GO_Molecular_Function"].split("|")
            pass
示例#19
0
    def testSimpleAnnotationWithExampleVcf(self):
        """
        Tests the ability to do a simple Gaf 3.0 annotation.
        """
        inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"])
        outputFilename = os.path.join("out", "simpleVCF.Gaf.annotated.out.tsv")

        creator = VcfInputMutationCreator(inputFilename)
        creator.createMutations()
        renderer = SimpleOutputRenderer(outputFilename, [])
        annotator = Annotator()
        annotator.setInputCreator(creator)
        annotator.setOutputRenderer(renderer)
        annotator.addDatasource(TestUtils.createTranscriptProviderDatasource(self.config))
        annotator.annotate()
 def testDoubleAnnotationError(self):
     ''' Given a maf file that used to cause a duplicate annotation exception, do not throw that (or any) exception. '''
     outputFilename = 'out/genericGenomePositionDoubleAnnotationTest.out.tsv'
     
     gpDS = DatasourceFactory.createDatasource("testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/")
     
     
     annotator = Annotator()
     annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/testDoubleAnnotate.maf.tsv'))
     annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
     annotator.addDatasource(gpDS)
     testFilename = annotator.annotate()
     
     
     # Make sure that some values were populated
     self.assertTrue(os.path.exists(testFilename))
    def testFullSnpVcf(self):
        """ Perform test of a SNP call stats (maflite) all the way through TCGA VCF creation.  Only checks that a file was created.
        """
        outputFilename = "out/TCGAVCFTest.snp.vcf"
        callStatsIn = MafliteInputMutationCreator("testdata/Test.call_stats.trim.txt")
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))
    def testFullSnpVcf(self):
        """ Perform test of a SNP call stats (maflite) all the way through TCGA VCF creation.  Only checks that a file was created.
        """
        outputFilename = "out/TCGAVCFTest.snp.vcf"
        callStatsIn = MafliteInputMutationCreator("testdata/Test.call_stats.trim.txt")
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))
示例#23
0
    def testTCGAMAFAsInputAndQuickAnnotate(self):
        """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """
        inputFilename = "testdata/maf/Patient0.maf.annotated"
        tmp = MafliteInputMutationCreator(inputFilename,
                                          'configs/maflite_input.config')
        outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv"
        outputRenderer = TcgaMafOutputRenderer(
            outputFilename, 'configs/tcgaMAF2.4_output.config')
        annotator = Annotator()

        annotator.setInputCreator(tmp)
        annotator.setOutputRenderer(outputRenderer)
        ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config",
            "testdata/thaga_janakari_gene_ds/hg19/")
        annotator.addDatasource(ds)
        annotator.annotate()

        statinfo = os.stat(outputFilename)
        self.assertTrue(
            statinfo.st_size > 0,
            "Generated MAF file (" + outputFilename + ") is empty.")
        tsvReaderIn = GenericTsvReader(inputFilename)
        tsvReader = GenericTsvReader(outputFilename)

        self.assertTrue(tsvReader.getComments().find('#version') != -1,
                        "First line did not specify a version number")
        self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(),
                        "New field missing (i_TJ_Data_Why) from header")
        self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(),
                        "New field missing (i_TJ_Data_Who) from header")

        ctrOut = 0
        for lineDict in tsvReader:
            ctrOut += 1
        ctrIn = 0
        for lineDict in tsvReaderIn:
            ctrIn += 1
        ctrIn += len(tsvReaderIn.getCommentsAsList())
        ctrOut += len(tsvReader.getCommentsAsList())

        self.assertTrue(
            ctrOut == (ctrIn + 2),
            "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file.  (In,Out): "
            + str(ctrIn) + ", " + str(ctrOut))
    def testDoubleAnnotationError(self):
        ''' Given a maf file that used to cause a duplicate annotation exception, do not throw that (or any) exception. '''
        outputFilename = 'out/genericGenomePositionDoubleAnnotationTest.out.tsv'

        gpDS = DatasourceFactory.createDatasource(
            "testdata/small_genome_position_tsv_ds/oreganno_trim.config",
            "testdata/small_genome_position_tsv_ds/")

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/testDoubleAnnotate.maf.tsv'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gpDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
示例#25
0
    def test_simple_genes_by_region_annotation(self):
        """Test web api backend call /genes/ """
        # http://www.broadinstitute.org/oncotator/genes/chr22_22112223_22312558/
        # Two genes: chr22:22,112,223-22,312,558
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        # Here is what the API would call....
        txs = annotator.retrieve_transcripts_by_region("22", 22112223, 22312558)
        self.assertTranscriptsFound(txs)

        mut_dict = annotator.annotate_genes_given_txs(txs)

        # Each mut will be for a separate gene
        for gene in mut_dict.keys():
            mut = mut_dict[gene]
            alt_accessions = mut['UniProt_alt_uniprot_accessions'].split("|")
            tcgascape_amp_peaks = mut['TCGAScape_Amplification_Peaks'].split("|")
            tcgascape_del_peaks = mut['TCGAScape_Deletion_Peaks'].split("|")
            tumorscape_amp_peaks = mut['TUMORScape_Amplification_Peaks'].split("|")
            tumorscape_del_peaks = mut['TUMORScape_Deletion_Peaks'].split("|")
            full_name = mut['HGNC_Approved Name']
            cosmic = {"tissue_types_affected": mut['COSMIC_Tissue_tissue_types_affected'], "total_alterations_in_gene": mut["COSMIC_Tissue_tissue_types_affected"]}
            alt_aliases = list(itertools.chain([mut["HGNC_Previous Symbols"].split(", "), mut["HGNC_Synonyms"].split(", ")]))
            location = mut["HGNC_Chromosome"]
            uniprot_accession = mut["UniProt_uniprot_accession"]
            transcripts = mut['transcripts']
            self.assertTrue(transcripts is not None)
            self.assertTrue(len(transcripts) > 0)
            self.assertTrue(transcripts.startswith('ENST'))
            strand = mut['strand']
            klass = mut['class']
            uniprot_experimentals = mut['UniProt_AA_experimental_info'].split("|")
            self.assertTrue(uniprot_experimentals is not None)
            uniprot_natural_variations = mut['UniProt_AA_natural_variation'].split("|")
            uniprot_regions = mut['UniProt_AA_region'].split("|")
            uniprot_sites = mut['UniProt_AA_site'].split("|")
            uniprot_go_biological_processes = mut["UniProt_GO_Biological_Process"].split("|")
            uniprot_go_cellular_components = mut["UniProt_GO_Cellular_Component"].split("|")
            self.assertTrue(uniprot_go_cellular_components is not None)
            uniprot_go_molecular_functions = mut["UniProt_GO_Molecular_Function"].split("|")
            pass
示例#26
0
    def test_querying_transcripts_by_region(self):
        """Test web api backend call /transcripts/.... """
        datasource_list = DatasourceFactory.createDatasources(
            self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)
        txs = annotator.retrieve_transcripts_by_region("4", 50164411, 60164411)
        self.assertTranscriptsFound(txs)

        ## Here is an example of getting enough data to populate the json in doc/transcript_json_commented.json.txt
        # None of these values are validated.
        for tx in txs:
            transcript_id = tx.get_transcript_id()
            tx_start = tx.determine_transcript_start()
            tx_end = tx.determine_transcript_stop()
            gene = tx.get_gene()
            chr = tx.get_contig()
            n_exons = len(tx.get_exons())
            strand = tx.get_strand()
            footprint_start, footprint_end = tx.determine_cds_footprint()
            klass = tx.get_gene_type()
            cds_start = tx.determine_cds_start()
            cds_end = tx.determine_cds_stop()
            id = tx.get_gene_id()
            genomic_coords = [[exon[0], exon[1]] for exon in tx.get_exons()]
            transcript_coords = [[
                TranscriptProviderUtils.convert_genomic_space_to_exon_space(
                    exon[0] + 1, exon[1], tx)
            ] for exon in tx.get_exons()]
            code_len = int(cds_end) - int(cds_start) + 1

            # If refseq datasources are not available, this will fail.
            # Step 2 annotate the transcript, which produces a dummy mutation with the refseq annotations.
            dummy_mut = annotator.annotate_transcript(tx)
            refseq_mRNA_id = dummy_mut["gencode_xref_refseq_mRNA_id"]
            refseq_prot_id = dummy_mut["gencode_xref_refseq_prot_acc"]

            # Description is unavailable right now
            description = ""

            self.assertTrue(refseq_mRNA_id is not None)
            self.assertTrue(refseq_prot_id is not None)
            self.assertTrue(len(transcript_coords) == n_exons)
    def testAnotherFullSNP(self):
        """Test SNP call stats .  Just make sure no exception is thrown."""
        inputFile = "testdata/maflite/Another.call_stats.txt"
        outputFilename = "out/Another.call_stats.out.vcf"
        callStatsIn = MafliteInputMutationCreator(inputFile)
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))
        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated VCF file (" + outputFilename + ") is empty.")
    def testEmptyInput(self):
        """Make sure that we can generate an empty vcf from an empty maflite"""
        inputFile = "testdata/maflite/empty.maflite"
        outputFilename = "out/empty.vcf"
        callStatsIn = MafliteInputMutationCreator(inputFile)
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))
        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated VCF file (" + outputFilename + ") is empty.")
    def testEmptyInput(self):
        """Make sure that we can generate an empty vcf from an empty maflite"""
        inputFile = "testdata/maflite/empty.maflite"
        outputFilename = "out/empty.vcf"
        callStatsIn = MafliteInputMutationCreator(inputFile)
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))
        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated VCF file (" + outputFilename + ") is empty.")
示例#30
0
    def test_querying_transcripts_by_region(self):
        """Test web api backend call /transcripts/.... """
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)
        txs = annotator.retrieve_transcripts_by_region("4", 50164411, 60164411)
        self.assertTranscriptsFound(txs)

        ## Here is an example of getting enough data to populate the json in doc/transcript_json_commented.json.txt
        # None of these values are validated.
        for tx in txs:
            transcript_id = tx.get_transcript_id()
            tx_start = tx.determine_transcript_start()
            tx_end = tx.determine_transcript_stop()
            gene = tx.get_gene()
            chr = tx.get_contig()
            n_exons = len(tx.get_exons())
            strand = tx.get_strand()
            footprint_start, footprint_end = tx.determine_cds_footprint()
            klass = tx.get_gene_type()
            cds_start = tx.determine_cds_start()
            cds_end = tx.determine_cds_stop()
            id = tx.get_gene_id()
            genomic_coords = [[exon[0], exon[1]] for exon in tx.get_exons()]
            transcript_coords = [
                [TranscriptProviderUtils.convert_genomic_space_to_exon_space(exon[0] + 1, exon[1], tx)]
                for exon in tx.get_exons()
            ]
            code_len = int(cds_end) - int(cds_start) + 1

            # If refseq datasources are not available, this will fail.
            # Step 2 annotate the transcript, which produces a dummy mutation with the refseq annotations.
            dummy_mut = annotator.annotate_transcript(tx)
            refseq_mRNA_id = dummy_mut["gencode_xref_refseq_mRNA_id"]
            refseq_prot_id = dummy_mut["gencode_xref_refseq_prot_acc"]

            # Description is unavailable right now
            description = ""

            self.assertTrue(refseq_mRNA_id is not None)
            self.assertTrue(refseq_prot_id is not None)
            self.assertTrue(len(transcript_coords) == n_exons)
    def testAnotherFullSNP(self):
        """Test SNP call stats .  Just make sure no exception is thrown."""
        inputFile = "testdata/maflite/Another.call_stats.txt"
        outputFilename = "out/Another.call_stats.out.vcf"
        callStatsIn = MafliteInputMutationCreator(inputFile)
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))
        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated VCF file (" + outputFilename + ") is empty.")
示例#32
0
    def testTCGAMAFRendering(self):
        """
        Tests the ability to render a germline VCF file as a TCGA MAF file.
        """
        inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"])
        outputFilename = os.path.join("out", "example.vcf.maf.annotated")

        creator = VcfInputMutationCreator(inputFilename)
        creator.createMutations()
        renderer = TcgaMafOutputRenderer(outputFilename)
        annotator = Annotator()
        annotator.setInputCreator(creator)
        annotator.setOutputRenderer(renderer)
        annotator.setManualAnnotations(self._createTCGAMAFOverridesForVCF())
        datasources = self._createDatasourceCorpus()
        for ds in datasources:
            annotator.addDatasource(ds)
        filename = annotator.annotate()

        self._validateTcgaMafContents(filename)
    def testBasicAnnotation(self):
        """ Test annotation from a generic TSV based on a transcript annotation.  Only confirms the proper headers of the output. """
        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config)
        transcriptDS = DatasourceFactory.createDatasource(
            "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/"
        )
        outputFilename = "out/genericTranscriptTest.out.tsv"

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator("testdata/maflite/Patient0.snp.maf.txt"))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(transcriptDS)
        outputFilename = annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("refseq_test_mRNA_Id" in headers, "refseq_test_mRNA_Id not found in headers: " + str(headers))
        self.assertTrue("refseq_test_prot_Id" in headers, "refseq_test_prot_Id not found in headers: " + str(headers))
    def testBasicAnnotation(self):
        ''' Annotate from a basic tsv of Genomic positions.  This tests both single- and multiple-nucleotide variants.  The tsv is already installed (i.e. proper config file created).
        '''
        outputFilename = 'out/genericGenomePositionTest.out.tsv'

        gpDS = DatasourceFactory.createDatasource(
            "testdata/small_genome_position_tsv_ds/oreganno_trim.config",
            "testdata/small_genome_position_tsv_ds/")

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/tiny_maflite.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gpDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
        tsvReader = GenericTsvReader(testFilename)

        ctr = 1
        # Two overlap, one does not.  Repeat...
        for lineDict in tsvReader:
            if (ctr % 3 == 0):
                self.assertTrue(
                    lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " +
                    str(ctr) + " should have had blank value, but did not: " +
                    lineDict["ORegAnno_hg19.oreganno.id"])
            else:
                self.assertFalse(
                    lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " +
                    str(ctr) + " should not have had blank value, but did.")
                self.assertTrue(
                    lineDict["ORegAnno_hg19.oreganno.id"] == 'OREG0013034',
                    "Line " + str(ctr) + " did not have correct value: " +
                    lineDict["ORegAnno_hg19.oreganno.id"])
            ctr = ctr + 1
    def testBasicAnnotation(self):
        ''' Annotate from a basic tsv gene file.  Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated.
        Using trimmed CancerGeneCensus as basis for this test.
        ''' 
        
        # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g"
        genesAvailable = ['ABL1','ABL2','ACSL3','AF15Q14','AF1Q','AF3p21','AF5q31','AKAP9','AKT1','AKT2','ALDH2','ALK','ALO17','APC','ARHGEF12','ARHH','ARID1A','ARID2','ARNT','ASPSCR1','ASXL1','ATF1','ATIC','ATM','ATRX','BAP1','BCL10','BCL11A','BCL11B']
        
        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config)
        geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/")
        outputFilename = 'out/genericGeneTest.out.tsv'
        
        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(geneDS)
        annotator.annotate()
        
        # Check that there were actual annotations performed.
        tsvReader = GenericTsvReader(outputFilename)
        
        fields = tsvReader.getFieldNames()
        self.assertTrue('CGC_Abridged_Other Syndrome/Disease' in fields, "'CGC_Other Syndrome/Disease' was not present in the header")
        self.assertTrue('CGC_Abridged_Mutation Type' in fields, "'CGC_Abridged_Mutation Type' was not present in the header")
        
        ctr = 1
        linesThatShouldBeAnnotated = 0
        for lineDict in tsvReader:
            self.assertTrue('gene' in lineDict.keys())
            if lineDict['gene'] in genesAvailable:
                self.assertTrue(lineDict['CGC_Abridged_GeneID'] <> '', "'CGC_Abridged_GeneID' was missing on a row that should have been populated.  Line: " + str(ctr))
                linesThatShouldBeAnnotated = linesThatShouldBeAnnotated + 1
            ctr = ctr + 1
        self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")
    def testBasicAnnotation(self):
        """ Annotate from a basic tsv gene file.  Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated.
        Using trimmed CancerGeneCensus as basis for this test.
        """

        # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g"
        genesAvailable = [
            "ABL1",
            "ABL2",
            "ACSL3",
            "AF15Q14",
            "AF1Q",
            "AF3p21",
            "AF5q31",
            "AKAP9",
            "AKT1",
            "AKT2",
            "ALDH2",
            "ALK",
            "ALO17",
            "APC",
            "ARHGEF12",
            "ARHH",
            "ARID1A",
            "ARID2",
            "ARNT",
            "ASPSCR1",
            "ASXL1",
            "ATF1",
            "ATIC",
            "ATM",
            "ATRX",
            "BAP1",
            "BCL10",
            "BCL11A",
            "BCL11B",
        ]

        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config)
        geneDS = DatasourceFactory.createDatasource(
            "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/"
        )
        outputFilename = "out/genericGeneTest.out.tsv"

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator("testdata/maflite/Patient0.snp.maf.txt"))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(geneDS)
        annotator.annotate()

        # Check that there were actual annotations performed.
        tsvReader = GenericTsvReader(outputFilename)

        fields = tsvReader.getFieldNames()
        self.assertTrue(
            "CGC_Abridged_Other Syndrome/Disease" in fields,
            "'CGC_Other Syndrome/Disease' was not present in the header",
        )
        self.assertTrue(
            "CGC_Abridged_Mutation Type" in fields, "'CGC_Abridged_Mutation Type' was not present in the header"
        )

        ctr = 1
        linesThatShouldBeAnnotated = 0
        for lineDict in tsvReader:
            self.assertTrue("gene" in lineDict.keys())
            if lineDict["gene"] in genesAvailable:
                self.assertTrue(
                    lineDict["CGC_Abridged_GeneID"] != "",
                    "'CGC_Abridged_GeneID' was missing on a row that should have been populated.  Line: " + str(ctr),
                )
                linesThatShouldBeAnnotated += 1
            ctr += 1
        self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")