Пример #1
0
    def test_build_ensembl_transcript_index(self):
        """Build the gtf portion of the ensembl transcript db
        """
        # cat ~/oncotator_pycharm/oncotator/test/testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf | cut -f 9 | cut -f 5 --delimiter=" " | sort | uniq | sed -r "s/;//g" | sed -r "s/\"//g"
        #  snR84, tK(UUU)K, YAL067C, YAL067W-A, YAL068C, YAL068W-A, YAL069W, YBR278W, YBR279W, YBR280C, YBR281C, YDR528W, YDR529C, YKR074W,
        #
        # grep -Pzo  ">(snR84|tK\(UUU\)K|YAL067C|YAL067W-A|YAL068C|YAL068W-A|YAL069W|YBR278W|YBR279W|YBR280C|YBR281C|YDR528W|YDR529C|YKR074W)([A-Za-z_0-9 \:\-\n]+)" Saccharomyces_cerevisiae.EF4.71.cdna.all.fa >Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa
        #
        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        output_filename = "out/test_ensembl_gtf.db"
        protocol = "file"
        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], output_filename, protocol=protocol)
        self.assertTrue(os.path.exists(output_filename))

        shove = Shove(protocol + "://" + output_filename, "memory://")
        self.assertTrue(len(shove.keys()) > 0)
        self.assertTrue("YDR529C" in shove.keys())
        t = shove["YDR529C"]
        self.assertTrue(t.get_seq() is not None)
        self.assertTrue(t.get_seq() is not "")
        self.assertTrue(len(t.get_cds()) > 0)
        self.assertTrue(len(t.get_exons()) > 0)
        MutUtils.removeDir(output_filename)
Пример #2
0
    def test_build_ensembl_transcript_index(self):
        """Build the gtf portion of the ensembl transcript db
        """
        # cat ~/oncotator_pycharm/oncotator/test/testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf | cut -f 9 | cut -f 5 --delimiter=" " | sort | uniq | sed -r "s/;//g" | sed -r "s/\"//g"
        #  snR84, tK(UUU)K, YAL067C, YAL067W-A, YAL068C, YAL068W-A, YAL069W, YBR278W, YBR279W, YBR280C, YBR281C, YDR528W, YDR529C, YKR074W,
        #
        # grep -Pzo  ">(snR84|tK\(UUU\)K|YAL067C|YAL067W-A|YAL068C|YAL068W-A|YAL069W|YBR278W|YBR279W|YBR280C|YBR281C|YDR528W|YDR529C|YKR074W)([A-Za-z_0-9 \:\-\n]+)" Saccharomyces_cerevisiae.EF4.71.cdna.all.fa >Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa
        #
        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        output_filename = "out/test_ensembl_gtf.db"
        protocol = "file"
        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], output_filename, protocol=protocol)
        self.assertTrue(os.path.exists(output_filename))

        shove = Shove(protocol + "://" + output_filename, "memory://")
        self.assertTrue(len(shove.keys()) > 0)
        self.assertTrue("YDR529C" in shove.keys())
        t = shove["YDR529C"]
        self.assertTrue(t.get_seq() is not None)
        self.assertTrue(t.get_seq() is not "")
        self.assertTrue(len(t.get_cds()) > 0)
        self.assertTrue(len(t.get_exons()) > 0)
        MutUtils.removeDir(output_filename)
Пример #3
0
    def test_build_ensembl_transcripts_by_gene_index(self):
        """Test building an index for getting a transcript given a gene."""
        protocol = "file"
        transcript_index_filename = "out/test_ensembl_gtf_for_gene.db"
        output_filename = "out/test_ensembl_gtf_for_gene.db.gene.idx"
        shutil.rmtree(output_filename,ignore_errors=True)

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol)
        genome_build_factory.build_ensembl_transcripts_by_gene_index(transcript_index_filename, output_filename)

        # Now load the index and look something up.
        gene_index = Shove(protocol + "://" + output_filename, optimize=False)
        self.assertTrue(len(gene_index['SEO1']) == 1)
        tx = gene_index['SEO1'][0]

        self.assertTrue(tx.get_transcript_id()=="YAL067C")
Пример #4
0
    def test_build_ensembl_transcripts_by_gene_index(self):
        """Test building an index for getting a transcript given a gene."""
        protocol = "file"
        transcript_index_filename = "out/test_ensembl_gtf_for_gene.db"
        output_filename = "out/test_ensembl_gtf_for_gene.db.gene.idx"
        shutil.rmtree(output_filename,ignore_errors=True)

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol)
        genome_build_factory.build_ensembl_transcripts_by_gene_index(transcript_index_filename, output_filename)

        # Now load the index and look something up.
        gene_index = Shove(protocol + "://" + output_filename, optimize=False)
        self.assertTrue(len(gene_index['SEO1']) == 1)
        tx = gene_index['SEO1'][0]

        self.assertTrue(tx.get_transcript_id()=="YAL067C")
Пример #5
0
    def test_build_ensembl_transcripts_by_genomic_location_index(self):
        """Test that we can get an ensembl transcript from a genomic position"""
        protocol = "file"
        transcript_index_filename = "out/test_ensemble_gtf_for_gp.db"
        output_filename = "out/test_ensemble_gtf_for_gp.db.idx"
        shutil.rmtree(output_filename, ignore_errors=True)

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol)
        genome_build_factory.build_ensembl_transcripts_by_genomic_location_index(transcript_index_filename, output_filename, protocol=protocol)

        # Now load the index and look something up.
        gp_index = Shove(protocol + "://" + output_filename)
        gt_transcript_id = "YAL067C"
        bins = region2bins(1496172, 1496400)

        for bin in bins:
            key = 'I_' + str(bin)
            if key in gp_index.keys():
                self.assertTrue(gp_index[key] == gt_transcript_id)
Пример #6
0
    def test_build_ensembl_transcripts_by_genomic_location_index(self):
        """Test that we can get an ensembl transcript from a genomic position"""
        protocol = "file"
        transcript_index_filename = "out/test_ensemble_gtf_for_gp.db"
        output_filename = "out/test_ensemble_gtf_for_gp.db.idx"
        shutil.rmtree(output_filename, ignore_errors=True)

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol)
        genome_build_factory.build_ensembl_transcripts_by_genomic_location_index(transcript_index_filename, output_filename, protocol=protocol)

        # Now load the index and look something up.
        gp_index = Shove(protocol + "://" + output_filename)
        gt_transcript_id = "YAL067C"
        bins = region2bins(1496172, 1496400)

        for bin in bins:
            key = 'I_' + str(bin)
            if key in gp_index.keys():
                self.assertTrue(gp_index[key] == gt_transcript_id)