Exemplo n.º 1
0
    def __get_binned_data(self, chr, start, end, type):
        if type == "gene":
            data = self.Genes
        elif type == "transcript":
            data = self.Transcripts

        # GAF uses M_rCRS whereas mutations are often using M for the chromosome field.
        if chr == "M" and ("M" not in data) and ("M_rCRS" not in data):
            raise GafInvalidChromosomeValue(
                "Unable to process mitochondria mutation with chr: %s   .... data keys are %s" % (str(chr)),
                str(data.keys()),
            )
        if chr == "M" and ("M" not in data):

            # TODO: Verify that it is okay to do this.
            chr = "M_rCRS"

        if chr not in data:
            self.logger.warn("Invalid chromosome value for Gaf search: %s" % (str(chr)))
            return list()
            # raise GafInvalidChromosomeValue("Invalid chromosome value: %s" % (str(chr)))

        bins = region2bins(start, end)
        records = list()
        for b in bins:
            records.extend(data[chr].get(b, []))

        return records
Exemplo n.º 2
0
    def __get_binned_data(self, chr, start, end, type):
        if type == 'gene':
            data = self.Genes
        elif type == 'transcript':
            data = self.Transcripts

        # GAF uses M_rCRS whereas mutations are often using M for the chromosome field.
        if chr == 'M' and ('M' not in data) and ('M_rCRS' not in data):
            raise GafInvalidChromosomeValue(
                "Unable to process mitochondria mutation with chr: %s   .... data keys are %s"
                % (str(chr)), str(data.keys()))
        if chr == 'M' and ('M' not in data):

            # TODO: Verify that it is okay to do this.
            chr = 'M_rCRS'

        if chr not in data:
            self.logger.warn("Invalid chromosome value for Gaf search: %s" %
                             (str(chr)))
            return list()
            #raise GafInvalidChromosomeValue("Invalid chromosome value: %s" % (str(chr)))

        bins = region2bins(start, end)
        records = list()
        for b in bins:
            records.extend(data[chr].get(b, []))

        return records
    def _get_binned_transcripts_given_index(self, chr, start, end, index_dict):
        bins = region2bins(int(start), int(end))
        records = list()

        for b in bins:
            key = chr + "_" + str(b)
            try:
                txs = index_dict[key]
                records.extend(txs)
            except KeyError:
                pass
        return set(records)
    def _get_binned_transcripts_given_index(self, chr, start, end, index_dict):
        bins = region2bins(int(start), int(end))
        records = list()

        for b in bins:
            key = chr + "_" + str(b)
            try:
                txs = index_dict[key]
                records.extend(txs)
            except KeyError:
                pass
        return set(records)
Exemplo n.º 5
0
    def test_build_ensembl_transcripts_by_genomic_location_index(self):
        """Test that we can get an ensembl transcript from a genomic position"""
        protocol = "file"
        transcript_index_filename = "out/test_ensemble_gtf_for_gp.db"
        output_filename = "out/test_ensemble_gtf_for_gp.db.idx"
        shutil.rmtree(output_filename, ignore_errors=True)

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol)
        genome_build_factory.build_ensembl_transcripts_by_genomic_location_index(transcript_index_filename, output_filename, protocol=protocol)

        # Now load the index and look something up.
        gp_index = Shove(protocol + "://" + output_filename)
        gt_transcript_id = "YAL067C"
        bins = region2bins(1496172, 1496400)

        for bin in bins:
            key = 'I_' + str(bin)
            if key in gp_index.keys():
                self.assertTrue(gp_index[key] == gt_transcript_id)
Exemplo n.º 6
0
    def test_build_ensembl_transcripts_by_genomic_location_index(self):
        """Test that we can get an ensembl transcript from a genomic position"""
        protocol = "file"
        transcript_index_filename = "out/test_ensemble_gtf_for_gp.db"
        output_filename = "out/test_ensemble_gtf_for_gp.db.idx"
        shutil.rmtree(output_filename, ignore_errors=True)

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol)
        genome_build_factory.build_ensembl_transcripts_by_genomic_location_index(transcript_index_filename, output_filename, protocol=protocol)

        # Now load the index and look something up.
        gp_index = Shove(protocol + "://" + output_filename)
        gt_transcript_id = "YAL067C"
        bins = region2bins(1496172, 1496400)

        for bin in bins:
            key = 'I_' + str(bin)
            if key in gp_index.keys():
                self.assertTrue(gp_index[key] == gt_transcript_id)