Exemplo n.º 1
0
def _record_with_genes_only(reference):
    gene_names = Transcript.query.with_entities(Transcript.gene). \
        filter_by(reference_id=reference.id).all()
    record = _bare_record(reference)
    genes = []
    for gene_name in gene_names:
        gene = Gene(gene_name.gene)
        version = gene.newLocusTag()
        my_transcript = Locus(version)

        my_transcript.mRNA = PList()
        my_transcript.CDS = PList()

        gene.transcriptList.append(my_transcript)
        genes.append(gene)
    record.geneList = genes
    record.seq = Seq('', generic_dna)
    return record
Exemplo n.º 2
0
def _get_mutalyzer_record(reference, db_transcripts):
    """
    Creates a Mutalyzer specific record from the transcript entries retrieved
    from the gbparser database.
    :param reference: A gbparser database reference entry.
    :param db_transcripts:A gbparser database list of transcript.
    :return: The Mutalyzer record.
    """
    record = _bare_record(reference)

    # Extracting the transcripts from the DB entries.
    transcripts = []
    for db_transcript in db_transcripts:
        transcript = {
            'gene':
            db_transcript.gene,
            'strand':
            db_transcript.strand,
            'transcript_start':
            db_transcript.transcript_start,
            'transcript_stop':
            db_transcript.transcript_stop,
            'transcript_product':
            db_transcript.transcript_product,
            'exons': [],
            'exons_start':
            db_transcript.exons_start,
            'exons_stop':
            db_transcript.exons_stop,
            'transcriptID':
            db_transcript.transcript_accession + '.' +
            db_transcript.transcript_version,
        }
        if db_transcript.protein_accession is not None \
                and db_transcript.protein_version is not None:
            transcript['cds_start'] = db_transcript.cds_start
            transcript['cds_stop'] = db_transcript.cds_stop
            transcript['protein_product'] = db_transcript.protein_product
            transcript['proteinID'] = '%s.%s' %\
                                            (db_transcript.protein_accession,
                                             db_transcript.protein_version)
            transcript['linkMethod'] = 'ncbi'
        starts = map(int, db_transcript.exons_start.split(',')) \
            if db_transcript.exons_start else None
        stops = map(int, db_transcript.exons_stop.split(',')) \
            if db_transcript.exons_stop else None
        if (starts and stops) and (len(starts) == len(stops)):
            for start, stop in zip(starts, stops):
                exon = {'start': start, 'stop': stop}
                transcript['exons'].append(exon)
        transcripts.append(transcript)

    # Generating the actual record entries in the Mutalyzer format.
    gene_dict = {}
    for db_transcript in transcripts:
        if db_transcript['gene'] in gene_dict:
            gene = gene_dict[db_transcript['gene']]
        else:
            gene = Gene(db_transcript['gene'])

        if db_transcript['strand'] == '+':
            gene.orientation = 1
        if db_transcript['strand'] == '-':
            gene.orientation = -1

        transcript = Locus(gene.newLocusTag())

        transcript.mRNA = PList()
        transcript.mRNA.location = [
            db_transcript['transcript_start'], db_transcript['transcript_stop']
        ]

        transcript.transcriptID = db_transcript['transcriptID']
        transcript.exon = PList()
        if db_transcript.get('exons') \
                and isinstance(db_transcript.get('exons'), list):
            exon_list = []
            for exon in db_transcript['exons']:
                exon_list.extend([exon['start'], exon['stop']])
            transcript.exon.positionList = exon_list
        else:
            transcript.exon.positionList = transcript.mRNA.location

        transcript.mRNA.positionList = transcript.exon.positionList
        transcript.mRNA.positionList.sort()

        if db_transcript.get('proteinID'):
            transcript.CDS = PList()
            transcript.CDS.location = [
                db_transcript['cds_start'], db_transcript['cds_stop']
            ]

            transcript.CDS.positionList = cds_position_list(
                transcript.mRNA.positionList, transcript.CDS.location)

            transcript.proteinID = db_transcript['proteinID']

            transcript.transcriptProduct = db_transcript['transcript_product']
            transcript.proteinProduct = db_transcript['protein_product']
            transcript.linkMethod = 'ncbi'
            transcript.transcribe = True
            transcript.translate = True
        else:
            transcript.linkMethod = None
            transcript.transcribe = True
            transcript.translate = False
            transcript.locusTag = ''

        # transcript.molType = db_transcript['molType']

        gene.transcriptList.append(transcript)
        gene_dict[gene.name] = gene

    record.geneList = list(gene_dict.values())

    # Get the sequence.
    seq_path = settings.SEQ_PATH + reference.checksum_sequence + '.sequence'
    try:
        seq = Seq(_get_sequence_mmap(seq_path, 1, reference.length + 1),
                  generic_dna)
    except IOError:
        return None
    else:
        record.seq = seq

    return record
Exemplo n.º 3
0
    def create_record(self, filename):
        """
        Create a GenRecord.Record from a GenBank file

        @arg filename: The full path to the compressed GenBank file
        @type filename: unicode

        @return: A GenRecord.Record instance
        @rtype: object (record)
        """
        # first create an intermediate genbank record with BioPython
        file_handle = bz2.BZ2File(filename, "r")
        file_handle = codecs.getreader('utf-8')(file_handle)
        biorecord = SeqIO.read(file_handle, "genbank")
        file_handle.close()

        record = Record()
        record.seq = biorecord.seq

        # Note: The .source_* values may be different from the values we are
        #     working with, e.g. for UD slices where these values (taken from
        #     the genbank file) are from the original NC reference. We try to
        #     set the .id field to the working value in the caller.
        record.source_id = biorecord.id
        record.source_accession, record.source_version = biorecord.id.split(
            '.')[:2]
        record.source_gi = biorecord.annotations['gi']
        record.organism = biorecord.annotations['organism']

        # Todo: This will change once we support protein references
        if isinstance(biorecord.seq.alphabet, ProteinAlphabet):
            return record

        exonList = []
        geneDict = {}

        accInfo = biorecord.annotations['accessions']
        if len(accInfo) >= 3 and accInfo[1] == "REGION:":
            # Todo: This information is present in the genbank file if it is a
            #     UD sliced from a chromosome. We can get the same information
            #     for NM references from our mapping database and that way
            #     also provide chromosomal variant descriptions for those.
            region = accInfo[2]
            if "complement" in region:
                record.orientation = -1
                record.chromOffset = int(region.split('.')[2][:-1])
            #if
            else:
                record.chromOffset = int(accInfo[2].split('.')[0])
        #if
        for i in biorecord.features:
            if i.qualifiers:
                if i.type == "source":
                    if i.qualifiers.has_key("mol_type"):
                        if i.qualifiers["mol_type"][0] in ["mRNA", \
                           "transcribed RNA"] :
                            record.molType = 'n'
                        else:
                            record.molType = 'g'
                    #if
                    if i.qualifiers.has_key("organelle"):
                        record.organelle = i.qualifiers["organelle"][0]
                        if record.organelle == "mitochondrion":
                            record.molType = 'm'
                    #if

                    fakeGene = Locus("001")
                    record.source.transcriptList.append(fakeGene)
                    fakeGene.CDS = PList()
                    fakeGene.CDS.location = self.__location2pos(i.location)
                #if

                if i.qualifiers.has_key("gene"):
                    if not unicode(i.location.start).isdigit() or \
                       not unicode(i.location.end).isdigit():
                        # Feature is not completely in reference. Either start
                        # or end is not a Bio.SeqFeature.ExactPosition.
                        continue

                    geneName = i.qualifiers["gene"][0]
                    if i.type == "gene":
                        if not geneDict.has_key(geneName):
                            myGene = Gene(geneName)
                            record.geneList.append(myGene)
                            if i.strand:
                                myGene.orientation = i.strand
                            myGene.location = self.__location2pos(i.location)
                            geneDict[geneName] = tempGene(geneName)
                        #if
                    else:
                        if geneName not in geneDict:
                            # We should have seen a gene entry for this gene
                            # by now. Could be that it was skipped because it
                            # was not completely in reference (see check
                            # above). In that case we just ignore any of its
                            # features.
                            continue
                    #if

                    if i.type in [
                            "mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA",
                            "tmRNA"
                    ]:
                        geneDict[geneName].rnaList.append(i)
                    if i.type == "CDS":
                        geneDict[geneName].cdsList.append(i)
                    if i.type == "exon":
                        exonLocation = self.__location2pos(i.location)
                        if exonLocation:
                            exonList.extend(exonLocation)
                    #if
                #if
            #if
        #for
        if record.molType in ['g', 'm']:
            for j in geneDict.keys():
                myGene = geneDict[j]
                self.link(myGene.rnaList, myGene.cdsList)
                for i in myGene.rnaList:
                    if i.usable:
                        myRealGene = record.findGene(i.gene)
                        if i.locus_tag:
                            # Note: We use the last three characters of the
                            # locus_tag as a unique transcript version id.
                            # This is also used to for the protein-transcript
                            # link table.
                            # Normally, locus_tag ends with three digits, but
                            # for some (e.g. mobA on NC_011228, a plasmid) it
                            # ends with two digits prepended with an
                            # underscore. Or prepended with a letter. We
                            # really want a number, so 'fix' this by only
                            # looking for a numeric part.
                            try:
                                version = LOCUS_TAG_VERSION.findall(
                                    i.locus_tag)[0].zfill(3)
                            except IndexError:
                                version = '000'
                            myTranscript = Locus(version)
                        else:
                            myTranscript = Locus(myRealGene.newLocusTag())
                        myTranscript.mRNA = PList()
                        myTranscript.mRNA.positionList = i.positionList
                        myTranscript.mRNA.location = i.location
                        myTranscript.transcribe = True
                        myTranscript.transcriptID = i.transcript_id
                        myTranscript.transcriptProduct = i.product
                        myTranscript.locusTag = i.locus_tag
                        if i.link:
                            myTranscript.CDS = PList()
                            myTranscript.CDS.positionList = i.link.positionList
                            myTranscript.CDS.location = i.link.location
                            myTranscript.translate = True
                            myTranscript.proteinID = i.link.protein_id
                            myTranscript.linkMethod = i.linkMethod
                            myTranscript.proteinProduct = i.link.product
                            if i.link.qualifiers.has_key("transl_table"):
                                myTranscript.txTable = \
                                    int(i.qualifiers["transl_table"][0])
                        #if
                        myRealGene.transcriptList.append(myTranscript)
                    #if
                #for
                for i in myGene.cdsList:
                    if not i.linked and \
                       (i.usable or not geneDict[myGene.name].rnaList) :
                        myRealGene = record.findGene(i.gene)
                        if i.locus_tag:
                            # Note: We use the last three characters of the
                            # locus_tag as a unique transcript version id.
                            # This is also used to for the protein-transcript
                            # link table.
                            # Normally, locus_tag ends with three digits, but
                            # for some (e.g. mobA on NC_011228, a plasmid) it
                            # ends with two digits prepended with an
                            # underscore. Or prepended with a letter. We
                            # really want a number, so 'fix' this by only
                            # looking for a numeric part.
                            try:
                                version = LOCUS_TAG_VERSION.findall(
                                    i.locus_tag)[0].zfill(3)
                            except IndexError:
                                version = '000'
                            myTranscript = Locus(version)
                        else:
                            myTranscript = Locus(myRealGene.newLocusTag())
                        myTranscript.CDS = PList()
                        myTranscript.CDS.positionList = i.positionList
                        myTranscript.CDS.location = i.location
                        myTranscript.proteinID = i.protein_id
                        myTranscript.proteinProduct = i.product
                        if i.qualifiers.has_key("transl_table"):
                            myTranscript.txTable = \
                                int(i.qualifiers["transl_table"][0])
                        myRealGene.transcriptList.append(myTranscript)
                        #if
                    #if
                #for
            #for
        #if
        else:
            if geneDict:
                myGene = geneDict[geneDict.keys()[0]]
                myRealGene = record.geneList[0]
                if myGene.cdsList:
                    myCDS = myGene.cdsList[0]
                    self.__tagByDict(myCDS, "protein_id")
                    self.__tagByDict(myCDS, "product")
                #if
                else:
                    myCDS = None
                myTranscript = Locus("001")
                myTranscript.exon = PList()
                if exonList:
                    myTranscript.exon.positionList = exonList
                else:
                    myTranscript.exon.location = myRealGene.location
                if myCDS:
                    myTranscript.CDS = PList()
                    myTranscript.CDS.location = \
                        self.__location2pos(myCDS.location)
                #if
                if exonList or myRealGene.location or \
                   myTranscript.CDS.location :
                    myTranscript.transcriptID = biorecord.id
                    if myCDS:
                        myTranscript.proteinID = myCDS.protein_id
                        myTranscript.proteinProduct = myCDS.product
                        myTranscript.linkMethod = "exhaustion"
                        myTranscript.transcribe = True
                        if myCDS.qualifiers.has_key("transl_table"):
                            myTranscript.txTable = \
                                int(i.qualifiers["transl_table"][0])
                    #if
                    myRealGene.transcriptList.append(myTranscript)
                #if
            #if
        #else
        for i in record.geneList:
            if not i.transcriptList:
                record.geneList.remove(i)

        return record
Exemplo n.º 4
0
def _get_mutalyzer_record(reference, db_transcripts):
    """
    Creates a Mutalyzer specific record from the transcript entries retrieved
    from the gbparser database.
    :param reference: A gbparser database reference entry.
    :param db_transcripts:A gbparser database list of transcript.
    :return: The Mutalyzer record.
    """
    record = _bare_record(reference)

    # Extracting the transcripts from the DB entries.
    transcripts = []
    for transcript in db_transcripts:
        my_transcript = {
            'gene': transcript.gene,
            'strand': transcript.strand,
            'transcript_start': transcript.transcript_start,
            'transcript_stop': transcript.transcript_stop,
            'cds_start': transcript.cds_start,
            'cds_stop': transcript.cds_stop,
            'exons': [],
            'exons_start': transcript.exons_start,
            'exons_stop': transcript.exons_stop,
            'transcriptID': transcript.transcript_accession + '.' +
            transcript.transcript_version,
            'proteinID':
            transcript.protein_accession + '.' + transcript.protein_version,
            'linkMethod': 'ncbi'
        }
        # if transcript.exons_start:
        #     starts = transcript.exons_start.split(',')
        # if transcripts.exons_stop:
        #     stops = transcript.exons_stopts.split(',')
        starts = map(int, transcript.exons_start.split(
            ',')) if transcript.exons_start else None
        stops = map(int, transcript.exons_stop.split(
            ',')) if transcript.exons_stop else None
        if (starts and stops) and (len(starts) == len(stops)):
            for start, stop in zip(starts, stops):
                exon = {'start': start, 'stop': stop}
                my_transcript['exons'].append(exon)
        # if transcript.exons and isinstance(transcript.exons, list):
        #     for exon in transcript.exons:
        #         exon = {'start': exon.start,
        #                 'stop': exon.stop}
        #         my_transcript['exons'].append(exon)
        transcripts.append(my_transcript)

    # Generating the actual record entries in the Mutalyzer format.
    gene_dict = {}
    for transcript in transcripts:
        if transcript['gene'] in gene_dict:
            gene = gene_dict[transcript['gene']]
        else:
            gene = Gene(transcript['gene'])

        if transcript['strand'] == '+':
            gene.orientation = 1
        if transcript['strand'] == '-':
            gene.orientation = -1

        my_transcript = Locus(gene.newLocusTag())

        my_transcript.mRNA = PList()
        my_transcript.mRNA.location = [
            transcript['transcript_start'], transcript['transcript_stop']
        ]

        my_transcript.CDS = PList()
        my_transcript.CDS.location = [
            transcript['cds_start'], transcript['cds_stop']
        ]
        my_transcript.exon = PList()
        if transcript.get('exons') and isinstance(transcript.get('exons'),
                                                  list):
            exon_list = []
            for exon in transcript['exons']:
                exon_list.extend([exon['start'], exon['stop']])
            my_transcript.exon.positionList = exon_list
        else:
            my_transcript.exon.positionList = my_transcript.mRNA.location

        my_transcript.mRNA.positionList = my_transcript.exon.positionList
        my_transcript.mRNA.positionList.sort()

        my_transcript.CDS.positionList = cds_position_list(
            my_transcript.mRNA.positionList, my_transcript.CDS.location)

        my_transcript.transcriptID = transcript['transcriptID']
        my_transcript.proteinID = transcript['proteinID']
        my_transcript.linkMethod = 'ncbi'
        my_transcript.transcribe = True
        my_transcript.translate = True
        gene.transcriptList.append(my_transcript)
        gene_dict[gene.name] = gene

    record.geneList = list(gene_dict.values())

    # Get the sequence.
    seq_path = settings.SEQ_PATH + reference.checksum_sequence + '.sequence'
    try:
        seq = Seq(_get_sequence_mmap(seq_path, 1, reference.length + 1),
                  generic_dna)
    except IOError:
        return None
    else:
        record.seq = seq

    return record