示例#1
0
def create_gene_utrs(gene_gff_list):
    """
    Create UTR tracks for an (annotated) gene

    @type  gene_gff_list: list
    @param gene_gff_list: list with gff tuples of the (annotated) gene

    @rtype  utrs: list
    @return utrs: list with gff tuples of the UTR tracks

    @attention: requires global variables GFF_CDS_FMETHOD, GFF_EXON_FMETHOD
    @attention: requires global variables GFF_UTR5_FSOURCE, GFF_UTR5_FMETHOD
    @attention: requires global variables GFF_UTR3_FSOURCE, GFF_UTR3_FMETHOD
    """
    # make sets of unigene coordinates
    cdscoords  = gffs2coordset(gene_gff_list,fmethod=[GFF_CDS_FMETHOD])
    exoncoords = gffs2coordset(gene_gff_list,fmethod=[GFF_EXON_FMETHOD])

    # return list with UTR tracks
    utrs = []
    
    # create a list with 5'UTR coordinates
    if cdscoords and exoncoords:
        utr5p_coords = []
        utr3p_coords = []

        # find start codon in cdscoords: min()
        for coord in range( min(exoncoords), min(cdscoords) ):
            if coord in exoncoords:
                utr5p_coords.append(coord)

        # find stop codon in exoncoords: max()
        if max(cdscoords)+3 != max(exoncoords):
            # check if exon-end != cds-end+3 -> only a stop codon
            for coord in range( max(cdscoords)+1, max(exoncoords)+1 ):
                if coord in exoncoords:
                    utr3p_coords.append(coord)

        if utr5p_coords or utr3p_coords:
            # get list with coding exons for track backbone
            cexons = filtergffs4fmethod(gene_gff_list,fmethod=GFF_CDS_FMETHOD)

            # create 5'UTRs
            gfftrack    = list( cexons[0] )
            gfftrack[1] = GFF_UTR5_FSOURCE
            gfftrack[2] = GFF_UTR5_FMETHOD
            utrs.extend( coordset2gfftracks(utr5p_coords,gfftrack))

            # create 3'UTRs
            gfftrack    = list( cexons[0] )
            gfftrack[1] = GFF_UTR3_FSOURCE
            gfftrack[2] = GFF_UTR3_FMETHOD
            utrs.extend( coordset2gfftracks(utr3p_coords,gfftrack))

    # return the utr tracks
    return utrs
def correct_unigene_for_utrs(
    unigene_gff_list,
    start_codon_gff=(),
    stop_codon_gff=(),
    minimal_likely_tss_pssm_score=3.0,
    shift_tss_pssm_score_ratio=4.0,
    dnaseqfname=None,
    verbose=False,
):
    """
    Check if unigene contains evidence for non-coding UTRs and if so, correct

    @type  unigene_gff_list: list
    @param unigene_gff_list: list with uncorrected unigene gff tuples

    @type  start_codon_gff: tuple
    @param start_codon_gff: tuple representing the (annotated) protein's start codon
    
    @type  stop_codon_gff: tuple
    @param stop_codon_gff: tuple representing the (annotated) protein's stop codon

    @type  dnaseqfname: string (or None)
    @param dnaseqfname: filename of DNA sequence corresponding to the unigene's GFF

    @type  minimal_likely_tss_pssm_score: float
    @param minimal_likely_tss_pssm_score: minimal (likely) PSSM score of the TSS

    @type  shift_tss_pssm_score_ratio: float
    @param shift_tss_pssm_score_ratio: shift TSS downstream when ratio is exceeded

    @type  verbose: Boolean
    @param verbose: print discrepancies to STDOUT (True) or not (False, default)

    @rtype:  list of (gff) tuples + typeofunigene string
    @return: list with corrected gff tuples + string

    @attention: Global variable GFF_UGEXON_FMETHOD  is required for this function
    @attention: Global variable GFF_UG3UTREXON_FMETHOD  is required for this function
    @attention: Global variable GFF_UG5UTREXON_FMETHOD  is required for this function
    """
    # return list with corrected unigene tracks
    return_unigene_gff_list = []
    start_codon_pos = None
    stop_codon_pos = None
    typeofunigene = None
    # make sets of unigene coordinates
    unigene_coordinate_set = gffs2coordset(unigene_gff_list, fmethod=[GFF_UGEXON_FMETHOD])

    if dnaseqfname:
        # print unigene structure annotation
        unigeneexons = filtergffs4fmethod(unigene_gff_list, GFF_UGEXON_FMETHOD)
        unigeneexons.sort()
        # replace fasta header for correct recognition
        # header,descr = parseSingleFastaHeaderFromFile(dnaseqfname)
        header, dnaseq, descr = parseSingleFasta(open(dnaseqfname).readlines())
        for i in range(0, len(unigeneexons)):
            gff = list(unigeneexons[i])
            gff[0] = header
            # correct for negative coordinate. This can happen in case
            # the unigene sticks out of the genelocus
            if gff[3] <= 0:
                gff[3] = 1
            unigeneexons[i] = tuple(gff)

        # run unigeneannotation command
        command = "%s %s %s" % (PYTHON_PATH, EXECUTABLE_UNIGENEANNOTATION, dnaseqfname)
        ci, co = os.popen2(command)
        ci.write(gffs2txt(unigeneexons))
        ci.close()
        ugannotation = co.read().strip().split("\t")
        co.close()
        typeofunigene = ugannotation[0]
        # abstract coordinates of start and stop codon
        # from unigene annotation
        try:
            start_codon_pos = int(ugannotation[5])
        except:
            start_codon_pos = None
        try:
            stop_codon_pos = int(ugannotation[6])
        except:
            stop_codon_pos = None

        ################################################################
        if verbose:
            for track in unigene_gff_list:
                print track
            print ugannotation, start_codon_pos, stop_codon_pos
            print "given ATG:", start_codon_gff
            print "given TGA:", stop_codon_gff
        ################################################################

        if start_codon_pos:
            # check if the PythonRegex obtained Methionine is the most
            # likely TSS. When a far better one is available -> shift
            # the TSS downstream (5p->3p) to this better TSS.
            startcodons = []
            for gffpos in range(start_codon_pos, int(unigeneexons[0][4]), 3):
                if dnaseq[gffpos - 1 : gffpos - 1 + 3].upper() == "ATG":
                    tssSta = gffpos - 1 - IC_TSS_PATTERN_OFFSET[0]
                    tssEnd = gffpos - 1 + 3 + IC_TSS_PATTERN_OFFSET[1]
                    tssSeq = dnaseq[tssSta:tssEnd]
                    tssSco = score_tss(tssSeq)
                    # print 'ATG', gffpos, "%1.2f" % tssSco
                    startcodons.append((tssSco, gffpos))
            # check if there are >1 start codon posibilities
            if len(startcodons) > 1 and startcodons[0][0] < minimal_likely_tss_pssm_score:
                for score, gffpos in startcodons[1:]:
                    if (
                        score >= minimal_likely_tss_pssm_score
                        and abs(score / startcodons[0][0]) > shift_tss_pssm_score_ratio
                    ):
                        start_codon_pos = gffpos
                        # print "TSS pos SHIFTED", startcodons[0][1], "->", gffpos
                        # break out after first shift; this is now *THE* TSS
                        break
        elif start_codon_gff:
            # unigene is a fragment or other transcript without
            # likely ATG. Fortunately, ATG is applied from the given
            # gene structure. Take this one.
            start_codon_pos = int(start_codon_gff[3])
        else:
            # NO start_codon_pos available -> unigene fragment!
            pass

    elif start_codon_gff or stop_codon_gff:
        typeofunigene = None  # unknown -> no unigeneannotation
        # No dna sequence is applied to verify the ATG/TGA
        # positions of the unigene by unigene annotation.
        # Abstract coordinates of start and/or stop codons
        # from the given coordinates (from the gene's annotation)
        if start_codon_gff:
            start_codon_pos = int(start_codon_gff[3])
        if stop_codon_gff:
            stop_codon_pos = int(stop_codon_gff[4])
    else:
        typeofunigene = None  # unknown -> no unigeneannotation
        ########################################################
        if verbose:
            print "NONE GIVEN seq/sta/end:", dnaseqfname
            print "gff ATG:", start_codon_gff
            print "gff TGA:", stop_codon_gff
        ########################################################
        # no anchors applied in terms of start/stop sites
        # TODO future update: find or predict the putative orf
        # of this unigene. That specific functionallity should
        # NOT be placed in this function!
        # for the time being, just return the input gff list.
        return unigene_gff_list, typeofunigene

    # create an unigene stop codon track when in unigene_coordinate_set
    if stop_codon_pos and stop_codon_pos in unigene_coordinate_set:
        # make a deepcopy of the first unigene exon track and make a list of it
        newgff = list(deepcopy(unigene_gff_list[0]))
        # update the coordinates
        newgff[2] = "UGstop"
        newgff[3] = stop_codon_pos - 2
        newgff[4] = stop_codon_pos
        return_unigene_gff_list.append(tuple(newgff))

    # CORRECT the unigene_coordinate_set for 5p nucleotides
    ignore_5p_coords = []
    if (
        start_codon_pos != None
        and start_codon_pos in unigene_coordinate_set
        and min(unigene_coordinate_set) < start_codon_pos
    ):
        if verbose:
            print "CREATE 5pUTR for ug:", typeofunigene
        # yes, there is a 5p unigene alignment part
        for coord in unigene_coordinate_set:
            if coord < start_codon_pos:
                # append to the ignore_5p_coords list
                ignore_5p_coords.append(coord)
        # remove from the unigene coord set
        for coord in ignore_5p_coords:
            unigene_coordinate_set.remove(coord)

    # CORRECT the unigene_coordinate_set for 3p nucleotides
    ignore_3p_coords = []
    if (
        stop_codon_pos != None
        and stop_codon_pos in unigene_coordinate_set
        and max(unigene_coordinate_set) > stop_codon_pos
    ):
        if verbose:
            print "CREATE 3pUTR for ug:", typeofunigene
        # yes, there is a 3p unigene alignment part
        for coord in unigene_coordinate_set:
            if coord > stop_codon_pos:
                # append to the ignore_5p_coords list
                ignore_3p_coords.append(coord)
        # remove from the unigene coord set
        for coord in ignore_3p_coords:
            unigene_coordinate_set.remove(coord)
        #### remove the stop codon position too
        ###unigene_coordinate_set.remove(stop_codon_pos-2)
        ###unigene_coordinate_set.remove(stop_codon_pos-1)
        ###unigene_coordinate_set.remove(stop_codon_pos)

    # make (new) UGExon tracks, corrected for UTRS, if needed
    if not (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set:
        # no utrs available; just set the input to the output list
        return_unigene_gff_list.extend(unigene_gff_list)

    elif (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set:
        # create new gff tracks for unigene exons
        unigene_exon_coords = list(unigene_coordinate_set)
        unigene_exon_coords.sort()
        track_coords = [[unigene_exon_coords[0]]]
        for coord in unigene_exon_coords[1:]:
            if coord == max(track_coords[-1]) + 1:
                track_coords[-1].append(coord)
            else:
                track_coords.append([coord])
        for track in track_coords:
            # make a deepcopy of the first unigene exon track and make a list of it
            newgff = list(deepcopy(unigene_gff_list[0]))
            # update the coordinates
            newgff[3] = min(track)
            newgff[4] = max(track)
            # and append to the new return unigene gff list
            return_unigene_gff_list.append(tuple(newgff))

        # make UTR5UGExon track if it exists
        if ignore_5p_coords:
            ignore_5p_coords.sort()
            tracks = [[ignore_5p_coords[0]]]
            for coord in ignore_5p_coords[1:]:
                if coord == max(tracks[-1]) + 1:
                    tracks[-1].append(coord)
                else:
                    tracks.append([coord])
            # reverse tracks; if there are >1, inserting in the
            # return list will guarantee the correct order
            tracks.reverse()
            for track in tracks:
                # make a deepcopy of the first unigene exon track and make a list of it
                newgff = list(deepcopy(unigene_gff_list[0]))
                # update the coordinates
                newgff[2] = GFF_UG5UTREXON_FMETHOD
                newgff[3] = min(track)
                newgff[4] = max(track)
                # and insert as the first the new return unigene gff list
                return_unigene_gff_list.insert(0, tuple(newgff))

        # make UTR3UGExon track
        if ignore_3p_coords:
            ignore_3p_coords.sort()
            tracks = [[ignore_3p_coords[0]]]
            for coord in ignore_3p_coords[1:]:
                if coord == max(tracks[-1]) + 1:
                    tracks[-1].append(coord)
                else:
                    tracks.append([coord])
            for track in tracks:
                # make a deepcopy of the first unigene exon track and make a list of it
                newgff = list(deepcopy(unigene_gff_list[0]))
                # update the coordinates
                newgff[2] = GFF_UG3UTREXON_FMETHOD
                newgff[3] = min(track)
                newgff[4] = max(track)
                # and append to the new return unigene gff list
                return_unigene_gff_list.append(tuple(newgff))

    else:
        # hmm... not really expected. There are UniGene tracks,
        # but no UniGene exons are recognized. Probably a wrong setting
        # applied for GFF_UGEXON_FMETHOD (not identical to the naming in
        # the input gff.
        pass

    # order the unigene gff list (stop codon potentially on the front
    return_unigene_gff_list = order_gff_list(return_unigene_gff_list)
    ################################################################
    if verbose and (ignore_5p_coords or ignore_3p_coords):
        for track in return_unigene_gff_list:
            print track
    ################################################################

    # done! return the new list
    return return_unigene_gff_list, typeofunigene
示例#3
0
def correct_unigene_for_utrs(unigene_gff_list,
                             start_codon_gff=(),
                             stop_codon_gff=(),
                             minimal_likely_tss_pssm_score=3.0,
                             shift_tss_pssm_score_ratio=4.0,
                             dnaseqfname=None,
                             verbose=False):
    """
    Check if unigene contains evidence for non-coding UTRs and if so, correct

    @type  unigene_gff_list: list
    @param unigene_gff_list: list with uncorrected unigene gff tuples

    @type  start_codon_gff: tuple
    @param start_codon_gff: tuple representing the (annotated) protein's start codon
    
    @type  stop_codon_gff: tuple
    @param stop_codon_gff: tuple representing the (annotated) protein's stop codon

    @type  dnaseqfname: string (or None)
    @param dnaseqfname: filename of DNA sequence corresponding to the unigene's GFF

    @type  minimal_likely_tss_pssm_score: float
    @param minimal_likely_tss_pssm_score: minimal (likely) PSSM score of the TSS

    @type  shift_tss_pssm_score_ratio: float
    @param shift_tss_pssm_score_ratio: shift TSS downstream when ratio is exceeded

    @type  verbose: Boolean
    @param verbose: print discrepancies to STDOUT (True) or not (False, default)

    @rtype:  list of (gff) tuples + typeofunigene string
    @return: list with corrected gff tuples + string

    @attention: Global variable GFF_UGEXON_FMETHOD  is required for this function
    @attention: Global variable GFF_UG3UTREXON_FMETHOD  is required for this function
    @attention: Global variable GFF_UG5UTREXON_FMETHOD  is required for this function
    """
    # return list with corrected unigene tracks
    return_unigene_gff_list = []
    start_codon_pos = None
    stop_codon_pos = None
    typeofunigene = None
    # make sets of unigene coordinates
    unigene_coordinate_set = gffs2coordset(unigene_gff_list,
                                           fmethod=[GFF_UGEXON_FMETHOD])

    if dnaseqfname:
        # print unigene structure annotation
        unigeneexons = filtergffs4fmethod(unigene_gff_list, GFF_UGEXON_FMETHOD)
        unigeneexons.sort()
        # replace fasta header for correct recognition
        #header,descr = parseSingleFastaHeaderFromFile(dnaseqfname)
        header, dnaseq, descr = parseSingleFasta(open(dnaseqfname).readlines())
        for i in range(0, len(unigeneexons)):
            gff = list(unigeneexons[i])
            gff[0] = header
            # correct for negative coordinate. This can happen in case
            # the unigene sticks out of the genelocus
            if gff[3] <= 0: gff[3] = 1
            unigeneexons[i] = tuple(gff)

        # run unigeneannotation command
        command = "%s %s %s" % (PYTHON_PATH, EXECUTABLE_UNIGENEANNOTATION,
                                dnaseqfname)
        ci, co = os.popen2(command)
        ci.write(gffs2txt(unigeneexons))
        ci.close()
        ugannotation = co.read().strip().split("\t")
        co.close()
        typeofunigene = ugannotation[0]
        # abstract coordinates of start and stop codon
        # from unigene annotation
        try:
            start_codon_pos = int(ugannotation[5])
        except:
            start_codon_pos = None
        try:
            stop_codon_pos = int(ugannotation[6])
        except:
            stop_codon_pos = None

        ################################################################
        if verbose:
            for track in unigene_gff_list:
                print track
            print ugannotation, start_codon_pos, stop_codon_pos
            print "given ATG:", start_codon_gff
            print "given TGA:", stop_codon_gff
        ################################################################

        if start_codon_pos:
            # check if the PythonRegex obtained Methionine is the most
            # likely TSS. When a far better one is available -> shift
            # the TSS downstream (5p->3p) to this better TSS.
            startcodons = []
            for gffpos in range(start_codon_pos, int(unigeneexons[0][4]), 3):
                if dnaseq[gffpos - 1:gffpos - 1 + 3].upper() == 'ATG':
                    tssSta = gffpos - 1 - IC_TSS_PATTERN_OFFSET[0]
                    tssEnd = gffpos - 1 + 3 + IC_TSS_PATTERN_OFFSET[1]
                    tssSeq = dnaseq[tssSta:tssEnd]
                    tssSco = score_tss(tssSeq)
                    #print 'ATG', gffpos, "%1.2f" % tssSco
                    startcodons.append((tssSco, gffpos))
            # check if there are >1 start codon posibilities
            if len(startcodons) > 1 and startcodons[0][0] <\
            minimal_likely_tss_pssm_score:
                for score, gffpos in startcodons[1:]:
                    if score >= minimal_likely_tss_pssm_score and\
                    abs( score / startcodons[0][0] ) > shift_tss_pssm_score_ratio:
                        start_codon_pos = gffpos
                        #print "TSS pos SHIFTED", startcodons[0][1], "->", gffpos
                        # break out after first shift; this is now *THE* TSS
                        break
        elif start_codon_gff:
            # unigene is a fragment or other transcript without
            # likely ATG. Fortunately, ATG is applied from the given
            # gene structure. Take this one.
            start_codon_pos = int(start_codon_gff[3])
        else:
            # NO start_codon_pos available -> unigene fragment!
            pass

    elif start_codon_gff or stop_codon_gff:
        typeofunigene = None  # unknown -> no unigeneannotation
        # No dna sequence is applied to verify the ATG/TGA
        # positions of the unigene by unigene annotation.
        # Abstract coordinates of start and/or stop codons
        # from the given coordinates (from the gene's annotation)
        if start_codon_gff:
            start_codon_pos = int(start_codon_gff[3])
        if stop_codon_gff:
            stop_codon_pos = int(stop_codon_gff[4])
    else:
        typeofunigene = None  # unknown -> no unigeneannotation
        ########################################################
        if verbose:
            print "NONE GIVEN seq/sta/end:", dnaseqfname
            print "gff ATG:", start_codon_gff
            print "gff TGA:", stop_codon_gff
        ########################################################
        # no anchors applied in terms of start/stop sites
        # TODO future update: find or predict the putative orf
        # of this unigene. That specific functionallity should
        # NOT be placed in this function!
        # for the time being, just return the input gff list.
        return unigene_gff_list, typeofunigene

    # create an unigene stop codon track when in unigene_coordinate_set
    if stop_codon_pos and stop_codon_pos in unigene_coordinate_set:
        # make a deepcopy of the first unigene exon track and make a list of it
        newgff = list(deepcopy(unigene_gff_list[0]))
        # update the coordinates
        newgff[2] = 'UGstop'
        newgff[3] = stop_codon_pos - 2
        newgff[4] = stop_codon_pos
        return_unigene_gff_list.append(tuple(newgff))

    # CORRECT the unigene_coordinate_set for 5p nucleotides
    ignore_5p_coords = []
    if start_codon_pos != None and start_codon_pos in\
    unigene_coordinate_set and min(unigene_coordinate_set) < start_codon_pos:
        if verbose: print "CREATE 5pUTR for ug:", typeofunigene
        # yes, there is a 5p unigene alignment part
        for coord in unigene_coordinate_set:
            if coord < start_codon_pos:
                # append to the ignore_5p_coords list
                ignore_5p_coords.append(coord)
        # remove from the unigene coord set
        for coord in ignore_5p_coords:
            unigene_coordinate_set.remove(coord)

    # CORRECT the unigene_coordinate_set for 3p nucleotides
    ignore_3p_coords = []
    if stop_codon_pos != None and stop_codon_pos in\
    unigene_coordinate_set and max(unigene_coordinate_set) > stop_codon_pos:
        if verbose: print "CREATE 3pUTR for ug:", typeofunigene
        # yes, there is a 3p unigene alignment part
        for coord in unigene_coordinate_set:
            if coord > stop_codon_pos:
                # append to the ignore_5p_coords list
                ignore_3p_coords.append(coord)
        # remove from the unigene coord set
        for coord in ignore_3p_coords:
            unigene_coordinate_set.remove(coord)
        #### remove the stop codon position too
        ###unigene_coordinate_set.remove(stop_codon_pos-2)
        ###unigene_coordinate_set.remove(stop_codon_pos-1)
        ###unigene_coordinate_set.remove(stop_codon_pos)

    # make (new) UGExon tracks, corrected for UTRS, if needed
    if not (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set:
        # no utrs available; just set the input to the output list
        return_unigene_gff_list.extend(unigene_gff_list)

    elif (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set:
        # create new gff tracks for unigene exons
        unigene_exon_coords = list(unigene_coordinate_set)
        unigene_exon_coords.sort()
        track_coords = [[unigene_exon_coords[0]]]
        for coord in unigene_exon_coords[1:]:
            if coord == max(track_coords[-1]) + 1:
                track_coords[-1].append(coord)
            else:
                track_coords.append([coord])
        for track in track_coords:
            # make a deepcopy of the first unigene exon track and make a list of it
            newgff = list(deepcopy(unigene_gff_list[0]))
            # update the coordinates
            newgff[3] = min(track)
            newgff[4] = max(track)
            # and append to the new return unigene gff list
            return_unigene_gff_list.append(tuple(newgff))

        # make UTR5UGExon track if it exists
        if ignore_5p_coords:
            ignore_5p_coords.sort()
            tracks = [[ignore_5p_coords[0]]]
            for coord in ignore_5p_coords[1:]:
                if coord == max(tracks[-1]) + 1:
                    tracks[-1].append(coord)
                else:
                    tracks.append([coord])
            # reverse tracks; if there are >1, inserting in the
            # return list will guarantee the correct order
            tracks.reverse()
            for track in tracks:
                # make a deepcopy of the first unigene exon track and make a list of it
                newgff = list(deepcopy(unigene_gff_list[0]))
                # update the coordinates
                newgff[2] = GFF_UG5UTREXON_FMETHOD
                newgff[3] = min(track)
                newgff[4] = max(track)
                # and insert as the first the new return unigene gff list
                return_unigene_gff_list.insert(0, tuple(newgff))

        # make UTR3UGExon track
        if ignore_3p_coords:
            ignore_3p_coords.sort()
            tracks = [[ignore_3p_coords[0]]]
            for coord in ignore_3p_coords[1:]:
                if coord == max(tracks[-1]) + 1:
                    tracks[-1].append(coord)
                else:
                    tracks.append([coord])
            for track in tracks:
                # make a deepcopy of the first unigene exon track and make a list of it
                newgff = list(deepcopy(unigene_gff_list[0]))
                # update the coordinates
                newgff[2] = GFF_UG3UTREXON_FMETHOD
                newgff[3] = min(track)
                newgff[4] = max(track)
                # and append to the new return unigene gff list
                return_unigene_gff_list.append(tuple(newgff))

    else:
        # hmm... not really expected. There are UniGene tracks,
        # but no UniGene exons are recognized. Probably a wrong setting
        # applied for GFF_UGEXON_FMETHOD (not identical to the naming in
        # the input gff.
        pass

    # order the unigene gff list (stop codon potentially on the front
    return_unigene_gff_list = order_gff_list(return_unigene_gff_list)
    ################################################################
    if verbose and (ignore_5p_coords or ignore_3p_coords):
        for track in return_unigene_gff_list:
            print track
    ################################################################

    # done! return the new list
    return return_unigene_gff_list, typeofunigene