def create_gene_utrs(gene_gff_list): """ Create UTR tracks for an (annotated) gene @type gene_gff_list: list @param gene_gff_list: list with gff tuples of the (annotated) gene @rtype utrs: list @return utrs: list with gff tuples of the UTR tracks @attention: requires global variables GFF_CDS_FMETHOD, GFF_EXON_FMETHOD @attention: requires global variables GFF_UTR5_FSOURCE, GFF_UTR5_FMETHOD @attention: requires global variables GFF_UTR3_FSOURCE, GFF_UTR3_FMETHOD """ # make sets of unigene coordinates cdscoords = gffs2coordset(gene_gff_list,fmethod=[GFF_CDS_FMETHOD]) exoncoords = gffs2coordset(gene_gff_list,fmethod=[GFF_EXON_FMETHOD]) # return list with UTR tracks utrs = [] # create a list with 5'UTR coordinates if cdscoords and exoncoords: utr5p_coords = [] utr3p_coords = [] # find start codon in cdscoords: min() for coord in range( min(exoncoords), min(cdscoords) ): if coord in exoncoords: utr5p_coords.append(coord) # find stop codon in exoncoords: max() if max(cdscoords)+3 != max(exoncoords): # check if exon-end != cds-end+3 -> only a stop codon for coord in range( max(cdscoords)+1, max(exoncoords)+1 ): if coord in exoncoords: utr3p_coords.append(coord) if utr5p_coords or utr3p_coords: # get list with coding exons for track backbone cexons = filtergffs4fmethod(gene_gff_list,fmethod=GFF_CDS_FMETHOD) # create 5'UTRs gfftrack = list( cexons[0] ) gfftrack[1] = GFF_UTR5_FSOURCE gfftrack[2] = GFF_UTR5_FMETHOD utrs.extend( coordset2gfftracks(utr5p_coords,gfftrack)) # create 3'UTRs gfftrack = list( cexons[0] ) gfftrack[1] = GFF_UTR3_FSOURCE gfftrack[2] = GFF_UTR3_FMETHOD utrs.extend( coordset2gfftracks(utr3p_coords,gfftrack)) # return the utr tracks return utrs
def correct_unigene_for_utrs( unigene_gff_list, start_codon_gff=(), stop_codon_gff=(), minimal_likely_tss_pssm_score=3.0, shift_tss_pssm_score_ratio=4.0, dnaseqfname=None, verbose=False, ): """ Check if unigene contains evidence for non-coding UTRs and if so, correct @type unigene_gff_list: list @param unigene_gff_list: list with uncorrected unigene gff tuples @type start_codon_gff: tuple @param start_codon_gff: tuple representing the (annotated) protein's start codon @type stop_codon_gff: tuple @param stop_codon_gff: tuple representing the (annotated) protein's stop codon @type dnaseqfname: string (or None) @param dnaseqfname: filename of DNA sequence corresponding to the unigene's GFF @type minimal_likely_tss_pssm_score: float @param minimal_likely_tss_pssm_score: minimal (likely) PSSM score of the TSS @type shift_tss_pssm_score_ratio: float @param shift_tss_pssm_score_ratio: shift TSS downstream when ratio is exceeded @type verbose: Boolean @param verbose: print discrepancies to STDOUT (True) or not (False, default) @rtype: list of (gff) tuples + typeofunigene string @return: list with corrected gff tuples + string @attention: Global variable GFF_UGEXON_FMETHOD is required for this function @attention: Global variable GFF_UG3UTREXON_FMETHOD is required for this function @attention: Global variable GFF_UG5UTREXON_FMETHOD is required for this function """ # return list with corrected unigene tracks return_unigene_gff_list = [] start_codon_pos = None stop_codon_pos = None typeofunigene = None # make sets of unigene coordinates unigene_coordinate_set = gffs2coordset(unigene_gff_list, fmethod=[GFF_UGEXON_FMETHOD]) if dnaseqfname: # print unigene structure annotation unigeneexons = filtergffs4fmethod(unigene_gff_list, GFF_UGEXON_FMETHOD) unigeneexons.sort() # replace fasta header for correct recognition # header,descr = parseSingleFastaHeaderFromFile(dnaseqfname) header, dnaseq, descr = parseSingleFasta(open(dnaseqfname).readlines()) for i in range(0, len(unigeneexons)): gff = list(unigeneexons[i]) gff[0] = header # correct for negative coordinate. This can happen in case # the unigene sticks out of the genelocus if gff[3] <= 0: gff[3] = 1 unigeneexons[i] = tuple(gff) # run unigeneannotation command command = "%s %s %s" % (PYTHON_PATH, EXECUTABLE_UNIGENEANNOTATION, dnaseqfname) ci, co = os.popen2(command) ci.write(gffs2txt(unigeneexons)) ci.close() ugannotation = co.read().strip().split("\t") co.close() typeofunigene = ugannotation[0] # abstract coordinates of start and stop codon # from unigene annotation try: start_codon_pos = int(ugannotation[5]) except: start_codon_pos = None try: stop_codon_pos = int(ugannotation[6]) except: stop_codon_pos = None ################################################################ if verbose: for track in unigene_gff_list: print track print ugannotation, start_codon_pos, stop_codon_pos print "given ATG:", start_codon_gff print "given TGA:", stop_codon_gff ################################################################ if start_codon_pos: # check if the PythonRegex obtained Methionine is the most # likely TSS. When a far better one is available -> shift # the TSS downstream (5p->3p) to this better TSS. startcodons = [] for gffpos in range(start_codon_pos, int(unigeneexons[0][4]), 3): if dnaseq[gffpos - 1 : gffpos - 1 + 3].upper() == "ATG": tssSta = gffpos - 1 - IC_TSS_PATTERN_OFFSET[0] tssEnd = gffpos - 1 + 3 + IC_TSS_PATTERN_OFFSET[1] tssSeq = dnaseq[tssSta:tssEnd] tssSco = score_tss(tssSeq) # print 'ATG', gffpos, "%1.2f" % tssSco startcodons.append((tssSco, gffpos)) # check if there are >1 start codon posibilities if len(startcodons) > 1 and startcodons[0][0] < minimal_likely_tss_pssm_score: for score, gffpos in startcodons[1:]: if ( score >= minimal_likely_tss_pssm_score and abs(score / startcodons[0][0]) > shift_tss_pssm_score_ratio ): start_codon_pos = gffpos # print "TSS pos SHIFTED", startcodons[0][1], "->", gffpos # break out after first shift; this is now *THE* TSS break elif start_codon_gff: # unigene is a fragment or other transcript without # likely ATG. Fortunately, ATG is applied from the given # gene structure. Take this one. start_codon_pos = int(start_codon_gff[3]) else: # NO start_codon_pos available -> unigene fragment! pass elif start_codon_gff or stop_codon_gff: typeofunigene = None # unknown -> no unigeneannotation # No dna sequence is applied to verify the ATG/TGA # positions of the unigene by unigene annotation. # Abstract coordinates of start and/or stop codons # from the given coordinates (from the gene's annotation) if start_codon_gff: start_codon_pos = int(start_codon_gff[3]) if stop_codon_gff: stop_codon_pos = int(stop_codon_gff[4]) else: typeofunigene = None # unknown -> no unigeneannotation ######################################################## if verbose: print "NONE GIVEN seq/sta/end:", dnaseqfname print "gff ATG:", start_codon_gff print "gff TGA:", stop_codon_gff ######################################################## # no anchors applied in terms of start/stop sites # TODO future update: find or predict the putative orf # of this unigene. That specific functionallity should # NOT be placed in this function! # for the time being, just return the input gff list. return unigene_gff_list, typeofunigene # create an unigene stop codon track when in unigene_coordinate_set if stop_codon_pos and stop_codon_pos in unigene_coordinate_set: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[2] = "UGstop" newgff[3] = stop_codon_pos - 2 newgff[4] = stop_codon_pos return_unigene_gff_list.append(tuple(newgff)) # CORRECT the unigene_coordinate_set for 5p nucleotides ignore_5p_coords = [] if ( start_codon_pos != None and start_codon_pos in unigene_coordinate_set and min(unigene_coordinate_set) < start_codon_pos ): if verbose: print "CREATE 5pUTR for ug:", typeofunigene # yes, there is a 5p unigene alignment part for coord in unigene_coordinate_set: if coord < start_codon_pos: # append to the ignore_5p_coords list ignore_5p_coords.append(coord) # remove from the unigene coord set for coord in ignore_5p_coords: unigene_coordinate_set.remove(coord) # CORRECT the unigene_coordinate_set for 3p nucleotides ignore_3p_coords = [] if ( stop_codon_pos != None and stop_codon_pos in unigene_coordinate_set and max(unigene_coordinate_set) > stop_codon_pos ): if verbose: print "CREATE 3pUTR for ug:", typeofunigene # yes, there is a 3p unigene alignment part for coord in unigene_coordinate_set: if coord > stop_codon_pos: # append to the ignore_5p_coords list ignore_3p_coords.append(coord) # remove from the unigene coord set for coord in ignore_3p_coords: unigene_coordinate_set.remove(coord) #### remove the stop codon position too ###unigene_coordinate_set.remove(stop_codon_pos-2) ###unigene_coordinate_set.remove(stop_codon_pos-1) ###unigene_coordinate_set.remove(stop_codon_pos) # make (new) UGExon tracks, corrected for UTRS, if needed if not (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set: # no utrs available; just set the input to the output list return_unigene_gff_list.extend(unigene_gff_list) elif (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set: # create new gff tracks for unigene exons unigene_exon_coords = list(unigene_coordinate_set) unigene_exon_coords.sort() track_coords = [[unigene_exon_coords[0]]] for coord in unigene_exon_coords[1:]: if coord == max(track_coords[-1]) + 1: track_coords[-1].append(coord) else: track_coords.append([coord]) for track in track_coords: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[3] = min(track) newgff[4] = max(track) # and append to the new return unigene gff list return_unigene_gff_list.append(tuple(newgff)) # make UTR5UGExon track if it exists if ignore_5p_coords: ignore_5p_coords.sort() tracks = [[ignore_5p_coords[0]]] for coord in ignore_5p_coords[1:]: if coord == max(tracks[-1]) + 1: tracks[-1].append(coord) else: tracks.append([coord]) # reverse tracks; if there are >1, inserting in the # return list will guarantee the correct order tracks.reverse() for track in tracks: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[2] = GFF_UG5UTREXON_FMETHOD newgff[3] = min(track) newgff[4] = max(track) # and insert as the first the new return unigene gff list return_unigene_gff_list.insert(0, tuple(newgff)) # make UTR3UGExon track if ignore_3p_coords: ignore_3p_coords.sort() tracks = [[ignore_3p_coords[0]]] for coord in ignore_3p_coords[1:]: if coord == max(tracks[-1]) + 1: tracks[-1].append(coord) else: tracks.append([coord]) for track in tracks: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[2] = GFF_UG3UTREXON_FMETHOD newgff[3] = min(track) newgff[4] = max(track) # and append to the new return unigene gff list return_unigene_gff_list.append(tuple(newgff)) else: # hmm... not really expected. There are UniGene tracks, # but no UniGene exons are recognized. Probably a wrong setting # applied for GFF_UGEXON_FMETHOD (not identical to the naming in # the input gff. pass # order the unigene gff list (stop codon potentially on the front return_unigene_gff_list = order_gff_list(return_unigene_gff_list) ################################################################ if verbose and (ignore_5p_coords or ignore_3p_coords): for track in return_unigene_gff_list: print track ################################################################ # done! return the new list return return_unigene_gff_list, typeofunigene
def correct_unigene_for_utrs(unigene_gff_list, start_codon_gff=(), stop_codon_gff=(), minimal_likely_tss_pssm_score=3.0, shift_tss_pssm_score_ratio=4.0, dnaseqfname=None, verbose=False): """ Check if unigene contains evidence for non-coding UTRs and if so, correct @type unigene_gff_list: list @param unigene_gff_list: list with uncorrected unigene gff tuples @type start_codon_gff: tuple @param start_codon_gff: tuple representing the (annotated) protein's start codon @type stop_codon_gff: tuple @param stop_codon_gff: tuple representing the (annotated) protein's stop codon @type dnaseqfname: string (or None) @param dnaseqfname: filename of DNA sequence corresponding to the unigene's GFF @type minimal_likely_tss_pssm_score: float @param minimal_likely_tss_pssm_score: minimal (likely) PSSM score of the TSS @type shift_tss_pssm_score_ratio: float @param shift_tss_pssm_score_ratio: shift TSS downstream when ratio is exceeded @type verbose: Boolean @param verbose: print discrepancies to STDOUT (True) or not (False, default) @rtype: list of (gff) tuples + typeofunigene string @return: list with corrected gff tuples + string @attention: Global variable GFF_UGEXON_FMETHOD is required for this function @attention: Global variable GFF_UG3UTREXON_FMETHOD is required for this function @attention: Global variable GFF_UG5UTREXON_FMETHOD is required for this function """ # return list with corrected unigene tracks return_unigene_gff_list = [] start_codon_pos = None stop_codon_pos = None typeofunigene = None # make sets of unigene coordinates unigene_coordinate_set = gffs2coordset(unigene_gff_list, fmethod=[GFF_UGEXON_FMETHOD]) if dnaseqfname: # print unigene structure annotation unigeneexons = filtergffs4fmethod(unigene_gff_list, GFF_UGEXON_FMETHOD) unigeneexons.sort() # replace fasta header for correct recognition #header,descr = parseSingleFastaHeaderFromFile(dnaseqfname) header, dnaseq, descr = parseSingleFasta(open(dnaseqfname).readlines()) for i in range(0, len(unigeneexons)): gff = list(unigeneexons[i]) gff[0] = header # correct for negative coordinate. This can happen in case # the unigene sticks out of the genelocus if gff[3] <= 0: gff[3] = 1 unigeneexons[i] = tuple(gff) # run unigeneannotation command command = "%s %s %s" % (PYTHON_PATH, EXECUTABLE_UNIGENEANNOTATION, dnaseqfname) ci, co = os.popen2(command) ci.write(gffs2txt(unigeneexons)) ci.close() ugannotation = co.read().strip().split("\t") co.close() typeofunigene = ugannotation[0] # abstract coordinates of start and stop codon # from unigene annotation try: start_codon_pos = int(ugannotation[5]) except: start_codon_pos = None try: stop_codon_pos = int(ugannotation[6]) except: stop_codon_pos = None ################################################################ if verbose: for track in unigene_gff_list: print track print ugannotation, start_codon_pos, stop_codon_pos print "given ATG:", start_codon_gff print "given TGA:", stop_codon_gff ################################################################ if start_codon_pos: # check if the PythonRegex obtained Methionine is the most # likely TSS. When a far better one is available -> shift # the TSS downstream (5p->3p) to this better TSS. startcodons = [] for gffpos in range(start_codon_pos, int(unigeneexons[0][4]), 3): if dnaseq[gffpos - 1:gffpos - 1 + 3].upper() == 'ATG': tssSta = gffpos - 1 - IC_TSS_PATTERN_OFFSET[0] tssEnd = gffpos - 1 + 3 + IC_TSS_PATTERN_OFFSET[1] tssSeq = dnaseq[tssSta:tssEnd] tssSco = score_tss(tssSeq) #print 'ATG', gffpos, "%1.2f" % tssSco startcodons.append((tssSco, gffpos)) # check if there are >1 start codon posibilities if len(startcodons) > 1 and startcodons[0][0] <\ minimal_likely_tss_pssm_score: for score, gffpos in startcodons[1:]: if score >= minimal_likely_tss_pssm_score and\ abs( score / startcodons[0][0] ) > shift_tss_pssm_score_ratio: start_codon_pos = gffpos #print "TSS pos SHIFTED", startcodons[0][1], "->", gffpos # break out after first shift; this is now *THE* TSS break elif start_codon_gff: # unigene is a fragment or other transcript without # likely ATG. Fortunately, ATG is applied from the given # gene structure. Take this one. start_codon_pos = int(start_codon_gff[3]) else: # NO start_codon_pos available -> unigene fragment! pass elif start_codon_gff or stop_codon_gff: typeofunigene = None # unknown -> no unigeneannotation # No dna sequence is applied to verify the ATG/TGA # positions of the unigene by unigene annotation. # Abstract coordinates of start and/or stop codons # from the given coordinates (from the gene's annotation) if start_codon_gff: start_codon_pos = int(start_codon_gff[3]) if stop_codon_gff: stop_codon_pos = int(stop_codon_gff[4]) else: typeofunigene = None # unknown -> no unigeneannotation ######################################################## if verbose: print "NONE GIVEN seq/sta/end:", dnaseqfname print "gff ATG:", start_codon_gff print "gff TGA:", stop_codon_gff ######################################################## # no anchors applied in terms of start/stop sites # TODO future update: find or predict the putative orf # of this unigene. That specific functionallity should # NOT be placed in this function! # for the time being, just return the input gff list. return unigene_gff_list, typeofunigene # create an unigene stop codon track when in unigene_coordinate_set if stop_codon_pos and stop_codon_pos in unigene_coordinate_set: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[2] = 'UGstop' newgff[3] = stop_codon_pos - 2 newgff[4] = stop_codon_pos return_unigene_gff_list.append(tuple(newgff)) # CORRECT the unigene_coordinate_set for 5p nucleotides ignore_5p_coords = [] if start_codon_pos != None and start_codon_pos in\ unigene_coordinate_set and min(unigene_coordinate_set) < start_codon_pos: if verbose: print "CREATE 5pUTR for ug:", typeofunigene # yes, there is a 5p unigene alignment part for coord in unigene_coordinate_set: if coord < start_codon_pos: # append to the ignore_5p_coords list ignore_5p_coords.append(coord) # remove from the unigene coord set for coord in ignore_5p_coords: unigene_coordinate_set.remove(coord) # CORRECT the unigene_coordinate_set for 3p nucleotides ignore_3p_coords = [] if stop_codon_pos != None and stop_codon_pos in\ unigene_coordinate_set and max(unigene_coordinate_set) > stop_codon_pos: if verbose: print "CREATE 3pUTR for ug:", typeofunigene # yes, there is a 3p unigene alignment part for coord in unigene_coordinate_set: if coord > stop_codon_pos: # append to the ignore_5p_coords list ignore_3p_coords.append(coord) # remove from the unigene coord set for coord in ignore_3p_coords: unigene_coordinate_set.remove(coord) #### remove the stop codon position too ###unigene_coordinate_set.remove(stop_codon_pos-2) ###unigene_coordinate_set.remove(stop_codon_pos-1) ###unigene_coordinate_set.remove(stop_codon_pos) # make (new) UGExon tracks, corrected for UTRS, if needed if not (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set: # no utrs available; just set the input to the output list return_unigene_gff_list.extend(unigene_gff_list) elif (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set: # create new gff tracks for unigene exons unigene_exon_coords = list(unigene_coordinate_set) unigene_exon_coords.sort() track_coords = [[unigene_exon_coords[0]]] for coord in unigene_exon_coords[1:]: if coord == max(track_coords[-1]) + 1: track_coords[-1].append(coord) else: track_coords.append([coord]) for track in track_coords: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[3] = min(track) newgff[4] = max(track) # and append to the new return unigene gff list return_unigene_gff_list.append(tuple(newgff)) # make UTR5UGExon track if it exists if ignore_5p_coords: ignore_5p_coords.sort() tracks = [[ignore_5p_coords[0]]] for coord in ignore_5p_coords[1:]: if coord == max(tracks[-1]) + 1: tracks[-1].append(coord) else: tracks.append([coord]) # reverse tracks; if there are >1, inserting in the # return list will guarantee the correct order tracks.reverse() for track in tracks: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[2] = GFF_UG5UTREXON_FMETHOD newgff[3] = min(track) newgff[4] = max(track) # and insert as the first the new return unigene gff list return_unigene_gff_list.insert(0, tuple(newgff)) # make UTR3UGExon track if ignore_3p_coords: ignore_3p_coords.sort() tracks = [[ignore_3p_coords[0]]] for coord in ignore_3p_coords[1:]: if coord == max(tracks[-1]) + 1: tracks[-1].append(coord) else: tracks.append([coord]) for track in tracks: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[2] = GFF_UG3UTREXON_FMETHOD newgff[3] = min(track) newgff[4] = max(track) # and append to the new return unigene gff list return_unigene_gff_list.append(tuple(newgff)) else: # hmm... not really expected. There are UniGene tracks, # but no UniGene exons are recognized. Probably a wrong setting # applied for GFF_UGEXON_FMETHOD (not identical to the naming in # the input gff. pass # order the unigene gff list (stop codon potentially on the front return_unigene_gff_list = order_gff_list(return_unigene_gff_list) ################################################################ if verbose and (ignore_5p_coords or ignore_3p_coords): for track in return_unigene_gff_list: print track ################################################################ # done! return the new list return return_unigene_gff_list, typeofunigene