def assign_utrornongene3p_inwpcbgs(inwpcbgs,verbose=True): """ """ # return empty list when no inwpcbgs applied if not inwpcbgs: return [] # get most likely first & final inwpCBG pointer in inwpcbgs list posFirst,posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs) # return variable list noncoding_inwpcbg_list = [] # get data of most likely first inwpCBG max_cntAnnot = max([inwp.count_orfs_labeled_as_annotated_exon() for inwp in inwpcbgs ]) finalInwpCBG = inwpcbgs[posFinal] final_cnt_is_final = finalInwpCBG.count_orfs_labeled_as_final_exon() final_identityscore = finalInwpCBG.get_identityscore() final_prjtls_aadif = finalInwpCBG.get_projected_tailing_stop_aa_difference() final_prjtls_nonad = finalInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference() # range of inwpCBGs which are checked for deletion range_3p_test = range(posFinal+1,len(inwpcbgs)) # detect UTR or nongene / noncoding inwpCBGS for pos in range(0,len(inwpcbgs)): if pos not in range_3p_test: continue # get this inwpCBG and get statistics inwpCBG = inwpcbgs[pos] cntFinal = inwpCBG.count_orfs_labeled_as_first_exon() # calculated differntly as for *the* firstCBG cntAnnot = float(inwpCBG.organism_set_size()) # break when to putatively first is reached if cntFinal == final_cnt_is_final: break # remove poorly covered inwpCBGs with low identityscore and not # having a likely stop codon if cntAnnot/max_cntAnnot < 0.80 and\ inwpCBG.get_identityscore() / final_identityscore <=\ NONCODINGNONGENE_3p_INWPCBG_MAX_IDENTITYRATIO and\ inwpCBG.organism_set_size() < final_cnt_is_final and\ final_prjtls_aadif < inwpCBG.get_projected_tailing_stop_aa_difference() and\ final_prjtls_nonad < inwpCBG.get_projected_tailing_stop_nonaligned_aa_difference(): # not contribution to the gene structure at all.... noncoding_inwpcbg_list.append(inwpCBG) continue # check relative position towards the current finalInwpCBG # position is measured in actual nt distance and tcode 'distance': # the lowest scoring TCODE window in between these inwpCBGs ntdistdict = finalInwpCBG.nt_spacing_between_codingblocks([inwpCBG]) tcodedistdict = finalInwpCBG.tcode_spacing_between_codingblocks([inwpCBG]) # remove highest & lowest distance and then do stats on remaining dists if len(ntdistdict) >= 3: _tmp = [ (v,k) for k,v in ntdistdict.iteritems() ] _tmp.sort() del( ntdistdict[_tmp[0][1]] ) del( ntdistdict[_tmp[-1][1]] ) if len(tcodedistdict) >= 3: _tmp = [ (v,k) for k,v in tcodedistdict.iteritems() ] _tmp.sort() del( tcodedistdict[_tmp[0][1]] ) del( tcodedistdict[_tmp[-1][1]] ) # do 3 checks. # 1) are average and maximum intergenic distances are bridged? # 2) is stop codon projection a deterioration? # 4) does tcodedistance suggest bridging of a non-coding stretch? check_1 = sum(ntdistdict.values())/len(ntdistdict) >=\ AVERAGE_INTERGENIC_MIN_NT_LENGTH and\ max(ntdistdict.values()) >= MAX_INTERGENIC_MIN_NT_LENGTH check_2 = final_prjtls_aadif <\ inwpCBG.get_projected_tailing_stop_aa_difference() and\ final_prjtls_nonad <\ inwpCBG.get_projected_tailing_stop_nonaligned_aa_difference() check_3 = len(tcodedistdict)>0 and\ sum(tcodedistdict.values())/len(tcodedistdict) <=\ TCODE_MAX_NONCODING if [ check_1, check_2, check_3 ].count(True) >= 2: # not contribution to the gene structure at all.... noncoding_inwpcbg_list.append(inwpCBG) continue # do is_coding() test iscoding = inwpCBG.is_coding() ######################################################################## if verbose: print pos, "3'UTR analyses:", inwpCBG, iscoding, print cntAnnot/max_cntAnnot ######################################################################## if not iscoding: # probably non-coding inwp CBG alignment block noncoding_inwpcbg_list.append(inwpCBG) continue # return the noncoding_inwpcbg_list return noncoding_inwpcbg_list
def assign_gtgdiscrepancy_inwpcbgs(inwpcbgs,GTG,exclude_annotated=True,verbose=True): """ """ # return empty list when no inwpcbgs applied if not inwpcbgs: return [] # get target organism identifier target = inwpcbgs[0]._get_target_organism() # return list with inwpcbgs gtgdiscrepancy_inwpcbg_list = [] if exclude_annotated: # get most likely first & final inwpCBG pointer in inwpcbgs list posFirst,posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs) range_5p_test = range(0,posFirst) range_3p_test = range(posFinal+1,len(inwpcbgs)) protected_target_orfid_list = [] for inwpCBG in inwpcbgs[posFirst:posFinal+1]: if inwpCBG.count_orfs_labeled_as_annotated_exon() > 0: protected_target_orfid_list.append( inwpCBG.get_orfs_of_graph(organism=target)[0].id ) else: range_5p_test = [] range_3p_test = [] protected_target_orfid_list = [] ############################################################################ if verbose and exclude_annotated: print "NOT-excluded:", range_5p_test, range_3p_test ############################################################################ # detect UTR or nongene / noncoding inwpCBGS for pos in range(0,len(inwpcbgs)): if exclude_annotated and pos in range_5p_test: pass elif exclude_annotated and pos in range_3p_test: pass elif exclude_annotated and inwpcbgs[pos].count_orfs_labeled_as_annotated_exon() == 0: # in the middle of the annotated geen structure, but not a single # Orf annotated as an exon. Asses for gtg difference too! pass elif exclude_annotated: continue else: pass # get this inwpCBG and thisInwpCBG = inwpcbgs[pos] # ignore if the target's Orf is belonging to a `protected` Orf if protected_target_orfid_list and\ thisInwpCBG.get_orfs_of_graph(organism=target)[0].id in\ protected_target_orfid_list: continue # ignore inwpCBGs which are very likely (poor quality) SignalP alignments cntSP = float(thisInwpCBG.count_orfs_with_signalpeptides()) if cntSP/(thisInwpCBG.count_genomic_informants()+1) > 0.66 and\ thisInwpCBG.get_signalp_score() > 0.75: continue # create its GeneTreeGraph gtg = pcg2gtg_by_identity(thisInwpCBG,target) # step 1. Do the gtg/GTG difference check difference = _relative_gtg_difference(gtg,GTG,target) if difference < NONGENE_GTG_MAX_DIFFERENCE: # step 2. Do the CEXPANDER check if thisInwpCBG.node_count() <= 2: gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG) ################################################################ if verbose: print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % ( difference,NONGENE_GTG_MAX_DIFFERENCE), print thisInwpCBG.get_organism_nodes(target)[0] ################################################################ elif thisInwpCBG.get_cexpander_uniformly_aligned_count() == 0: gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG) ################################################################ if verbose: print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % ( difference,NONGENE_GTG_MAX_DIFFERENCE), print thisInwpCBG.get_organism_nodes(target)[0] ################################################################ else: # cexpander check is succesfull, GTGdifference claims # the aligment is bogus. Do a more elaborate check on # some other variables of thisInwpCBG # calculate the difference between minsr & maxsr lengths node = thisInwpCBG.get_organism_nodes(target)[0] minsr = thisInwpCBG.minimal_spanning_range_sizes()[node] maxsr = thisInwpCBG.maximal_spanning_range_sizes()[node] msr_ratio = float(minsr)/float(maxsr) # calculate the ratio between average weights of gtg and GTG average_wt_gtg = _pairwise_gtg_average_weight(gtg,target) average_wt_GTG = _pairwise_gtg_average_weight(GTG,target) gtg_ratio = average_wt_gtg / average_wt_GTG if msr_ratio < NONGENE_GTG_MAX_MSR_RATIO and\ gtg_ratio < NONGENE_GTG_MAX_GTG_RATIO: gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG) ################################################################ if verbose: print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % ( difference,NONGENE_GTG_MAX_DIFFERENCE), print thisInwpCBG.get_organism_nodes(target)[0] ################################################################ else: pass else: pass # return the gtgdiscrepancy_inwpcbg_list return gtgdiscrepancy_inwpcbg_list
def assign_utrornongene5p_inwpcbgs(inwpcbgs,verbose=True): """ """ # return empty list when no inwpcbgs applied if not inwpcbgs: return [] # get most likely first & final inwpCBG pointer in inwpcbgs list posFirst,posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs) # return variable list noncoding_inwpcbg_list = [] # get data of most likely first inwpCBG max_cntAnnot = max([inwp.count_orfs_labeled_as_annotated_exon() for inwp in inwpcbgs ]) firstInwpCBG = inwpcbgs[posFirst] first_cnt_is_first = firstInwpCBG.count_orfs_labeled_as_first_exon() first_identityscore = firstInwpCBG.get_identityscore() first_upstrTSScnt = [ pf.has_upstream_tss() for pf in firstInwpCBG.pacbps.values() ].count(True) if (max_cntAnnot-1) == 0: # avoid ZeroDivisionError first_upstrTSSratio = 0.0 else: first_upstrTSSratio = float(first_upstrTSScnt) / (max_cntAnnot-1) # range of inwpCBGs which are checked for deletion range_5p_test = range(0,posFirst) # detect UTR or nongene / noncoding inwpCBGS for pos in range(0,len(inwpcbgs)): if pos not in range_5p_test: continue # get this inwpCBG and get statistics inwpCBG = inwpcbgs[pos] cntFirst = inwpCBG.count_orfs_labeled_as_first_exon() # calculated differently as for *the* firstCBG cntAnnot = float(inwpCBG.organism_set_size()) # break when to putatively first is reached if cntFirst == first_cnt_is_first: break # do is_coding() test iscoding = inwpCBG.is_coding() # calculate cnt/ratio of upstrTSS sites this_upstrTSScnt = [ pf.has_upstream_tss() for pf in inwpCBG.pacbps.values() ].count(True) if (max_cntAnnot-1) == 0: this_upstrTSSratio = 0.0 else: this_upstrTSSratio = float(this_upstrTSScnt) / (max_cntAnnot-1.0) ######################################################################## if verbose: print pos, range_5p_test, "5'UTR analyses:", inwpCBG, iscoding, print "coverage:",cntAnnot/max_cntAnnot, print "upstrTSS cnt - ratio: %s - %1.2f" % ( this_upstrTSScnt, this_upstrTSSratio) ######################################################################## if not iscoding: # inwpCBGs most likely not coding alignments -> remove noncoding_inwpcbg_list.append(inwpCBG) continue # check relative position towards the current firstInwpCBG # position is measured in actual nt distance and tcode 'distance': # the lowest scoring TCODE window in between these inwpCBGs tcodedistdict = inwpCBG.tcode_spacing_between_codingblocks([firstInwpCBG]) if len(tcodedistdict) >= 3: _tmp = [ (v,k) for k,v in tcodedistdict.iteritems() ] _tmp.sort() del( tcodedistdict[_tmp[0][1]] ) del( tcodedistdict[_tmp[-1][1]] ) ######################################################################## if verbose and len(tcodedistdict) >= 1: print pos, sum(tcodedistdict.values())/len(tcodedistdict), print TCODE_MAX_NONCODING, firstInwpCBG ######################################################################## # continue when coverage is to high if cntAnnot/max_cntAnnot >= 0.40: continue if len(tcodedistdict)>0 and\ sum(tcodedistdict.values())/len(tcodedistdict) <= TCODE_MAX_NONCODING: noncoding_inwpcbg_list.append(inwpCBG) continue if this_upstrTSScnt == 0: # no upstream TSS sites at all -> remove! noncoding_inwpcbg_list.append(inwpCBG) continue # do a furter check for unlikely first inwpCBG blocks if inwpCBG.get_identityscore() / first_identityscore <=\ NONCODINGNONGENE_5p_INWPCBG_MAX_IDENTITYRATIO and\ inwpCBG.organism_set_size() < first_cnt_is_first and\ first_upstrTSSratio==0.0: noncoding_inwpcbg_list.append(inwpCBG) continue # do a furter check for unlikely first inwpCBG blocks if inwpCBG.get_identityscore() / first_identityscore <=\ NONCODINGNONGENE_5p_INWPCBG_MAX_IDENTITYRATIO and\ inwpCBG.organism_set_size() < first_cnt_is_first and\ (first_upstrTSSratio!=0.0 and (this_upstrTSSratio / first_upstrTSSratio) < 0.6): noncoding_inwpcbg_list.append(inwpCBG) continue # do a final check for unlikely first inwpCBG blocks # all parameters must be (slightly) poorer if inwpCBG.get_identityscore() < first_identityscore and\ inwpCBG.organism_set_size() < first_cnt_is_first and\ inwpCBG.get_average_upstream_methionine_pssm_score() <\ firstInwpCBG.get_average_upstream_methionine_pssm_score(): noncoding_inwpcbg_list.append(inwpCBG) continue # return the noncoding_inwpcbg_list return noncoding_inwpcbg_list
def assign_internal_nongene_alignments(inwpcbgs,GTG,exclude_annotated=False,verbose=True): """ TODO TODO: this function must be moved to another location. TODO TODO: better place in inwpCBGs/blocks filtering """ # return empty list when no inwpcbgs applied if not inwpcbgs: return [] # get target organism identifier target = inwpcbgs[0]._get_target_organism() # return list with inwpcbgs gtgdiscrepancy_inwpcbg_list = [] # get most likely first & final inwpCBG pointer in inwpcbgs list posFirst,posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs) # check if posFirst,posFinal+1 isa non-empty range if not range(posFirst,posFinal+1): return [] # get info on the *best* covered inwpCBG best_nt_identity = max([ inwpcbgs[pos].get_nt_identity() for pos in range(posFirst,posFinal+1) ]) best_bitscore = max([ inwpcbgs[pos].get_bitscore() for pos in range(posFirst,posFinal+1) ]) best_annot_cnt = max([ inwpcbgs[pos].count_orfs_labeled_as_annotated_exon() for pos in range(posFirst,posFinal+1) ]) best_bits_per_aa = max([ float( inwpcbgs[pos].get_bitscore() ) / float( sum([pf.get_unextended_length() for pf in inwpcbgs[pos].pacbps.values() ]) ) for pos in range(posFirst,posFinal+1) ]) for pos in range(posFirst,posFinal+1): # get this inwpCBG and thisInwpCBG = inwpcbgs[pos] if pos > 0: prevInwpCBG = inwpcbgs[pos-1] else: prevInwpCBG = None if pos < len(inwpcbgs)-1: nextInwpCBG = inwpcbgs[pos+1] else: nextInwpCBG = None if prevInwpCBG and Set(prevInwpCBG.get_nodes()).intersection(thisInwpCBG.get_nodes()): continue if nextInwpCBG and Set(nextInwpCBG.get_nodes()).intersection(thisInwpCBG.get_nodes()): continue tot_length = sum([pf.get_unextended_length() for pf in thisInwpCBG.pacbps.values() ]) bits = thisInwpCBG.get_bitscore() bits_per_aa = float(bits)/float(tot_length) if bits_per_aa/best_bits_per_aa < 0.55 and\ float(thisInwpCBG.count_orfs_labeled_as_annotated_exon()) /\ float(best_annot_cnt) <= 0.50: minsr = thisInwpCBG.minimal_spanning_range(organism=target) print " __XX__", pos, thisInwpCBG, thisInwpCBG.get_nt_identity(), bits, float(bits)/float(tot_length), thisInwpCBG.count_orfs_labeled_as_annotated_exon() if prevInwpCBG and len(prevInwpCBG.maximal_spanning_range(organism=target).intersection(minsr)) == len(minsr): ################################################################ if verbose: print "PREV::", len(prevInwpCBG.maximal_spanning_range(organism=target).intersection(minsr)), len(minsr) ################################################################ gtgdiscrepancy_inwpcbg_list.append( thisInwpCBG ) continue if nextInwpCBG and len(nextInwpCBG.maximal_spanning_range(organism=target).intersection(minsr)) == len(minsr): ################################################################ if verbose: print "NEXT::", len(nextInwpCBG.maximal_spanning_range(organism=target).intersection(minsr)), len(minsr) ################################################################ gtgdiscrepancy_inwpcbg_list.append( thisInwpCBG ) continue # return list with conflicts return gtgdiscrepancy_inwpcbg_list
def assign_utrornongene3p_inwpcbgs(inwpcbgs, verbose=True): """ """ # return empty list when no inwpcbgs applied if not inwpcbgs: return [] # get most likely first & final inwpCBG pointer in inwpcbgs list posFirst, posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs) # return variable list noncoding_inwpcbg_list = [] # get data of most likely first inwpCBG max_cntAnnot = max( [inwp.count_orfs_labeled_as_annotated_exon() for inwp in inwpcbgs]) finalInwpCBG = inwpcbgs[posFinal] final_cnt_is_final = finalInwpCBG.count_orfs_labeled_as_final_exon() final_identityscore = finalInwpCBG.get_identityscore() final_prjtls_aadif = finalInwpCBG.get_projected_tailing_stop_aa_difference( ) final_prjtls_nonad = finalInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference( ) # range of inwpCBGs which are checked for deletion range_3p_test = range(posFinal + 1, len(inwpcbgs)) # detect UTR or nongene / noncoding inwpCBGS for pos in range(0, len(inwpcbgs)): if pos not in range_3p_test: continue # get this inwpCBG and get statistics inwpCBG = inwpcbgs[pos] cntFinal = inwpCBG.count_orfs_labeled_as_first_exon() # calculated differntly as for *the* firstCBG cntAnnot = float(inwpCBG.organism_set_size()) # break when to putatively first is reached if cntFinal == final_cnt_is_final: break # remove poorly covered inwpCBGs with low identityscore and not # having a likely stop codon if cntAnnot/max_cntAnnot < 0.80 and\ inwpCBG.get_identityscore() / final_identityscore <=\ NONCODINGNONGENE_3p_INWPCBG_MAX_IDENTITYRATIO and\ inwpCBG.organism_set_size() < final_cnt_is_final and\ final_prjtls_aadif < inwpCBG.get_projected_tailing_stop_aa_difference() and\ final_prjtls_nonad < inwpCBG.get_projected_tailing_stop_nonaligned_aa_difference(): # not contribution to the gene structure at all.... noncoding_inwpcbg_list.append(inwpCBG) continue # check relative position towards the current finalInwpCBG # position is measured in actual nt distance and tcode 'distance': # the lowest scoring TCODE window in between these inwpCBGs ntdistdict = finalInwpCBG.nt_spacing_between_codingblocks([inwpCBG]) tcodedistdict = finalInwpCBG.tcode_spacing_between_codingblocks( [inwpCBG]) # remove highest & lowest distance and then do stats on remaining dists if len(ntdistdict) >= 3: _tmp = [(v, k) for k, v in ntdistdict.iteritems()] _tmp.sort() del (ntdistdict[_tmp[0][1]]) del (ntdistdict[_tmp[-1][1]]) if len(tcodedistdict) >= 3: _tmp = [(v, k) for k, v in tcodedistdict.iteritems()] _tmp.sort() del (tcodedistdict[_tmp[0][1]]) del (tcodedistdict[_tmp[-1][1]]) # do 3 checks. # 1) are average and maximum intergenic distances are bridged? # 2) is stop codon projection a deterioration? # 4) does tcodedistance suggest bridging of a non-coding stretch? check_1 = sum(ntdistdict.values())/len(ntdistdict) >=\ AVERAGE_INTERGENIC_MIN_NT_LENGTH and\ max(ntdistdict.values()) >= MAX_INTERGENIC_MIN_NT_LENGTH check_2 = final_prjtls_aadif <\ inwpCBG.get_projected_tailing_stop_aa_difference() and\ final_prjtls_nonad <\ inwpCBG.get_projected_tailing_stop_nonaligned_aa_difference() check_3 = len(tcodedistdict)>0 and\ sum(tcodedistdict.values())/len(tcodedistdict) <=\ TCODE_MAX_NONCODING if [check_1, check_2, check_3].count(True) >= 2: # not contribution to the gene structure at all.... noncoding_inwpcbg_list.append(inwpCBG) continue # do is_coding() test iscoding = inwpCBG.is_coding() ######################################################################## if verbose: print pos, "3'UTR analyses:", inwpCBG, iscoding, print cntAnnot / max_cntAnnot ######################################################################## if not iscoding: # probably non-coding inwp CBG alignment block noncoding_inwpcbg_list.append(inwpCBG) continue # return the noncoding_inwpcbg_list return noncoding_inwpcbg_list
def assign_utrornongene5p_inwpcbgs(inwpcbgs, verbose=True): """ """ # return empty list when no inwpcbgs applied if not inwpcbgs: return [] # get most likely first & final inwpCBG pointer in inwpcbgs list posFirst, posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs) # return variable list noncoding_inwpcbg_list = [] # get data of most likely first inwpCBG max_cntAnnot = max( [inwp.count_orfs_labeled_as_annotated_exon() for inwp in inwpcbgs]) firstInwpCBG = inwpcbgs[posFirst] first_cnt_is_first = firstInwpCBG.count_orfs_labeled_as_first_exon() first_identityscore = firstInwpCBG.get_identityscore() first_upstrTSScnt = [ pf.has_upstream_tss() for pf in firstInwpCBG.pacbps.values() ].count(True) if (max_cntAnnot - 1) == 0: # avoid ZeroDivisionError first_upstrTSSratio = 0.0 else: first_upstrTSSratio = float(first_upstrTSScnt) / (max_cntAnnot - 1) # range of inwpCBGs which are checked for deletion range_5p_test = range(0, posFirst) # detect UTR or nongene / noncoding inwpCBGS for pos in range(0, len(inwpcbgs)): if pos not in range_5p_test: continue # get this inwpCBG and get statistics inwpCBG = inwpcbgs[pos] cntFirst = inwpCBG.count_orfs_labeled_as_first_exon() # calculated differently as for *the* firstCBG cntAnnot = float(inwpCBG.organism_set_size()) # break when to putatively first is reached if cntFirst == first_cnt_is_first: break # do is_coding() test iscoding = inwpCBG.is_coding() # calculate cnt/ratio of upstrTSS sites this_upstrTSScnt = [ pf.has_upstream_tss() for pf in inwpCBG.pacbps.values() ].count(True) if (max_cntAnnot - 1) == 0: this_upstrTSSratio = 0.0 else: this_upstrTSSratio = float(this_upstrTSScnt) / (max_cntAnnot - 1.0) ######################################################################## if verbose: print pos, range_5p_test, "5'UTR analyses:", inwpCBG, iscoding, print "coverage:", cntAnnot / max_cntAnnot, print "upstrTSS cnt - ratio: %s - %1.2f" % (this_upstrTSScnt, this_upstrTSSratio) ######################################################################## if not iscoding: # inwpCBGs most likely not coding alignments -> remove noncoding_inwpcbg_list.append(inwpCBG) continue # check relative position towards the current firstInwpCBG # position is measured in actual nt distance and tcode 'distance': # the lowest scoring TCODE window in between these inwpCBGs tcodedistdict = inwpCBG.tcode_spacing_between_codingblocks( [firstInwpCBG]) if len(tcodedistdict) >= 3: _tmp = [(v, k) for k, v in tcodedistdict.iteritems()] _tmp.sort() del (tcodedistdict[_tmp[0][1]]) del (tcodedistdict[_tmp[-1][1]]) ######################################################################## if verbose and len(tcodedistdict) >= 1: print pos, sum(tcodedistdict.values()) / len(tcodedistdict), print TCODE_MAX_NONCODING, firstInwpCBG ######################################################################## # continue when coverage is to high if cntAnnot / max_cntAnnot >= 0.40: continue if len(tcodedistdict)>0 and\ sum(tcodedistdict.values())/len(tcodedistdict) <= TCODE_MAX_NONCODING: noncoding_inwpcbg_list.append(inwpCBG) continue if this_upstrTSScnt == 0: # no upstream TSS sites at all -> remove! noncoding_inwpcbg_list.append(inwpCBG) continue # do a furter check for unlikely first inwpCBG blocks if inwpCBG.get_identityscore() / first_identityscore <=\ NONCODINGNONGENE_5p_INWPCBG_MAX_IDENTITYRATIO and\ inwpCBG.organism_set_size() < first_cnt_is_first and\ first_upstrTSSratio==0.0: noncoding_inwpcbg_list.append(inwpCBG) continue # do a furter check for unlikely first inwpCBG blocks if inwpCBG.get_identityscore() / first_identityscore <=\ NONCODINGNONGENE_5p_INWPCBG_MAX_IDENTITYRATIO and\ inwpCBG.organism_set_size() < first_cnt_is_first and\ (first_upstrTSSratio!=0.0 and (this_upstrTSSratio / first_upstrTSSratio) < 0.6): noncoding_inwpcbg_list.append(inwpCBG) continue # do a final check for unlikely first inwpCBG blocks # all parameters must be (slightly) poorer if inwpCBG.get_identityscore() < first_identityscore and\ inwpCBG.organism_set_size() < first_cnt_is_first and\ inwpCBG.get_average_upstream_methionine_pssm_score() <\ firstInwpCBG.get_average_upstream_methionine_pssm_score(): noncoding_inwpcbg_list.append(inwpCBG) continue # return the noncoding_inwpcbg_list return noncoding_inwpcbg_list
def assign_gtgdiscrepancy_inwpcbgs(inwpcbgs, GTG, exclude_annotated=True, verbose=True): """ """ # return empty list when no inwpcbgs applied if not inwpcbgs: return [] # get target organism identifier target = inwpcbgs[0]._get_target_organism() # return list with inwpcbgs gtgdiscrepancy_inwpcbg_list = [] if exclude_annotated: # get most likely first & final inwpCBG pointer in inwpcbgs list posFirst, posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs) range_5p_test = range(0, posFirst) range_3p_test = range(posFinal + 1, len(inwpcbgs)) protected_target_orfid_list = [] for inwpCBG in inwpcbgs[posFirst:posFinal + 1]: if inwpCBG.count_orfs_labeled_as_annotated_exon() > 0: protected_target_orfid_list.append( inwpCBG.get_orfs_of_graph(organism=target)[0].id) else: range_5p_test = [] range_3p_test = [] protected_target_orfid_list = [] ############################################################################ if verbose and exclude_annotated: print "NOT-excluded:", range_5p_test, range_3p_test ############################################################################ # detect UTR or nongene / noncoding inwpCBGS for pos in range(0, len(inwpcbgs)): if exclude_annotated and pos in range_5p_test: pass elif exclude_annotated and pos in range_3p_test: pass elif exclude_annotated and inwpcbgs[ pos].count_orfs_labeled_as_annotated_exon() == 0: # in the middle of the annotated geen structure, but not a single # Orf annotated as an exon. Asses for gtg difference too! pass elif exclude_annotated: continue else: pass # get this inwpCBG and thisInwpCBG = inwpcbgs[pos] # ignore if the target's Orf is belonging to a `protected` Orf if protected_target_orfid_list and\ thisInwpCBG.get_orfs_of_graph(organism=target)[0].id in\ protected_target_orfid_list: continue # ignore inwpCBGs which are very likely (poor quality) SignalP alignments cntSP = float(thisInwpCBG.count_orfs_with_signalpeptides()) if cntSP/(thisInwpCBG.count_genomic_informants()+1) > 0.66 and\ thisInwpCBG.get_signalp_score() > 0.75: continue # create its GeneTreeGraph gtg = pcg2gtg_by_identity(thisInwpCBG, target) # step 1. Do the gtg/GTG difference check difference = _relative_gtg_difference(gtg, GTG, target) if difference < NONGENE_GTG_MAX_DIFFERENCE: # step 2. Do the CEXPANDER check if thisInwpCBG.node_count() <= 2: gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG) ################################################################ if verbose: print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % ( difference, NONGENE_GTG_MAX_DIFFERENCE), print thisInwpCBG.get_organism_nodes(target)[0] ################################################################ elif thisInwpCBG.get_cexpander_uniformly_aligned_count() == 0: gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG) ################################################################ if verbose: print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % ( difference, NONGENE_GTG_MAX_DIFFERENCE), print thisInwpCBG.get_organism_nodes(target)[0] ################################################################ else: # cexpander check is succesfull, GTGdifference claims # the aligment is bogus. Do a more elaborate check on # some other variables of thisInwpCBG # calculate the difference between minsr & maxsr lengths node = thisInwpCBG.get_organism_nodes(target)[0] minsr = thisInwpCBG.minimal_spanning_range_sizes()[node] maxsr = thisInwpCBG.maximal_spanning_range_sizes()[node] msr_ratio = float(minsr) / float(maxsr) # calculate the ratio between average weights of gtg and GTG average_wt_gtg = _pairwise_gtg_average_weight(gtg, target) average_wt_GTG = _pairwise_gtg_average_weight(GTG, target) gtg_ratio = average_wt_gtg / average_wt_GTG if msr_ratio < NONGENE_GTG_MAX_MSR_RATIO and\ gtg_ratio < NONGENE_GTG_MAX_GTG_RATIO: gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG) ################################################################ if verbose: print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % ( difference, NONGENE_GTG_MAX_DIFFERENCE), print thisInwpCBG.get_organism_nodes(target)[0] ################################################################ else: pass else: pass # return the gtgdiscrepancy_inwpcbg_list return gtgdiscrepancy_inwpcbg_list
def assign_internal_nongene_alignments(inwpcbgs, GTG, exclude_annotated=False, verbose=True): """ TODO TODO: this function must be moved to another location. TODO TODO: better place in inwpCBGs/blocks filtering """ # return empty list when no inwpcbgs applied if not inwpcbgs: return [] # get target organism identifier target = inwpcbgs[0]._get_target_organism() # return list with inwpcbgs gtgdiscrepancy_inwpcbg_list = [] # get most likely first & final inwpCBG pointer in inwpcbgs list posFirst, posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs) # check if posFirst,posFinal+1 isa non-empty range if not range(posFirst, posFinal + 1): return [] # get info on the *best* covered inwpCBG best_nt_identity = max([ inwpcbgs[pos].get_nt_identity() for pos in range(posFirst, posFinal + 1) ]) best_bitscore = max([ inwpcbgs[pos].get_bitscore() for pos in range(posFirst, posFinal + 1) ]) best_annot_cnt = max([ inwpcbgs[pos].count_orfs_labeled_as_annotated_exon() for pos in range(posFirst, posFinal + 1) ]) best_bits_per_aa = max([ float(inwpcbgs[pos].get_bitscore()) / float( sum([ pf.get_unextended_length() for pf in inwpcbgs[pos].pacbps.values() ])) for pos in range(posFirst, posFinal + 1) ]) for pos in range(posFirst, posFinal + 1): # get this inwpCBG and thisInwpCBG = inwpcbgs[pos] if pos > 0: prevInwpCBG = inwpcbgs[pos - 1] else: prevInwpCBG = None if pos < len(inwpcbgs) - 1: nextInwpCBG = inwpcbgs[pos + 1] else: nextInwpCBG = None if prevInwpCBG and Set(prevInwpCBG.get_nodes()).intersection( thisInwpCBG.get_nodes()): continue if nextInwpCBG and Set(nextInwpCBG.get_nodes()).intersection( thisInwpCBG.get_nodes()): continue tot_length = sum( [pf.get_unextended_length() for pf in thisInwpCBG.pacbps.values()]) bits = thisInwpCBG.get_bitscore() bits_per_aa = float(bits) / float(tot_length) if bits_per_aa/best_bits_per_aa < 0.55 and\ float(thisInwpCBG.count_orfs_labeled_as_annotated_exon()) /\ float(best_annot_cnt) <= 0.50: minsr = thisInwpCBG.minimal_spanning_range(organism=target) print " __XX__", pos, thisInwpCBG, thisInwpCBG.get_nt_identity( ), bits, float(bits) / float( tot_length), thisInwpCBG.count_orfs_labeled_as_annotated_exon( ) if prevInwpCBG and len( prevInwpCBG.maximal_spanning_range( organism=target).intersection(minsr)) == len(minsr): ################################################################ if verbose: print "PREV::", len( prevInwpCBG.maximal_spanning_range( organism=target).intersection(minsr)), len(minsr) ################################################################ gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG) continue if nextInwpCBG and len( nextInwpCBG.maximal_spanning_range( organism=target).intersection(minsr)) == len(minsr): ################################################################ if verbose: print "NEXT::", len( nextInwpCBG.maximal_spanning_range( organism=target).intersection(minsr)), len(minsr) ################################################################ gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG) continue # return list with conflicts return gtgdiscrepancy_inwpcbg_list