def predict_longest_ORFs(seq, min_aa_length): """ seq --- should be plain string in all upper case, A/T/C/G Return all longest ORFs that exceed <min_length> Returns: dict of <frame> --> list of (flag, <0-based start>, <1-based end>) NOTE that is the seq is reverse complemented, the handler function needs to rev the coords on its own """ start_d, stop_d = ORFscores.find_start_stop_codons(seq) result = {0: [], 1: [], 2: []} n, m = len(seq) / 3, len(seq) % 3 for frame in xrange(3): starts, stops = start_d[frame].keys(), stop_d[frame].keys() starts.sort() stops.sort() #print frame, starts, stops if len(stops) == 0: # no stop, so just output first (start, last) if len(starts) > 0 and n - starts[0] + 1 >= min_aa_length: result[frame].append(('dumb-3partial', starts[0] * 3 + frame, n * 3 + (frame if frame <= m else 0))) else: # has stop if len(starts) == 0: # 5' partial if stops[0] + 1 >= min_aa_length: result[frame].append( ('dumb-5partial', frame, stops[0] * 3 + 3 + frame)) else: # has at least one start and one stop i, j = 0, 0 while j < len(stops): if i == len(starts): break if stops[j] - starts[i] + 1 >= min_aa_length: result[frame].append( ('dumb-complete', starts[i] * 3 + frame, stops[j] * 3 + 3 + frame)) j += 1 # move stop one step down while i < len(starts) and starts[i] < stops[j - 1]: i += 1 # check the very last possible ORF if i < len(starts) and ( j == len(stops) or (j < len(stops) and starts[i] > stops[j]) ) and n - starts[i] + 1 >= min_aa_length: result[frame].append( ('dumb-3partial', starts[i] * 3 + frame, n * 3 + (frame if frame <= m else 0))) return result
def predict_longest_ORFs(seq, min_aa_length): """ seq --- should be plain string in all upper case, A/T/C/G Return all longest ORFs that exceed <min_length> Returns: dict of <frame> --> list of (flag, <0-based start>, <1-based end>) NOTE that is the seq is reverse complemented, the handler function needs to rev the coords on its own """ start_d, stop_d = ORFscores.find_start_stop_codons(seq) result = {0: [], 1: [], 2: []} n, m = len(seq)/3, len(seq)%3 for frame in xrange(3): starts, stops = start_d[frame].keys(), stop_d[frame].keys() starts.sort() stops.sort() #print frame, starts, stops if len(stops) == 0: # no stop, so just output first (start, last) if len(starts) > 0 and n - starts[0] + 1 >= min_aa_length: result[frame].append(('dumb-3partial', starts[0]*3+frame, n*3+(frame if frame<=m else 0))) else: # has stop if len(starts) == 0: # 5' partial if stops[0] + 1 >= min_aa_length: result[frame].append(('dumb-5partial', frame, stops[0]*3+3+frame)) else: # has at least one start and one stop i, j = 0, 0 while j < len(stops): if i == len(starts): break if stops[j] - starts[i] + 1 >= min_aa_length: result[frame].append(('dumb-complete', starts[i]*3+frame, stops[j]*3+3+frame)) j += 1 # move stop one step down while i < len(starts) and starts[i] < stops[j-1]: i += 1 # check the very last possible ORF if i < len(starts) and (j == len(stops) or (j < len(stops) and starts[i] > stops[j])) and n - starts[i] + 1 >= min_aa_length: result[frame].append(('dumb-3partial', starts[i]*3+frame, n*3+(frame if frame<=m else 0))) return result
def predict_longest_ORFs(seq, min_aa_length, use_firstORF=False): """ seq --- should be plain string in all upper case, A/T/C/G Return the longest ORFs that exceed <min_length> (unless use_firstORF is True) Returns: dict of <frame> --> list of (flag, <0-based start>, <1-based end>) NOTE that is the seq is reverse complemented, the handler function needs to rev the coords on its own """ start_d, stop_d = ORFscores.find_start_stop_codons(seq) result = {0: [], 1: [], 2: []} n, m = len(seq) / 3, len(seq) % 3 for frame in xrange(3): starts, stops = start_d[frame].keys(), stop_d[frame].keys() starts.sort() stops.sort() #print frame, starts, stops if len(stops) == 0: # no stop, so just output first (start, last) if len(starts) > 0 and n - starts[0] + 1 >= min_aa_length: result[frame].append( ('dumb-3partial', starts[0] * 3 + frame, n * 3 + (frame if frame <= m else frame - 3))) else: # has stop if len(starts) == 0: # 5' partial if stops[0] + 1 >= min_aa_length: result[frame].append( ('dumb-5partial', frame, stops[0] * 3 + 3 + frame)) else: # has at least one start and one stop i, j = 0, 0 # if the first stop is smaller than i, find the first j s.t. stops[j-1] < start[0] < stops[j] if stops[0] < starts[0]: while j < len(stops) and starts[0] < stops[j - 1]: j += 1 # now: stops[j-1] < starts[0] < stops[j] while j < len(stops): if i == len(starts): break if stops[j] - starts[i] + 1 >= min_aa_length: #rint frame, starts[i], stops[j] result[frame].append( ('dumb-complete', starts[i] * 3 + frame, stops[j] * 3 + 3 + frame)) j += 1 # move stop one step down while i < len(starts) and starts[i] < stops[j - 1]: i += 1 # now starts[i] is between the last stop and this one # check the very last possible ORF if i < len(starts) and ( j == len(stops) or (j < len(stops) and starts[i] > stops[j]) ) and n - starts[i] + 1 >= min_aa_length: result[frame].append( ('dumb-3partial', starts[i] * 3 + frame, n * 3 + (frame if frame <= m else frame - 3))) # now pick the frame with the longest ORF! if all(len(v) == 0 for v in result.itervalues()): # no ORF found return None best_frame, best_flag, best_s, best_e, best_len = None, None, None, None, 0 if not use_firstORF: # find the longest ORF among all frames for _frame, v in result.iteritems(): for (flag, s, e) in v: _len = e - s if _len > best_len: best_frame, best_flag, best_s, best_e, best_len = \ _frame, flag, s, e, _len else: # use the first ORF among all frames for _frame, v in result.iteritems(): for (flag, s, e) in v: _len = e - s if best_s is None or s < best_s or (s == best_s and _len > best_len): best_frame, best_flag, best_s, best_e, best_len = \ _frame, flag, s, e, _len return {best_frame: [(best_flag, best_s, best_e)]}
def predict_longest_ORFs(seq, min_aa_length, use_firstORF=False): """ seq --- should be plain string in all upper case, A/T/C/G Return the longest ORFs that exceed <min_length> (unless use_firstORF is True) Returns: dict of <frame> --> list of (flag, <0-based start>, <1-based end>) NOTE that is the seq is reverse complemented, the handler function needs to rev the coords on its own """ start_d, stop_d = ORFscores.find_start_stop_codons(seq) result = {0: [], 1: [], 2: []} n, m = len(seq)/3, len(seq)%3 for frame in xrange(3): starts, stops = start_d[frame].keys(), stop_d[frame].keys() starts.sort() stops.sort() #print frame, starts, stops if len(stops) == 0: # no stop, so just output first (start, last) if len(starts) > 0 and n - starts[0] + 1 >= min_aa_length: result[frame].append(('dumb-3partial', starts[0]*3+frame, n*3+(frame if frame<=m else frame-3))) else: # has stop if len(starts) == 0: # 5' partial if stops[0] + 1 >= min_aa_length: result[frame].append(('dumb-5partial', frame, stops[0]*3+3+frame)) else: # has at least one start and one stop i, j = 0, 0 # if the first stop is smaller than i, find the first j s.t. stops[j-1] < start[0] < stops[j] if stops[0] < starts[0]: while j < len(stops) and starts[0] < stops[j-1]: j += 1 # now: stops[j-1] < starts[0] < stops[j] while j < len(stops): if i == len(starts): break if stops[j] - starts[i] + 1 >= min_aa_length: #rint frame, starts[i], stops[j] result[frame].append(('dumb-complete', starts[i]*3+frame, stops[j]*3+3+frame)) j += 1 # move stop one step down while i < len(starts) and starts[i] < stops[j-1]: i += 1 # now starts[i] is between the last stop and this one # check the very last possible ORF if i < len(starts) and (j == len(stops) or (j < len(stops) and starts[i] > stops[j])) and n - starts[i] + 1 >= min_aa_length: result[frame].append(('dumb-3partial', starts[i]*3+frame, n*3+(frame if frame<=m else frame-3))) # now pick the frame with the longest ORF! if all(len(v)==0 for v in result.itervalues()): # no ORF found return None best_frame, best_flag, best_s, best_e, best_len = None, None, None, None, 0 if not use_firstORF: # find the longest ORF among all frames for _frame, v in result.iteritems(): for (flag, s, e) in v: _len = e - s if _len > best_len: best_frame, best_flag, best_s, best_e, best_len = \ _frame, flag, s, e, _len else: # use the first ORF among all frames for _frame, v in result.iteritems(): for (flag, s, e) in v: _len = e - s if best_s is None or s < best_s or (s==best_s and _len>best_len): best_frame, best_flag, best_s, best_e, best_len = \ _frame, flag, s, e, _len return {best_frame: [(best_flag, best_s, best_e)]}