Exemplo n.º 1
0
def predict_longest_ORFs(seq, min_aa_length):
    """
    seq --- should be plain string in all upper case, A/T/C/G
    Return all longest ORFs that exceed <min_length>

    Returns: dict of <frame> --> list of (flag, <0-based start>, <1-based end>)
    NOTE that is the seq is reverse complemented, the handler function needs to rev the coords on its own
    """
    start_d, stop_d = ORFscores.find_start_stop_codons(seq)
    result = {0: [], 1: [], 2: []}

    n, m = len(seq) / 3, len(seq) % 3

    for frame in xrange(3):
        starts, stops = start_d[frame].keys(), stop_d[frame].keys()
        starts.sort()
        stops.sort()
        #print frame, starts, stops
        if len(stops) == 0:  # no stop, so just output first (start, last)
            if len(starts) > 0 and n - starts[0] + 1 >= min_aa_length:
                result[frame].append(('dumb-3partial', starts[0] * 3 + frame,
                                      n * 3 + (frame if frame <= m else 0)))
        else:  # has stop
            if len(starts) == 0:  # 5' partial
                if stops[0] + 1 >= min_aa_length:
                    result[frame].append(
                        ('dumb-5partial', frame, stops[0] * 3 + 3 + frame))
            else:  # has at least one start and one stop
                i, j = 0, 0
                while j < len(stops):
                    if i == len(starts): break
                    if stops[j] - starts[i] + 1 >= min_aa_length:
                        result[frame].append(
                            ('dumb-complete', starts[i] * 3 + frame,
                             stops[j] * 3 + 3 + frame))
                    j += 1  # move stop one step down
                    while i < len(starts) and starts[i] < stops[j - 1]:
                        i += 1
                # check the very last possible ORF
                if i < len(starts) and (
                        j == len(stops) or
                    (j < len(stops) and starts[i] > stops[j])
                ) and n - starts[i] + 1 >= min_aa_length:
                    result[frame].append(
                        ('dumb-3partial', starts[i] * 3 + frame,
                         n * 3 + (frame if frame <= m else 0)))
    return result
Exemplo n.º 2
0
def predict_longest_ORFs(seq, min_aa_length):
    """
    seq --- should be plain string in all upper case, A/T/C/G
    Return all longest ORFs that exceed <min_length>

    Returns: dict of <frame> --> list of (flag, <0-based start>, <1-based end>)
    NOTE that is the seq is reverse complemented, the handler function needs to rev the coords on its own
    """
    start_d, stop_d = ORFscores.find_start_stop_codons(seq)
    result = {0: [], 1: [], 2: []}

    n, m = len(seq)/3, len(seq)%3

    for frame in xrange(3):
        starts, stops = start_d[frame].keys(), stop_d[frame].keys()
        starts.sort()
        stops.sort()
        #print frame, starts, stops
        if len(stops) == 0: # no stop, so just output first (start, last)
            if len(starts) > 0 and n - starts[0] + 1 >= min_aa_length:
                result[frame].append(('dumb-3partial', starts[0]*3+frame, n*3+(frame if frame<=m else 0)))
        else: # has stop
            if len(starts) == 0: # 5' partial
                if  stops[0] + 1 >= min_aa_length:
                    result[frame].append(('dumb-5partial', frame, stops[0]*3+3+frame))
            else: # has at least one start and one stop
                i, j = 0, 0
                while j < len(stops):
                    if i == len(starts): break
                    if stops[j] - starts[i] + 1 >= min_aa_length:
                        result[frame].append(('dumb-complete', starts[i]*3+frame, stops[j]*3+3+frame))
                    j += 1 # move stop one step down
                    while i < len(starts) and starts[i] < stops[j-1]:
                        i += 1
                # check the very last possible ORF
                if i < len(starts) and (j == len(stops) or (j < len(stops) and starts[i] > stops[j])) and n - starts[i] + 1 >= min_aa_length:
                    result[frame].append(('dumb-3partial', starts[i]*3+frame, n*3+(frame if frame<=m else 0)))
    return result
Exemplo n.º 3
0
def predict_longest_ORFs(seq, min_aa_length, use_firstORF=False):
    """
    seq --- should be plain string in all upper case, A/T/C/G
    Return the longest ORFs that exceed <min_length> (unless use_firstORF is True)

    Returns: dict of <frame> --> list of (flag, <0-based start>, <1-based end>)
    NOTE that is the seq is reverse complemented, the handler function needs to rev the coords on its own
    """
    start_d, stop_d = ORFscores.find_start_stop_codons(seq)
    result = {0: [], 1: [], 2: []}

    n, m = len(seq) / 3, len(seq) % 3

    for frame in xrange(3):
        starts, stops = start_d[frame].keys(), stop_d[frame].keys()
        starts.sort()
        stops.sort()
        #print frame, starts, stops
        if len(stops) == 0:  # no stop, so just output first (start, last)
            if len(starts) > 0 and n - starts[0] + 1 >= min_aa_length:
                result[frame].append(
                    ('dumb-3partial', starts[0] * 3 + frame,
                     n * 3 + (frame if frame <= m else frame - 3)))
        else:  # has stop
            if len(starts) == 0:  # 5' partial
                if stops[0] + 1 >= min_aa_length:
                    result[frame].append(
                        ('dumb-5partial', frame, stops[0] * 3 + 3 + frame))
            else:  # has at least one start and one stop
                i, j = 0, 0
                # if the first stop is smaller than i, find the first j s.t. stops[j-1] < start[0] < stops[j]
                if stops[0] < starts[0]:
                    while j < len(stops) and starts[0] < stops[j - 1]:
                        j += 1
                # now: stops[j-1] < starts[0] < stops[j]
                while j < len(stops):
                    if i == len(starts): break
                    if stops[j] - starts[i] + 1 >= min_aa_length:
                        #rint frame, starts[i], stops[j]
                        result[frame].append(
                            ('dumb-complete', starts[i] * 3 + frame,
                             stops[j] * 3 + 3 + frame))
                    j += 1  # move stop one step down
                    while i < len(starts) and starts[i] < stops[j - 1]:
                        i += 1
                    # now starts[i] is between the last stop and this one
                # check the very last possible ORF
                if i < len(starts) and (
                        j == len(stops) or
                    (j < len(stops) and starts[i] > stops[j])
                ) and n - starts[i] + 1 >= min_aa_length:
                    result[frame].append(
                        ('dumb-3partial', starts[i] * 3 + frame,
                         n * 3 + (frame if frame <= m else frame - 3)))

    # now pick the frame with the longest ORF!
    if all(len(v) == 0 for v in result.itervalues()):  # no ORF found
        return None

    best_frame, best_flag, best_s, best_e, best_len = None, None, None, None, 0
    if not use_firstORF:  # find the longest ORF among all frames
        for _frame, v in result.iteritems():
            for (flag, s, e) in v:
                _len = e - s
                if _len > best_len:
                    best_frame, best_flag, best_s, best_e, best_len = \
                    _frame, flag, s, e, _len
    else:  # use the first ORF among all frames
        for _frame, v in result.iteritems():
            for (flag, s, e) in v:
                _len = e - s
                if best_s is None or s < best_s or (s == best_s
                                                    and _len > best_len):
                    best_frame, best_flag, best_s, best_e, best_len = \
                    _frame, flag, s, e, _len

    return {best_frame: [(best_flag, best_s, best_e)]}
Exemplo n.º 4
0
def predict_longest_ORFs(seq, min_aa_length, use_firstORF=False):
    """
    seq --- should be plain string in all upper case, A/T/C/G
    Return the longest ORFs that exceed <min_length> (unless use_firstORF is True)

    Returns: dict of <frame> --> list of (flag, <0-based start>, <1-based end>)
    NOTE that is the seq is reverse complemented, the handler function needs to rev the coords on its own
    """
    start_d, stop_d = ORFscores.find_start_stop_codons(seq)
    result = {0: [], 1: [], 2: []}

    n, m = len(seq)/3, len(seq)%3

    for frame in xrange(3):
        starts, stops = start_d[frame].keys(), stop_d[frame].keys()
        starts.sort()
        stops.sort()
        #print frame, starts, stops
        if len(stops) == 0: # no stop, so just output first (start, last)
            if len(starts) > 0 and n - starts[0] + 1 >= min_aa_length:
                result[frame].append(('dumb-3partial', starts[0]*3+frame, n*3+(frame if frame<=m else frame-3)))
        else: # has stop
            if len(starts) == 0: # 5' partial
                if  stops[0] + 1 >= min_aa_length:
                    result[frame].append(('dumb-5partial', frame, stops[0]*3+3+frame))
            else: # has at least one start and one stop
                i, j = 0, 0
                # if the first stop is smaller than i, find the first j s.t. stops[j-1] < start[0] < stops[j]
                if stops[0] < starts[0]:
                    while j < len(stops) and starts[0] < stops[j-1]:
                        j += 1
                # now: stops[j-1] < starts[0] < stops[j]
                while j < len(stops):
                    if i == len(starts): break
                    if stops[j] - starts[i] + 1 >= min_aa_length:
                        #rint frame, starts[i], stops[j]
                        result[frame].append(('dumb-complete', starts[i]*3+frame, stops[j]*3+3+frame))
                    j += 1 # move stop one step down
                    while i < len(starts) and starts[i] < stops[j-1]:
                        i += 1
                    # now starts[i] is between the last stop and this one
                # check the very last possible ORF
                if i < len(starts) and (j == len(stops) or (j < len(stops) and starts[i] > stops[j])) and n - starts[i] + 1 >= min_aa_length:
                    result[frame].append(('dumb-3partial', starts[i]*3+frame, n*3+(frame if frame<=m else frame-3)))

    # now pick the frame with the longest ORF!
    if all(len(v)==0 for v in result.itervalues()): # no ORF found
        return None


    best_frame, best_flag, best_s, best_e, best_len = None, None, None, None, 0
    if not use_firstORF: # find the longest ORF among all frames
        for _frame, v in result.iteritems():
            for (flag, s, e) in v:
                _len = e - s
                if _len > best_len:
                    best_frame, best_flag, best_s, best_e, best_len = \
                    _frame, flag, s, e, _len
    else: # use the first ORF among all frames
        for _frame, v in result.iteritems():
            for (flag, s, e) in v:
                _len = e - s
                if best_s is None or s < best_s or (s==best_s and _len>best_len):
                    best_frame, best_flag, best_s, best_e, best_len = \
                    _frame, flag, s, e, _len

    return {best_frame: [(best_flag, best_s, best_e)]}