示例#1
0
def orf_finder(sequence,startphase,stopphase,strand,expected_aa_len,length_variance,
               search_coords = None, is_start = False, is_stop = False, hmm_profile = None,evalue='0.05'):
    """Finds all putative exons matching a given expected length and intron phase profile and \
    containing an open reading frame"""
    search_seq = sequence
    if search_coords:
        search_seq = sequence[search_coords[0]:search_coords[1]]
    if strand == '+':
        search_seq = genome.Sequence(search_seq)
    elif strand == '-':
        search_seq = genome.Sequence(search_seq).reverse_compliment()
    match_len = expected_aa_len * 3 + ((3 - startphase) % 3) + stopphase
    phase_matches = []
    startstop = ['AG','G[TC]']
    if is_start:
        startstop[0] = 'ATG'
        match_len = match_len - 3
    if is_stop:
        startstop[1] = "T(AG|GA|AA)"
        match_len = match_len - 3
    for variance in range(length_variance + 1):
        for direction in (1,-1):
            if variance == 0 and direction == -1: # prevents double returns of 0 variance
                continue
            for match in re.finditer(startstop[0] + '.{' + str(match_len + variance * 3 * direction) +
                                     '}' + startstop[1], search_seq):
                phase_matches.append([match.start(), match.start() + match_len + variance * 3 *
                                      direction + len(startstop[0]) + len(startstop[1])])
    exon_coords = []
    for match_coords in phase_matches:
        start = match_coords[0] + len(startstop[0]) 
        stop = match_coords[1] - len(startstop[1])
        if stop - (start + ((3 - startphase) % 3)) > 2:
            if not "*" in genome.Sequence(search_seq[start + ((3 - startphase) % 3) : stop ]).translate():
                #NB: returns 1-based coords consistent with gff and blast coords
                exon_coords.append([start + 1,stop])
                if is_start:
                    exon_coords[-1][0] = exon_coords[-1][0] - 3
                if is_stop:
                    exon_coords[-1][1] = exon_coords[-1][1] + 3
                if strand == '-':
                    exon_coords[-1] = [len(search_seq) - exon_coords[-1][1] + 1,len(search_seq) - exon_coords[-1][0] + 1]
                if search_coords:
                    exon_coords[-1] = [exon_coords[-1][0] + search_coords[0],exon_coords[-1][1] + search_coords[0]]     
    if hmm_profile and len(exon_coords) > 0:
        exon_coords = hmmsearch(hmm_profile,exon_coords,sequence,strand,startphase,evalue=evalue)
    return exon_coords
示例#2
0
def hmmsearch(hmm_profile,exon_coords,sequence,strand,startphase, evalue= "0.05"):
    orf_file = tempfile.NamedTemporaryFile('w')
    for coords_index in range(len(exon_coords)):
        nuc_seq = genome.Sequence(sequence[exon_coords[coords_index][0] - 1:exon_coords[coords_index][1]])
        if strand == '-':
            nuc_seq = nuc_seq.reverse_compliment()
        pep_seq = genome.Sequence(nuc_seq[((3 - startphase) % 3):]).translate()
        orf_file.write(">coords" + str(coords_index) + '\n' +
                       pep_seq + '\n' )
    orf_file.flush()
    hmmout = subprocess.check_output('hmmsearch --max -E ' + evalue + ' ' + hmm_profile + ' ' + orf_file.name,
                                     shell = True).split('\n')
    found_hit = False
    for line in hmmout:
        if line[:2] == ">>":
            found_hit = True
            return exon_coords[int(line[9:].replace('\r',''))]
            break
    if not found_hit:
        return []
示例#3
0
#sets up default program paths, overwritten by any program paths passes with -f or --program_filepaths
hmmsearch = args.hmmsearch_filepath
target_nucdb = genome.Genome(args.target_nucdb)

#Gets ORFs from the genome and hmmers them

if args.frames_in:
    frames_file = args.frames_in
else:
    if args.frames_out:
        frame_fasta = open(args.frames_out, 'w')
    else:
        frame_fasta = tempfile.NamedTemporaryFile('w')
    for seq_id in target_nucdb.genome_sequence:
        frameonef = genome.Sequence(target_nucdb.genome_sequence[seq_id])
        frameoner = genome.Sequence(
            target_nucdb.genome_sequence[seq_id]).reverse_compliment()
        frames = [
            frameonef,
            genome.Sequence(frameonef[1:]),
            genome.Sequence(frameonef[2:]), frameoner,
            genome.Sequence(frameoner[1:]),
            genome.Sequence(frameoner[2:])
        ]
        fasta_list = []
        for frame_num in (0, 1, 2, 3, 4, 5):
            frame = frames[frame_num]
            if frame_num < 3:
                frame_offset = frame_num
            else:
示例#4
0
def exon_finder(tstart,tend,strand,qstart,qend,qlen,qstartphase,qendphase,seqdict,seqname,
                max_offset = 30, is_start = False, is_stop = False, nevermind_atg = False):
    """"finds exons with ORFs based on the requested parameters. Note that it is expected that tstart < tend, \
    even for minus strand features, so these might need to be reversed if coming from say tblastn. \
    Also coords are expected as 1-based (as output from blast), and are \
    converted internally to 0 based. It is also expected that the hit itself doesn't contain stop codons."""
    start = None
    end = None
    pseudo = False
    if strand == "+":
        phasestart_offset = (3 - qstartphase) % 3
        phasestop_offset = qendphase
        start_match_offset, stop_match_offset = 3 * (qstart - 1),3 * (qlen - qend)
    elif strand == '-':
        phasestart_offset = qendphase
        phasestop_offset = (3 - qstartphase) % 3
        start_match_offset, stop_match_offset = 3 * (qlen - qend), 3 * qstart
    ideal_start = tstart - phasestart_offset - start_match_offset
    ideal_end = tend + stop_match_offset + phasestop_offset
    pseudo_start = tstart - phasestart_offset
    pseudo_end = tend + phasestop_offset
    gc_start, gc_end = None, None
    for offset in range(0,max_offset + 3,3):
        if start:
            break
        for direction in [1,-1]:
            test_start = ideal_start - offset * direction
            test_seq = genome.Sequence(seqdict[seqname][test_start-1 + phasestart_offset:tend])
            if strand == "-":
                test_seq = test_seq.reverse_compliment()
            if not test_seq.translate():
                continue
            elif is_stop and strand == "-":
                if ideal_start - 1 < test_start < pseudo_start:
                    pseudo_start = test_start
                lastcodon = seqdict[seqname][test_start - 1:test_start + 2]
                if lastcodon.upper() in ['TTA','TCA','CTA'] and test_seq.translate().count('*') == 1:
                    start = test_start
                    break
            elif not "*" in test_seq.translate():
                if ideal_start - 1 < test_start < pseudo_start:
                    pseudo_start = test_start
                if is_start and strand == '+':
                    if nevermind_atg:
                        start = test_start
                        break
                    else:
                        firstcodon = seqdict[seqname][test_start - 1:test_start + 2]
                        if firstcodon.upper() == "ATG":
                            start = test_start
                            break
                else:
                    splicesite = seqdict[seqname][test_start-3:test_start-1]
                    if (strand == '+' and splicesite.upper() == "AG") or (strand == '-' and splicesite.upper() == "AC"):
                        start = test_start
                        break
                    elif strand == '-' and splicesite.upper() == "GC" and not gc_start:
                        gc_start = test_start
    if not start:
        if gc_start:
            start = gc_start
        else:
            pseudo = "P"
            start = pseudo_start
    for offset in range(0,max_offset + 3,3):
        if end:
            break
        for direction in [-1,1]:
            test_end = ideal_end - offset * direction
            if test_end - start < 3:
                break
            test_seq = genome.Sequence(seqdict[seqname][start - 1 + phasestart_offset:test_end - phasestop_offset])
            if strand == "-":
                test_seq = test_seq.reverse_compliment()
            if not test_seq.translate():
                continue
            elif is_stop and strand == "+":
                if ideal_end + 1 > test_end > pseudo_end:
                    pseudo_end = test_end
                lastcodon = seqdict[seqname][test_end - 3:test_end]
                if lastcodon.upper() in ['TAA','TGA','TAG'] and test_seq.translate().count('*') == 1:
                    end = test_end
                    break
            elif not "*" in test_seq.translate() or (is_stop and not "*" in test_seq.translate()[:-1]):
                if ideal_end + 1 > test_end > pseudo_end:
                    pseudo_end = test_end
                if is_start and strand == '-':
                    if nevermind_atg:
                        end = test_end
                        break
                    else:
                        firstcodon = seqdict[seqname][test_end - 3:test_end]
                        if firstcodon.upper() == "CAT":
                            end = test_end
                            break                        
                else:
                    splicesite = seqdict[seqname][test_end:test_end + 2]
                    if (strand == '+' and splicesite.upper() == "GT") or (strand == '-' and splicesite.upper() == "CT"):
                        end = test_end
                        break
                    elif strand == "+" and splicesite.upper() == "GC" and not gc_end:
                        gc_end = test_end
    if not end:
        if gc_end:
            end = gc_end
        else:
            pseudo = "P"
            end = pseudo_end
    return [start,end,pseudo]