def orf_finder(sequence,startphase,stopphase,strand,expected_aa_len,length_variance, search_coords = None, is_start = False, is_stop = False, hmm_profile = None,evalue='0.05'): """Finds all putative exons matching a given expected length and intron phase profile and \ containing an open reading frame""" search_seq = sequence if search_coords: search_seq = sequence[search_coords[0]:search_coords[1]] if strand == '+': search_seq = genome.Sequence(search_seq) elif strand == '-': search_seq = genome.Sequence(search_seq).reverse_compliment() match_len = expected_aa_len * 3 + ((3 - startphase) % 3) + stopphase phase_matches = [] startstop = ['AG','G[TC]'] if is_start: startstop[0] = 'ATG' match_len = match_len - 3 if is_stop: startstop[1] = "T(AG|GA|AA)" match_len = match_len - 3 for variance in range(length_variance + 1): for direction in (1,-1): if variance == 0 and direction == -1: # prevents double returns of 0 variance continue for match in re.finditer(startstop[0] + '.{' + str(match_len + variance * 3 * direction) + '}' + startstop[1], search_seq): phase_matches.append([match.start(), match.start() + match_len + variance * 3 * direction + len(startstop[0]) + len(startstop[1])]) exon_coords = [] for match_coords in phase_matches: start = match_coords[0] + len(startstop[0]) stop = match_coords[1] - len(startstop[1]) if stop - (start + ((3 - startphase) % 3)) > 2: if not "*" in genome.Sequence(search_seq[start + ((3 - startphase) % 3) : stop ]).translate(): #NB: returns 1-based coords consistent with gff and blast coords exon_coords.append([start + 1,stop]) if is_start: exon_coords[-1][0] = exon_coords[-1][0] - 3 if is_stop: exon_coords[-1][1] = exon_coords[-1][1] + 3 if strand == '-': exon_coords[-1] = [len(search_seq) - exon_coords[-1][1] + 1,len(search_seq) - exon_coords[-1][0] + 1] if search_coords: exon_coords[-1] = [exon_coords[-1][0] + search_coords[0],exon_coords[-1][1] + search_coords[0]] if hmm_profile and len(exon_coords) > 0: exon_coords = hmmsearch(hmm_profile,exon_coords,sequence,strand,startphase,evalue=evalue) return exon_coords
def hmmsearch(hmm_profile,exon_coords,sequence,strand,startphase, evalue= "0.05"): orf_file = tempfile.NamedTemporaryFile('w') for coords_index in range(len(exon_coords)): nuc_seq = genome.Sequence(sequence[exon_coords[coords_index][0] - 1:exon_coords[coords_index][1]]) if strand == '-': nuc_seq = nuc_seq.reverse_compliment() pep_seq = genome.Sequence(nuc_seq[((3 - startphase) % 3):]).translate() orf_file.write(">coords" + str(coords_index) + '\n' + pep_seq + '\n' ) orf_file.flush() hmmout = subprocess.check_output('hmmsearch --max -E ' + evalue + ' ' + hmm_profile + ' ' + orf_file.name, shell = True).split('\n') found_hit = False for line in hmmout: if line[:2] == ">>": found_hit = True return exon_coords[int(line[9:].replace('\r',''))] break if not found_hit: return []
#sets up default program paths, overwritten by any program paths passes with -f or --program_filepaths hmmsearch = args.hmmsearch_filepath target_nucdb = genome.Genome(args.target_nucdb) #Gets ORFs from the genome and hmmers them if args.frames_in: frames_file = args.frames_in else: if args.frames_out: frame_fasta = open(args.frames_out, 'w') else: frame_fasta = tempfile.NamedTemporaryFile('w') for seq_id in target_nucdb.genome_sequence: frameonef = genome.Sequence(target_nucdb.genome_sequence[seq_id]) frameoner = genome.Sequence( target_nucdb.genome_sequence[seq_id]).reverse_compliment() frames = [ frameonef, genome.Sequence(frameonef[1:]), genome.Sequence(frameonef[2:]), frameoner, genome.Sequence(frameoner[1:]), genome.Sequence(frameoner[2:]) ] fasta_list = [] for frame_num in (0, 1, 2, 3, 4, 5): frame = frames[frame_num] if frame_num < 3: frame_offset = frame_num else:
def exon_finder(tstart,tend,strand,qstart,qend,qlen,qstartphase,qendphase,seqdict,seqname, max_offset = 30, is_start = False, is_stop = False, nevermind_atg = False): """"finds exons with ORFs based on the requested parameters. Note that it is expected that tstart < tend, \ even for minus strand features, so these might need to be reversed if coming from say tblastn. \ Also coords are expected as 1-based (as output from blast), and are \ converted internally to 0 based. It is also expected that the hit itself doesn't contain stop codons.""" start = None end = None pseudo = False if strand == "+": phasestart_offset = (3 - qstartphase) % 3 phasestop_offset = qendphase start_match_offset, stop_match_offset = 3 * (qstart - 1),3 * (qlen - qend) elif strand == '-': phasestart_offset = qendphase phasestop_offset = (3 - qstartphase) % 3 start_match_offset, stop_match_offset = 3 * (qlen - qend), 3 * qstart ideal_start = tstart - phasestart_offset - start_match_offset ideal_end = tend + stop_match_offset + phasestop_offset pseudo_start = tstart - phasestart_offset pseudo_end = tend + phasestop_offset gc_start, gc_end = None, None for offset in range(0,max_offset + 3,3): if start: break for direction in [1,-1]: test_start = ideal_start - offset * direction test_seq = genome.Sequence(seqdict[seqname][test_start-1 + phasestart_offset:tend]) if strand == "-": test_seq = test_seq.reverse_compliment() if not test_seq.translate(): continue elif is_stop and strand == "-": if ideal_start - 1 < test_start < pseudo_start: pseudo_start = test_start lastcodon = seqdict[seqname][test_start - 1:test_start + 2] if lastcodon.upper() in ['TTA','TCA','CTA'] and test_seq.translate().count('*') == 1: start = test_start break elif not "*" in test_seq.translate(): if ideal_start - 1 < test_start < pseudo_start: pseudo_start = test_start if is_start and strand == '+': if nevermind_atg: start = test_start break else: firstcodon = seqdict[seqname][test_start - 1:test_start + 2] if firstcodon.upper() == "ATG": start = test_start break else: splicesite = seqdict[seqname][test_start-3:test_start-1] if (strand == '+' and splicesite.upper() == "AG") or (strand == '-' and splicesite.upper() == "AC"): start = test_start break elif strand == '-' and splicesite.upper() == "GC" and not gc_start: gc_start = test_start if not start: if gc_start: start = gc_start else: pseudo = "P" start = pseudo_start for offset in range(0,max_offset + 3,3): if end: break for direction in [-1,1]: test_end = ideal_end - offset * direction if test_end - start < 3: break test_seq = genome.Sequence(seqdict[seqname][start - 1 + phasestart_offset:test_end - phasestop_offset]) if strand == "-": test_seq = test_seq.reverse_compliment() if not test_seq.translate(): continue elif is_stop and strand == "+": if ideal_end + 1 > test_end > pseudo_end: pseudo_end = test_end lastcodon = seqdict[seqname][test_end - 3:test_end] if lastcodon.upper() in ['TAA','TGA','TAG'] and test_seq.translate().count('*') == 1: end = test_end break elif not "*" in test_seq.translate() or (is_stop and not "*" in test_seq.translate()[:-1]): if ideal_end + 1 > test_end > pseudo_end: pseudo_end = test_end if is_start and strand == '-': if nevermind_atg: end = test_end break else: firstcodon = seqdict[seqname][test_end - 3:test_end] if firstcodon.upper() == "CAT": end = test_end break else: splicesite = seqdict[seqname][test_end:test_end + 2] if (strand == '+' and splicesite.upper() == "GT") or (strand == '-' and splicesite.upper() == "CT"): end = test_end break elif strand == "+" and splicesite.upper() == "GC" and not gc_end: gc_end = test_end if not end: if gc_end: end = gc_end else: pseudo = "P" end = pseudo_end return [start,end,pseudo]