def trim_ambiguous_nucleotides (sequence): return ambiguous_pattern.sub("",sequence) def longest_ORF (sequence): open_reading_frames = orf_pattern.findall(sequence); if open_reading_frames: #trim M and X return max(open_reading_frames,key=len)[1:-1] else: return "" if __name__=="__main__": parser = argparse.ArgumentParser(__doc__) parser.add_argument("input_file",type=str,help="Target fasta file") args = parser.parse_args() sequences = (seq for seq in split_fasta(args.input_file)) for seq in sequences: data = trim_ambiguous_nucleotides(seq.data.lower()) header = seq.header #keep track of longest ORF for each of 3 forward frames peptides = [] for translation in forward_frame_translate(data): peptides.append(longest_ORF(translation)) print header print max(peptides,key=len)
""" Given a fasta reference, only print requested sequences (search by identifiers) """ import argparse import fastaparse parser = argparse.ArgumentParser() parser.add_argument("reference_file",type=str,help="Reference fasta database") parser.add_argument("id_file",type=str,help="File containing references to sequences") args = parser.parse_args() sequences = fastaparse.split_fasta(args.reference_file) results = list() for seq_id in open(args.id_file): for seq in sequences: if seq_id.strip() in seq.header: results.append(seq.fasta()) for seq in results: print seq