def hmmsearch_print_then_parse(gbkf, hmmf): aaout = NamedTemporaryFile(mode='w+t', delete=False) sys.stderr.write(f"Writing the amino acids to {aaout.name}\n") aaout.seek(0) for seq in genbank_seqio(gbkf): for feat in seq.features: if feat.type != 'CDS': continue aa = "" if 'translation' in feat.qualifiers: aa = feat.qualifiers['translation'][0] else: aa = str(feat.extract(seq).translate().seq) myid = feature_id(seq, feat) aaout.write(f">{myid}\n{aa}\n") aaout.close() sys.stderr.write("Searching\n") try: search = subprocess.Popen([ "hmmsearch", '--cpu', '6', '-E', '1e-10', '--domE', '1e-5', '--noali', hmmf, aaout.name ], stdout=subprocess.PIPE) except subprocess.CalledProcessError as e: sys.stderr.write(f"Error running hmmscan:\n{e}\n") sys.exit(-1) sys.stderr.write("Parsing\n") hmmresult = search.communicate()[0] results = SearchIO.parse(StringIO(hmmresult.decode()), 'hmmer3-text') allhits = {} hitcount = 0 rescount = 0 for res in results: allhits[res.id] = {} rescount += 1 for hit in res: allhits[res.id][hit.id] = hit.evalue # print(f"Result: {res.id}: Hit: {hit.id} Eval: {hit.evalue}") hitcount += 1 print( f"Using hmmsearch and tempfiles there were {rescount} results and {hitcount} hits, and our dict has {len(allhits)} entries" )
def stream_hmmsearch(gbkf, hmmf): """ NOTE: THIS DOES NOT WORK!! You can't stream against hmmsearch as it can't rewind the sequences. You either need to use hmmscan (slow) or a temp file (fast!) :param gbkf: :param hmmf: :return: """ allhits = {} hitcount = 0 rescount = 0 for seq in genbank_seqio(gbkf): prots = [] for feat in seq.features: if feat.type != 'CDS': continue aa = "" if 'translation' in feat.qualifiers: aa = feat.qualifiers['translation'][0] else: aa = str(feat.extract(seq).translate().seq) myid = feature_id(seq, feat) prots.append(f">{myid}\n{aa}") search = subprocess.Popen([ "hmmsearch", '--cpu', '6', '-E', '1e-10', '--domE', '1e-5', '--noali', hmmf, '-' ], stdin=subprocess.PIPE, stdout=subprocess.PIPE) hmmresult = search.communicate(input="\n".join(prots).encode())[0] results = SearchIO.parse(StringIO(hmmresult.decode()), 'hmmer3-text') for res in results: allhits[res.id] = {} rescount += 1 for hit in res: allhits[res.id][hit.id] = hit.evalue hitcount += 1 print( f"Using hmmsearch and streaming all at once there were {rescount} results and {hitcount} hits, and our dict has {len(allhits)} entries" )
def run_hmmscan_aao(gbkf, hmmf): allhits = {} hitcount = 0 rescount = 0 for seq in genbank_seqio(gbkf): prots = [] for feat in seq.features: if feat.type != 'CDS': continue aa = "" if 'translation' in feat.qualifiers: aa = feat.qualifiers['translation'][0] else: aa = str(feat.extract(seq).translate().seq) myid = feature_id(seq, feat) prots.append(f">{myid}\n{aa}") search = subprocess.Popen([ "hmmscan", '--cpu', '6', '-E', '1e-10', '--domE', '1e-5', '--noali', hmmf, '-' ], stdin=subprocess.PIPE, stdout=subprocess.PIPE) hmmresult = search.communicate(input="\n".join(prots).encode())[0] results = SearchIO.parse(StringIO(hmmresult.decode()), 'hmmer3-text') for res in results: allhits[res.id] = {} rescount += 1 for hit in res: allhits[res.id][hit.id] = hit.evalue hitcount += 1 print( f"Using hmmscan and streaming all at once there were {rescount} results and {hitcount} hits, and our dict has {len(allhits)} entries" )