def process_nog_hits_file(hits_file, query_fasta, og2level, skip_queries=None, translate=False, cpu=1, excluded_taxa=None, base_tempdir=None): sequences = { name: seq for name, seq in seqio.iter_fasta_seqs(query_fasta, translate=translate) } cmds = [] visited_queries = set() if skip_queries: visited_queries.update(skip_queries) tempdir = mkdtemp(prefix='emappertmp_phmmer_', dir=base_tempdir) for line in gopen(hits_file): if line.startswith('#'): continue fields = map(str.strip, line.split('\t')) seqname = fields[0] if fields[1] == '-' or fields[1] == 'ERROR': continue if seqname in visited_queries: continue hitname = cleanup_og_name(fields[1]) level = og2level[hitname] seq = sequences[seqname] visited_queries.add(seqname) target_fasta = os.path.join(get_fasta_path(), level, "%s.fa" % hitname) cmds.append([seqname, seq, target_fasta, excluded_taxa, tempdir]) if cmds: pool = multiprocessing.Pool(cpu) for r in pool.imap(search.refine_hit, cmds): yield r pool.terminate() shutil.rmtree(tempdir)
def process_nog_hits_file(hits_file, query_fasta, og2level, skip_queries=None, translate=False, cpu=1, excluded_taxa=None): sequences = {name: seq for name, seq in seqio.iter_fasta_seqs( query_fasta, translate=translate)} cmds = [] visited_queries = set() if skip_queries: visited_queries.update(skip_queries) tempdir = mkdtemp(prefix='emappertmp_phmmer_', dir=TEMPDIR) for line in gopen(hits_file): if line.startswith('#'): continue fields = map(str.strip, line.split('\t')) seqname = fields[0] if fields[1] == '-' or fields[1] == 'ERROR': continue if seqname in visited_queries: continue hitname = cleanup_og_name(fields[1]) level = og2level[hitname] seq = sequences[seqname] visited_queries.add(seqname) target_fasta = os.path.join(FASTA_PATH, level, "%s.fa" % hitname) cmds.append([seqname, seq, target_fasta, excluded_taxa, tempdir]) if cmds: pool = multiprocessing.Pool(cpu) for r in pool.imap(search.refine_hit, cmds): yield r pool.terminate() shutil.rmtree(tempdir)