Пример #1
0
def process_nog_hits_file(hits_file,
                          query_fasta,
                          og2level,
                          skip_queries=None,
                          translate=False,
                          cpu=1,
                          excluded_taxa=None,
                          base_tempdir=None):
    sequences = {
        name: seq
        for name, seq in seqio.iter_fasta_seqs(query_fasta,
                                               translate=translate)
    }
    cmds = []
    visited_queries = set()

    if skip_queries:
        visited_queries.update(skip_queries)

    tempdir = mkdtemp(prefix='emappertmp_phmmer_', dir=base_tempdir)

    for line in gopen(hits_file):
        if line.startswith('#'):
            continue

        fields = map(str.strip, line.split('\t'))
        seqname = fields[0]

        if fields[1] == '-' or fields[1] == 'ERROR':
            continue

        if seqname in visited_queries:
            continue

        hitname = cleanup_og_name(fields[1])
        level = og2level[hitname]

        seq = sequences[seqname]
        visited_queries.add(seqname)
        target_fasta = os.path.join(get_fasta_path(), level, "%s.fa" % hitname)
        cmds.append([seqname, seq, target_fasta, excluded_taxa, tempdir])

    if cmds:
        pool = multiprocessing.Pool(cpu)
        for r in pool.imap(search.refine_hit, cmds):
            yield r
        pool.terminate()

    shutil.rmtree(tempdir)
Пример #2
0
def process_nog_hits_file(hits_file, query_fasta, og2level, skip_queries=None,
                          translate=False, cpu=1, excluded_taxa=None):
    sequences = {name: seq for name, seq in seqio.iter_fasta_seqs(
        query_fasta, translate=translate)}
    cmds = []
    visited_queries = set()

    if skip_queries:
        visited_queries.update(skip_queries)

    tempdir = mkdtemp(prefix='emappertmp_phmmer_', dir=TEMPDIR)

    for line in gopen(hits_file):
        if line.startswith('#'):
            continue

        fields = map(str.strip, line.split('\t'))
        seqname = fields[0]

        if fields[1] == '-' or fields[1] == 'ERROR':
            continue

        if seqname in visited_queries:
            continue

        hitname = cleanup_og_name(fields[1])
        level = og2level[hitname]

        seq = sequences[seqname]
        visited_queries.add(seqname)
        target_fasta = os.path.join(FASTA_PATH, level, "%s.fa" % hitname)
        cmds.append([seqname, seq, target_fasta, excluded_taxa, tempdir])

    if cmds:
        pool = multiprocessing.Pool(cpu)
        for r in pool.imap(search.refine_hit, cmds):
            yield r
        pool.terminate()

    shutil.rmtree(tempdir)