示例#1
0
def run_centrifuge(Bdb, prod_dir, cent_dir, **kwargs):
    t = kwargs.get('processors','6')

    cmds = []
    files = []
    for genome in Bdb['genome'].unique():
        genes = "{0}{1}.fna".format(prod_dir, genome)
        cent = "{0}{1}".format(cent_dir, genome)
        if not (os.path.exists("{0}_hits.tsv".format(cent)) and \
                    os.path.exists("{0}_report.tsv".format(cent))):
            cmds.append(gen_centrifuge_cmd(genes,cent,**kwargs))

    if len(cmds) >= 1:
        logging.info('Running Centrifuge')
        for cmd in cmds:
            logging.debug(' '.join(cmd))

        if 'wd' in kwargs:
            logdir = kwargs.get('wd').get_dir('cmd_logs')
        else:
            logdir = False
        drep.thread_cmds(cmds, shell=False, logdir=logdir, t=int(t))
        #drep.d_cluster.thread_mash_cmds_status(cmds,t=int(t))

    else:
        logging.info('Past centrifuge runs found- will not re-run')
示例#2
0
def fastani_one_vs_many(one, many, genome_rep_file, outdir, **kwargs):
    p = kwargs.get('processors', 6)
    code = drep.d_cluster.utils._randomString(stringLength=10)
    tmp_dir = kwargs.get('tmp_dir')
    logdir = kwargs.get('logdir')
    exe_loc = kwargs.get('current_exe')
    redo = kwargs.get('redo', False)

    # Gen command
    out_base = os.path.join(outdir, 'fastANI_out_{0}'.format(code))
    cmd = [
        exe_loc, '-q', one, '--rl', genome_rep_file, '-o', out_base,
        '--matrix', '-t',
        str(p), '--minFraction',
        str(0)
    ]
    logging.debug(' '.join(cmd) + ' ' + code)

    # Run command
    drep.thread_cmds([cmd], shell=False, logdir=logdir, t=1)

    # Load results
    fdb = load_fastani(out_base)

    return fdb
示例#3
0
def run_prodigal(bdb, out_dir, **kwargs):
    t = kwargs.get('processors', '6')
    loc = shutil.which('prodigal')
    if loc == None:
        logging.error('Cannot locate the program {0}- make sure its in the system path'\
            .format('prodigal'))
        sys.exit()

    cmds = []
    for genome in bdb['location'].unique():
        fna = "{0}{1}{2}".format(out_dir, os.path.basename(genome), '.fna')
        faa = "{0}{1}{2}".format(out_dir, os.path.basename(genome), '.faa')
        if os.path.exists(fna) and os.path.exists(faa):
            pass
        else:
            cmds.append([
                'prodigal', '-i', genome, '-d', fna, '-a', faa, '-m', '-p',
                'meta'
            ])

    if len(cmds) > 0:
        if 'wd' in kwargs:
            logdir = kwargs.get('wd').get_dir('cmd_logs')
        else:
            logdir = False
        drep.thread_cmds(cmds, shell=False, logdir=logdir, t=int(t))

    else:
        logging.info("Past prodigal runs found- will not re-run")
示例#4
0
文件: d_cluster.py 项目: pythseq/drep
def run_pairwise_ANIn(genome_list, ANIn_folder, **kwargs):
    '''
    Given a list of genomes and an output folder, compare all genomes using ANImf

    Args:
        genome_list: list of locations of genome files
        ANIn_folder: folder to store the output of comparison

    Keyword arguments:
        processors: threads to use
        debug: if true save extra output
        wd: needed if debug is True
    '''
    p = kwargs.get('processors', 6)
    genomes = genome_list

    # Make folder
    if not os.path.exists(ANIn_folder):
        os.makedirs(ANIn_folder)

    # Gen commands
    cmds = []
    files = []
    for g1 in genomes:
        # Make it so each reference is it's own folder, to spread out .delta files
        cur_folder = os.path.join(ANIn_folder, _get_genome_name_from_fasta(g1))
        if not os.path.exists(cur_folder):
            os.makedirs(cur_folder)

        for g2 in genomes:
            file_name = "{0}{1}_vs_{2}".format(ANIn_folder, \
                        _get_genome_name_from_fasta(g1),\
                        _get_genome_name_from_fasta(g2))
            files.append(file_name)

            # If the file doesn't already exist, add it to what needs to be run
            if not os.path.isfile(file_name + '.delta'):
                cmds.append(gen_nucmer_cmd(file_name, g1, g2))

    # Run commands
    if len(cmds) > 0:
        for c in cmds:
            logging.debug(' '.join(c))

        if ('wd' in kwargs) and (kwargs.get('debug', False)):
            logdir = kwargs.get('wd').get_dir('cmd_logs')
        else:
            logdir = False
        drep.thread_cmds(cmds, shell=False, logdir=logdir, t=int(p))

    # Make dictionary of genome lengths
    org_lengths = {}
    for genome in genomes:
        org_lengths[_get_genome_name_from_fasta(genome)] = \
            drep.d_filter.calc_fasta_length(genome)

    deltafiles = ["{0}.delta".format(file) for file in files]
    df = process_deltafiles(deltafiles, org_lengths, **kwargs)

    return df
示例#5
0
def run_prodigal(genome_list, out_dir, **kwargs):
    '''
    Run prodigal on a set of genomes, store the output in the out_dir

    Args:
        genome_list: list of genomes to run prodigal on
        out_dir: output directory to store prodigal output

    Keyword Args:
        processors: number of processors to multithread with
        exe_loc: location of prodigal excutible (will try and find with shutil if not provided)
        debug: log all of the commands
        wd: if you want to log commands, you also need the wd

    '''
    # Get set up
    t = kwargs.get('processors', '6')
    loc = kwargs.get('exe_loc', None)
    if loc == None:
        loc = drep.get_exe('prodigal')

    # Make sure it's a list
    assert type(genome_list) == list

    # Make list of commands
    cmds = []
    for genome in genome_list:
        fna = "{0}{1}".format(os.path.join(out_dir, os.path.basename(genome)),
                              '.fna')
        faa = "{0}{1}".format(os.path.join(out_dir, os.path.basename(genome)),
                              '.faa')
        if os.path.exists(fna) and os.path.exists(faa):
            pass
        else:
            cmds.append([
                'prodigal', '-i', genome, '-d', fna, '-a', faa, '-m', '-p',
                'meta'
            ])

    # Run commands
    if len(cmds) > 0:
        if ('wd' in kwargs) and (kwargs.get('debug', False) == True):
            logdir = kwargs.get('wd').get_dir('cmd_logs')
        else:
            logdir = False
            #logdir = "/home/mattolm/Programs/drep/tests/test_backend/logs/"

        drep.thread_cmds(cmds, shell=False, logdir=logdir, t=int(t))

    else:
        logging.info("Past prodigal runs found- will not re-run")
示例#6
0
def run_pairwise_fastANI(genome_list, outdir, **kwargs):
    p = kwargs.get('processors', 6)
    code = drep.d_cluster.utils._randomString(stringLength=10)

    # Make folders
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    tmp_dir = os.path.join(outdir, 'tmp/')
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # Make genome list
    glist = os.path.join(tmp_dir, 'genomeList')
    glist = _make_glist(genome_list, glist)

    # Gen command
    exe_loc = drep.get_exe('fastANI')
    out_base = os.path.join(outdir, 'fastANI_out_{0}'.format(code))
    cmd = [
        exe_loc, '--ql', glist, '--rl', glist, '-o', out_base, '--matrix',
        '-t',
        str(p), '--minFraction',
        str(0)
    ]
    logging.debug(' '.join(cmd) + ' ' + code)

    # Run command
    if ('wd' in kwargs) and (kwargs.get('debug', False)):
        logdir = kwargs.get('wd').get_dir('cmd_logs')
    else:
        logdir = False
    drep.thread_cmds([cmd], shell=False, logdir=logdir, t=1)

    # Load results
    fdb = load_fastani(out_base)

    # fix missing ones
    try:
        fdb = _fix_fastani(fdb)
        return fdb

    # handle broken self
    except:
        logging.error(
            "CRITICAL ERROR WITH SECONDARY CLUSTERING CODE {0}; SKIPPING".
            format(code))
        return pd.DataFrame()
示例#7
0
def run_mash_on_genome_chunks(genome_chunks, mash_exe, sketch_folder,
                              MASH_folder, logdir, **kwargs):
    dry = kwargs.get('dry', False)
    p = kwargs.get('processors', 6)
    MASH_s = kwargs.get('MASH_sketch', 1000)
    multi_round = kwargs.get('multiround_primary_clustering', True)

    # Step 1) Create Mash sketches
    cmds = []
    for GC in genome_chunks:
        cmds += GC.gen_sketch_cmds(mash_exe, MASH_s)
    if (not dry) & (len(cmds) > 0):
        drep.thread_cmds(cmds, logdir=logdir, t=int(p))

    # Step 2) Combine MASH sketches within chunks
    cmds = [GC.gen_paste_cmd(mash_exe) for GC in genome_chunks]
    if (not dry) & (len(cmds) > 0):
        drep.thread_cmds(cmds, logdir=logdir, t=int(p))

    # Merge the pasted chunks and make a new genomeChunk if thats what you want
    if (not multi_round) & (len(genome_chunks) > 1):
        cmd, new_gc = drep.d_cluster.utils.merge_genome_chunks(
            mash_exe, genome_chunks, sketch_folder, MASH_folder)
        genome_chunks = [new_gc]
        drep.run_cmd(cmd, dry, shell=False, logdir=logdir)

    # Step 3) Run Mash on each chunk
    cmds = [GC.gen_dist_cmd(mash_exe, MASH_folder, p) for GC in genome_chunks]
    for j, cmd in enumerate(cmds):
        if not dry:
            if len(cmds) > 1:
                logging.info(f"  Comparing group {j+1} of {len(cmds)}")
            drep.run_cmd(cmd, dry, shell=True, logdir=logdir)

    # Step 4) Load the Mash tables of each chunk
    for GC in genome_chunks:
        GC.load_mash_table()

    return genome_chunks
示例#8
0
def run_pairwise_goANI(bdb, goANI_folder, prod_folder, **kwargs):
    '''
    Run pairwise goANI on a list of Genomes

    Args:
        bdb: DataFrame with ['genome', 'location']
        goANI_folder: folder to store gANI output
        prod_folder: folder containing prodigal output from genomes (will run if needed)

    Keyword arguments:
        debug: log all of the commands
        wd: if you want to log commands, you also need the wd
        processors: threads to use

    Returns:
        DataFrame: Ndb for gANI
    '''
    p = kwargs.get('processors', 6)
    nsimscan_exe = drep.get_exe('nsimscan')
    genomes = bdb['location'].tolist()

    # Make folders
    if not os.path.exists(goANI_folder):
        os.makedirs(goANI_folder)
    if not os.path.exists(prod_folder):
        os.makedirs(prod_folder)

    # Run prodigal
    logging.debug("Running prodigal...")
    drep.d_filter.run_prodigal(bdb['location'].tolist(), prod_folder, **kwargs)

    # Gen gANI commands
    logging.debug("Running goANI...")
    cmds = []
    files = []
    for i, g1 in enumerate(genomes):
        # Make it so each reference is it's own folder, to spread out .delta files
        cur_folder = os.path.join(
            goANI_folder, drep.d_cluster.utils._get_genome_name_from_fasta(g1))
        if not os.path.exists(cur_folder):
            os.makedirs(cur_folder)

        for j, g2 in enumerate(genomes):
            if i != j:
                name1 = drep.d_cluster.utils._get_genome_name_from_fasta(g1)
                name2 = drep.d_cluster.utils._get_genome_name_from_fasta(g2)
                file_name = "{0}/{1}_vs_{2}.sim".format(
                    cur_folder, name1, name2)
                files.append(file_name)

                # If the file doesn't already exist, add it to what needs to be run
                if not os.path.isfile(file_name):
                    fna1 = "{0}.fna".format(os.path.join(prod_folder, name1))
                    fna2 = "{0}.fna".format(os.path.join(prod_folder, name2))
                    cmds.append(
                        drep.d_cluster.utils.gen_goANI_cmd(
                            file_name, fna1, fna2, nsimscan_exe))

    # Run commands
    if len(cmds) > 0:
        logging.debug('Running goANI commands: {0}'.format('\n'.join(
            [' '.join(x) for x in cmds])))
        if ('wd' in kwargs) and (kwargs.get('debug', False) == True):
            logdir = kwargs.get('wd').get_dir('cmd_logs')
        else:
            logdir = False
            #logdir = "/home/mattolm/Programs/drep/tests/test_backend/logs/"
        drep.thread_cmds(cmds, logdir=logdir, t=int(p))

    else:
        logging.debug("goANI already run- will not re-run")

    # Parse output
    df = drep.d_cluster.utils.process_goani_files(files)

    # Add self-comparisons if there is only one genome
    if len(genomes) == 1:
        Table = {
            'querry': [],
            'reference': [],
            'ani': [],
            'alignment_coverage': []
        }
        for g in genomes:
            Table['reference'].append(
                drep.d_cluster.utils._get_genome_name_from_fasta(g))
            Table['querry'].append(
                drep.d_cluster.utils._get_genome_name_from_fasta(g))
            Table['ani'].append(1)
            Table['alignment_coverage'].append(1)
        d = pd.DataFrame(Table)
        df = pd.concat([df, d], ignore_index=True)

    return df
示例#9
0
文件: d_cluster.py 项目: pythseq/drep
def all_vs_all_MASH(Bdb, data_folder, **kwargs):
    """
    Run MASH pairwise within all samples in Bdb

    Args:
        Bdb: dataframe with genome, location
        data_folder: location to store output files

    Keyword Args:
        MASH_sketch: size of mash sketches
        dry: dont actually run anything
        processors: number of processors to multithread with
        mash_exe: location of mash excutible (will try and find with shutil if not provided)
        groupSize: max number of mash sketches to hold in each folder
        debug: if True, log all of the commands
        wd: if you want to log commands, you also need the wd
    """

    MASH_s = kwargs.get('MASH_sketch', 1000)
    dry = kwargs.get('dry', False)
    # overwrite = kwargs.get('overwrite', False)
    mash_exe = kwargs.get('mash_exe', None)
    p = kwargs.get('processors', 6)
    groupSize = kwargs.get('groupSize', 1000)

    # set up logdir
    if ('wd' in kwargs) and (kwargs.get('debug', False) == True):
        logdir = kwargs.get('wd').get_dir('cmd_logs')
    else:
        logdir = False

    # Find mash
    mash_exe = kwargs.get('exe_loc', None)
    if mash_exe == None:
        mash_exe = drep.get_exe('mash')

    # Set up folders
    MASH_folder = os.path.join(data_folder, 'MASH_files/')
    if not os.path.exists(MASH_folder):
        os.makedirs(MASH_folder)

    sketch_folder = os.path.join(MASH_folder, 'sketches/')
    if not os.path.exists(sketch_folder):
        os.makedirs(sketch_folder)

    # Make chunks
    l2g = Bdb.set_index('location')['genome'].to_dict()
    locations = list(Bdb['location'].unique())
    chunks = [
        locations[x:x + groupSize] for x in range(0, len(locations), groupSize)
    ]

    # Make the MASH sketches
    cmds = []
    chunk_folders = []
    for i, chunk in enumerate(chunks):
        chunk_folder = os.path.join(sketch_folder, "chunk_{0}".format(i))
        chunk_folders.append(chunk_folder)
        if not os.path.exists(chunk_folder):
            os.makedirs(chunk_folder)
        for fasta in chunk:
            genome = l2g[fasta]
            file = os.path.join(chunk_folder, genome)
            if not os.path.isfile(file + '.msh'):
                cmd = [
                    mash_exe, 'sketch', fasta, '-s',
                    str(MASH_s), '-o', file
                ]
                cmds.append(cmd)

    if not dry:
        if len(cmds) > 0:
            drep.thread_cmds(cmds, logdir=logdir, t=int(p))

    # Combine MASH sketches within chunk
    cmds = []
    alls = []
    for chunk_folder in chunk_folders:
        all_file = os.path.join(chunk_folder, 'chunk_all.msh')
        cmd = [mash_exe, 'paste', all_file] \
                + glob.glob(os.path.join(chunk_folder, '*'))
        cmds.append(cmd)
        alls.append(all_file)
    if not dry:
        if len(cmds) > 0:
            drep.thread_cmds(cmds, logdir=logdir, t=int(p))

    # Combine MASH sketches of all chunks
    all_file = os.path.join(MASH_folder, 'ALL.msh')
    cmd = [mash_exe, 'paste', all_file] + alls
    drep.run_cmd(cmd, dry, shell=False, logdir=logdir)

    # Calculate distances
    cmd = [
        mash_exe, 'dist', '-p',
        str(p), all_file, all_file, '>', MASH_folder + 'MASH_table.tsv'
    ]
    cmd = ' '.join(cmd)
    drep.run_cmd(cmd, dry, shell=True, logdir=logdir)

    # Make Mdb based on all genomes in the MASH folder
    file = MASH_folder + 'MASH_table.tsv'

    iniCols = ['genome1', 'genome2', 'dist', 'p', 'kmers']
    uCols = ['genome1', 'genome2', 'dist']
    dTypes = {'genome1': 'category', 'genome2': 'category', 'dist': np.float32}
    Mdb = pd.read_csv(file,
                      names=iniCols,
                      usecols=uCols,
                      dtype=dTypes,
                      sep='\t')
    Mdb['genome1'] = Mdb['genome1'].apply(_get_genome_name_from_fasta)
    Mdb['genome2'] = Mdb['genome2'].apply(_get_genome_name_from_fasta)
    Mdb['similarity'] = 1 - Mdb['dist']

    # Filter out those genomes that are in the MASH folder but shouldn't be in Mdb
    genomes = Bdb['genome'].unique()
    Mdb = Mdb[Mdb['genome1'].isin(genomes)]
    Mdb = Mdb[Mdb['genome2'].isin(genomes)]

    # Reorder categories to be correct
    for g in ['genome1', 'genome2']:
        Mdb[g] = Mdb[g].cat.remove_unused_categories()
        Mdb[g] = Mdb[g].cat.reorder_categories(sorted((Mdb[g].unique())),
                                               ordered=True)

    return Mdb