Exemplo n.º 1
0
def prepare_mash(data_folder, **kwargs):
    """
    Make some folders and things
    """
    append = kwargs.get('v2', '')

    # set up logdir
    if ('wd' in kwargs) and (kwargs.get('debug', False) == True):
        logdir = kwargs.get('wd').get_dir('cmd_logs')
    else:
        logdir = False

    # Find mash excutable
    mash_exe = kwargs.get('exe_loc', None)
    if mash_exe == None:
        mash_exe = drep.get_exe('mash')

    # Make a folder to hold this information
    MASH_folder = os.path.join(data_folder, 'MASH_files{0}/'.format(append))
    if not os.path.exists(MASH_folder):
        os.makedirs(MASH_folder)

    # Make a folder in there to store sketches
    sketch_folder = os.path.join(MASH_folder, 'sketches{0}/'.format(append))
    if not os.path.exists(sketch_folder):
        os.makedirs(sketch_folder)

    return logdir, MASH_folder, sketch_folder, mash_exe
Exemplo n.º 2
0
def run_prodigal(genome_list, out_dir, **kwargs):
    '''
    Run prodigal on a set of genomes, store the output in the out_dir

    Args:
        genome_list: list of genomes to run prodigal on
        out_dir: output directory to store prodigal output

    Keyword Args:
        processors: number of processors to multithread with
        exe_loc: location of prodigal excutible (will try and find with shutil if not provided)
        debug: log all of the commands
        wd: if you want to log commands, you also need the wd

    '''
    # Get set up
    t = kwargs.get('processors', '6')
    loc = kwargs.get('exe_loc', None)
    if loc == None:
        loc = drep.get_exe('prodigal')

    # Make sure it's a list
    assert type(genome_list) == list

    # Make list of commands
    cmds = []
    for genome in genome_list:
        fna = "{0}{1}".format(os.path.join(out_dir, os.path.basename(genome)),
                              '.fna')
        faa = "{0}{1}".format(os.path.join(out_dir, os.path.basename(genome)),
                              '.faa')
        if os.path.exists(fna) and os.path.exists(faa):
            pass
        else:
            cmds.append([
                'prodigal', '-i', genome, '-d', fna, '-a', faa, '-m', '-p',
                'meta'
            ])

    # Run commands
    if len(cmds) > 0:
        if ('wd' in kwargs) and (kwargs.get('debug', False) == True):
            logdir = kwargs.get('wd').get_dir('cmd_logs')
        else:
            logdir = False
            #logdir = "/home/mattolm/Programs/drep/tests/test_backend/logs/"

        drep.thread_cmds(cmds, shell=False, logdir=logdir, t=int(t))

    else:
        logging.info("Past prodigal runs found- will not re-run")
Exemplo n.º 3
0
def run_pairwise_fastANI(genome_list, outdir, **kwargs):
    p = kwargs.get('processors', 6)
    code = drep.d_cluster.utils._randomString(stringLength=10)

    # Make folders
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    tmp_dir = os.path.join(outdir, 'tmp/')
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # Make genome list
    glist = os.path.join(tmp_dir, 'genomeList')
    glist = _make_glist(genome_list, glist)

    # Gen command
    exe_loc = drep.get_exe('fastANI')
    out_base = os.path.join(outdir, 'fastANI_out_{0}'.format(code))
    cmd = [
        exe_loc, '--ql', glist, '--rl', glist, '-o', out_base, '--matrix',
        '-t',
        str(p), '--minFraction',
        str(0)
    ]
    logging.debug(' '.join(cmd) + ' ' + code)

    # Run command
    if ('wd' in kwargs) and (kwargs.get('debug', False)):
        logdir = kwargs.get('wd').get_dir('cmd_logs')
    else:
        logdir = False
    drep.thread_cmds([cmd], shell=False, logdir=logdir, t=1)

    # Load results
    fdb = load_fastani(out_base)

    # fix missing ones
    try:
        fdb = _fix_fastani(fdb)
        return fdb

    # handle broken self
    except:
        logging.error(
            "CRITICAL ERROR WITH SECONDARY CLUSTERING CODE {0}; SKIPPING".
            format(code))
        return pd.DataFrame()
Exemplo n.º 4
0
def run_pairwise_goANI(bdb, goANI_folder, prod_folder, **kwargs):
    '''
    Run pairwise goANI on a list of Genomes

    Args:
        bdb: DataFrame with ['genome', 'location']
        goANI_folder: folder to store gANI output
        prod_folder: folder containing prodigal output from genomes (will run if needed)

    Keyword arguments:
        debug: log all of the commands
        wd: if you want to log commands, you also need the wd
        processors: threads to use

    Returns:
        DataFrame: Ndb for gANI
    '''
    p = kwargs.get('processors', 6)
    nsimscan_exe = drep.get_exe('nsimscan')
    genomes = bdb['location'].tolist()

    # Make folders
    if not os.path.exists(goANI_folder):
        os.makedirs(goANI_folder)
    if not os.path.exists(prod_folder):
        os.makedirs(prod_folder)

    # Run prodigal
    logging.debug("Running prodigal...")
    drep.d_filter.run_prodigal(bdb['location'].tolist(), prod_folder, **kwargs)

    # Gen gANI commands
    logging.debug("Running goANI...")
    cmds = []
    files = []
    for i, g1 in enumerate(genomes):
        # Make it so each reference is it's own folder, to spread out .delta files
        cur_folder = os.path.join(
            goANI_folder, drep.d_cluster.utils._get_genome_name_from_fasta(g1))
        if not os.path.exists(cur_folder):
            os.makedirs(cur_folder)

        for j, g2 in enumerate(genomes):
            if i != j:
                name1 = drep.d_cluster.utils._get_genome_name_from_fasta(g1)
                name2 = drep.d_cluster.utils._get_genome_name_from_fasta(g2)
                file_name = "{0}/{1}_vs_{2}.sim".format(
                    cur_folder, name1, name2)
                files.append(file_name)

                # If the file doesn't already exist, add it to what needs to be run
                if not os.path.isfile(file_name):
                    fna1 = "{0}.fna".format(os.path.join(prod_folder, name1))
                    fna2 = "{0}.fna".format(os.path.join(prod_folder, name2))
                    cmds.append(
                        drep.d_cluster.utils.gen_goANI_cmd(
                            file_name, fna1, fna2, nsimscan_exe))

    # Run commands
    if len(cmds) > 0:
        logging.debug('Running goANI commands: {0}'.format('\n'.join(
            [' '.join(x) for x in cmds])))
        if ('wd' in kwargs) and (kwargs.get('debug', False) == True):
            logdir = kwargs.get('wd').get_dir('cmd_logs')
        else:
            logdir = False
            #logdir = "/home/mattolm/Programs/drep/tests/test_backend/logs/"
        drep.thread_cmds(cmds, logdir=logdir, t=int(p))

    else:
        logging.debug("goANI already run- will not re-run")

    # Parse output
    df = drep.d_cluster.utils.process_goani_files(files)

    # Add self-comparisons if there is only one genome
    if len(genomes) == 1:
        Table = {
            'querry': [],
            'reference': [],
            'ani': [],
            'alignment_coverage': []
        }
        for g in genomes:
            Table['reference'].append(
                drep.d_cluster.utils._get_genome_name_from_fasta(g))
            Table['querry'].append(
                drep.d_cluster.utils._get_genome_name_from_fasta(g))
            Table['ani'].append(1)
            Table['alignment_coverage'].append(1)
        d = pd.DataFrame(Table)
        df = pd.concat([df, d], ignore_index=True)

    return df
Exemplo n.º 5
0
def all_vs_all_MASH(Bdb, data_folder, **kwargs):
    """
    Run MASH pairwise within all samples in Bdb

    Args:
        Bdb: dataframe with genome, location
        data_folder: location to store output files

    Keyword Args:
        MASH_sketch: size of mash sketches
        dry: dont actually run anything
        processors: number of processors to multithread with
        mash_exe: location of mash excutible (will try and find with shutil if not provided)
        groupSize: max number of mash sketches to hold in each folder
        debug: if True, log all of the commands
        wd: if you want to log commands, you also need the wd
    """

    MASH_s = kwargs.get('MASH_sketch', 1000)
    dry = kwargs.get('dry', False)
    # overwrite = kwargs.get('overwrite', False)
    mash_exe = kwargs.get('mash_exe', None)
    p = kwargs.get('processors', 6)
    groupSize = kwargs.get('groupSize', 1000)

    # set up logdir
    if ('wd' in kwargs) and (kwargs.get('debug', False) == True):
        logdir = kwargs.get('wd').get_dir('cmd_logs')
    else:
        logdir = False

    # Find mash
    mash_exe = kwargs.get('exe_loc', None)
    if mash_exe == None:
        mash_exe = drep.get_exe('mash')

    # Set up folders
    MASH_folder = os.path.join(data_folder, 'MASH_files/')
    if not os.path.exists(MASH_folder):
        os.makedirs(MASH_folder)

    sketch_folder = os.path.join(MASH_folder, 'sketches/')
    if not os.path.exists(sketch_folder):
        os.makedirs(sketch_folder)

    # Make chunks
    l2g = Bdb.set_index('location')['genome'].to_dict()
    locations = list(Bdb['location'].unique())
    chunks = [
        locations[x:x + groupSize] for x in range(0, len(locations), groupSize)
    ]

    # Make the MASH sketches
    cmds = []
    chunk_folders = []
    for i, chunk in enumerate(chunks):
        chunk_folder = os.path.join(sketch_folder, "chunk_{0}".format(i))
        chunk_folders.append(chunk_folder)
        if not os.path.exists(chunk_folder):
            os.makedirs(chunk_folder)
        for fasta in chunk:
            genome = l2g[fasta]
            file = os.path.join(chunk_folder, genome)
            if not os.path.isfile(file + '.msh'):
                cmd = [
                    mash_exe, 'sketch', fasta, '-s',
                    str(MASH_s), '-o', file
                ]
                cmds.append(cmd)

    if not dry:
        if len(cmds) > 0:
            drep.thread_cmds(cmds, logdir=logdir, t=int(p))

    # Combine MASH sketches within chunk
    cmds = []
    alls = []
    for chunk_folder in chunk_folders:
        all_file = os.path.join(chunk_folder, 'chunk_all.msh')
        cmd = [mash_exe, 'paste', all_file] \
                + glob.glob(os.path.join(chunk_folder, '*'))
        cmds.append(cmd)
        alls.append(all_file)
    if not dry:
        if len(cmds) > 0:
            drep.thread_cmds(cmds, logdir=logdir, t=int(p))

    # Combine MASH sketches of all chunks
    all_file = os.path.join(MASH_folder, 'ALL.msh')
    cmd = [mash_exe, 'paste', all_file] + alls
    drep.run_cmd(cmd, dry, shell=False, logdir=logdir)

    # Calculate distances
    cmd = [
        mash_exe, 'dist', '-p',
        str(p), all_file, all_file, '>', MASH_folder + 'MASH_table.tsv'
    ]
    cmd = ' '.join(cmd)
    drep.run_cmd(cmd, dry, shell=True, logdir=logdir)

    # Make Mdb based on all genomes in the MASH folder
    file = MASH_folder + 'MASH_table.tsv'

    iniCols = ['genome1', 'genome2', 'dist', 'p', 'kmers']
    uCols = ['genome1', 'genome2', 'dist']
    dTypes = {'genome1': 'category', 'genome2': 'category', 'dist': np.float32}
    Mdb = pd.read_csv(file,
                      names=iniCols,
                      usecols=uCols,
                      dtype=dTypes,
                      sep='\t')
    Mdb['genome1'] = Mdb['genome1'].apply(_get_genome_name_from_fasta)
    Mdb['genome2'] = Mdb['genome2'].apply(_get_genome_name_from_fasta)
    Mdb['similarity'] = 1 - Mdb['dist']

    # Filter out those genomes that are in the MASH folder but shouldn't be in Mdb
    genomes = Bdb['genome'].unique()
    Mdb = Mdb[Mdb['genome1'].isin(genomes)]
    Mdb = Mdb[Mdb['genome2'].isin(genomes)]

    # Reorder categories to be correct
    for g in ['genome1', 'genome2']:
        Mdb[g] = Mdb[g].cat.remove_unused_categories()
        Mdb[g] = Mdb[g].cat.reorder_categories(sorted((Mdb[g].unique())),
                                               ordered=True)

    return Mdb