def run_checkM(genome_folder, checkm_outf, **kwargs): import drep.d_bonus as dBonus t = str(kwargs.get('processors', '6')) loc, works = dBonus.find_program('checkm') if loc == None: logging.error('Cannot locate the program {0}- make sure its in the system path'\ .format('checkm')) sys.exit() if works == False: logging.error('Program {0} is not working!! Im going to crash now'\ .format('checkm')) sys.exit() check_exe = loc checkm_method = kwargs.get('checkM_method', 'lineage_wf') # Run checkM initial if checkm_method == 'taxonomy_wf': cmd = [check_exe,checkm_method,'domain','Bacteria',genome_folder,checkm_outf,'-f',\ checkm_outf + '/results.tsv','--tab_table','-t',str(t),'-g','-x','faa'] else: cmd = [check_exe,checkm_method,genome_folder,checkm_outf,'-f',\ checkm_outf + '/results.tsv','--tab_table','-t',str(t),'--pplacer_threads',\ str(t),'-g','-x','faa'] logging.debug("Running CheckM with command: {0}".format(cmd)) if 'wd' in kwargs: logdir = kwargs.get('wd').get_dir('cmd_logs') else: logdir = False drep.run_cmd(cmd, shell=False, logdir=logdir) # Run checkM again for the better table if checkm_method == 'taxonomy_wf': lineage = checkm_outf + 'Bacteria.ms' else: lineage = checkm_outf + 'lineage.ms' desired_file = checkm_outf + 'Chdb.tsv' cmd = [check_exe,'qa', lineage, checkm_outf, '-f', desired_file, '-t',\ str(t), '--tab_table','-o', '2'] logging.debug("Running CheckM with command: {0}".format(cmd)) if 'wd' in kwargs: logdir = kwargs.get('wd').get_dir('cmd_logs') else: logdir = False drep.run_cmd(cmd, shell=False, logdir=logdir) # Load table and return it try: chdb = pd.read_table(desired_file, sep='\t') except: logging.error("!!! checkM failed !!!\nIf using pyenv, make sure both python2 and " +\ "python3 are available (for example: pyenv global 3.5.1 2.7.9)") sys.exit() return chdbfix_
def unit_tests_6(self): ''' Test drep call commands ''' # try on single mash command wd = WorkDirectory(self.working_wd_loc) MASH_folder = wd.get_dir('MASH') log_folder = wd.get_dir('cmd_logs') mash_exe = 'mash' all_file = MASH_folder + 'ALL.msh' cmd = [mash_exe, 'dist', all_file, all_file, '>', MASH_folder + 'MASH_table.tsv'] cmd = ' '.join(cmd) drep.run_cmd(cmd, shell=True, logdir=log_folder) assert len(glob.glob(log_folder + '*')) == 3
def run_mash_on_genome_chunks(genome_chunks, mash_exe, sketch_folder, MASH_folder, logdir, **kwargs): dry = kwargs.get('dry', False) p = kwargs.get('processors', 6) MASH_s = kwargs.get('MASH_sketch', 1000) multi_round = kwargs.get('multiround_primary_clustering', True) # Step 1) Create Mash sketches cmds = [] for GC in genome_chunks: cmds += GC.gen_sketch_cmds(mash_exe, MASH_s) if (not dry) & (len(cmds) > 0): drep.thread_cmds(cmds, logdir=logdir, t=int(p)) # Step 2) Combine MASH sketches within chunks cmds = [GC.gen_paste_cmd(mash_exe) for GC in genome_chunks] if (not dry) & (len(cmds) > 0): drep.thread_cmds(cmds, logdir=logdir, t=int(p)) # Merge the pasted chunks and make a new genomeChunk if thats what you want if (not multi_round) & (len(genome_chunks) > 1): cmd, new_gc = drep.d_cluster.utils.merge_genome_chunks( mash_exe, genome_chunks, sketch_folder, MASH_folder) genome_chunks = [new_gc] drep.run_cmd(cmd, dry, shell=False, logdir=logdir) # Step 3) Run Mash on each chunk cmds = [GC.gen_dist_cmd(mash_exe, MASH_folder, p) for GC in genome_chunks] for j, cmd in enumerate(cmds): if not dry: if len(cmds) > 1: logging.info(f" Comparing group {j+1} of {len(cmds)}") drep.run_cmd(cmd, dry, shell=True, logdir=logdir) # Step 4) Load the Mash tables of each chunk for GC in genome_chunks: GC.load_mash_table() return genome_chunks
def run_checkM(genome_folder, checkm_outf, **kwargs): ''' Run checkM WARNING- this will result in wrong genome lenth and genome N50 estimate, due to it being run on prodigal output Args: genome_folder: location of folder to run checkM on - should be full of files ending in .faa (result of prodigal) checkm_outf: location of folder to store checkM output Keyword args: processors: number of threads checkm_method: either lineage_wf or taxonomy_wf debug: log all of the commands wd: if you want to log commands, you also need the wd set_recursion: if not 0, set the python recursion ''' # Find checkm exe loc, works = drep.d_bonus.find_program('checkm') if loc == None: logging.error('Cannot locate the program {0}- make sure its in the system path'\ .format('checkm')) sys.exit() if works == False: logging.error('Program {0} is not working!! Im going to crash now'\ .format('checkm')) sys.exit() check_exe = loc # Get set up t = str(kwargs.get('processors', '6')) checkm_method = kwargs.get('checkM_method', 'lineage_wf') # Set recursion R = kwargs.get('set_recursion', '0') if R != '0': logging.warning('Setting Maximum Recursion depth to {0}'.format(R)) sys.setrecursionlimit(int(R)) # Run checkM initial if checkm_method == 'taxonomy_wf': cmd = [check_exe,checkm_method,'domain','Bacteria',genome_folder,checkm_outf,'-f',\ checkm_outf + '/results.tsv','--tab_table','-t',str(t),'-g','-x','faa'] else: cmd = [check_exe,checkm_method,genome_folder,checkm_outf,'-f',\ checkm_outf + '/results.tsv','--tab_table','-t',str(t),'--pplacer_threads',\ str(t),'-g','-x','faa'] logging.debug("Running CheckM with command: {0}".format(cmd)) if ('wd' in kwargs) & (kwargs.get('debug', False) == True): logdir = kwargs.get('wd').get_dir('cmd_logs') else: logdir = False drep.run_cmd(cmd, shell=False, logdir=logdir) # Run checkM again for the better table if checkm_method == 'taxonomy_wf': lineage = checkm_outf + 'Bacteria.ms' else: lineage = checkm_outf + 'lineage.ms' desired_file = checkm_outf + 'Chdb.tsv' cmd = [check_exe,'qa', lineage, checkm_outf, '-f', desired_file, '-t',\ str(t), '--tab_table','-o', '2'] logging.debug("Running CheckM with command: {0}".format(cmd)) if ('wd' in kwargs) & (kwargs.get('debug', False) == True): logdir = kwargs.get('wd').get_dir('cmd_logs') else: logdir = False drep.run_cmd(cmd, shell=False, logdir=logdir) # Load table try: chdb = pd.read_table(desired_file, sep='\t') except: logging.error( "!!! checkM failed !!!\nYou can run again with the --debug option to see what went wrong (command logs will be created in the log folder)" ) sys.exit() # Return table return chdb
def run_checkM(genome_folder_whole, checkm_outf_whole, **kwargs): ''' Run checkM WARNING- this will result in wrong genome length and genome N50 estimate, due to it being run on prodigal output Args: genome_folder: location of folder to run checkM on - should be full of files ending in .faa (result of prodigal) checkm_outf: location of folder to store checkM output Keyword args: processors: number of threads checkm_method: either lineage_wf or taxonomy_wf debug: log all of the commands wd: if you want to log commands, you also need the wd set_recursion: if not 0, set the python recursion ''' # Get set up check_exe = _checkm_get_exe() t = str(kwargs.get('processors','6')) checkm_method = kwargs.get('checkM_method', 'lineage_wf') checkm_group_size = kwargs.get('checkm_group_size', 1000) # Set recursion R = kwargs.get('set_recursion', '0') if R != '0': logging.warning('Setting Maximum Recursion depth to {0}'.format(R)) sys.setrecursionlimit(int(R)) # Establish groups dbs = [] for genome_folder, checkm_outf in _iterate_checkm_groups(genome_folder_whole, checkm_outf_whole, checkm_group_size): # Run checkM initial if checkm_method == 'taxonomy_wf': cmd = [check_exe,checkm_method,'domain','Bacteria',genome_folder,checkm_outf,'-f',\ checkm_outf + '/results.tsv','--tab_table','-t',str(t),'-g','-x','faa'] else: cmd = [check_exe,checkm_method,genome_folder,checkm_outf,'-f',\ checkm_outf + '/results.tsv','--tab_table','-t',str(t),'--pplacer_threads',\ str(t),'-g','-x','faa'] logging.debug("Running CheckM with command: {0}".format(' '.join(cmd))) if ('wd' in kwargs) & (kwargs.get('debug', False) == True): logdir = kwargs.get('wd').get_dir('cmd_logs') else: logdir = False drep.run_cmd(cmd, shell=False, logdir=logdir) # Run checkM again for the better table if checkm_method == 'taxonomy_wf': lineage = checkm_outf + 'Bacteria.ms' else: lineage = checkm_outf + 'lineage.ms' desired_file = checkm_outf + 'Chdb.tsv' cmd = [check_exe,'qa', lineage, checkm_outf, '-f', desired_file, '-t',\ str(t), '--tab_table','-o', '2'] logging.debug("Running CheckM with command: {0}".format(' '.join(cmd))) if ('wd' in kwargs) & (kwargs.get('debug', False) == True): logdir = kwargs.get('wd').get_dir('cmd_logs') else: logdir = False drep.run_cmd(cmd, shell=False, logdir=logdir) # Load table try: chdb = pd.read_table(desired_file,sep='\t') except: logging.error("!!! checkM failed !!!\nSee https://drep.readthedocs.io/en/latest/advanced_use.html#troubleshooting-checkm for help troubleshooting") sys.exit() # Return table dbs.append(chdb) return pd.concat(dbs).reset_index(drop=True)
def all_vs_all_MASH(Bdb, data_folder, **kwargs): """ Run MASH pairwise within all samples in Bdb Args: Bdb: dataframe with genome, location data_folder: location to store output files Keyword Args: MASH_sketch: size of mash sketches dry: dont actually run anything processors: number of processors to multithread with mash_exe: location of mash excutible (will try and find with shutil if not provided) groupSize: max number of mash sketches to hold in each folder debug: if True, log all of the commands wd: if you want to log commands, you also need the wd """ MASH_s = kwargs.get('MASH_sketch', 1000) dry = kwargs.get('dry', False) # overwrite = kwargs.get('overwrite', False) mash_exe = kwargs.get('mash_exe', None) p = kwargs.get('processors', 6) groupSize = kwargs.get('groupSize', 1000) # set up logdir if ('wd' in kwargs) and (kwargs.get('debug', False) == True): logdir = kwargs.get('wd').get_dir('cmd_logs') else: logdir = False # Find mash mash_exe = kwargs.get('exe_loc', None) if mash_exe == None: mash_exe = drep.get_exe('mash') # Set up folders MASH_folder = os.path.join(data_folder, 'MASH_files/') if not os.path.exists(MASH_folder): os.makedirs(MASH_folder) sketch_folder = os.path.join(MASH_folder, 'sketches/') if not os.path.exists(sketch_folder): os.makedirs(sketch_folder) # Make chunks l2g = Bdb.set_index('location')['genome'].to_dict() locations = list(Bdb['location'].unique()) chunks = [ locations[x:x + groupSize] for x in range(0, len(locations), groupSize) ] # Make the MASH sketches cmds = [] chunk_folders = [] for i, chunk in enumerate(chunks): chunk_folder = os.path.join(sketch_folder, "chunk_{0}".format(i)) chunk_folders.append(chunk_folder) if not os.path.exists(chunk_folder): os.makedirs(chunk_folder) for fasta in chunk: genome = l2g[fasta] file = os.path.join(chunk_folder, genome) if not os.path.isfile(file + '.msh'): cmd = [ mash_exe, 'sketch', fasta, '-s', str(MASH_s), '-o', file ] cmds.append(cmd) if not dry: if len(cmds) > 0: drep.thread_cmds(cmds, logdir=logdir, t=int(p)) # Combine MASH sketches within chunk cmds = [] alls = [] for chunk_folder in chunk_folders: all_file = os.path.join(chunk_folder, 'chunk_all.msh') cmd = [mash_exe, 'paste', all_file] \ + glob.glob(os.path.join(chunk_folder, '*')) cmds.append(cmd) alls.append(all_file) if not dry: if len(cmds) > 0: drep.thread_cmds(cmds, logdir=logdir, t=int(p)) # Combine MASH sketches of all chunks all_file = os.path.join(MASH_folder, 'ALL.msh') cmd = [mash_exe, 'paste', all_file] + alls drep.run_cmd(cmd, dry, shell=False, logdir=logdir) # Calculate distances cmd = [ mash_exe, 'dist', '-p', str(p), all_file, all_file, '>', MASH_folder + 'MASH_table.tsv' ] cmd = ' '.join(cmd) drep.run_cmd(cmd, dry, shell=True, logdir=logdir) # Make Mdb based on all genomes in the MASH folder file = MASH_folder + 'MASH_table.tsv' iniCols = ['genome1', 'genome2', 'dist', 'p', 'kmers'] uCols = ['genome1', 'genome2', 'dist'] dTypes = {'genome1': 'category', 'genome2': 'category', 'dist': np.float32} Mdb = pd.read_csv(file, names=iniCols, usecols=uCols, dtype=dTypes, sep='\t') Mdb['genome1'] = Mdb['genome1'].apply(_get_genome_name_from_fasta) Mdb['genome2'] = Mdb['genome2'].apply(_get_genome_name_from_fasta) Mdb['similarity'] = 1 - Mdb['dist'] # Filter out those genomes that are in the MASH folder but shouldn't be in Mdb genomes = Bdb['genome'].unique() Mdb = Mdb[Mdb['genome1'].isin(genomes)] Mdb = Mdb[Mdb['genome2'].isin(genomes)] # Reorder categories to be correct for g in ['genome1', 'genome2']: Mdb[g] = Mdb[g].cat.remove_unused_categories() Mdb[g] = Mdb[g].cat.reorder_categories(sorted((Mdb[g].unique())), ordered=True) return Mdb