def tearDown(self): remove_files(set(self.files_to_remove)) # remove directories last, so we don't get errors # trying to remove files which may be in the directories for d in self.dirs_to_remove: if exists(d): rmtree(d)
def test_mothur_supported_version(self): """mothur is in path and version is supported """ acceptable_version = (1, 25, 0) self.assertTrue( which("mothur"), "mothur not found. This may or may not be a problem depending on " + "which components of QIIME you plan to use.", ) # mothur creates a log file in cwd, so create a tmp and cd there first log_file = join(get_qiime_temp_dir(), "mothur.log") command = "mothur \"#set.logfile(name=%s)\" | grep '^mothur v'" % log_file stdout, stderr, exit_Status = qiime_system_call(command) # remove log file remove_files([log_file], error_on_missing=False) version_string = stdout.strip().split(" ")[1].strip("v.") try: version = tuple(map(int, version_string.split("."))) pass_test = version == acceptable_version except ValueError: pass_test = False version_string = stdout self.assertTrue( pass_test, "Unsupported mothur version. %s is required, but running %s." % (".".join(map(str, acceptable_version)), version_string), )
def tearDown(self): if self._files_to_remove: remove_files(self._files_to_remove) if exists(self.output_dir): rmtree(self.output_dir) if exists(self.input_dir): rmtree(self.input_dir)
def test_plot_heatmap(self): plot_heatmap(self.otu_table, self.otu_table.ids(axis='observation'), self.otu_table.ids(), filename=self.tmp_heatmap_fpath) self.assertEqual(exists(self.tmp_heatmap_fpath), True) remove_files(set([self.tmp_heatmap_fpath]))
def test_mothur_supported_version(self): """mothur is in path and version is supported """ acceptable_version = (1, 25, 0) self.assertTrue( which('mothur'), "mothur not found. This may or may not be a problem depending on " + "which components of QIIME you plan to use.") # mothur creates a log file in cwd, so create a tmp and cd there first log_file = join(get_qiime_temp_dir(), 'mothur.log') command = "mothur \"#set.logfile(name=%s)\" | grep '^mothur v'" % log_file stdout, stderr, exit_Status = qiime_system_call(command) # remove log file remove_files([log_file], error_on_missing=False) version_string = stdout.strip().split(' ')[1].strip('v.') try: version = tuple(map(int, version_string.split('.'))) pass_test = version == acceptable_version except ValueError: pass_test = False version_string = stdout self.assertTrue( pass_test, "Unsupported mothur version. %s is required, but running %s." % ('.'.join(map(str, acceptable_version)), version_string))
def tearDown(self): """Removes temporary directories and files.""" remove_files(self.files_to_remove) # Remove directories last, so we don't get errors trying to remove # files which may be in the directories. for d in self.dirs_to_remove: if exists(d): rmtree(d)
def tearDown(self): """ """ disable_timeout() remove_files(self.files_to_remove, error_on_missing=False) # remove directories last, so we don't get errors # trying to remove files which may be in the directories for d in self.dirs_to_remove: if exists(d): rmtree(d)
def tearDown(self): """Clean up tmp files.""" remove_files(self.files_to_remove, False) if self.tmpdir: rmtree(self.tmpdir) # clean up the file from init_flowgram_file if (hasattr(self, "tmp_filename") and exists(self.tmp_filename)): remove(self.tmp_filename)
def tearDown(self): """remove all the files after completing tests """ self.mapping_fp.close() self.fasta_file_no_consensus.close() self.fasta_file_for_consensus_tie_G_C.close() self.fasta_file_for_consensus_unequal_length.close() remove_files([self.mapping_fp_name, self.fasta_file_no_consensus_name, self.fasta_file_for_consensus_tie_G_C_name, self.fasta_file_for_consensus_unequal_length_name, self.fwd_read_fh_name, self.rev_read_fh_name])
def swarm_denovo_cluster(seq_path, d=1, threads=1, HALT_EXEC=False): """ Function : launch the Swarm de novo OTU picker Parameters: seq_path, filepath to reads d, resolution threads, number of threads to use Return : clusters, list of lists """ # Check sequence file exists if not exists(seq_path): raise ValueError("%s does not exist" % seq_path) # Instantiate the object swarm = Swarm(HALT_EXEC=HALT_EXEC) # Set the resolution if d > 0: swarm.Parameters['-d'].on(d) else: raise ValueError("Resolution -d must be a positive integer.") # Set the number of threads if threads > 0: swarm.Parameters['-t'].on(threads) else: raise ValueError("Number of threads must be a positive integer.") # create temporary file for Swarm OTU-map f, tmp_swarm_otumap = mkstemp(prefix='temp_otumap_', suffix='.swarm') close(f) swarm.Parameters['-o'].on(tmp_swarm_otumap) # Remove this file later, the final OTU-map # is output by swarm_breaker.py and returned # as a list of lists (clusters) swarm.files_to_remove.append(tmp_swarm_otumap) # Launch Swarm # set the data string to include the read filepath # (to be passed as final arguments in the swarm command) clusters = swarm(seq_path) remove_files(swarm.files_to_remove, error_on_missing=False) # Return clusters return clusters
def tearDown(self): """ """ disable_timeout() # reset sys.stderr sys.stderr = self.saved_stderr remove_files(self.files_to_remove) # remove directories last, so we don't get errors # trying to remove files which may be in the directories for d in self.dirs_to_remove: if exists(d): rmtree(d)
def remove_intermediate_files(self): """Remove all intermediate files.""" # tmp files are written in the current dir, # app controller always jumps into dir specified via exec_dir # Note: blast intermediates are not removed exec_dir = str(self.Parameters['--exec_dir'].Value) inp_file_name = str(self.Parameters['--query_NAST'].Value) exec_dir = exec_dir.rstrip('"') exec_dir = exec_dir.lstrip('"') inp_file_name = inp_file_name.rstrip('"') inp_file_name = inp_file_name.lstrip('"') tmp_suffixes = [".CPS", ".CPS.CPC", ".CPS_RENAST", ".CPS_RENAST.cidx", ".CPS.CPC.wTaxons", ".cidx"] cs_tmp_files = [ exec_dir + '/' + inp_file_name + x for x in tmp_suffixes] remove_files(cs_tmp_files, error_on_missing=False) db_param = self.Parameters['--db_NAST'] if db_param.isOn(): nast_db_name = str(db_param.Value) nast_db_name = nast_db_name.rstrip('"') nast_db_name = nast_db_name.lstrip('"') # Better do not remove this file since other ChimeraSlayer # instances running on the same ref set might use this file # Should be rather deleted in the calling function # remove_files([nast_db_name + ".cidx"], # error_on_missing=False) fasta_param = self.Parameters['--db_FASTA'] if fasta_param.isOn(): fasta_name = str(fasta_param.Value) fasta_name = fasta_name.rstrip('"') fasta_name = fasta_name.lstrip('"') blast_db_files = [ fasta_name + x for x in [ ".nsq", ".nin", ".nhr", ".cidx"]] remove_files(blast_db_files, error_on_missing=False)
def test_seq_path(self): """ Swarm should raise a ValueError if the sequences filepath does not exist """ f, tmp_file = mkstemp(prefix='temp_reads_', suffix='.fasta') close(f) remove_files([tmp_file]) self.assertRaises(ValueError, swarm_denovo_cluster, seq_path=tmp_file, d=1, threads=1)
def tearDown(self): """Clean up tmp files.""" # turn off the alarm signal.alarm(0) remove_files(self.files_to_remove, False) if self.server_socket: self.server_socket.close() # give clients time to clean up sleep(1) if exists(self.tmp_dir): try: rmdir(self.tmp_dir) except OSError: # give clients some more time, fail if still error sleep(5) rmdir(self.tmp_dir)
def test_remove_files(self): # create list of temp file paths test_fds = [NamedTemporaryFile(delete=False) for i in range(5)] test_filepaths = [element.name for element in test_fds] # should work just fine remove_files(test_filepaths) # check that an error is raised on trying to remove the files... self.assertRaises(OSError, remove_files, test_filepaths) # touch one of the filepaths so it exists extra_file = NamedTemporaryFile(delete=False).name test_filepaths.append(extra_file) # no error is raised on trying to remove the files # (although 5 don't exist)... remove_files(test_filepaths, error_on_missing=False) # ... and the existing file was removed self.assertFalse(exists(extra_file)) # try to remove them with remove_files and verify that an IOError is # raises self.assertRaises(OSError, remove_files, test_filepaths) # now get no error when error_on_missing=False remove_files(test_filepaths, error_on_missing=False)
def test_compute_seqs_per_file(self): """compute_seqs_per_file functions as expected """ fd, temp_fasta_fp = mkstemp(prefix='QiimeScriptUtilTests', suffix='.fasta') close(fd) temp_fasta = ['>seq', 'AAACCCCAAATTGG'] * 25 open(temp_fasta_fp, 'w').write('\n'.join(temp_fasta)) actual_25 = self.pw._compute_seqs_per_file(temp_fasta_fp, 25) actual_2 = self.pw._compute_seqs_per_file(temp_fasta_fp, 2) actual_10 = self.pw._compute_seqs_per_file(temp_fasta_fp, 10) actual_5 = self.pw._compute_seqs_per_file(temp_fasta_fp, 5) actual_40 = self.pw._compute_seqs_per_file(temp_fasta_fp, 40) remove_files([temp_fasta_fp]) self.assertEqual(actual_25, 1) self.assertEqual(actual_2, 13) self.assertEqual(actual_10, 3) self.assertEqual(actual_5, 5) self.assertEqual(actual_40, 1)
def test_build_blast_db_from_fasta_path_aln(self): """build_blast_db_from_fasta_path works with alignment as input """ blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp) self.assertEqual(blast_db,self.in_aln1_fp) expected_db_files = set([blast_db + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files),expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def select_unique_rand_bcs(rand_bcs, unique_threshold): """ Attempts to select true barcodes from set of barcodes i.e. removes barcodes that might be artifacts due to sequencing errors. Uses uclust to remove barcodes that are similar thatn threshold. Parameters ---------- rand_bcs: list unique_threshold: float Returns ---------- unique_rand_bcs: set set of unique random barcodes. """ temp_dir = get_qiime_temp_dir() fasta_fd, fasta_tempfile_name = mkstemp(dir=temp_dir, prefix='tmp', suffix='.fas') rand_bcs = set(rand_bcs) with open(fasta_tempfile_name, 'w') as fasta_tempfile: for rand_bc in rand_bcs: fasta_tempfile.write(">{}\n{}\n".format(rand_bc, rand_bc)) fasta_tempfile.close() _, _, unique_rand_bcs = get_clusters_from_fasta_filepath( fasta_tempfile_name, original_fasta_path=None, percent_ID=unique_threshold, save_uc_files=False, output_dir=temp_dir) unique_rand_bcs = set(unique_rand_bcs) remove_files([fasta_tempfile_name]) return unique_rand_bcs
def test_build_blast_db_from_fasta_file(self): """build_blast_db_from_fasta_file works with open files as input """ blast_db, db_files = \ build_blast_db_from_fasta_file(open(self.in_aln1_fp),output_dir='/tmp/') self.assertTrue(blast_db.startswith('/tmp/BLAST_temp_db')) self.assertTrue(blast_db.endswith('.fasta')) expected_db_files = set([blast_db] + [blast_db + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files),expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def select_unique_rand_bcs(rand_bcs, unique_threshold): """ Attempts to select true barcodes from set of barcodes i.e. removes barcodes that might be artifacts due to sequencing errors. Uses uclust to remove barcodes that are similar thatn threshold. Parameters ---------- rand_bcs: list unique_threshold: float Returns ---------- unique_rand_bcs: set set of unique random barcodes. """ temp_dir = get_qiime_temp_dir() fasta_fd, fasta_tempfile_name = mkstemp( dir=temp_dir, prefix='tmp', suffix='.fas') rand_bcs = set(rand_bcs) with open(fasta_tempfile_name, 'w') as fasta_tempfile: for rand_bc in rand_bcs: fasta_tempfile.write(">{}\n{}\n".format(rand_bc, rand_bc)) fasta_tempfile.close() _, _, unique_rand_bcs = get_clusters_from_fasta_filepath( fasta_tempfile_name, original_fasta_path=None, percent_ID=unique_threshold, save_uc_files=False, output_dir=temp_dir) unique_rand_bcs = set(unique_rand_bcs) remove_files([fasta_tempfile_name]) return unique_rand_bcs
def test_build_blast_db_from_fasta_path(self): """build_blast_db_from_fasta_path convenience function works as expected """ blast_db, db_files = \ build_blast_db_from_fasta_path(self.in_seqs1_fp) self.assertEqual(blast_db,self.in_seqs1_fp) expected_db_files = set([self.in_seqs1_fp + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files),expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_build_blast_db_from_seqs(self): """build_blast_db_from_seqs convenience function works as expected """ blast_db, db_files = build_blast_db_from_seqs(self.in_seqs1,output_dir='/tmp') self.assertTrue(blast_db.startswith('/tmp/Blast_tmp_db')) self.assertTrue(blast_db.endswith('.fasta')) expected_db_files = set([blast_db + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files),expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def get_chimeras_from_Nast_aligned(seqs_fp, ref_db_aligned_fp=None, ref_db_fasta_fp=None, HALT_EXEC=False, min_div_ratio=None, keep_intermediates=False): """remove chimeras from seqs_fp using chimeraSlayer. seqs_fp: a filepath with the seqs to check in the file ref_db_aligned_fp: fp to (pynast) aligned reference sequences ref_db_fasta_fp: same seqs as above, just unaligned. Will be computed on the fly if not provided, HALT_EXEC: stop execution if true min_div_ratio: passed to ChimeraSlayer App """ files_to_remove = [] # might come in as FilePath object with quotes seqs_fp = str(seqs_fp) seqs_fp = seqs_fp.rstrip('"') seqs_fp = seqs_fp.lstrip('"') seqs_dir, new_seqs_fp = split(seqs_fp) # if fp is in current dir, we fake a dir change if seqs_dir == "": seqs_dir = "./" # Chimera Slayer puts some temp files in current dir and some in dir of input file # use exe_dir to change to dir of input file, so to have all tmp files in # one place params = {'--query_NAST': new_seqs_fp, '--exec_dir': seqs_dir} if ref_db_aligned_fp is None and ref_db_fasta_fp is None: # use default db, whose relative position to the # ChimeraSlayer binary is hardcoded pass else: if not ref_db_fasta_fp: # make degapped reference file ref_db_fasta_fp = write_degapped_fasta_to_file(parse_fasta( open(ref_db_aligned_fp))) files_to_remove.append(ref_db_fasta_fp) # use user db params.update({'--db_NAST': abspath(ref_db_aligned_fp), '--db_FASTA': abspath(ref_db_fasta_fp)}) if min_div_ratio is not None: params.update({'-R': min_div_ratio}) app = ChimeraSlayer(params=params, HALT_EXEC=HALT_EXEC) app_results = app() # this is a FilePath object in case of success. # How can we test for failure here? # if not exists(app_results['CPS']): # raise ApplicationError, "ChimeraSlayer failed. No output file." chimeras = parse_CPS_file((app_results['CPS'])) if not keep_intermediates: app.remove_intermediate_files() remove_files(files_to_remove) return chimeras
def tearDown(self): remove_files(self.files_to_remove) for folder in self.folders_to_remove: shutil.rmtree(folder)
def get_consensus_seqs_lookup(random_bc_lookup, random_bc_reads, random_bcs, min_difference_in_bcs, min_reads_per_random_bc, output_dir, min_difference_in_clusters, max_cluster_ratio, min_consensus): """ Generates LEA-seq consensus sequence For each sample id, for each random barcode, consensus sequence is created according to the LEA seq algorithm. Parameters ---------- random_bc_lookup: defaultdict contains sample ID -> random barcode -> list of seqs random_bc_reads: defaultdict contains sample ID -> random barcode -> number of reads random_bcs: list list of random barcodes min_difference_in_bcs: float threshold for selecting unique barcodes min_reads_per_random_bc: minimum number of reads per random bc, for it not to be discarded output_dir: dirpath output directory path min_difference_in_clusters: float percent identity threshold for cluster formation max_cluster_ratio: float cluster_ratio below which you need to find the consensus sequence Returns ---------- consensus_seq_lookup: defaultdict contains sample ID -> random barcode -> consensus_seq """ consensus_seq_lookup = defaultdict(lambda: defaultdict(str)) # defaultdict that stores LEA-seq consensus sequence # For each sample id, for each random barcode, # consensus sequence is stored random_bc_keep = {} # to remove random bcs that are selected # during the pruning step (select_unique_rand_bcs) for sample_id in random_bc_lookup: random_bc_keep[sample_id] = select_unique_rand_bcs( random_bcs[sample_id], min_difference_in_bcs) # removes barcodes that might be artifacts # due to sequencing error for random_bc in random_bc_lookup[sample_id]: if random_bc in random_bc_keep[sample_id] and random_bc_reads[ sample_id][random_bc] >= min_reads_per_random_bc: fwd_fd, fwd_fasta_tempfile_name = mkstemp( dir=output_dir, prefix='fwd', suffix='.fas') rev_fd, rev_fasta_tempfile_name = mkstemp( dir=output_dir, prefix='rev', suffix='.fas') close(fwd_fd) close(rev_fd) # create fasta files for all fwd and rev seqs # for that sample id and random bc. fwd_fasta_tempfile = open(fwd_fasta_tempfile_name, 'w') rev_fasta_tempfile = open(rev_fasta_tempfile_name, 'w') max_freq = 0 for seq_index, fwd_rev in enumerate( random_bc_lookup[sample_id][random_bc]): fwd_seq, rev_seq = fwd_rev fwd_line = ">{}{}|{}\n{}\n".format( seq_index, random_bc, random_bc_lookup[sample_id][random_bc][fwd_rev], fwd_seq) rev_line = ">{}{}|{}\n{}\n".format( seq_index, random_bc, random_bc_lookup[sample_id][random_bc][fwd_rev], rev_seq) fwd_fasta_tempfile.write(fwd_line) rev_fasta_tempfile.write(rev_line) if random_bc_lookup[sample_id][ random_bc][fwd_rev] > max_freq: max_freq = random_bc_lookup[ sample_id][random_bc][fwd_rev] majority_seq = fwd_seq + "^" + rev_seq # select majority sequence for the sample_id, # and for that particular random_bc fwd_fasta_tempfile.close() rev_fasta_tempfile.close() fwd_cluster_ratio = get_cluster_ratio( fwd_fasta_tempfile_name, min_difference_in_clusters) rev_cluster_ratio = get_cluster_ratio( rev_fasta_tempfile_name, min_difference_in_clusters) # If the cluster ratio exists, and # if is is below the threshold(max_cluster_ratio), # set the consensus seq as the majority seq # otherwise call get_consensus function if fwd_cluster_ratio == 0 or rev_cluster_ratio == 0: consensus_seq = "No consensus" elif (fwd_cluster_ratio > max_cluster_ratio and rev_cluster_ratio > max_cluster_ratio): consensus_seq = majority_seq else: fwd_fasta_tempfile = open(fwd_fasta_tempfile_name, 'r') rev_fasta_tempfile = open(rev_fasta_tempfile_name, 'r') fwd_consensus = get_consensus( fwd_fasta_tempfile, min_consensus) rev_consensus = get_consensus( rev_fasta_tempfile, min_consensus) fwd_fasta_tempfile.close() rev_fasta_tempfile.close() consensus_seq = fwd_consensus + "^" + rev_consensus consensus_seq_lookup[sample_id][random_bc] = consensus_seq files_to_be_removed = list() files_to_be_removed.append(fwd_fasta_tempfile_name) files_to_be_removed.append(rev_fasta_tempfile_name) remove_files(files_to_be_removed) # return the entire defaultdict 'consensus_seq_lookup # which has consensus sequence for each sample id, # and for each random barcode. return consensus_seq_lookup
def get_cluster_ratio(fasta_seqs, min_difference_in_clusters): """ Uses uclust to calculate cluster ratio cluster_ratio = num_of_seq_in_cluster_with_max_seq divided by num_of_seq_in cluster_with_second_higest_seq Parameters ---------- fasta_seqs: list list of fasta sequences min_difference_in_clusters: float percent identity threshold for cluster formation Returns ---------- cluster_ratio: float cluster ratio of the sequences using uclust cluster_ratio = num_of_seq_in_cluster_with_max_seq / num_of_seq_in cluster_with_second_higest_seq """ cluster_percent_id = min_difference_in_clusters temp_dir = get_qiime_temp_dir() fd_uc, uclust_tempfile_name = mkstemp(dir=temp_dir, suffix='.uc') close(fd_uc) fd_fas, fasta_tempfile_name = mkstemp(dir=temp_dir, suffix='.uc') close(fd_fas) with open(fasta_tempfile_name, 'w') as fasta_tempfile: fasta_tempfile.write(fasta_seqs) fasta_tempfile.close() count = 0 command = "uclust --usersort --input {} --uc {} --id 0.98".format( fasta_tempfile_name, uclust_tempfile_name) # In the function, I am calling uclust a large number of times. # Initially I was using from bfillings.get_clusters_from_fasta_filepath # but due to issue (biocore/bfillingss#31), I have temporarily # reverted to qiime_system_call. count_lookup = defaultdict(int) qiime_system_call(command) uclust_tempfile = open(uclust_tempfile_name, 'r') for line in uclust_tempfile: if search(r'^C', line): pieces = line.split('\t') count_lookup[pieces[1]] += int(pieces[2]) count += 1 uclust_tempfile.close() files_to_be_removed = list() files_to_be_removed.append(uclust_tempfile_name) remove_files(files_to_be_removed) sorted_counts_in_clusters = sorted( count_lookup.iteritems(), key=lambda x: x[1], reverse=True) try: max_cluster_count = \ float(str(sorted_counts_in_clusters[0][1])) second_cluster_count = \ float(str(sorted_counts_in_clusters[1][1])) return max_cluster_count / second_cluster_count except IndexError: return 1
def tearDown(self): remove_files(self.files_to_remove)
def pick_subsampled_open_reference_otus( input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=None, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, suppress_index_page=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust'] allowed_reference_otu_picking_methods = [ 'uclust_ref', 'usearch61_ref', 'sortmerna' ] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps index_links = [] input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True index_links.append( ('Run summary data', log_fp, _index_headers['run_summary'])) else: close_logger_on_success = False if not suppress_md5: log_input_md5s( logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp, prefiltered_input_fp, prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) index_links.append( ('Pre-filtered sequence identifiers ' '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id) * 100), prefilter_failures_list_fp, _index_headers['sequences'])) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") # Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus(input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir, input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp, step1_failures_list_fp, step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir # count number of sequences in step 1 failures fasta file with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f: num_failure_seqs, mean, std = count_seqs_from_file( step1_failures_fasta_f) # number of failures sequences is greater than the threshold, # continue to step 2,3 and 4 run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold if run_step_2_and_3: # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_dir = '%s/step2_otus/' % output_dir create_dir(step2_dir) step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step2_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath(step2_input_fasta_fp), percent_subsample)) # Prep the OTU picking command for the subsampled failures step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir # remove the indexed reference database from the dictionary of # parameters as it must be forced to build a new database # using the step2_repset_fasta_fp if reference_otu_picking_method == 'sortmerna': if 'sortmerna_db' in params['pick_otus']: del params['pick_otus']['sortmerna_db'] step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)]) index_links.append(( 'Final map of OTU identifier to sequence identifers (i.e., "OTU map")', merged_otu_map_fp, _index_headers['otu_maps'])) if not suppress_step4: step4_dir = '%s/step4_otus/' % output_dir if run_step_2_and_3: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) failures_fp = step3_failures_fasta_fp failures_otus_fp = 'failures_failures_otus.txt' failures_step = 'step3' else: failures_fp = step1_failures_fasta_fp failures_otus_fp = 'failures_otus.txt' failures_step = 'step1' step3_otu_map_fp = "" step4_cmd = pick_denovo_otus(failures_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp) commands.append([('Pick de novo OTUs on %s failures' % failures_step, step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp) commands.append([('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created if run_step_2_and_3: failures_fp = step3_failures_list_fp else: failures_fp = step1_failures_list_fp step3_otu_map_fp = "" cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([ ('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (failures_fp, output_dir)) ]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp, min_otu_size) index_links.append( ('Final map of OTU identifier to sequence identifers excluding ' 'OTUs with fewer than %d sequences' % min_otu_size, otu_no_singletons_fp, _index_headers['otu_maps'])) logger.write( '# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size)) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir index_links.append(('OTU representative sequences', final_repset_fp, _index_headers['sequences'])) final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir index_links.append(( 'New reference sequences (i.e., OTU representative sequences plus input ' 'reference sequences)', new_refseqs_fp, _index_headers['sequences'])) # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copyfile(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write( '# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. if run_step_2_and_3: for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() # steps 1-4 executed if run_step_2_and_3: logger.write( '# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # only steps 1 and 4 executed else: logger.write( '# Write non-singleton otus representative sequences from ' + 'step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size, otu_table_fp, _index_headers['otu_tables'])) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) index_links.append(('OTU taxonomic assignments', taxonomy_fp, _index_headers['taxa_assignments'])) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: rep_set_tree_fp = join(output_dir, 'rep_set.tre') index_links.append(('OTU phylogenetic tree', rep_set_tree_fp, _index_headers['trees'])) if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table( table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close() if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp)
def align_and_tree(repset_fasta_fp, output_dir, command_handler, params, qiime_config, parallel=False, logger=None, status_update_callback=print_to_stdout): input_dir, input_filename = split(repset_fasta_fp) input_basename, input_ext = splitext(input_filename) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # Prep the pynast alignment command alignment_method = 'pynast' pynast_dir = '%s/%s_aligned_seqs' % (output_dir, alignment_method) aln_fp = '%s/%s_aligned.fasta' % (pynast_dir, input_basename) failures_fp = '%s/%s_failures.fasta' % (pynast_dir, input_basename) if exists(pynast_dir): rmtree(pynast_dir) if parallel: # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['align_seqs'].copy() if 'alignment_method' in d: del d['alignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel pynast alignment command align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\ (repset_fasta_fp, pynast_dir, params_str) else: try: params_str = get_params_str(params['align_seqs']) except KeyError: params_str = '' # Build the pynast alignment command align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\ (repset_fasta_fp, pynast_dir, params_str) commands.append([('Align sequences', align_seqs_cmd)]) # Prep the alignment filtering command filtered_aln_fp = '%s/%s_aligned_pfiltered.fasta' %\ (pynast_dir, input_basename) try: params_str = get_params_str(params['filter_alignment']) except KeyError: params_str = '' # Build the alignment filtering command filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\ (pynast_dir, aln_fp, params_str) commands.append([('Filter alignment', filter_alignment_cmd)]) # Prep the tree building command tree_fp = '%s/rep_set.tre' % output_dir try: params_str = get_params_str(params['make_phylogeny']) except KeyError: params_str = '' # Build the tree building command make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\ (filtered_aln_fp, tree_fp, params_str) commands.append([('Build phylogenetic tree', make_phylogeny_cmd)]) if exists(tree_fp): remove_files([tree_fp]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return failures_fp
def cleanUp(self): """ Remove temporary blast database files, if applicable """ remove_files(self._db_files_to_remove, error_on_missing=False)
def tearDown(self): remove_files(self.files_to_remove, error_on_missing=False)
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=None, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, suppress_index_page=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust'] allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref', 'sortmerna'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps index_links = [] input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True index_links.append( ('Run summary data', log_fp, _index_headers['run_summary'])) else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id) commands.append( [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp, prefiltered_input_fp, prefilter_failures_list_fp) commands.append( [('Filter prefilter failures from input', filter_fasta_cmd)]) index_links.append( ('Pre-filtered sequence identifiers ' '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100), prefilter_failures_list_fp, _index_headers['sequences'])) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") # Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus( input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir, input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp, step1_failures_list_fp, step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir # count number of sequences in step 1 failures fasta file with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f: num_failure_seqs, mean, std = count_seqs_from_file(step1_failures_fasta_f) # number of failures sequences is greater than the threshold, # continue to step 2,3 and 4 run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold if run_step_2_and_3: # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_dir = '%s/step2_otus/' % output_dir create_dir(step2_dir) step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step2_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath( step2_input_fasta_fp), percent_subsample)) # Prep the OTU picking command for the subsampled failures step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp) commands.append( [('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir # remove the indexed reference database from the dictionary of # parameters as it must be forced to build a new database # using the step2_repset_fasta_fp if reference_otu_picking_method == 'sortmerna': if 'sortmerna_db' in params['pick_otus']: del params['pick_otus']['sortmerna_db'] step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set', step3_cmd)]) index_links.append( ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")', merged_otu_map_fp, _index_headers['otu_maps'])) if not suppress_step4: step4_dir = '%s/step4_otus/' % output_dir if run_step_2_and_3: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) failures_fp = step3_failures_fasta_fp failures_otus_fp = 'failures_failures_otus.txt' failures_step = 'step3' else: failures_fp = step1_failures_fasta_fp failures_otus_fp = 'failures_otus.txt' failures_step = 'step1' step3_otu_map_fp = "" step4_cmd = pick_denovo_otus(failures_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp) commands.append([('Pick de novo OTUs on %s failures' % failures_step, step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp) commands.append( [('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created if run_step_2_and_3: failures_fp = step3_failures_list_fp else: failures_fp = step1_failures_list_fp step3_otu_map_fp = "" cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (failures_fp, output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map( otu_fp, otu_no_singletons_fp, min_otu_size) index_links.append(('Final map of OTU identifier to sequence identifers excluding ' 'OTUs with fewer than %d sequences' % min_otu_size, otu_no_singletons_fp, _index_headers['otu_maps'])) logger.write('# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath( otu_no_singletons_fp), min_otu_size)) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir index_links.append( ('OTU representative sequences', final_repset_fp, _index_headers['sequences'])) final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir index_links.append( ('New reference sequences (i.e., OTU representative sequences plus input ' 'reference sequences)', new_refseqs_fp, _index_headers['sequences'])) # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copyfile(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write('# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. if run_step_2_and_3: for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() # steps 1-4 executed if run_step_2_and_3: logger.write('# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # only steps 1 and 4 executed else: logger.write('# Write non-singleton otus representative sequences from ' + 'step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size, otu_table_fp, _index_headers['otu_tables'])) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write( "Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) index_links.append( ('OTU taxonomic assignments', taxonomy_fp, _index_headers['taxa_assignments'])) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: rep_set_tree_fp = join(output_dir, 'rep_set.tre') index_links.append( ('OTU phylogenetic tree', rep_set_tree_fp, _index_headers['trees'])) if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table(table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close() if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp)
def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=None, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i, input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir, i) if iteration_output_exists(iteration_output_dir, min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger, [input_fp, refseqs_fp]) logger.write('Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i, input_fp)) else: pick_subsampled_open_reference_otus(input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join( [new_ref_set_id, str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, suppress_index_page=True, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback, minimum_failure_threshold=minimum_failure_threshold) # perform post-iteration file shuffling whether the previous iteration's # data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append( '%s/otu_table_mc%d.biom' % (iteration_output_dir, min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps), otu_table_fp) commands.append([("Merge OTU tables", merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write( "Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table(table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=None, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i, input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir, i) if iteration_output_exists(iteration_output_dir, min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger, [input_fp, refseqs_fp]) logger.write( 'Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i, input_fp)) else: pick_subsampled_open_reference_otus( input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join([new_ref_set_id, str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, suppress_index_page=True, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback, minimum_failure_threshold=minimum_failure_threshold) # perform post-iteration file shuffling whether the previous iteration's # data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir, min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps), otu_table_fp) commands.append([("Merge OTU tables", merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table( table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def usearch61_chimera_check(input_seqs_fp, output_dir, reference_seqs_fp=None, suppress_usearch61_intermediates=False, suppress_usearch61_ref=False, suppress_usearch61_denovo=False, split_by_sampleid=False, non_chimeras_retention="union", usearch61_minh=0.28, usearch61_xn=8.0, usearch61_dn=1.4, usearch61_mindiffs=3, usearch61_mindiv=0.8, usearch61_abundance_skew=2.0, percent_id_usearch61=0.97, minlen=64, word_length=8, max_accepts=1, max_rejects=8, verbose=False, threads=1.0, HALT_EXEC=False): """ Main convenience function for usearch61 chimera checking input_seqs_fp: filepath of input fasta file. output_dir: output directory reference_seqs_fp: fasta filepath for reference chimera detection. suppress_usearch61_intermediates: Suppress retention of .uc and log files. suppress_usearch61_ref: Suppress usearch61 reference chimera detection. suppress_usearch61_denovo: Suppress usearch61 de novo chimera detection. split_by_sampleid: Split by sample ID for de novo chimera detection. non_chimeras_retention: Set to "union" or "intersection" to retain non-chimeras between de novo and reference based results. usearch61_minh: Minimum score (h) to be classified as chimera. Increasing this value tends to the number of false positives (and also sensitivity). usearch61_xn: Weight of "no" vote. Increasing this value tends to the number of false positives (and also sensitivity). usearch61_dn: Pseudo-count prior for "no" votes. (n). Increasing this value tends to the number of false positives (and also sensitivity). usearch61_mindiffs: Minimum number of diffs in a segment. Increasing this value tends to reduce the number of false positives while reducing sensitivity to very low-divergence chimeras. usearch61_mindiv: Minimum divergence, i.e. 100% - identity between the query and closest reference database sequence. Expressed as a percentage, so the default is 0.8%, which allows chimeras that are up to 99.2% similar to a reference sequence. usearch61_abundance_skew: abundance skew for de novo chimera comparisons. percent_id_usearch61: identity to cluster sequences at minlen: minimum sequence length for use with usearch61 word_length: length of nucleotide 'words' for usearch61 max_accepts: max number of accepts for hits with usearch61 max_rejects: max number of rejects for usearch61, increasing allows more sensitivity at a cost of speed threads: Specify number of threads used per core per CPU HALT_EXEC=application controller option to halt execution and print command """ """ Need to cluster sequences de novo first to get 1. abundance information and 2 consensus sequence for each cluster. Using dereplication followed by clustering does not appear to automatically update complete cluster size, will directly cluster raw seqs with the small_mem clustering option. This means without additional parsing steps to recalculate actual cluster sizes, the sizeorder option can't be used for de novo clustering and downstream chimera detection.""" files_to_remove = [] # Get absolute paths to avoid issues with calling usearch input_seqs_fp = abspath(input_seqs_fp) output_dir = abspath(output_dir) if reference_seqs_fp: reference_seqs_fp = abspath(reference_seqs_fp) log_fp = join(output_dir, "identify_chimeric_seqs.log") chimeras_fp = join(output_dir, "chimeras.txt") non_chimeras_fp = join(output_dir, "non_chimeras.txt") non_chimeras = [] chimeras = [] log_lines = {'denovo_chimeras': 0, 'denovo_non_chimeras': 0, 'ref_chimeras': 0, 'ref_non_chimeras': 0} if split_by_sampleid: if verbose: print "Splitting fasta according to SampleID..." with open(input_seqs_fp, 'U') as full_seqs: sep_fastas =split_sequence_file_on_sample_ids_to_files( full_seqs, 'fasta', output_dir) if suppress_usearch61_intermediates: files_to_remove += sep_fastas for curr_fasta in sep_fastas: curr_chimeras, curr_non_chimeras, files_to_remove, log_lines =\ identify_chimeras_usearch61(curr_fasta, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC, log_lines, verbose, threads) chimeras += curr_chimeras non_chimeras += curr_non_chimeras else: chimeras, non_chimeras, files_to_remove, log_lines =\ identify_chimeras_usearch61(input_seqs_fp, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC, log_lines, verbose, threads) # write log, non chimeras, chimeras. write_usearch61_log(log_fp, input_seqs_fp, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, split_by_sampleid, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, HALT_EXEC, log_lines) chimeras_f = open(chimeras_fp, "w") non_chimeras_f = open(non_chimeras_fp, "w") for curr_chimera in chimeras: chimeras_f.write("%s\n" % curr_chimera) for curr_non_chimera in non_chimeras: non_chimeras_f.write("%s\n" % curr_non_chimera) chimeras_f.close() non_chimeras_f.close() remove_files(files_to_remove)
def tearDown(self): if self._files_to_remove: remove_files(self._files_to_remove) if isdir('/tmp/truncate_fasta_qual_test/'): rmtree('/tmp/truncate_fasta_qual_test/')
def tearDown(self): remove_files(self.files_to_remove) if self._dirs_to_remove: for i in self._dirs_to_remove: rmtree(i)
def tearDown(self): remove_files(self.files_to_remove) rmtree(self.working_dir)
def tearDown(self): """cleanup temporary files and dirs """ remove_files(set(self.files_to_remove), error_on_missing=False)
def tearDown(self): for dir in self.dirs_to_remove: if exists(dir): rmdir(dir) remove_files(self.files_to_remove)
def remove_artifacts_seqs(seqs_fp, ref_fp, output_fp, ref_db_fp=None, negate=False, threads=1): """Remove artifacts from FASTA file using SortMeRNA. Parameters ---------- seqs_fp: string file path to FASTA input sequence file ref_fp: tuple file path(s) to FASTA database file output_fp: string file path to store output results ref_db_fp: string or tuple, optional file path(s) to indexed FASTA database negate: boolean, optional if True, discard all input sequences aligning to reference database threads: integer, optional number of threads to use for SortMeRNA """ working_dir = join(dirname(output_fp), "working_dir") if not exists(working_dir): makedirs(working_dir) aligned_seq_ids = set() files_to_remove = [] for i, db in enumerate(ref_fp): # create working directory for each # reference database db_dir_base = splitext(basename(db))[0] db_dir = join(working_dir, db_dir_base) if not exists(db_dir): makedirs(db_dir) if ref_db_fp: sortmerna_db = ref_db_fp[i] else: # build index sortmerna_db, files_to_remove = \ build_database_sortmerna( fasta_path=db, max_pos=10000, output_dir=db_dir) # run SortMeRNA app_result = sortmerna_map(seq_path=seqs_fp, output_dir=db_dir, refseqs_fp=db, sortmerna_db=sortmerna_db, threads=threads, best=1) # Print SortMeRNA errors stderr_fp = app_result['StdErr'].name if stat(stderr_fp).st_size != 0: with open(stderr_fp, 'U') as stderr_f: for line in stderr_f: print line raise ValueError("Could not run SortMeRNA.") for line in app_result['BlastAlignments']: line = line.strip().split('\t') if line[1] == '*': continue else: aligned_seq_ids.add(line[0]) # remove indexed database files remove_files(files_to_remove, error_on_missing=False) if negate: def op(x): return x not in aligned_seq_ids else: def op(x): return x in aligned_seq_ids # if negate = False, only output sequences # matching to at least one of the databases with open(seqs_fp, 'U') as seqs_f: with open(output_fp, 'w') as out_f: for label, seq in parse_fasta(seqs_f): label = label.split()[0] if op(label): out_f.write(">%s\n%s\n" % (label, seq))
def tearDown(self): remove_files(self.files_to_remove) rmtree(self.root_dir)