def tearDown(self): remove_files(set(self.files_to_remove)) # remove directories last, so we don't get errors # trying to remove files which may be in the directories for d in self.dirs_to_remove: if exists(d): rmtree(d)
def test_plot_heatmap(self): plot_heatmap(self.otu_table, self.otu_table.ObservationIds, self.otu_table.SampleIds, filename=self.tmp_heatmap_fpath) self.assertEqual(exists(self.tmp_heatmap_fpath), True) remove_files(set([self.tmp_heatmap_fpath]))
def test_mothur_supported_version(self): """mothur is in path and version is supported """ acceptable_version = (1, 25, 0) self.assertTrue( which('mothur'), "mothur not found. This may or may not be a problem depending on " + "which components of QIIME you plan to use.") # mothur creates a log file in cwd, so create a tmp and cd there first log_file = join(get_qiime_temp_dir(), 'mothur.log') command = "mothur \"#set.logfile(name=%s)\" | grep '^mothur v'" % log_file stdout, stderr, exit_Status = qiime_system_call(command) # remove log file remove_files([log_file], error_on_missing=False) version_string = stdout.strip().split(' ')[1].strip('v.') try: version = tuple(map(int, version_string.split('.'))) pass_test = version == acceptable_version except ValueError: pass_test = False version_string = stdout self.assertTrue( pass_test, "Unsupported mothur version. %s is required, but running %s." % ('.'.join(map(str, acceptable_version)), version_string))
def tearDown(self): if self._files_to_remove: remove_files(self._files_to_remove) if exists(self.output_dir): rmtree(self.output_dir) if exists(self.input_dir): rmtree(self.input_dir)
def tearDown(self): """ """ disable_timeout() remove_files(self.files_to_remove, error_on_missing=False) # remove directories last, so we don't get errors # trying to remove files which may be in the directories for d in self.dirs_to_remove: if exists(d): rmtree(d)
def tearDown(self): """Removes temporary directories and files.""" remove_files(self.files_to_remove) # Remove directories last, so we don't get errors trying to remove # files which may be in the directories. for d in self.dirs_to_remove: if exists(d): rmtree(d)
def tearDown(self): """Clean up tmp files.""" remove_files(self.files_to_remove, False) if self.tmpdir: rmtree(self.tmpdir) # clean up the file from init_flowgram_file if (hasattr(self, "tmp_filename") and exists(self.tmp_filename)): remove(self.tmp_filename)
def tearDown(self): """ """ disable_timeout() # reset sys.stderr sys.stderr = self.saved_stderr remove_files(self.files_to_remove) # remove directories last, so we don't get errors # trying to remove files which may be in the directories for d in self.dirs_to_remove: if exists(d): rmtree(d)
def tearDown(self): disable_timeout() # reset sys.stderr sys.stderr = self.saved_stderr remove_files(self.files_to_remove) # remove directories last, so we don't get errors # trying to remove files which may be in the directories for d in self.dirs_to_remove: if exists(d): rmtree(d)
def remove_intermediate_files(self): """Remove all intermediate files.""" # tmp files are written in the current dir, # app controller always jumps into dir specified via exec_dir # Note: blast intermediates are not removed exec_dir = str(self.Parameters['--exec_dir'].Value) inp_file_name = str(self.Parameters['--query_NAST'].Value) exec_dir = exec_dir.rstrip('"') exec_dir = exec_dir.lstrip('"') inp_file_name = inp_file_name.rstrip('"') inp_file_name = inp_file_name.lstrip('"') tmp_suffixes = [".CPS", ".CPS.CPC", ".CPS_RENAST", ".CPS_RENAST.cidx", ".CPS.CPC.wTaxons", ".cidx"] cs_tmp_files = [ exec_dir + '/' + inp_file_name + x for x in tmp_suffixes] remove_files(cs_tmp_files, error_on_missing=False) db_param = self.Parameters['--db_NAST'] if db_param.isOn(): nast_db_name = str(db_param.Value) nast_db_name = nast_db_name.rstrip('"') nast_db_name = nast_db_name.lstrip('"') # Better do not remove this file since other ChimeraSlayer # instances running on the same ref set might use this file # Should be rather deleted in the calling function # remove_files([nast_db_name + ".cidx"], # error_on_missing=False) fasta_param = self.Parameters['--db_FASTA'] if fasta_param.isOn(): fasta_name = str(fasta_param.Value) fasta_name = fasta_name.rstrip('"') fasta_name = fasta_name.lstrip('"') blast_db_files = [ fasta_name + x for x in [ ".nsq", ".nin", ".nhr", ".cidx"]] remove_files(blast_db_files, error_on_missing=False)
def tearDown(self): """Clean up tmp files.""" # turn off the alarm signal.alarm(0) remove_files(self.files_to_remove, False) if self.server_socket: self.server_socket.close() # give clients time to clean up sleep(1) if exists(self.tmp_dir): try: rmdir(self.tmp_dir) except OSError: # give clients some more time, fail if still error sleep(5) rmdir(self.tmp_dir)
def test_remove_files(self): """Remove files functions as expected """ # create list of temp file paths test_fds = [NamedTemporaryFile(delete=False) for i in range(5)] test_filepaths = [element.name for element in test_fds] # should work just fine remove_files(test_filepaths) # check that an error is raised on trying to remove the files... self.assertRaises(OSError, remove_files, test_filepaths) # touch one of the filepaths so it exists extra_file = NamedTemporaryFile(delete=False).name test_filepaths.append(extra_file) # no error is raised on trying to remove the files # (although 5 don't exist)... remove_files(test_filepaths, error_on_missing=False) # ... and the existing file was removed self.assertFalse(exists(extra_file)) # try to remove them with remove_files and verify that an IOError is # raises self.assertRaises(OSError, remove_files, test_filepaths) # now get no error when error_on_missing=False remove_files(test_filepaths, error_on_missing=False)
def test_build_blast_db_from_fasta_path_aln(self): """build_blast_db_from_fasta_path works with alignment as input """ blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp) self.assertEqual(blast_db,self.in_aln1_fp) expected_db_files = set([blast_db + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files),expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_compute_seqs_per_file(self): """compute_seqs_per_file functions as expected """ fd, temp_fasta_fp = mkstemp(prefix='QiimeScriptUtilTests', suffix='.fasta') close(fd) temp_fasta = ['>seq', 'AAACCCCAAATTGG'] * 25 open(temp_fasta_fp, 'w').write('\n'.join(temp_fasta)) actual_25 = self.pw._compute_seqs_per_file(temp_fasta_fp, 25) actual_2 = self.pw._compute_seqs_per_file(temp_fasta_fp, 2) actual_10 = self.pw._compute_seqs_per_file(temp_fasta_fp, 10) actual_5 = self.pw._compute_seqs_per_file(temp_fasta_fp, 5) actual_40 = self.pw._compute_seqs_per_file(temp_fasta_fp, 40) remove_files([temp_fasta_fp]) self.assertEqual(actual_25, 1) self.assertEqual(actual_2, 13) self.assertEqual(actual_10, 3) self.assertEqual(actual_5, 5) self.assertEqual(actual_40, 1)
def test_build_blast_db_from_seqs(self): """build_blast_db_from_seqs convenience function works as expected """ blast_db, db_files = build_blast_db_from_seqs(self.in_seqs1,output_dir='/tmp') self.assertTrue(blast_db.startswith('/tmp/Blast_tmp_db')) self.assertTrue(blast_db.endswith('.fasta')) expected_db_files = set([blast_db + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files),expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_build_blast_db_from_fasta_path(self): """build_blast_db_from_fasta_path convenience function works as expected """ blast_db, db_files = \ build_blast_db_from_fasta_path(self.in_seqs1_fp) self.assertEqual(blast_db,self.in_seqs1_fp) expected_db_files = set([self.in_seqs1_fp + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files),expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_build_blast_db_from_fasta_file(self): """build_blast_db_from_fasta_file works with open files as input """ blast_db, db_files = \ build_blast_db_from_fasta_file(open(self.in_aln1_fp),output_dir='/tmp/') self.assertTrue(blast_db.startswith('/tmp/BLAST_temp_db')) self.assertTrue(blast_db.endswith('.fasta')) expected_db_files = set([blast_db] + [blast_db + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files),expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_mothur_supported_version(self): """mothur is in path and version is supported """ acceptable_version = (1, 25, 0) self.assertTrue(which('mothur'), "mothur not found. This may or may not be a problem depending on " + "which components of QIIME you plan to use.") # mothur creates a log file in cwd, so create a tmp and cd there first log_file = join(get_qiime_temp_dir(), 'mothur.log') command = "mothur \"#set.logfile(name=%s)\" | grep '^mothur v'" % log_file stdout, stderr, exit_Status = qiime_system_call(command) # remove log file remove_files([log_file], error_on_missing=False) version_string = stdout.strip().split(' ')[1].strip('v.') try: version = tuple(map(int, version_string.split('.'))) pass_test = version == acceptable_version except ValueError: pass_test = False version_string = stdout self.assertTrue(pass_test, "Unsupported mothur version. %s is required, but running %s." % ('.'.join(map(str, acceptable_version)), version_string))
def usearch61_chimera_check(input_seqs_fp, output_dir, reference_seqs_fp=None, suppress_usearch61_intermediates=False, suppress_usearch61_ref=False, suppress_usearch61_denovo=False, split_by_sampleid=False, non_chimeras_retention="union", usearch61_minh=0.28, usearch61_xn=8.0, usearch61_dn=1.4, usearch61_mindiffs=3, usearch61_mindiv=0.8, usearch61_abundance_skew=2.0, percent_id_usearch61=0.97, minlen=64, word_length=8, max_accepts=1, max_rejects=8, verbose=False, threads=1.0, HALT_EXEC=False): """ Main convenience function for usearch61 chimera checking input_seqs_fp: filepath of input fasta file. output_dir: output directory reference_seqs_fp: fasta filepath for reference chimera detection. suppress_usearch61_intermediates: Suppress retention of .uc and log files. suppress_usearch61_ref: Suppress usearch61 reference chimera detection. suppress_usearch61_denovo: Suppress usearch61 de novo chimera detection. split_by_sampleid: Split by sample ID for de novo chimera detection. non_chimeras_retention: Set to "union" or "intersection" to retain non-chimeras between de novo and reference based results. usearch61_minh: Minimum score (h) to be classified as chimera. Increasing this value tends to the number of false positives (and also sensitivity). usearch61_xn: Weight of "no" vote. Increasing this value tends to the number of false positives (and also sensitivity). usearch61_dn: Pseudo-count prior for "no" votes. (n). Increasing this value tends to the number of false positives (and also sensitivity). usearch61_mindiffs: Minimum number of diffs in a segment. Increasing this value tends to reduce the number of false positives while reducing sensitivity to very low-divergence chimeras. usearch61_mindiv: Minimum divergence, i.e. 100% - identity between the query and closest reference database sequence. Expressed as a percentage, so the default is 0.8%, which allows chimeras that are up to 99.2% similar to a reference sequence. usearch61_abundance_skew: abundance skew for de novo chimera comparisons. percent_id_usearch61: identity to cluster sequences at minlen: minimum sequence length for use with usearch61 word_length: length of nucleotide 'words' for usearch61 max_accepts: max number of accepts for hits with usearch61 max_rejects: max number of rejects for usearch61, increasing allows more sensitivity at a cost of speed threads: Specify number of threads used per core per CPU HALT_EXEC=application controller option to halt execution and print command """ """ Need to cluster sequences de novo first to get 1. abundance information and 2 consensus sequence for each cluster. Using dereplication followed by clustering does not appear to automatically update complete cluster size, will directly cluster raw seqs with the small_mem clustering option. This means without additional parsing steps to recalculate actual cluster sizes, the sizeorder option can't be used for de novo clustering and downstream chimera detection.""" files_to_remove = [] # Get absolute paths to avoid issues with calling usearch input_seqs_fp = abspath(input_seqs_fp) output_dir = abspath(output_dir) if reference_seqs_fp: reference_seqs_fp = abspath(reference_seqs_fp) log_fp = join(output_dir, "identify_chimeric_seqs.log") chimeras_fp = join(output_dir, "chimeras.txt") non_chimeras_fp = join(output_dir, "non_chimeras.txt") non_chimeras = [] chimeras = [] log_lines = {'denovo_chimeras': 0, 'denovo_non_chimeras': 0, 'ref_chimeras': 0, 'ref_non_chimeras': 0} if split_by_sampleid: if verbose: print "Splitting fasta according to SampleID..." full_seqs = open(input_seqs_fp, "U") sep_fastas =\ split_fasta_on_sample_ids_to_files(parse_fasta(full_seqs), output_dir) full_seqs.close() if suppress_usearch61_intermediates: files_to_remove += sep_fastas for curr_fasta in sep_fastas: curr_chimeras, curr_non_chimeras, files_to_remove, log_lines =\ identify_chimeras_usearch61(curr_fasta, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC, log_lines, verbose, threads) chimeras += curr_chimeras non_chimeras += curr_non_chimeras else: chimeras, non_chimeras, files_to_remove, log_lines =\ identify_chimeras_usearch61(input_seqs_fp, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC, log_lines, verbose, threads) # write log, non chimeras, chimeras. write_usearch61_log(log_fp, input_seqs_fp, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, split_by_sampleid, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, HALT_EXEC, log_lines) chimeras_f = open(chimeras_fp, "w") non_chimeras_f = open(non_chimeras_fp, "w") for curr_chimera in chimeras: chimeras_f.write("%s\n" % curr_chimera) for curr_non_chimera in non_chimeras: non_chimeras_f.write("%s\n" % curr_non_chimera) chimeras_f.close() non_chimeras_f.close() remove_files(files_to_remove)
def tearDown(self): remove_files(self.files_to_remove)
def tearDown(self): for dir in self.dirs_to_remove: if exists(dir): rmdir(dir) remove_files(self.files_to_remove)
def tearDown(self): remove_files(self.files_to_remove) for folder in self.folders_to_remove: shutil.rmtree(folder)
def align_and_tree(repset_fasta_fp, output_dir, command_handler, params, qiime_config, parallel=False, logger=None, status_update_callback=print_to_stdout): input_dir, input_filename = split(repset_fasta_fp) input_basename, input_ext = splitext(input_filename) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # Prep the pynast alignment command alignment_method = 'pynast' pynast_dir = '%s/%s_aligned_seqs' % (output_dir, alignment_method) aln_fp = '%s/%s_aligned.fasta' % (pynast_dir, input_basename) failures_fp = '%s/%s_failures.fasta' % (pynast_dir, input_basename) if exists(pynast_dir): rmtree(pynast_dir) if parallel: # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['align_seqs'].copy() if 'alignment_method' in d: del d['alignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel pynast alignment command align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\ (repset_fasta_fp, pynast_dir, params_str) else: try: params_str = get_params_str(params['align_seqs']) except KeyError: params_str = '' # Build the pynast alignment command align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\ (repset_fasta_fp, pynast_dir, params_str) commands.append([('Align sequences', align_seqs_cmd)]) # Prep the alignment filtering command filtered_aln_fp = '%s/%s_aligned_pfiltered.fasta' %\ (pynast_dir, input_basename) try: params_str = get_params_str(params['filter_alignment']) except KeyError: params_str = '' # Build the alignment filtering command filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\ (pynast_dir, aln_fp, params_str) commands.append([('Filter alignment', filter_alignment_cmd)]) # Prep the tree building command tree_fp = '%s/rep_set.tre' % output_dir try: params_str = get_params_str(params['make_phylogeny']) except KeyError: params_str = '' # Build the tree building command make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\ (filtered_aln_fp, tree_fp, params_str) commands.append([('Build phylogenetic tree', make_phylogeny_cmd)]) if exists(tree_fp): remove_files([tree_fp]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return failures_fp
def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=None, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i, input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir, i) if iteration_output_exists(iteration_output_dir, min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger, [input_fp, refseqs_fp]) logger.write('Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i, input_fp)) else: pick_subsampled_open_reference_otus(input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join( [new_ref_set_id, str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, suppress_index_page=True, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback) # perform post-iteration file shuffling whether the previous iteration's # data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append( '%s/otu_table_mc%d.biom' % (iteration_output_dir, min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps), otu_table_fp) commands.append([("Merge OTU tables", merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write( "Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures with biom_open(align_and_tree_input_otu_table) as biom_file: table = Table.from_hdf5(biom_file) filtered_otu_table = filter_otus_from_otu_table(table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def tearDown(self): remove_files(self.files_to_remove) if self._dirs_to_remove: for i in self._dirs_to_remove: rmtree(i)
def tearDown(self): remove_files(set(self._paths_to_clean_up),error_on_missing=False)
def main(): option_parser, options, args = parse_command_line_parameters(**script_info) DEBUG = options.verbose check_options(option_parser, options) start_time = time() option_lines = format_options_as_lines(options) if DEBUG: print FORMAT_BAR print "Running with options:" for line in sorted(option_lines): print line print FORMAT_BAR # because the blast app controller uses absolute paths, make sure subject # db path is fully specified subject_db = options.subjectdb if not subject_db.startswith('/'): subject_db = join(getcwd(), subject_db) if not options.no_format_db: # initialize object inpath = FilePath(abspath(options.subjectdb)) subject_dir, subj_file = split(inpath) fdb = FormatDb(WorkingDir=subject_dir) # Currently we do not support protein blasts, but # this would be easy to add in the future... fdb.Parameters['-p'].on('F') # Create indices for record lookup fdb.Parameters['-o'].on('T') # Set input database fdb.Parameters['-i'].on(subject_db) formatdb_cmd = fdb.BaseCommand if DEBUG: print "Formatting db with command: %s" % formatdb_cmd app_result = fdb(subject_db) formatdb_filepaths = [] for v in app_result.values(): try: formatdb_filepaths.append(v.name) except AttributeError: # not a file object, so no path to return pass db_format_time = time() - start_time if DEBUG: print "Formatting subject db took: %2.f seconds" % db_format_time print "formatdb log file written to: %s" % app_result['log'] print FORMAT_BAR else: db_format_time = time() - start_time formatdb_cmd = "None (formatdb not called)" # Check that User-Supplied subjectdb is valid db_ext = [".nhr", ".nin", ".nsd", ".nsi", ".nsq"] formatdb_filepaths = [subject_db + ext for ext in db_ext] if DEBUG: print "Checking that pre-existing formatdb files exist and can be read." print "Files to be checked:" for fp in formatdb_filepaths: print fp print FORMAT_BAR try: formatdb_files = [open(db_f, "U") for db_f in formatdb_filepaths] [f.close() for f in formatdb_files] except IOError: if DEBUG: print "Cannot open user-supplied database file:", db_f option_parser.error( """Problem with -d and --no_format_db option combination: Cannot open the following user-supplied database file: %s. Consider running without --no_format_db to let formatdb generate these required files""" % db_f) if DEBUG: print "OK: BLAST Database files exist and can be read." print FORMAT_BAR # Perform BLAST search blast_results, hit_ids, removed_hit_ids = find_homologs( options.querydb, subject_db, options.e_value, options.max_hits, options.working_dir, options.blastmatroot, options.wordsize, options.percent_aligned, DEBUG=DEBUG) blast_time = (time() - start_time) - db_format_time if DEBUG: print "BLAST search took: %2.f minute(s)" % (blast_time / 60.0) print FORMAT_BAR # Create output folder outputdir = options.outputdir try: makedirs(outputdir) except OSError: pass # Record raw blast results raw_blast_results_path = join(outputdir, "raw_blast_results.txt") f = open(raw_blast_results_path, 'w') f.writelines(blast_results) f.close() # Record excluded seqs excluded_seqs_path = join(outputdir, "matching.fna") ids_to_seq_file(hit_ids, options.querydb, excluded_seqs_path, "") # Record included (screened) seqs included_seqs_path = join(outputdir, "non-matching.fna") all_ids = ids_from_fasta_lines(open(options.querydb)) included_ids = set(all_ids) - hit_ids ids_to_seq_file(included_ids, options.querydb, included_seqs_path, "") log_lines = compose_logfile_lines(start_time, db_format_time, blast_time, option_lines, formatdb_cmd, blast_results, options, all_ids, hit_ids, removed_hit_ids, included_ids, DEBUG) log_path = join(outputdir, "sequence_exclusion.log") if DEBUG: print "Writing summary to: %s" % log_path f = open(log_path, 'w') f.writelines(log_lines) f.close() if not options.no_clean: if DEBUG: print FORMAT_BAR print "| Cleanup |" print FORMAT_BAR if not options.no_format_db: if options.verbose: print "Cleaning up formatdb files:", formatdb_filepaths remove_files(formatdb_filepaths) else: if options.verbose: print "Formatdb not run...nothing to clean"
def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=0.60, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i, input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir, i) if iteration_output_exists(iteration_output_dir, min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger, [input_fp, refseqs_fp]) logger.write( 'Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i, input_fp)) else: pick_subsampled_open_reference_otus( input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join([new_ref_set_id, str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, suppress_index_page=True, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback) # perform post-iteration file shuffling whether the previous iteration's # data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir, min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps), otu_table_fp) commands.append([("Merge OTU tables", merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table, 'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp, 'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=None, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, suppress_index_page=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61'] allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps index_links = [] input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True index_links.append( ('Run summary data', log_fp, _index_headers['run_summary'])) else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id) commands.append( [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp, prefiltered_input_fp, prefilter_failures_list_fp) commands.append( [('Filter prefilter failures from input', filter_fasta_cmd)]) index_links.append( ('Pre-filtered sequence identifiers ' '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100), prefilter_failures_list_fp, _index_headers['sequences'])) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") # Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus( input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir, input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp, step1_failures_list_fp, step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath( step2_input_fasta_fp), percent_subsample)) # Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp) commands.append( [('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set', step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir index_links.append( ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")', merged_otu_map_fp, _index_headers['otu_maps'])) if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp, step4_repset_fasta_fp, step3_failures_fasta_fp) commands.append( [('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp, output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map( otu_fp, otu_no_singletons_fp, min_otu_size) index_links.append(('Final map of OTU identifier to sequence identifers excluding ' 'OTUs with fewer than %d sequences' % min_otu_size, otu_no_singletons_fp, _index_headers['otu_maps'])) logger.write('# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath( otu_no_singletons_fp), min_otu_size)) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir index_links.append( ('OTU representative sequences', final_repset_fp, _index_headers['sequences'])) final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir index_links.append( ('New reference sequences (i.e., OTU representative sequences plus input ' 'reference sequences)', new_refseqs_fp, _index_headers['sequences'])) # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write('# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() logger.write('# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size, otu_table_fp, _index_headers['otu_tables'])) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write( "Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: rep_set_tree_fp = join(output_dir, 'rep_set.tre') index_links.append( ('OTU phylogenetic tree', rep_set_tree_fp, _index_headers['trees'])) if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures with biom_open(align_and_tree_input_otu_table) as biom_file: table = Table.from_hdf5(biom_file) filtered_otu_table = filter_otus_from_otu_table(table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close() if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp)
def get_clusters_from_fasta_filepath( fasta_filepath, original_fasta_path, percent_ID=0.97, max_accepts=1, max_rejects=8, stepwords=8, word_length=8, optimal=False, exact=False, suppress_sort=False, output_dir=None, enable_rev_strand_matching=False, subject_fasta_filepath=None, suppress_new_clusters=False, return_cluster_maps=False, stable_sort=False, save_uc_files=True, HALT_EXEC=False): """ Main convenience wrapper for using uclust to generate cluster files A source fasta file is required for the fasta_filepath. This will be sorted to be in order of longest to shortest length sequences. Following this, the sorted fasta file is used to generate a cluster file in the uclust (.uc) format. Next the .uc file is converted to cd-hit format (.clstr). Finally this file is parsed and returned as a list of lists, where each sublist a cluster of sequences. If an output_dir is specified, the intermediate files will be preserved, otherwise all files created are temporary and will be deleted at the end of this function The percent_ID parameter specifies the percent identity for a clusters, i.e., if 99% were the parameter, all sequences that were 99% identical would be grouped as a cluster. """ # Create readable intermediate filenames if they are to be kept fasta_output_filepath = None uc_output_filepath = None cd_hit_filepath = None if output_dir and not output_dir.endswith('/'): output_dir += '/' if save_uc_files: uc_save_filepath = get_output_filepaths( output_dir, original_fasta_path) else: uc_save_filepath = None sorted_fasta_filepath = "" uc_filepath = "" clstr_filepath = "" # Error check in case any app controller fails files_to_remove = [] try: if not suppress_sort: # Sort fasta input file from largest to smallest sequence sort_fasta = uclust_fasta_sort_from_filepath(fasta_filepath, output_filepath=fasta_output_filepath) # Get sorted fasta name from application wrapper sorted_fasta_filepath = sort_fasta['Output'].name files_to_remove.append(sorted_fasta_filepath) else: sort_fasta = None sorted_fasta_filepath = fasta_filepath # Generate uclust cluster file (.uc format) uclust_cluster = uclust_cluster_from_sorted_fasta_filepath( sorted_fasta_filepath, uc_save_filepath, percent_ID=percent_ID, max_accepts=max_accepts, max_rejects=max_rejects, stepwords=stepwords, word_length=word_length, optimal=optimal, exact=exact, suppress_sort=suppress_sort, enable_rev_strand_matching=enable_rev_strand_matching, subject_fasta_filepath=subject_fasta_filepath, suppress_new_clusters=suppress_new_clusters, stable_sort=stable_sort, HALT_EXEC=HALT_EXEC) # Get cluster file name from application wrapper remove_files(files_to_remove) except ApplicationError: remove_files(files_to_remove) raise ApplicationError('Error running uclust. Possible causes are ' 'unsupported version (current supported version is v1.2.22) is installed or ' 'improperly formatted input file was provided') except ApplicationNotFoundError: remove_files(files_to_remove) raise ApplicationNotFoundError('uclust not found, is it properly ' + 'installed?') # Get list of lists for each cluster clusters, failures, seeds = \ clusters_from_uc_file(uclust_cluster['ClusterFile']) # Remove temp files unless user specifies output filepath if not save_uc_files: uclust_cluster.cleanUp() if return_cluster_maps: return clusters, failures, seeds else: return clusters.values(), failures, seeds
def test_plot_heatmap(self): plot_heatmap( self.otu_table, self.otu_table.observation_ids, self.otu_table.sample_ids, filename=self.tmp_heatmap_fpath ) self.assertEqual(exists(self.tmp_heatmap_fpath), True) remove_files(set([self.tmp_heatmap_fpath]))
def main(): option_parser, options, args = parse_command_line_parameters(**script_info) DEBUG = options.verbose check_options(option_parser, options) start_time = time() option_lines = format_options_as_lines(options) if DEBUG: print FORMAT_BAR print "Running with options:" for line in sorted(option_lines): print line print FORMAT_BAR # because the blast app controller uses absolute paths, make sure subject # db path is fully specified subject_db = options.subjectdb if not subject_db.startswith('/'): subject_db = join(getcwd(), subject_db) if not options.no_format_db: # initialize object inpath = FilePath(abspath(options.subjectdb)) subject_dir, subj_file = split(inpath) fdb = FormatDb(WorkingDir=subject_dir) # Currently we do not support protein blasts, but # this would be easy to add in the future... fdb.Parameters['-p'].on('F') # Create indices for record lookup fdb.Parameters['-o'].on('T') # Set input database fdb.Parameters['-i'].on(subject_db) formatdb_cmd = fdb.BaseCommand if DEBUG: print "Formatting db with command: %s" % formatdb_cmd app_result = fdb(subject_db) formatdb_filepaths = [] for v in app_result.values(): try: formatdb_filepaths.append(v.name) except AttributeError: # not a file object, so no path to return pass db_format_time = time() - start_time if DEBUG: print "Formatting subject db took: %2.f seconds" % db_format_time print "formatdb log file written to: %s" % app_result['log'] print FORMAT_BAR else: db_format_time = time() - start_time formatdb_cmd = "None (formatdb not called)" # Check that User-Supplied subjectdb is valid db_ext = [".nhr", ".nin", ".nsd", ".nsi", ".nsq"] formatdb_filepaths = [subject_db + ext for ext in db_ext] if DEBUG: print "Checking that pre-existing formatdb files exist and can be read." print "Files to be checked:" for fp in formatdb_filepaths: print fp print FORMAT_BAR try: formatdb_files = [open(db_f, "U") for db_f in formatdb_filepaths] [f.close() for f in formatdb_files] except IOError: if DEBUG: print "Cannot open user-supplied database file:", db_f option_parser.error( """Problem with -d and --no_format_db option combination: Cannot open the following user-supplied database file: %s. Consider running without --no_format_db to let formatdb generate these required files""" % db_f) if DEBUG: print "OK: BLAST Database files exist and can be read." print FORMAT_BAR # Perform BLAST search blast_results, hit_ids, removed_hit_ids = find_homologs(options.querydb, subject_db, options.e_value, options.max_hits, options.working_dir, options.blastmatroot, options.wordsize, options.percent_aligned, DEBUG=DEBUG) blast_time = (time() - start_time) - db_format_time if DEBUG: print "BLAST search took: %2.f minute(s)" % (blast_time / 60.0) print FORMAT_BAR # Create output folder outputdir = options.outputdir try: makedirs(outputdir) except OSError: pass # Record raw blast results raw_blast_results_path = join(outputdir, "raw_blast_results.txt") f = open(raw_blast_results_path, 'w') f.writelines(blast_results) f.close() # Record excluded seqs excluded_seqs_path = join(outputdir, "matching.fna") ids_to_seq_file(hit_ids, options.querydb, excluded_seqs_path, "") # Record included (screened) seqs included_seqs_path = join(outputdir, "non-matching.fna") all_ids = ids_from_fasta_lines(open(options.querydb)) included_ids = set(all_ids) - hit_ids ids_to_seq_file(included_ids, options.querydb, included_seqs_path, "") log_lines = compose_logfile_lines(start_time, db_format_time, blast_time, option_lines, formatdb_cmd, blast_results, options, all_ids, hit_ids, removed_hit_ids, included_ids, DEBUG) log_path = join(outputdir, "sequence_exclusion.log") if DEBUG: print "Writing summary to: %s" % log_path f = open(log_path, 'w') f.writelines(log_lines) f.close() if not options.no_clean: if DEBUG: print FORMAT_BAR print "| Cleanup |" print FORMAT_BAR if not options.no_format_db: if options.verbose: print "Cleaning up formatdb files:", formatdb_filepaths remove_files(formatdb_filepaths) else: if options.verbose: print "Formatdb not run...nothing to clean"
def tearDown(self): if self._files_to_remove: remove_files(self._files_to_remove) if isdir('/tmp/truncate_fasta_qual_test/'): rmtree('/tmp/truncate_fasta_qual_test/')
def cleanUp(self): """ Remove temporary blast database files, if applicable """ remove_files(self._db_files_to_remove, error_on_missing=False)
def tearDown(self): """ """ remove_files(self.files_to_remove)
def pick_subsampled_open_reference_otus( input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, suppress_index_page=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61'] allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps index_links = [] input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True index_links.append( ('Run summary data', log_fp, _index_headers['run_summary'])) else: close_logger_on_success = False if not suppress_md5: log_input_md5s( logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp, prefiltered_input_fp, prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) index_links.append( ('Pre-filtered sequence identifiers ' '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id) * 100), prefilter_failures_list_fp, _index_headers['sequences'])) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") # Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus(input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir, input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp, step1_failures_list_fp, step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath(step2_input_fasta_fp), percent_subsample)) # Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir index_links.append(( 'Final map of OTU identifier to sequence identifers (i.e., "OTU map")', merged_otu_map_fp, _index_headers['otu_maps'])) if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp, step4_repset_fasta_fp, step3_failures_fasta_fp) commands.append([('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp, output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp, min_otu_size) index_links.append( ('Final map of OTU identifier to sequence identifers excluding ' 'OTUs with fewer than %d sequences' % min_otu_size, otu_no_singletons_fp, _index_headers['otu_maps'])) logger.write( '# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size)) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir index_links.append(('OTU representative sequences', final_repset_fp, _index_headers['sequences'])) final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir index_links.append(( 'New reference sequences (i.e., OTU representative sequences plus input ' 'reference sequences)', new_refseqs_fp, _index_headers['sequences'])) # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write( '# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() logger.write( '# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size, otu_table_fp, _index_headers['otu_tables'])) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: rep_set_tree_fp = join(output_dir, 'rep_set.tre') index_links.append(('OTU phylogenetic tree', rep_set_tree_fp, _index_headers['trees'])) if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table, 'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp, 'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close() if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp)
def test_plot_heatmap(self): plot_heatmap( self.otu_table, self.otu_table.ids(axis='observation'), self.otu_table.ids(), filename=self.tmp_heatmap_fpath) self.assertEqual(exists(self.tmp_heatmap_fpath), True) remove_files(set([self.tmp_heatmap_fpath]))
def get_chimeras_from_Nast_aligned(seqs_fp, ref_db_aligned_fp=None, ref_db_fasta_fp=None, HALT_EXEC=False, min_div_ratio=None, keep_intermediates=False): """remove chimeras from seqs_fp using chimeraSlayer. seqs_fp: a filepath with the seqs to check in the file ref_db_aligned_fp: fp to (pynast) aligned reference sequences ref_db_fasta_fp: same seqs as above, just unaligned. Will be computed on the fly if not provided, HALT_EXEC: stop execution if true min_div_ratio: passed to ChimeraSlayer App """ files_to_remove = [] # might come in as FilePath object with quotes seqs_fp = str(seqs_fp) seqs_fp = seqs_fp.rstrip('"') seqs_fp = seqs_fp.lstrip('"') seqs_dir, new_seqs_fp = split(seqs_fp) # if fp is in current dir, we fake a dir change if seqs_dir == "": seqs_dir = "./" # Chimera Slayer puts some temp files in current dir and some in dir of input file # use exe_dir to change to dir of input file, so to have all tmp files in # one place params = {'--query_NAST': new_seqs_fp, '--exec_dir': seqs_dir} if ref_db_aligned_fp is None and ref_db_fasta_fp is None: # use default db, whose relative position to the # ChimeraSlayer binary is hardcoded pass else: if not ref_db_fasta_fp: # make degapped reference file ref_db_fasta_fp = write_degapped_fasta_to_file(parse_fasta( open(ref_db_aligned_fp))) files_to_remove.append(ref_db_fasta_fp) # use user db params.update({'--db_NAST': abspath(ref_db_aligned_fp), '--db_FASTA': abspath(ref_db_fasta_fp)}) if min_div_ratio is not None: params.update({'-R': min_div_ratio}) app = ChimeraSlayer(params=params, HALT_EXEC=HALT_EXEC) app_results = app() # this is a FilePath object in case of success. # How can we test for failure here? # if not exists(app_results['CPS']): # raise ApplicationError, "ChimeraSlayer failed. No output file." chimeras = parse_CPS_file((app_results['CPS'])) if not keep_intermediates: app.remove_intermediate_files() remove_files(files_to_remove) return chimeras