def pick_reference_otus(input_fp, output_dir, otu_picking_method, refseqs_fp, parallel, params, logger, similarity_override=None): params_copy = deepcopy(params) if 'pick_otus' in params_copy and 'refseqs_fp' in params_copy['pick_otus']: raise WorkflowError( "Cannot pass pick_otus:refseqs_fp in parameters file. This can only be" " passed on the command line or through the API.") if similarity_override is not None: logger.write('Similiarity of %1.3f being used for pre-filtering.\n' % similarity_override) if 'pick_otus' in params_copy: params_copy['pick_otus']['similarity'] = str(similarity_override) else: params_copy['pick_otus'] = {'similarity': str(similarity_override)} if parallel and (otu_picking_method == 'uclust_ref' or otu_picking_method == "sortmerna"): # Grab the parallel-specific parameters try: params_str = get_params_str(params_copy['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --otu_picking_method # option. This works for now though. if 'otu_picking_method' in params_copy['pick_otus']: del params_copy['pick_otus']['otu_picking_method'] except KeyError: pass params_str += ' %s' % get_params_str(params_copy['pick_otus']) otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s -i %s -o %s -r %s -T %s' %\ (otu_picking_script, input_fp, output_dir, refseqs_fp, params_str) else: try: params_str = get_params_str(params_copy['pick_otus']) except KeyError: params_str = '' # Since this is reference-based OTU picking we always want to # suppress new clusters -- force it here. params_str += ' --suppress_new_clusters' logger.write( "Forcing --suppress_new_clusters as this is reference-based OTU picking.\n\n" ) # Build the OTU picking command pick_otus_cmd = 'pick_otus.py -i %s -o %s -r %s -m %s %s' %\ (input_fp, output_dir, refseqs_fp, otu_picking_method, params_str) return pick_otus_cmd
def generate_most_wanted_list( output_dir, otu_table_fps, rep_set_fp, gg_fp, nt_fp, mapping_fp, mapping_category, top_n, min_abundance, max_abundance, min_categories, num_categories_to_plot, max_gg_similarity, max_nt_similarity, e_value, word_size, merged_otu_table_fp, suppress_taxonomic_output, jobs_to_start, command_handler, status_update_callback, force): try: makedirs(output_dir) except OSError: if not force: raise WorkflowError( "Output directory '%s' already exists. Please " "choose a different directory, or force overwrite with -f." % output_dir) logger = WorkflowLogger(generate_log_fp(output_dir)) commands, blast_results_fp, rep_set_cands_failures_fp, \ master_otu_table_ms_fp = _get_most_wanted_filtering_commands( output_dir, otu_table_fps, rep_set_fp, gg_fp, nt_fp, mapping_fp, mapping_category, min_abundance, max_abundance, min_categories, max_gg_similarity, e_value, word_size, merged_otu_table_fp, jobs_to_start) # Execute the commands, but keep the logger open because # we're going to write additional status updates as we process the data. command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] # We'll sort the BLAST results by percent identity (ascending) and pick the # top n. logger.write("Reading in BLAST results, sorting by percent identity, " "and picking the top %d OTUs.\n\n" % top_n) top_n_mw = _get_top_n_blast_results(open(blast_results_fp, 'U'), top_n, max_nt_similarity) # Read in our filtered down candidate seqs file and latest filtered and # collapsed OTU table. We'll need to compute some stats on these to include # in our report. logger.write("Reading in filtered candidate sequences and latest filtered " "and collapsed OTU table.\n\n") mw_seqs = _get_rep_set_lookup(open(rep_set_cands_failures_fp, 'U')) master_otu_table_ms = parse_biom_table(open(master_otu_table_ms_fp, 'U')) # Write results out to tsv and HTML table. logger.write("Writing most wanted OTUs results to TSV and HTML " "tables.\n\n") output_img_dir = join(output_dir, 'img') try: makedirs(output_img_dir) except OSError: # It already exists, which is okay since we already know we are in # 'force' mode from above. pass tsv_lines, html_table_lines, mw_fasta_lines, plot_fps, plot_data_fps = \ _format_top_n_results_table(top_n_mw, mw_seqs, master_otu_table_ms, output_img_dir, mapping_category, suppress_taxonomic_output, num_categories_to_plot) mw_tsv_rel_fp = 'most_wanted_otus.txt' mw_tsv_fp = join(output_dir, mw_tsv_rel_fp) mw_tsv_f = open(mw_tsv_fp, 'w') mw_tsv_f.write(tsv_lines) mw_tsv_f.close() mw_fasta_rel_fp = 'most_wanted_otus.fasta' mw_fasta_fp = join(output_dir, mw_fasta_rel_fp) mw_fasta_f = open(mw_fasta_fp, 'w') mw_fasta_f.write(mw_fasta_lines) mw_fasta_f.close() html_dl_links = ( '<a href="%s" target="_blank">Download table in tab-' 'separated value (TSV) format</a><br /><a href="%s" ' 'target="_blank">Download OTU sequence data in FASTA format</a>' % (mw_tsv_rel_fp, mw_fasta_rel_fp)) html_lines = '%s<div>%s<br /><br />%s<br />%s</div>%s' % ( html_header, html_dl_links, html_table_lines, html_dl_links, html_footer) mw_html_f = open(join(output_dir, 'most_wanted_otus.html'), 'w') mw_html_f.write(html_lines) mw_html_f.close() logger.close()