def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=0.60, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i,input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir,i) if iteration_output_exists(iteration_output_dir,min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger,[input_fp,refseqs_fp]) logger.write('Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i,input_fp)) else: pick_subsampled_open_reference_otus(input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join([new_ref_set_id,str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback) ## perform post-iteration file shuffling whether the previous iteration's ## data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir,min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps),otu_table_fp) commands.append([("Merge OTU tables",merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps,final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp],error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table",add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." %\ pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table,'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')), 0,inf,0,inf,negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def run_core_diversity_analyses( biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, suppress_taxa_summary=False, suppress_beta_diversity=False, suppress_alpha_diversity=False, suppress_group_significance=False, status_update_callback=print_to_stdout, ): """ """ if categories is not None: # Validate categories provided by the users mapping_data, mapping_comments = parse_mapping_file_to_dict(open(mapping_fp, "U")) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError( "Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c, ", ".join(metadata_map.CategoryNames)) ) if metadata_map.hasSingleCategoryValue(c): raise ValueError( "Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c ) else: categories = [] comma_separated_categories = ",".join(categories) # prep some variables if params is None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = "%s/index.html" % output_dir index_links = [] commands = [] # begin logging old_log_fps = glob(join(output_dir, "log_20*txt")) log_fp = generate_log_fp(output_dir) index_links.append(("Master run log", log_fp, _index_headers["run_summary"])) for old_log_fp in old_log_fps: index_links.append(("Previous run log", old_log_fp, _index_headers["run_summary"])) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp, mapping_fp] if tree_fp is not None: input_fps.append(tree_fp) log_input_md5s(logger, input_fps) # run 'biom summarize-table' on input BIOM table try: params_str = get_params_str(params["biom-summarize-table"]) except KeyError: params_str = "" biom_table_stats_output_fp = "%s/biom_table_summary.txt" % output_dir if not exists(biom_table_stats_output_fp): biom_table_summary_cmd = "biom summarize-table -i %s -o %s --suppress-md5 %s" % ( biom_fp, biom_table_stats_output_fp, params_str, ) commands.append([("Generate BIOM table summary", biom_table_summary_cmd)]) else: logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" % biom_table_stats_output_fp) index_links.append(("BIOM table statistics", biom_table_stats_output_fp, _index_headers["run_summary"])) # filter samples with fewer observations than the requested sampling_depth. # since these get filtered for some analyses (eg beta diversity after # even sampling) it's useful to filter them here so they're filtered # from all analyses. filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth) if not exists(filtered_biom_fp): filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" % ( biom_fp, filtered_biom_fp, sampling_depth, ) commands.append( [ ( "Filter low sequence count samples from table (minimum sequence count: %d)" % sampling_depth, filter_samples_cmd, ) ] ) else: logger.write("Skipping filter_samples_from_otu_table.py as %s exists.\n\n" % filtered_biom_fp) biom_fp = filtered_biom_fp # rarify the BIOM table to sampling_depth rarefied_biom_fp = "%s/table_even%d.biom" % (output_dir, sampling_depth) if not exists(rarefied_biom_fp): single_rarefaction_cmd = "single_rarefaction.py -i %s -o %s -d %d" % (biom_fp, rarefied_biom_fp, sampling_depth) commands.append([("Rarify the OTU table to %d sequences/sample" % sampling_depth, single_rarefaction_cmd)]) else: logger.write("Skipping single_rarefaction.py as %s exists.\n\n" % rarefied_biom_fp) # run initial commands and reset the command list if len(commands) > 0: command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] if not suppress_beta_diversity: bdiv_even_output_dir = "%s/bdiv_even%d/" % (output_dir, sampling_depth) # Need to check for the existence of any distance matrices, since the user # can select which will be generated. existing_dm_fps = glob("%s/*_dm.txt" % bdiv_even_output_dir) if len(existing_dm_fps) == 0: even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=rarefied_biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, # Note: we pass sampling depth=None here as # we rarify the BIOM table above and pass that # in here. sampling_depth=None, tree_fp=tree_fp, parallel=parallel, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) else: logger.write("Skipping beta_diversity_through_plots.py as %s exist(s).\n\n" % ", ".join(existing_dm_fps)) even_dm_fps = [(split(fp)[1].strip("_dm.txt"), fp) for fp in existing_dm_fps] # Get make_distance_boxplots parameters try: params_str = get_params_str(params["make_distance_boxplots"]) except KeyError: params_str = "" for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = "%s/%s_boxplots/" % (bdiv_even_output_dir, bdiv_metric) plot_output_fp = "%s/%s_Distances.pdf" % (boxplots_output_dir, category) stats_output_fp = "%s/%s_Stats.txt" % (boxplots_output_dir, category) if not exists(plot_output_fp): boxplots_cmd = "make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s" % ( dm_fp, category, boxplots_output_dir, mapping_fp, params_str, ) commands.append([("Boxplots (%s)" % category, boxplots_cmd)]) else: logger.write( "Skipping make_distance_boxplots.py for %s as %s exists.\n\n" % (category, plot_output_fp) ) index_links.append( ( "Distance boxplots (%s)" % bdiv_metric, plot_output_fp, _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Distance boxplots statistics (%s)" % bdiv_metric, stats_output_fp, _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "PCoA plot (%s)" % bdiv_metric, "%s/%s_emperor_pcoa_plot/index.html" % (bdiv_even_output_dir, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Distance matrix (%s)" % bdiv_metric, "%s/%s_dm.txt" % (bdiv_even_output_dir, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Principal coordinate matrix (%s)" % bdiv_metric, "%s/%s_pc.txt" % (bdiv_even_output_dir, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) if not suppress_alpha_diversity: # Alpha rarefaction workflow arare_full_output_dir = "%s/arare_max%d/" % (output_dir, sampling_depth) rarefaction_plots_output_fp = "%s/alpha_rarefaction_plots/rarefaction_plots.html" % arare_full_output_dir if not exists(rarefaction_plots_output_fp): run_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, suppress_md5=True, status_update_callback=status_update_callback, retain_intermediate_files=False, ) else: logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" % rarefaction_plots_output_fp) index_links.append(("Alpha rarefaction plots", rarefaction_plots_output_fp, _index_headers["alpha_diversity"])) collated_alpha_diversity_fps = glob("%s/alpha_div_collated/*txt" % arare_full_output_dir) try: params_str = get_params_str(params["compare_alpha_diversity"]) except KeyError: params_str = "" if len(categories) > 0: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0] compare_alpha_output_dir = "%s/compare_%s" % (arare_full_output_dir, alpha_metric) if not exists(compare_alpha_output_dir): compare_alpha_cmd = "compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s" % ( collated_alpha_diversity_fp, mapping_fp, comma_separated_categories, compare_alpha_output_dir, params_str, ) commands.append([("Compare alpha diversity (%s)" % alpha_metric, compare_alpha_cmd)]) for category in categories: alpha_comparison_stat_fp = "%s/%s_stats.txt" % (compare_alpha_output_dir, category) alpha_comparison_boxplot_fp = "%s/%s_boxplots.pdf" % (compare_alpha_output_dir, category) index_links.append( ( "Alpha diversity statistics (%s, %s)" % (category, alpha_metric), alpha_comparison_stat_fp, _index_headers["alpha_diversity"], ) ) index_links.append( ( "Alpha diversity boxplots (%s, %s)" % (category, alpha_metric), alpha_comparison_boxplot_fp, _index_headers["alpha_diversity"], ) ) else: logger.write( "Skipping compare_alpha_diversity.py" " for %s as %s exists.\n\n" % (alpha_metric, compare_alpha_output_dir) ) else: logger.write("Skipping compare_alpha_diversity.py as" " no categories were provided.\n\n") if not suppress_taxa_summary: taxa_plots_output_dir = "%s/taxa_plots/" % output_dir # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob(join(taxa_plots_output_dir, "taxa_summary_plots", "*.html")) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) else: logger.write( "Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n" % ", ".join(existing_taxa_plot_html_fps) ) index_links.append( ( "Taxa summary bar plots", "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary"], ) ) index_links.append( ( "Taxa summary area plots", "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary"], ) ) for category in categories: taxa_plots_output_dir = "%s/taxa_plots_%s/" % (output_dir, category) # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob("%s/taxa_summary_plots/*.html" % taxa_plots_output_dir) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=category, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) else: logger.write( "Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n" % (category, ", ".join(existing_taxa_plot_html_fps)) ) index_links.append( ( "Taxa summary bar plots", "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary_categorical"] % category, ) ) index_links.append( ( "Taxa summary area plots", "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary_categorical"] % category, ) ) if not suppress_group_significance: params_str = get_params_str(params["group_significance"]) # group significance tests, aka category significance for category in categories: group_signifance_fp = "%s/group_significance_%s.txt" % (output_dir, category) if not exists(group_signifance_fp): # Build the OTU cateogry significance command group_significance_cmd = "group_significance.py -i %s -m %s -c %s -o %s %s" % ( rarefied_biom_fp, mapping_fp, category, group_signifance_fp, params_str, ) commands.append([("Group significance (%s)" % category, group_significance_cmd)]) else: logger.write( "Skipping group_significance.py for %s as %s exists.\n\n" % (category, group_signifance_fp) ) index_links.append( ("Category significance (%s)" % category, group_signifance_fp, _index_headers["group_significance"]) ) filtered_biom_gzip_fp = "%s.gz" % filtered_biom_fp if not exists(filtered_biom_gzip_fp): commands.append([("Compress the filtered BIOM table", "gzip %s" % filtered_biom_fp)]) else: logger.write("Skipping compressing of filtered BIOM table as %s exists.\n\n" % filtered_biom_gzip_fp) index_links.append( ( "Filtered BIOM table (minimum sequence count: %d)" % sampling_depth, filtered_biom_gzip_fp, _index_headers["run_summary"], ) ) rarified_biom_gzip_fp = "%s.gz" % rarefied_biom_fp if not exists(rarified_biom_gzip_fp): commands.append([("Compress the rarified BIOM table", "gzip %s" % rarefied_biom_fp)]) else: logger.write("Skipping compressing of rarified BIOM table as %s exists.\n\n" % rarified_biom_gzip_fp) index_links.append( ( "Rarified BIOM table (sampling depth: %d)" % sampling_depth, rarified_biom_gzip_fp, _index_headers["run_summary"], ) ) if len(commands) > 0: command_handler(commands, status_update_callback, logger) else: logger.close() generate_index_page(index_links, index_fp)
def run_core_diversity_analyses( biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, suppress_taxa_summary=False, suppress_beta_diversity=False, suppress_alpha_diversity=False, suppress_otu_category_significance=False, status_update_callback=print_to_stdout, ): """ """ if categories != None: # Validate categories provided by the users mapping_data, mapping_comments = parse_mapping_file_to_dict(open(mapping_fp, "U")) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError, ( "Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c, ", ".join(metadata_map.CategoryNames)) ) if metadata_map.hasSingleCategoryValue(c): raise ValueError, ( "Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c ) else: categories = [] # prep some variables if params == None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = "%s/index.html" % output_dir index_links = [] commands = [] # begin logging log_fp = generate_log_fp(output_dir) index_links.append(("Master run log", log_fp, _index_headers["run_summary"])) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp, mapping_fp] if tree_fp != None: input_fps.append(tree_fp) log_input_md5s(logger, input_fps) # run print_biom_table_summary.py on input BIOM table try: params_str = get_params_str(params["print_biom_table_summary"]) except KeyError: params_str = "" biom_table_stats_output_fp = "%s/biom_table_summary.txt" % output_dir print_biom_table_summary_cmd = "print_biom_table_summary.py -i %s -o %s --suppress_md5 %s" % ( biom_fp, biom_table_stats_output_fp, params_str, ) index_links.append(("BIOM table statistics", biom_table_stats_output_fp, _index_headers["run_summary"])) commands.append([("Generate BIOM table summary", print_biom_table_summary_cmd)]) # filter samples with fewer observations than the requested sampling_depth. # since these get filtered for some analyses (eg beta diversity after # even sampling) it's useful to filter them here so they're filtered # from all analyses. filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth) filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" % ( biom_fp, filtered_biom_fp, sampling_depth, ) commands.append( [ ( "Filter low sequence count samples from table (minimum sequence count: %d)" % sampling_depth, filter_samples_cmd, ) ] ) biom_fp = filtered_biom_fp # run initial commands and reset the command list command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] if not suppress_beta_diversity: bdiv_even_output_dir = "%s/bdiv_even%d/" % (output_dir, sampling_depth) even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, sampling_depth=sampling_depth, # force suppression of distance histograms - boxplots work better # in this context, and are created below. histogram_categories=[], tree_fp=tree_fp, parallel=parallel, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = "%s/%s_boxplots/" % (bdiv_even_output_dir, bdiv_metric) try: params_str = get_params_str(params["make_distance_boxplots"]) except KeyError: params_str = "" boxplots_cmd = "make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s" % ( dm_fp, category, boxplots_output_dir, mapping_fp, params_str, ) commands.append([("Boxplots (%s)" % category, boxplots_cmd)]) index_links.append( ( "Distance boxplots (%s)" % bdiv_metric, "%s/%s_Distances.pdf" % (boxplots_output_dir, category), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Distance boxplots statistics (%s)" % bdiv_metric, "%s/%s_Stats.txt" % (boxplots_output_dir, category), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "3D plot (%s, continuous coloring)" % bdiv_metric, "%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "3D plot (%s, discrete coloring)" % bdiv_metric, "%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "2D plot (%s, continuous coloring)" % bdiv_metric, "%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "2D plot (%s, discrete coloring)" % bdiv_metric, "%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Distance matrix (%s)" % bdiv_metric, "%s/%s_dm.txt" % (bdiv_even_output_dir, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Principal coordinate matrix (%s)" % bdiv_metric, "%s/%s_pc.txt" % (bdiv_even_output_dir, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) if not suppress_alpha_diversity: ## Alpha rarefaction workflow arare_full_output_dir = "%s/arare_max%d/" % (output_dir, sampling_depth) run_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, suppress_md5=True, status_update_callback=status_update_callback, ) index_links.append( ( "Alpha rarefaction plots", "%s/alpha_rarefaction_plots/rarefaction_plots.html" % arare_full_output_dir, _index_headers["alpha_diversity"], ) ) collated_alpha_diversity_fps = glob("%s/alpha_div_collated/*txt" % arare_full_output_dir) try: params_str = get_params_str(params["compare_alpha_diversity"]) except KeyError: params_str = "" for category in categories: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0] alpha_comparison_output_fp = "%s/%s_%s.txt" % (arare_full_output_dir, category, alpha_metric) compare_alpha_cmd = "compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s" % ( collated_alpha_diversity_fp, mapping_fp, category, alpha_comparison_output_fp, params_str, ) commands.append([("Compare alpha diversity (%s, %s)" % (category, alpha_metric), compare_alpha_cmd)]) index_links.append( ( "Alpha diversity statistics (%s, %s)" % (category, alpha_metric), alpha_comparison_output_fp, _index_headers["alpha_diversity"], ) ) if not suppress_taxa_summary: taxa_plots_output_dir = "%s/taxa_plots/" % output_dir run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) index_links.append( ( "Taxa summary bar plots", "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary"], ) ) index_links.append( ( "Taxa summary area plots", "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary"], ) ) for category in categories: taxa_plots_output_dir = "%s/taxa_plots_%s/" % (output_dir, category) run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=category, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) index_links.append( ( "Taxa summary bar plots", "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary_categorical"] % category, ) ) index_links.append( ( "Taxa summary area plots", "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary_categorical"] % category, ) ) if not suppress_otu_category_significance: # OTU category significance for category in categories: category_signifance_fp = "%s/category_significance_%s.txt" % (output_dir, category) try: params_str = get_params_str(params["otu_category_significance"]) except KeyError: params_str = "" # Build the OTU cateogry significance command category_significance_cmd = "otu_category_significance.py -i %s -m %s -c %s -o %s %s" % ( biom_fp, mapping_fp, category, category_signifance_fp, params_str, ) commands.append([("OTU category significance (%s)" % category, category_significance_cmd)]) index_links.append( ("Category significance (%s)" % category, category_signifance_fp, _index_headers["otu_category_sig"]) ) commands.append([("Compress the filtered BIOM table", "gzip %s" % filtered_biom_fp)]) index_links.append( ( "Filtered BIOM table (minimum sequence count: %d)" % sampling_depth, "%s.gz" % filtered_biom_fp, _index_headers["run_summary"], ) ) command_handler(commands, status_update_callback, logger) generate_index_page(index_links, index_fp)
def run_pick_closed_reference_otus( input_fp, refseqs_fp, output_dir, taxonomy_fp, command_handler, params, qiime_config, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Pick OTUs; 2) Build an OTU table with optional pre-defined taxonmy. """ # confirm that a valid otu picking method was supplied before doing # any work reference_otu_picking_methods = ['blast','uclust_ref','usearch61_ref'] try: otu_picking_method = params['pick_otus']['otu_picking_method'] except KeyError: otu_picking_method = 'uclust_ref' assert otu_picking_method in reference_otu_picking_methods,\ "Invalid OTU picking method supplied: %s. Valid choices are: %s"\ % (otu_picking_method,' '.join(reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[input_fp,refseqs_fp,taxonomy_fp]) # Prep the OTU picking command pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method) otu_fp = '%s/%s_otus.txt' % (pick_otu_dir,input_basename) if parallel and (otu_picking_method == 'blast' or otu_picking_method == 'uclust_ref' or otu_picking_method == 'usearch61_ref'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['pick_otus'].copy() if 'otu_picking_method' in d: del d['otu_picking_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s %s/%s -i %s -o %s -r %s -T %s' %\ (python_exe_fp, script_dir, otu_picking_script, input_fp, pick_otu_dir, refseqs_fp, params_str) else: try: params_str = get_params_str(params['pick_otus']) except KeyError: params_str = '' # Since this is reference-based OTU picking we always want to # suppress new clusters -- force it here. params_str+= ' --suppress_new_clusters' logger.write("Forcing --suppress_new_clusters as this is closed-reference OTU picking.\n\n") # Build the OTU picking command pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s -r %s -m %s %s' %\ (python_exe_fp, script_dir, input_fp, pick_otu_dir, refseqs_fp, otu_picking_method, params_str) commands.append([('Pick OTUs', pick_otus_cmd)]) # Prep the OTU table building command otu_table_fp = '%s/otu_table.biom' % output_dir try: params_str = get_params_str(params['make_otu_table']) except KeyError: params_str = '' if taxonomy_fp: taxonomy_str = '-t %s' % taxonomy_fp else: taxonomy_str = '' # Build the OTU table building command make_otu_table_cmd = '%s %s/make_otu_table.py -i %s %s -o %s %s' %\ (python_exe_fp, script_dir, otu_fp, taxonomy_str, otu_table_fp, params_str) commands.append([('Make OTU table', make_otu_table_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_ampliconnoise(mapping_fp, output_dir, command_handler, params, qiime_config, logger=None, status_update_callback=print_to_stdout, chimera_alpha=-3.8228, chimera_beta=0.6200, sff_txt_fp=None, numnodes=2, suppress_perseus=True, output_filepath=None, platform='flx', seqnoise_resolution=None, truncate_len=None): """ Run the ampliconnoise pipeline The steps performed by this function are: 1. Split input sff.txt file into one file per sample 2. Run scripts required for PyroNoise 3. Run scripts required for SeqNoise 4. Run scripts requred for Perseus (chimera removal) 5. Merge output files into one file similar to the output of split_libraries.py output_filepath should be absolute seqnoise_resolution should be string environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be careful passing command handlers that don't spawn child processes, as they may not inherit the correct environment variable setting """ map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U')) create_dir(output_dir) if seqnoise_resolution == None: if platform == 'flx': seqnoise_resolution = '30.0' elif platform == 'titanium': seqnoise_resolution = '25.0' else: raise RuntimeError('seqnoise_resolution not set, and no'+\ ' default for platform '+platform) if truncate_len == None: if platform == 'flx': truncate_len = '220' elif platform == 'titanium': truncate_len = '400' else: raise RuntimeError('truncate_len not set, and no'+\ ' default for platform '+platform) sample_names = [ ] # these are filenames minus extension, and are sample IDs primer_seqs = [] # same order as sample_names bc_seqs = [] # same order as sample_names for i in range(len(map_data)): sample_names.append(map_data[i][headers.index('SampleID')]) bc_seqs.append(map_data[i][headers.index('BarcodeSequence')]) # don't know why don't just take off the primer now. # but that's done later # primer += (map_data[i][headers.index('LinkerPrimerSequence')]) # for char, bases in IUPAC_DNA_ambiguities.items(): # primer = primer.replace(char,'['+''.join(bases)+']') primer = (map_data[i][headers.index('LinkerPrimerSequence')]) for char, bases in IUPAC_DNA_ambiguities.items(): primer = primer.replace(char, '[' + ''.join(bases) + ']') primer_seqs.append(primer) if len(set(primer_seqs)) != 1: raise RuntimeError( 'Error: only one primer per mapping file supported.') one_primer = primer_seqs[0] commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False log_input_md5s(logger, [mapping_fp, sff_txt_fp]) # execute commands in output_dir called_dir = os.getcwd() os.chdir(output_dir) fh = open(os.path.join(output_dir, 'map.csv'), 'w') for i in range(len(sample_names)): fh.write(sample_names[i] + ',' + bc_seqs[i] + '\n') fh.close() # these are the fasta results, e.g. PC.636_Good.fa # later we merge them and copy to output file post_pyro_tail = '_' + truncate_len if suppress_perseus == True: fasta_result_names = [ sample_name + post_pyro_tail + '_seqnoise_cd.fa' for sample_name in sample_names ] else: fasta_result_names = [sample_name + '_Good.fa' \ for sample_name in sample_names] cmd = 'cd ' + output_dir # see also os.chdir above commands.append([('change to output dir', cmd)]) cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt' commands.append([('confirm pyro lookup filepath environment variable', cmd) ]) cmd = 'SplitKeys.pl '+one_primer+' map.csv < '+\ os.path.join(called_dir,sff_txt_fp)+\ ' > splitkeys_log.txt 2> unassigned.fna' commands.append([('split sff.txt via barcodes (keys)', cmd)]) for i, sample_name in enumerate(sample_names): # Build the summarize taxonomy command if platform == 'flx': cmd = 'Clean360.pl '+one_primer+' '+sample_name+' < '+\ sample_name+'.raw' commands.append([('clean flows ' + sample_name, cmd)]) # these run through the whole sff file once per sample, I think # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) elif platform == 'titanium': cmd = 'CleanMinMax.pl '+one_primer+' '+sample_name+' < '+\ sample_name+'.raw' commands.append([('clean flows ' + sample_name, cmd)]) # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) else: raise RuntimeError("platform " + platform + " not supported") cmd = "mpirun -np "+str(numnodes)+" PyroDist -in "+\ sample_name+".dat -out "+sample_name+ " > "+sample_name+".pdout" commands.append([('pyrodist ' + sample_name, cmd)]) cmd = "FCluster -in "+sample_name+".fdist -out "+sample_name+\ " > "+sample_name+".fcout" commands.append([('fcluster pyrodist ' + sample_name, cmd)]) # e.g.: # mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin # PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout cmd = "mpirun -np "+str(numnodes)+" PyroNoise -din "+\ sample_name+".dat -out "+\ sample_name+"_pyronoise "+"-lin "+\ sample_name+".list -s 60.0 -c 0.01 > "+\ sample_name+"_pyronoise.pnout" commands.append([('pyronoise ' + sample_name, cmd)]) cmd = 'Parse.pl '+bc_seqs[i]+one_primer+' '+truncate_len+' < '+\ sample_name+'_pyronoise_cd.fa'+' > '+ sample_name+'_'+\ truncate_len+'.fa' commands.append([('truncate ' + sample_name, cmd)]) # now start with post_pyro_tail cmd = "mpirun -np "+str(numnodes)+" SeqDist -in "+\ sample_name+post_pyro_tail+\ ".fa > "+sample_name+post_pyro_tail+".seqdist" commands.append([('seqdist ' + sample_name, cmd)]) cmd = "FCluster -in "+sample_name+post_pyro_tail+".seqdist -out "+\ sample_name+post_pyro_tail+"fcl > "+\ sample_name+post_pyro_tail+".fcout" commands.append([('fcluster seqdist ' + sample_name, cmd)]) # e.g.: # mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din # PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin # PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 > # PC.354_pyronoise_cd.snout cmd = "mpirun -np "+str(numnodes)+" SeqNoise -in "+\ sample_name+post_pyro_tail+\ ".fa -din "+sample_name+post_pyro_tail+".seqdist -out "+\ sample_name+post_pyro_tail+\ "_seqnoise -lin "+sample_name+post_pyro_tail+'fcl.list -min '+\ sample_name+'_pyronoise'+\ '.mapping -s '+seqnoise_resolution+' -c 0.08 > '+\ sample_name+post_pyro_tail+'.snout' commands.append([('seqnoise ' + sample_name, cmd)]) if suppress_perseus == False: cmd = 'Perseus -sin '+sample_name+post_pyro_tail+\ '_seqnoise_cd.fa > ' +\ sample_name+'.per' commands.append([('Perseus ' + sample_name, cmd)]) cmd = 'Class.pl '+sample_name+'.per '+\ str(chimera_alpha) + ' '+ str(chimera_beta)+\ ' > '+sample_name+'.class' commands.append([('Class.pl ' + sample_name, cmd)]) cmd = 'FilterGoodClass.pl '+sample_name+post_pyro_tail+\ '_seqnoise_cd.fa '+\ sample_name+'.class 0.5 > '+sample_name+'_Chi.fa 2> '+\ sample_name+'_Good.fa' commands.append([('FilterGoodClass ' + sample_name, cmd)]) cmd = '%s %s/unweight_fasta.py -i %s -o %s -l %s' %\ (python_exe_fp, script_dir, fasta_result_names[i], sample_name+'_unw.fna', sample_name) commands.append([('unweight fasta ' + sample_name, cmd)]) cmd = 'cat ' +\ ' '.join([sample_name+'_unw.fna' for sample_name in sample_names]) +\ ' > ' + output_filepath # this should be an abs filepath commands.append([('cat into one fasta file', cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=None, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, suppress_index_page=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust'] allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref', 'sortmerna'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps index_links = [] input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True index_links.append( ('Run summary data', log_fp, _index_headers['run_summary'])) else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id) commands.append( [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp, prefiltered_input_fp, prefilter_failures_list_fp) commands.append( [('Filter prefilter failures from input', filter_fasta_cmd)]) index_links.append( ('Pre-filtered sequence identifiers ' '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100), prefilter_failures_list_fp, _index_headers['sequences'])) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") # Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus( input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir, input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp, step1_failures_list_fp, step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir # count number of sequences in step 1 failures fasta file with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f: num_failure_seqs, mean, std = count_seqs_from_file(step1_failures_fasta_f) # number of failures sequences is greater than the threshold, # continue to step 2,3 and 4 run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold if run_step_2_and_3: # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_dir = '%s/step2_otus/' % output_dir create_dir(step2_dir) step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step2_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath( step2_input_fasta_fp), percent_subsample)) # Prep the OTU picking command for the subsampled failures step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp) commands.append( [('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir # remove the indexed reference database from the dictionary of # parameters as it must be forced to build a new database # using the step2_repset_fasta_fp if reference_otu_picking_method == 'sortmerna': if 'sortmerna_db' in params['pick_otus']: del params['pick_otus']['sortmerna_db'] step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set', step3_cmd)]) index_links.append( ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")', merged_otu_map_fp, _index_headers['otu_maps'])) if not suppress_step4: step4_dir = '%s/step4_otus/' % output_dir if run_step_2_and_3: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) failures_fp = step3_failures_fasta_fp failures_otus_fp = 'failures_failures_otus.txt' failures_step = 'step3' else: failures_fp = step1_failures_fasta_fp failures_otus_fp = 'failures_otus.txt' failures_step = 'step1' step3_otu_map_fp = "" step4_cmd = pick_denovo_otus(failures_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp) commands.append([('Pick de novo OTUs on %s failures' % failures_step, step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp) commands.append( [('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created if run_step_2_and_3: failures_fp = step3_failures_list_fp else: failures_fp = step1_failures_list_fp step3_otu_map_fp = "" cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (failures_fp, output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map( otu_fp, otu_no_singletons_fp, min_otu_size) index_links.append(('Final map of OTU identifier to sequence identifers excluding ' 'OTUs with fewer than %d sequences' % min_otu_size, otu_no_singletons_fp, _index_headers['otu_maps'])) logger.write('# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath( otu_no_singletons_fp), min_otu_size)) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir index_links.append( ('OTU representative sequences', final_repset_fp, _index_headers['sequences'])) final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir index_links.append( ('New reference sequences (i.e., OTU representative sequences plus input ' 'reference sequences)', new_refseqs_fp, _index_headers['sequences'])) # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copyfile(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write('# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. if run_step_2_and_3: for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() # steps 1-4 executed if run_step_2_and_3: logger.write('# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # only steps 1 and 4 executed else: logger.write('# Write non-singleton otus representative sequences from ' + 'step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size, otu_table_fp, _index_headers['otu_tables'])) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write( "Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) index_links.append( ('OTU taxonomic assignments', taxonomy_fp, _index_headers['taxa_assignments'])) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: rep_set_tree_fp = join(output_dir, 'rep_set.tre') index_links.append( ('OTU phylogenetic tree', rep_set_tree_fp, _index_headers['trees'])) if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table(table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close() if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp)
def run_jackknifed_beta_diversity(otu_table_fp, tree_fp, seqs_per_sample, output_dir, command_handler, params, qiime_config, mapping_fp, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout, master_tree=None): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Compute beta diversity distance matrix from otu table (and tree, if applicable) 2) Build rarefied OTU tables; 3) Build UPGMA tree from full distance matrix; 4) Compute distance matrics for rarefied OTU tables; 5) Build UPGMA trees from rarefied OTU table distance matrices; 5.5) Build a consensus tree from the rarefied UPGMA trees 6) Compare rarefied OTU table distance matrix UPGMA trees to tree full UPGMA tree and write support file and newick tree with support values as node labels. master_tree can be 'full' or 'consensus', default full """ # Prepare some variables for the later steps if master_tree is None: master_tree = 'full' otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp]) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac'] # Prep the beta-diversity command try: params_str = get_params_str(params['beta_diversity']) except KeyError: params_str = '' if tree_fp: params_str = '%s -t %s' % (params_str, tree_fp) # Build the beta-diversity command beta_div_cmd = 'beta_diversity.py -i %s -o %s %s' %\ (otu_table_fp, output_dir, params_str) commands.append([ ('Beta Diversity (%s)' % ', '.join(beta_diversity_metrics), beta_div_cmd) ]) # Prep rarefaction command rarefaction_dir = '%s/rarefaction/' % output_dir create_dir(rarefaction_dir) try: params_str = get_params_str(params['multiple_rarefactions_even_depth']) except KeyError: params_str = '' # Build the rarefaction command rarefaction_cmd = \ 'multiple_rarefactions_even_depth.py -i %s -d %d -o %s %s' %\ (otu_table_fp, seqs_per_sample, rarefaction_dir, params_str) commands.append([('Rarefaction', rarefaction_cmd)]) # Begin iterating over beta diversity distance metrics, if more than one # was provided for beta_diversity_metric in beta_diversity_metrics: metric_output_dir = '%s/%s/' % (output_dir, beta_diversity_metric) distance_matrix_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) # Prep the hierarchical clustering command (for full distance matrix) full_tree_fp = '%s/%s_upgma.tre' % (metric_output_dir, otu_table_basename) try: params_str = get_params_str(params['upgma_cluster']) except KeyError: params_str = '' # Build the hierarchical clustering command (for full distance matrix) hierarchical_cluster_cmd = 'upgma_cluster.py -i %s -o %s %s' %\ (distance_matrix_fp, full_tree_fp, params_str) commands.append([ ('UPGMA on full distance matrix: %s' % beta_diversity_metric, hierarchical_cluster_cmd) ]) # Prep the beta diversity command (for rarefied OTU tables) dm_dir = '%s/rare_dm/' % metric_output_dir create_dir(dm_dir) # the metrics parameter needs to be ignored as we need to run # beta_diversity one metric at a time to keep the per-metric # output files in separate directories try: d = params['beta_diversity'].copy() del d['metrics'] except KeyError: params_str = {} params_str = get_params_str(d) + ' -m %s ' % beta_diversity_metric if tree_fp: params_str = '%s -t %s' % (params_str, tree_fp) if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the parallel beta diversity command (for rarefied OTU # tables) beta_div_rarefied_cmd = \ 'parallel_beta_diversity.py -T -i %s -o %s %s' %\ (rarefaction_dir, dm_dir, params_str) else: # Build the serial beta diversity command (for rarefied OTU tables) beta_div_rarefied_cmd = \ 'beta_diversity.py -i %s -o %s %s' %\ (rarefaction_dir, dm_dir, params_str) commands.append([('Beta diversity on rarefied OTU tables (%s)' % beta_diversity_metric, beta_div_rarefied_cmd)]) # Prep the hierarchical clustering command (for rarefied # distance matrices) upgma_dir = '%s/rare_upgma/' % metric_output_dir create_dir(upgma_dir) try: params_str = get_params_str(params['upgma_cluster']) except KeyError: params_str = '' # Build the hierarchical clustering command (for rarefied # distance matrices) hierarchical_cluster_cmd =\ 'upgma_cluster.py -i %s -o %s %s' % (dm_dir, upgma_dir, params_str) commands.append([ ('UPGMA on rarefied distance matrix (%s)' % beta_diversity_metric, hierarchical_cluster_cmd) ]) # Build the consensus tree command consensus_tree_cmd =\ 'consensus_tree.py -i %s -o %s %s' %\ (upgma_dir, metric_output_dir + "/rare_upgma_consensus.tre", params_str) commands.append([('consensus on rarefied distance matrices (%s)' % beta_diversity_metric, consensus_tree_cmd)]) # Prep the tree compare command tree_compare_dir = '%s/upgma_cmp/' % metric_output_dir create_dir(tree_compare_dir) try: params_str = get_params_str(params['tree_compare']) except KeyError: params_str = '' # Build the tree compare command if master_tree == "full": master_tree_fp = full_tree_fp elif master_tree == "consensus": master_tree_fp = metric_output_dir + "/rare_upgma_consensus.tre" else: raise RuntimeError('master tree method "%s" not found' % (master_tree, )) tree_compare_cmd = 'tree_compare.py -s %s -m %s -o %s %s' %\ (upgma_dir, master_tree_fp, tree_compare_dir, params_str) commands.append([('Tree compare (%s)' % beta_diversity_metric, tree_compare_cmd)]) # Prep the PCoA command pcoa_dir = '%s/pcoa/' % metric_output_dir create_dir(pcoa_dir) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the PCoA command pcoa_cmd = 'principal_coordinates.py -i %s -o %s %s' %\ (dm_dir, pcoa_dir, params_str) commands.append([('Principal coordinates (%s)' % beta_diversity_metric, pcoa_cmd)]) # Prep the emperor plots command emperor_dir = '%s/emperor_pcoa_plots/' % metric_output_dir create_dir(emperor_dir) try: params_str = get_params_str(params['make_emperor']) except KeyError: params_str = '' emperor_cmd = 'make_emperor.py -i %s -o %s -m %s %s' %\ (pcoa_dir, emperor_dir, mapping_fp, params_str) commands.append([('emperor plots (%s)' % beta_diversity_metric, emperor_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=None, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i, input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir, i) if iteration_output_exists(iteration_output_dir, min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger, [input_fp, refseqs_fp]) logger.write( 'Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i, input_fp)) else: pick_subsampled_open_reference_otus( input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join([new_ref_set_id, str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, suppress_index_page=True, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback, minimum_failure_threshold=minimum_failure_threshold) # perform post-iteration file shuffling whether the previous iteration's # data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir, min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps), otu_table_fp) commands.append([("Merge OTU tables", merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table( table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def run_alpha_rarefaction(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, tree_fp=None, num_steps=10, parallel=False, logger=None, min_rare_depth=10, max_rare_depth=None, suppress_md5=False, status_update_callback=print_to_stdout, plot_stderr_and_stddev=False, retain_intermediate_files=True): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Generate rarefied OTU tables; 2) Compute alpha diversity metrics for each rarefied OTU table; 3) Collate alpha diversity results; 4) Generate alpha rarefaction plots. """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp]) if max_rare_depth is None: min_count, max_count, median_count, mean_count, counts_per_sample =\ compute_counts_per_sample_stats( load_table(otu_table_fp)) max_rare_depth = median_count step = int((max_rare_depth - min_rare_depth) / num_steps) or 1 max_rare_depth = int(max_rare_depth) rarefaction_dir = '%s/rarefaction/' % output_dir create_dir(rarefaction_dir) try: params_str = get_params_str(params['multiple_rarefactions']) except KeyError: params_str = '' if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the rarefaction command rarefaction_cmd = \ 'parallel_multiple_rarefactions.py -T -i %s -m %s -x %s -s %s -o %s %s' %\ (otu_table_fp, min_rare_depth, max_rare_depth, step, rarefaction_dir, params_str) else: # Build the rarefaction command rarefaction_cmd = \ 'multiple_rarefactions.py -i %s -m %s -x %s -s %s -o %s %s' %\ (otu_table_fp, min_rare_depth, max_rare_depth, step, rarefaction_dir, params_str) commands.append([('Alpha rarefaction', rarefaction_cmd)]) # Prep the alpha diversity command alpha_diversity_dir = '%s/alpha_div/' % output_dir create_dir(alpha_diversity_dir) try: params_str = get_params_str(params['alpha_diversity']) except KeyError: params_str = '' if tree_fp: params_str += ' -t %s' % tree_fp if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the alpha diversity command alpha_diversity_cmd = \ "parallel_alpha_diversity.py -T -i %s -o %s %s" %\ (rarefaction_dir, alpha_diversity_dir, params_str) else: # Build the alpha diversity command alpha_diversity_cmd = \ "alpha_diversity.py -i %s -o %s %s" %\ (rarefaction_dir, alpha_diversity_dir, params_str) commands.append([('Alpha diversity on rarefied OTU tables', alpha_diversity_cmd)]) # Prep the alpha diversity collation command alpha_collated_dir = '%s/alpha_div_collated/' % output_dir create_dir(alpha_collated_dir) try: params_str = get_params_str(params['collate_alpha']) except KeyError: params_str = '' # Build the alpha diversity collation command alpha_collated_cmd = 'collate_alpha.py -i %s -o %s %s' %\ (alpha_diversity_dir, alpha_collated_dir, params_str) commands.append([('Collate alpha', alpha_collated_cmd)]) if not retain_intermediate_files: commands.append([ ('Removing intermediate files', 'rm -r %s %s' % (rarefaction_dir, alpha_diversity_dir)) ]) else: commands.append([('Skipping removal of intermediate files.', '')]) # Prep the make rarefaction plot command(s) try: params_str = get_params_str(params['make_rarefaction_plots']) except KeyError: params_str = '' if 'std_type' in params[ 'make_rarefaction_plots'] or not plot_stderr_and_stddev: rarefaction_plot_dir = '%s/alpha_rarefaction_plots/' % output_dir create_dir(rarefaction_plot_dir) # Build the make rarefaction plot command(s) # for metric in alpha_diversity_metrics: make_rarefaction_plot_cmd =\ 'make_rarefaction_plots.py -i %s -m %s -o %s %s' %\ (alpha_collated_dir, mapping_fp, rarefaction_plot_dir, params_str) commands.append([('Rarefaction plot: %s' % 'All metrics', make_rarefaction_plot_cmd)]) else: rarefaction_plot_dir_stddev = '%s/alpha_rarefaction_plots_stddev/' % output_dir rarefaction_plot_dir_stderr = '%s/alpha_rarefaction_plots_stderr/' % output_dir create_dir(rarefaction_plot_dir_stddev) create_dir(rarefaction_plot_dir_stderr) # Build the make rarefaction plot command(s) # for metric in alpha_diversity_metrics: make_rarefaction_plot_cmd =\ 'make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stddev' %\ (alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stddev, params_str) commands.append([('Rarefaction plot: %s' % 'All metrics', make_rarefaction_plot_cmd)]) make_rarefaction_plot_cmd =\ 'make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stderr' %\ (alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stderr, params_str) commands.append([('Rarefaction plot: %s' % 'All metrics', make_rarefaction_plot_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_beta_diversity_through_plots(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, color_by_interesting_fields_only=True, sampling_depth=None, tree_fp=None, parallel=False, logger=None, suppress_emperor_plots=False, suppress_md5=False, status_update_callback=print_to_stdout): """ Compute beta diversity distance matrices, run PCoA, and generate emperor plots The steps performed by this function are: 1) Compute a beta diversity distance matrix for each metric 2) Peform a principal coordinates analysis on the result of step 1 3) Generate an emperor plot for each result of step 2 """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp]) mapping_data, mapping_header, mapping_comments =\ parse_mapping_file(open(mapping_fp, 'U')) # Get the interesting mapping fields to color by -- if none are # interesting, take all of them. Interesting is defined as those # which have greater than one value and fewer values than the number # of samples if color_by_interesting_fields_only: mapping_fields =\ get_interesting_mapping_fields(mapping_data, mapping_header) or\ mapping_header else: mapping_fields = mapping_header mapping_fields = ','.join(mapping_fields) if sampling_depth: # Sample the OTU table at even depth even_sampled_otu_table_fp = '%s/%s_even%d%s' %\ (output_dir, otu_table_basename, sampling_depth, otu_table_ext) single_rarefaction_cmd = \ 'single_rarefaction.py -i %s -o %s -d %d' %\ (otu_table_fp, even_sampled_otu_table_fp, sampling_depth) commands.append([ ('Sample OTU table at %d seqs/sample' % sampling_depth, single_rarefaction_cmd) ]) otu_table_fp = even_sampled_otu_table_fp otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac'] dm_fps = [] for beta_diversity_metric in beta_diversity_metrics: # Prep the beta-diversity command try: bdiv_params_copy = params['beta_diversity'].copy() except KeyError: bdiv_params_copy = {} try: del bdiv_params_copy['metrics'] except KeyError: pass params_str = get_params_str(bdiv_params_copy) if tree_fp: params_str = '%s -t %s ' % (params_str, tree_fp) # Build the beta-diversity command if parallel: # Grab the parallel-specific parameters try: params_str += get_params_str(params['parallel']) except KeyError: pass beta_div_cmd = 'parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\ (otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append([('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) else: beta_div_cmd = 'beta_diversity.py -i %s -o %s --metrics %s %s' %\ (otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append([('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) orig_beta_div_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) beta_div_fp = '%s/%s_dm.txt' % \ (output_dir, beta_diversity_metric) commands.append([ ('Rename distance matrix (%s)' % beta_diversity_metric, 'mv %s %s' % (orig_beta_div_fp, beta_div_fp)) ]) dm_fps.append((beta_diversity_metric, beta_div_fp)) # Prep the principal coordinates command pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the principal coordinates command pc_cmd = 'principal_coordinates.py -i %s -o %s %s' %\ (beta_div_fp, pc_fp, params_str) commands.append([('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)]) # Generate emperor plots if not suppress_emperor_plots: # Prep the emperor plots command emperor_dir = '%s/%s_emperor_pcoa_plot/' % (output_dir, beta_diversity_metric) create_dir(emperor_dir) try: params_str = get_params_str(params['make_emperor']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command emperor_command = \ 'make_emperor.py -i %s -o %s -m %s %s' % (pc_fp, emperor_dir, mapping_fp, params_str) commands.append([ ('Make emperor plots, %s)' % beta_diversity_metric, emperor_command) ]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return dm_fps
def run_beta_diversity_through_plots(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, color_by_interesting_fields_only=True, sampling_depth=None, histogram_categories=None, tree_fp=None, parallel=False, logger=None, suppress_3d_plots=False, suppress_2d_plots=False, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Compute a beta diversity distance matrix; 2) Peform a principal coordinates analysis on the result of Step 1; 3) Generate a 3D prefs file for optimized coloring of continuous variables; 4) Generate a 3D plot for all mapping fields with colors optimized for continuous data; 5) Generate a 3D plot for all mapping fields with colors optimized for discrete data. """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp]) mapping_data, mapping_header, mapping_comments =\ parse_mapping_file(open(mapping_fp,'U')) if histogram_categories: invalid_categories = set(histogram_categories) - set(mapping_header) if invalid_categories: raise ValueError,\ "Invalid histogram categories - these must exactly match "+\ "mapping file column headers: %s" % (' '.join(invalid_categories)) # Get the interesting mapping fields to color by -- if none are # interesting, take all of them. Interesting is defined as those # which have greater than one value and fewer values than the number # of samples if color_by_interesting_fields_only: mapping_fields =\ get_interesting_mapping_fields(mapping_data, mapping_header) or\ mapping_header else: mapping_fields = mapping_header mapping_fields = ','.join(mapping_fields) if sampling_depth: # Sample the OTU table at even depth even_sampled_otu_table_fp = '%s/%s_even%d%s' %\ (output_dir, otu_table_basename, sampling_depth, otu_table_ext) single_rarefaction_cmd = \ '%s %s/single_rarefaction.py -i %s -o %s -d %d' %\ (python_exe_fp, script_dir, otu_table_fp, even_sampled_otu_table_fp, sampling_depth) commands.append([ ('Sample OTU table at %d seqs/sample' % sampling_depth, single_rarefaction_cmd) ]) otu_table_fp = even_sampled_otu_table_fp otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac'] # Prep the 3d prefs file generator command prefs_fp = '%s/prefs.txt' % output_dir try: params_str = get_params_str(params['make_prefs_file']) except KeyError: params_str = '' if not 'mapping_headers_to_use' in params['make_prefs_file']: params_str = '%s --mapping_headers_to_use %s' \ % (params_str,mapping_fields) # Build the 3d prefs file generator command prefs_cmd = \ '%s %s/make_prefs_file.py -m %s -o %s %s' %\ (python_exe_fp, script_dir, mapping_fp, prefs_fp, params_str) commands.append([('Build prefs file', prefs_cmd)]) dm_fps = [] for beta_diversity_metric in beta_diversity_metrics: # Prep the beta-diversity command try: bdiv_params_copy = params['beta_diversity'].copy() except KeyError: bdiv_params_copy = {} try: del bdiv_params_copy['metrics'] except KeyError: pass params_str = get_params_str(bdiv_params_copy) if tree_fp: params_str = '%s -t %s ' % (params_str, tree_fp) # Build the beta-diversity command if parallel: # Grab the parallel-specific parameters try: params_str += get_params_str(params['parallel']) except KeyError: pass beta_div_cmd = '%s %s/parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append(\ [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) else: beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s --metrics %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append(\ [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) orig_beta_div_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) beta_div_fp = '%s/%s_dm.txt' % \ (output_dir, beta_diversity_metric) commands.append([ ('Rename distance matrix (%s)' % beta_diversity_metric, 'mv %s %s' % (orig_beta_div_fp, beta_div_fp)) ]) dm_fps.append((beta_diversity_metric, beta_div_fp)) # Prep the principal coordinates command pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the principal coordinates command pc_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, beta_div_fp, pc_fp, params_str) commands.append(\ [('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)]) # Generate 3d plots if not suppress_3d_plots: # Prep the continuous-coloring 3d plots command continuous_3d_dir = '%s/%s_3d_continuous/' %\ (output_dir, beta_diversity_metric) create_dir(continuous_3d_dir) try: params_str = get_params_str(params['make_3d_plots']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command continuous_3d_command = \ '%s %s/make_3d_plots.py -p %s -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_3d_dir, mapping_fp, params_str) # Prep the discrete-coloring 3d plots command discrete_3d_dir = '%s/%s_3d_discrete/' %\ (output_dir, beta_diversity_metric) create_dir(discrete_3d_dir) try: params_str = get_params_str(params['make_3d_plots']) except KeyError: params_str = '' # Build the discrete-coloring 3d plots command discrete_3d_command = \ '%s %s/make_3d_plots.py -b "%s" -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_3d_dir, mapping_fp, params_str) commands.append([\ ('Make 3D plots (continuous coloring, %s)' %\ beta_diversity_metric,continuous_3d_command),\ ('Make 3D plots (discrete coloring, %s)' %\ beta_diversity_metric,discrete_3d_command,)]) # Generate 3d plots if not suppress_2d_plots: # Prep the continuous-coloring 3d plots command continuous_2d_dir = '%s/%s_2d_continuous/' %\ (output_dir, beta_diversity_metric) create_dir(continuous_2d_dir) try: params_str = get_params_str(params['make_2d_plots']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command continuous_2d_command = \ '%s %s/make_2d_plots.py -p %s -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_2d_dir, mapping_fp, params_str) # Prep the discrete-coloring 3d plots command discrete_2d_dir = '%s/%s_2d_discrete/' %\ (output_dir, beta_diversity_metric) create_dir(discrete_2d_dir) try: params_str = get_params_str(params['make_2d_plots']) except KeyError: params_str = '' # Build the discrete-coloring 2d plots command discrete_2d_command = \ '%s %s/make_2d_plots.py -b "%s" -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_2d_dir, mapping_fp, params_str) commands.append([\ ('Make 2D plots (continuous coloring, %s)' %\ beta_diversity_metric,continuous_2d_command),\ ('Make 2D plots (discrete coloring, %s)' %\ beta_diversity_metric,discrete_2d_command,)]) if histogram_categories: # Prep the discrete-coloring 3d plots command histograms_dir = '%s/%s_histograms/' %\ (output_dir, beta_diversity_metric) create_dir(histograms_dir) try: params_str = get_params_str(params['make_distance_histograms']) except KeyError: params_str = '' # Build the make_distance_histograms command distance_histograms_command = \ '%s %s/make_distance_histograms.py -d %s -o %s -m %s -f "%s" %s' %\ (python_exe_fp, script_dir, beta_div_fp, histograms_dir, mapping_fp, ','.join(histogram_categories), params_str) commands.append([\ ('Make Distance Histograms (%s)' %\ beta_diversity_metric,distance_histograms_command)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return dm_fps
def run_beta_diversity_through_plots(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, color_by_interesting_fields_only=True, sampling_depth=None, histogram_categories=None, tree_fp=None, parallel=False, logger=None, suppress_3d_plots=False, suppress_2d_plots=False, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Compute a beta diversity distance matrix; 2) Peform a principal coordinates analysis on the result of Step 1; 3) Generate a 3D prefs file for optimized coloring of continuous variables; 4) Generate a 3D plot for all mapping fields with colors optimized for continuous data; 5) Generate a 3D plot for all mapping fields with colors optimized for discrete data. """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp]) mapping_data, mapping_header, mapping_comments =\ parse_mapping_file(open(mapping_fp,'U')) if histogram_categories: invalid_categories = set(histogram_categories) - set(mapping_header) if invalid_categories: raise ValueError,\ "Invalid histogram categories - these must exactly match "+\ "mapping file column headers: %s" % (' '.join(invalid_categories)) # Get the interesting mapping fields to color by -- if none are # interesting, take all of them. Interesting is defined as those # which have greater than one value and fewer values than the number # of samples if color_by_interesting_fields_only: mapping_fields =\ get_interesting_mapping_fields(mapping_data, mapping_header) or\ mapping_header else: mapping_fields = mapping_header mapping_fields = ','.join(mapping_fields) if sampling_depth: # Sample the OTU table at even depth even_sampled_otu_table_fp = '%s/%s_even%d%s' %\ (output_dir, otu_table_basename, sampling_depth, otu_table_ext) single_rarefaction_cmd = \ '%s %s/single_rarefaction.py -i %s -o %s -d %d' %\ (python_exe_fp, script_dir, otu_table_fp, even_sampled_otu_table_fp, sampling_depth) commands.append([ ('Sample OTU table at %d seqs/sample' % sampling_depth, single_rarefaction_cmd)]) otu_table_fp = even_sampled_otu_table_fp otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac','unweighted_unifrac'] # Prep the 3d prefs file generator command prefs_fp = '%s/prefs.txt' % output_dir try: params_str = get_params_str(params['make_prefs_file']) except KeyError: params_str = '' if not 'mapping_headers_to_use' in params['make_prefs_file']: params_str = '%s --mapping_headers_to_use %s' \ % (params_str,mapping_fields) # Build the 3d prefs file generator command prefs_cmd = \ '%s %s/make_prefs_file.py -m %s -o %s %s' %\ (python_exe_fp, script_dir, mapping_fp, prefs_fp, params_str) commands.append([('Build prefs file', prefs_cmd)]) dm_fps = [] for beta_diversity_metric in beta_diversity_metrics: # Prep the beta-diversity command try: bdiv_params_copy = params['beta_diversity'].copy() except KeyError: bdiv_params_copy = {} try: del bdiv_params_copy['metrics'] except KeyError: pass params_str = get_params_str(bdiv_params_copy) if tree_fp: params_str = '%s -t %s ' % (params_str,tree_fp) # Build the beta-diversity command if parallel: # Grab the parallel-specific parameters try: params_str += get_params_str(params['parallel']) except KeyError: pass beta_div_cmd = '%s %s/parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append(\ [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) else: beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s --metrics %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append(\ [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) orig_beta_div_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) beta_div_fp = '%s/%s_dm.txt' % \ (output_dir, beta_diversity_metric) commands.append([('Rename distance matrix (%s)' % beta_diversity_metric, 'mv %s %s' % (orig_beta_div_fp, beta_div_fp))]) dm_fps.append((beta_diversity_metric, beta_div_fp)) # Prep the principal coordinates command pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the principal coordinates command pc_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, beta_div_fp, pc_fp, params_str) commands.append(\ [('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)]) # Generate 3d plots if not suppress_3d_plots: # Prep the continuous-coloring 3d plots command continuous_3d_dir = '%s/%s_3d_continuous/' %\ (output_dir, beta_diversity_metric) create_dir(continuous_3d_dir) try: params_str = get_params_str(params['make_3d_plots']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command continuous_3d_command = \ '%s %s/make_3d_plots.py -p %s -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_3d_dir, mapping_fp, params_str) # Prep the discrete-coloring 3d plots command discrete_3d_dir = '%s/%s_3d_discrete/' %\ (output_dir, beta_diversity_metric) create_dir(discrete_3d_dir) try: params_str = get_params_str(params['make_3d_plots']) except KeyError: params_str = '' # Build the discrete-coloring 3d plots command discrete_3d_command = \ '%s %s/make_3d_plots.py -b "%s" -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_3d_dir, mapping_fp, params_str) commands.append([\ ('Make 3D plots (continuous coloring, %s)' %\ beta_diversity_metric,continuous_3d_command),\ ('Make 3D plots (discrete coloring, %s)' %\ beta_diversity_metric,discrete_3d_command,)]) # Generate 3d plots if not suppress_2d_plots: # Prep the continuous-coloring 3d plots command continuous_2d_dir = '%s/%s_2d_continuous/' %\ (output_dir, beta_diversity_metric) create_dir(continuous_2d_dir) try: params_str = get_params_str(params['make_2d_plots']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command continuous_2d_command = \ '%s %s/make_2d_plots.py -p %s -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_2d_dir, mapping_fp, params_str) # Prep the discrete-coloring 3d plots command discrete_2d_dir = '%s/%s_2d_discrete/' %\ (output_dir, beta_diversity_metric) create_dir(discrete_2d_dir) try: params_str = get_params_str(params['make_2d_plots']) except KeyError: params_str = '' # Build the discrete-coloring 2d plots command discrete_2d_command = \ '%s %s/make_2d_plots.py -b "%s" -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_2d_dir, mapping_fp, params_str) commands.append([\ ('Make 2D plots (continuous coloring, %s)' %\ beta_diversity_metric,continuous_2d_command),\ ('Make 2D plots (discrete coloring, %s)' %\ beta_diversity_metric,discrete_2d_command,)]) if histogram_categories: # Prep the discrete-coloring 3d plots command histograms_dir = '%s/%s_histograms/' %\ (output_dir, beta_diversity_metric) create_dir(histograms_dir) try: params_str = get_params_str(params['make_distance_histograms']) except KeyError: params_str = '' # Build the make_distance_histograms command distance_histograms_command = \ '%s %s/make_distance_histograms.py -d %s -o %s -m %s -f "%s" %s' %\ (python_exe_fp, script_dir, beta_div_fp, histograms_dir, mapping_fp, ','.join(histogram_categories), params_str) commands.append([\ ('Make Distance Histograms (%s)' %\ beta_diversity_metric,distance_histograms_command)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return dm_fps
def run_pick_closed_reference_otus( input_fp, refseqs_fp, output_dir, taxonomy_fp, command_handler, params, qiime_config, assign_taxonomy=False, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Pick OTUs; 2) If assignment_taxonomy is True, choose representative sequence for OTUs and assign taxonomy using a classifier. 3) Build an OTU table with optional predefined taxonomy (if assign_taxonomy=False) or taxonomic assignments from step 2 (if assign_taxonomy=True). """ # confirm that a valid otu picking method was supplied before doing # any work reference_otu_picking_methods = ['blast', 'uclust_ref', 'usearch61_ref', 'usearch_ref', 'sortmerna'] try: otu_picking_method = params['pick_otus']['otu_picking_method'] except KeyError: otu_picking_method = 'uclust_ref' assert otu_picking_method in reference_otu_picking_methods,\ "Invalid OTU picking method supplied: %s. Valid choices are: %s"\ % (otu_picking_method, ' '.join(reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, taxonomy_fp]) # Prep the OTU picking command pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method) otu_fp = '%s/%s_otus.txt' % (pick_otu_dir, input_basename) if parallel and (otu_picking_method == 'blast' or otu_picking_method == 'uclust_ref' or otu_picking_method == 'usearch61_ref' or otu_picking_method == 'sortmerna'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['pick_otus'].copy() if 'otu_picking_method' in d: del d['otu_picking_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s -i %s -o %s -r %s -T %s' %\ (otu_picking_script, input_fp, pick_otu_dir, refseqs_fp, params_str) else: try: params_str = get_params_str(params['pick_otus']) except KeyError: params_str = '' # Since this is reference-based OTU picking we always want to # suppress new clusters -- force it here. params_str += ' --suppress_new_clusters' logger.write( "Forcing --suppress_new_clusters as this is " "closed-reference OTU picking.\n\n") # Build the OTU picking command pick_otus_cmd = 'pick_otus.py -i %s -o %s -r %s -m %s %s' %\ (input_fp, pick_otu_dir, refseqs_fp, otu_picking_method, params_str) commands.append([('Pick OTUs', pick_otus_cmd)]) # Assign taxonomy using a taxonomy classifier, if request by the user. # (Alternatively predefined taxonomic assignments will be used, if provided.) if assign_taxonomy: # Prep the representative set picking command rep_set_dir = '%s/rep_set/' % output_dir create_dir(rep_set_dir) rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir, input_basename) rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir, input_basename) try: params_str = get_params_str(params['pick_rep_set']) except KeyError: params_str = '' # Build the representative set picking command pick_rep_set_cmd = 'pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\ (otu_fp, input_fp, rep_set_log_fp, rep_set_fp, params_str) commands.append([('Pick representative set', pick_rep_set_cmd)]) # Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'uclust' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir, assignment_method) taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \ (assign_taxonomy_dir, input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast' or assignment_method == 'uclust'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the taxonomy assignment parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() if 'assignment_method' in d: del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ 'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (assignment_method, rep_set_fp, assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\ (assign_taxonomy_dir, rep_set_fp, params_str) commands.append([('Assign taxonomy', assign_taxonomy_cmd)]) # Prep the OTU table building command otu_table_fp = '%s/otu_table.biom' % output_dir try: params_str = get_params_str(params['make_otu_table']) except KeyError: params_str = '' # If assign_taxonomy is True, this will be the path to the taxonomic # assignment results. If assign_taxonomy is False this will be either # the precomputed taxonomic assignments that the user passed in, # or None. if taxonomy_fp: taxonomy_str = '-t %s' % taxonomy_fp else: taxonomy_str = '' # Build the OTU table building command make_otu_table_cmd = 'make_otu_table.py -i %s %s -o %s %s' %\ (otu_fp, taxonomy_str, otu_table_fp, params_str) commands.append([('Make OTU table', make_otu_table_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_core_diversity_analyses( biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, suppress_taxa_summary=False, suppress_beta_diversity=False, suppress_alpha_diversity=False, suppress_otu_category_significance=False, status_update_callback=print_to_stdout): """ """ if categories != None: # Validate categories provided by the users mapping_data, mapping_comments = \ parse_mapping_file_to_dict(open(mapping_fp,'U')) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError, ("Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames))) if metadata_map.hasSingleCategoryValue(c): raise ValueError, ("Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c) else: categories= [] # prep some variables if params == None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = '%s/index.html' % output_dir index_links = [] commands = [] # begin logging old_log_fps = glob(join(output_dir,'log_20*txt')) log_fp = generate_log_fp(output_dir) index_links.append(('Master run log',log_fp,_index_headers['run_summary'])) for old_log_fp in old_log_fps: index_links.append(('Previous run log',old_log_fp,_index_headers['run_summary'])) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp,mapping_fp] if tree_fp != None: input_fps.append(tree_fp) log_input_md5s(logger,input_fps) # run 'biom summarize-table' on input BIOM table try: params_str = get_params_str(params['biom-summarize-table']) except KeyError: params_str = '' biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir if not exists(biom_table_stats_output_fp): biom_table_summary_cmd = \ "biom summarize-table -i %s -o %s --suppress-md5 %s" % \ (biom_fp, biom_table_stats_output_fp,params_str) commands.append([('Generate BIOM table summary', biom_table_summary_cmd)]) else: logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" \ % biom_table_stats_output_fp) index_links.append(('BIOM table statistics', biom_table_stats_output_fp, _index_headers['run_summary'])) # filter samples with fewer observations than the requested sampling_depth. # since these get filtered for some analyses (eg beta diversity after # even sampling) it's useful to filter them here so they're filtered # from all analyses. filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth) if not exists(filtered_biom_fp): filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\ (biom_fp,filtered_biom_fp,sampling_depth) commands.append([('Filter low sequence count samples from table (minimum sequence count: %d)' % sampling_depth, filter_samples_cmd)]) else: logger.write("Skipping filter_samples_from_otu_table.py as %s exists.\n\n" \ % filtered_biom_fp) biom_fp = filtered_biom_fp # run initial commands and reset the command list if len(commands) > 0: command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] if not suppress_beta_diversity: bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth) # Need to check for the existence of any distance matrices, since the user # can select which will be generated. existing_dm_fps = glob('%s/*_dm.txt' % bdiv_even_output_dir) if len(existing_dm_fps) == 0: even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, sampling_depth=sampling_depth, tree_fp=tree_fp, parallel=parallel, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write("Skipping beta_diversity_through_plots.py as %s exist(s).\n\n" \ % ', '.join(existing_dm_fps)) even_dm_fps = [(split(fp)[1].strip('_dm.txt'),fp) for fp in existing_dm_fps] # Get make_distance_boxplots parameters try: params_str = get_params_str(params['make_distance_boxplots']) except KeyError: params_str = '' for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric) plot_output_fp = '%s/%s_Distances.pdf' % (boxplots_output_dir,category) stats_output_fp = '%s/%s_Stats.txt' % (boxplots_output_dir,category) if not exists(plot_output_fp): boxplots_cmd = \ 'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\ (dm_fp, category, boxplots_output_dir, mapping_fp, params_str) commands.append([('Boxplots (%s)' % category, boxplots_cmd)]) else: logger.write("Skipping make_distance_boxplots.py for %s as %s exists.\n\n" \ % (category, plot_output_fp)) index_links.append(('Distance boxplots (%s)' % bdiv_metric, plot_output_fp, _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric, stats_output_fp, _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('PCoA plot (%s)' % bdiv_metric, '%s/%s_emperor_pcoa_plot/index.html' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Distance matrix (%s)' % bdiv_metric, '%s/%s_dm.txt' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric, '%s/%s_pc.txt' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) if not suppress_alpha_diversity: ## Alpha rarefaction workflow arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth) rarefaction_plots_output_fp = \ '%s/alpha_rarefaction_plots/rarefaction_plots.html' % arare_full_output_dir if not exists(rarefaction_plots_output_fp): run_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" \ % rarefaction_plots_output_fp) index_links.append(('Alpha rarefaction plots', rarefaction_plots_output_fp, _index_headers['alpha_diversity'])) collated_alpha_diversity_fps = \ glob('%s/alpha_div_collated/*txt' % arare_full_output_dir) try: params_str = get_params_str(params['compare_alpha_diversity']) except KeyError: params_str = '' for category in categories: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0] alpha_comparison_output_fp = '%s/%s_%s.txt' % \ (arare_full_output_dir,category,alpha_metric) if not exists(alpha_comparison_output_fp): compare_alpha_cmd = \ 'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\ (collated_alpha_diversity_fp, mapping_fp, category, alpha_comparison_output_fp, params_str) commands.append([('Compare alpha diversity (%s, %s)' %\ (category,alpha_metric), compare_alpha_cmd)]) else: logger.write("Skipping compare_alpha_diversity.py for %s as %s exists.\n\n" \ % (category, alpha_comparison_output_fp)) index_links.append( ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric), alpha_comparison_output_fp, _index_headers['alpha_diversity'])) if not suppress_taxa_summary: taxa_plots_output_dir = '%s/taxa_plots/' % output_dir # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob(join(output_dir,'taxa_summary_plots','*.html')) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write("Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n" \ % ', '.join(existing_taxa_plot_html_fps)) index_links.append(('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary'])) index_links.append(('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary'])) for category in categories: taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,category) # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob('%s/taxa_summary_plots/*.html' % taxa_plots_output_dir) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=category, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write("Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n" \ % (category, ', '.join(existing_taxa_plot_html_fps))) index_links.append(('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) index_links.append(('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) if not suppress_otu_category_significance: try: params_str = get_params_str(params['otu_category_significance']) except KeyError: params_str = '' # OTU category significance for category in categories: category_signifance_fp = \ '%s/category_significance_%s.txt' % (output_dir, category) if not exists(category_signifance_fp): # Build the OTU cateogry significance command category_significance_cmd = \ 'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\ (biom_fp, mapping_fp, category, category_signifance_fp, params_str) commands.append([('OTU category significance (%s)' % category, category_significance_cmd)]) else: logger.write("Skipping otu_category_significance.py for %s as %s exists.\n\n" \ % (category, category_signifance_fp)) index_links.append(('Category significance (%s)' % category, category_signifance_fp, _index_headers['otu_category_sig'])) filtered_biom_gzip_fp = '%s.gz' % filtered_biom_fp if not exists(filtered_biom_gzip_fp): commands.append([('Compress the filtered BIOM table','gzip %s' % filtered_biom_fp)]) index_links.append(('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth, filtered_biom_gzip_fp, _index_headers['run_summary'])) else: logger.write("Skipping compressing of filtered BIOM table as %s exists.\n\n" \ % filtered_biom_gzip_fp) if len(commands) > 0: command_handler(commands, status_update_callback, logger) else: logger.close() generate_index_page(index_links,index_fp)
def run_summarize_taxa_through_plots(otu_table_fp, mapping_fp, output_dir, mapping_cat, sort, command_handler, params, qiime_config, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation for summarizing taxonomies and generating plots The steps performed by this function are: 1) Summarize OTU by Category 2) Summarize Taxonomy 3) Plot Taxonomy Summary """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp]) # if mapping category not passed via command-line, # check if it is passed in params file if not mapping_cat: try: mapping_cat = params['collapse_samples']['collapse_fields'] except: mapping_cat = None try: params_str = get_params_str(params['collapse_samples']) # Need to remove the mapping category option, since it is defined above. # Using this method since we don't want to change the params dict split_params = params_str.split('--') updated_params_str = [] for i in split_params: if not i.startswith('collapse_fields'): updated_params_str.append(i) params_str = '--'.join(updated_params_str) except: params_str = '' if mapping_cat: base_filename = mapping_cat.replace(' ', '-').replace(',', '') output_biom_fp = join(output_dir, '%s_otu_table.biom' % base_filename) output_map_fp = join(output_dir, '%s_map.txt' % base_filename) # Build the collapse samples command collapse_samples_cmd = \ "collapse_samples.py -m %s -b %s --output_biom_fp %s --output_mapping_fp %s --collapse_fields '%s' %s" %\ (mapping_fp, otu_table_fp, output_biom_fp, output_map_fp, mapping_cat, params_str) commands.append([('Collapse samples in OTU table by categories', collapse_samples_cmd)]) otu_table_fp = output_biom_fp # Build the sort OTU table command if sort: # Prep the sort_otu_table command try: params_str = get_params_str(params['sort_otu_table']) except: params_str = '' # define output otu table sorted_fp = join(output_dir, splitext(split(otu_table_fp)[-1])[0] + '_sorted.biom') if mapping_cat or params_str == '': # for this case we don't have a collapsed mapping file so must # handle separately sort_otu_table_cmd = \ "sort_otu_table.py -i %s -o %s" % (otu_table_fp, sorted_fp) else: sort_otu_table_cmd = \ "sort_otu_table.py -i %s -o %s -m %s %s" %\ (otu_table_fp, sorted_fp, mapping_fp, params_str) commands.append([('Sort OTU Table', sort_otu_table_cmd)]) # redefine otu_table_fp to use otu_table_fp = sorted_fp # Prep the summarize taxonomy command try: params_str = get_params_str(params['summarize_taxa']) except: params_str = '' try: sum_taxa_levels = params['summarize_taxa']['level'] except: sum_taxa_levels = None # Build the summarize taxonomy command summarize_taxa_cmd = 'summarize_taxa.py -i %s -o %s %s' %\ (otu_table_fp, output_dir, params_str) commands.append([('Summarize Taxonomy', summarize_taxa_cmd)]) sum_taxa_fps = [] if sum_taxa_levels: basename = join(output_dir, splitext(split(otu_table_fp)[-1])[0]) for i in sum_taxa_levels.split(','): sum_taxa_fps.append(basename + '_L%s.txt' % (str(i))) else: basename = join(output_dir, splitext(split(otu_table_fp)[-1])[0]) # this is the default levels from summarize_taxa, but cannot import # script to get these values for i in [2, 3, 4, 5, 6]: sum_taxa_fps.append(basename + '_L%s.txt' % (str(i))) # Prep the plot taxa summary plot command(s) taxa_summary_plots_dir = '%s/taxa_summary_plots/' % output_dir create_dir(taxa_summary_plots_dir) try: params_str = get_params_str(params['plot_taxa_summary']) except: params_str = '' # Build the plot taxa summary plot command(s) plot_taxa_summary_cmd =\ 'plot_taxa_summary.py -i %s -o %s %s' %\ (','.join(sum_taxa_fps), taxa_summary_plots_dir, params_str) commands.append([('Plot Taxonomy Summary', plot_taxa_summary_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_ampliconnoise( mapping_fp, output_dir, command_handler, params, qiime_config, logger=None, status_update_callback=print_to_stdout, chimera_alpha=-3.8228, chimera_beta=0.6200, sff_txt_fp=None, numnodes=2, suppress_perseus=True, output_filepath=None, platform="flx", seqnoise_resolution=None, truncate_len=None, ): """ Run the ampliconnoise pipeline The steps performed by this function are: 1. Split input sff.txt file into one file per sample 2. Run scripts required for PyroNoise 3. Run scripts required for SeqNoise 4. Run scripts requred for Perseus (chimera removal) 5. Merge output files into one file similar to the output of split_libraries.py output_filepath should be absolute seqnoise_resolution should be string environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be careful passing command handlers that don't spawn child processes, as they may not inherit the correct environment variable setting """ map_data, headers, comments = parse_mapping_file(open(mapping_fp, "U")) create_dir(output_dir) if seqnoise_resolution is None: if platform == "flx": seqnoise_resolution = "30.0" elif platform == "titanium": seqnoise_resolution = "25.0" else: raise RuntimeError("seqnoise_resolution not set, and no" + " default for platform " + platform) if truncate_len is None: if platform == "flx": truncate_len = "220" elif platform == "titanium": truncate_len = "400" else: raise RuntimeError("truncate_len not set, and no" + " default for platform " + platform) # these are filenames minus extension, and are sample IDs sample_names = [] primer_seqs = [] # same order as sample_names bc_seqs = [] # same order as sample_names for i in range(len(map_data)): sample_names.append(map_data[i][headers.index("SampleID")]) bc_seqs.append(map_data[i][headers.index("BarcodeSequence")]) primer = map_data[i][headers.index("LinkerPrimerSequence")] for char, bases in DNASequence.iupac_degeneracies().iteritems(): primer = primer.replace(char, "[" + "".join(bases) + "]") primer_seqs.append(primer) if len(set(primer_seqs)) != 1: raise RuntimeError("Error: only one primer per mapping file supported.") one_primer = primer_seqs[0] commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False log_input_md5s(logger, [mapping_fp, sff_txt_fp]) # execute commands in output_dir called_dir = os.getcwd() os.chdir(output_dir) fh = open(os.path.join(output_dir, "map.csv"), "w") for i in range(len(sample_names)): fh.write(sample_names[i] + "," + bc_seqs[i] + "\n") fh.close() # these are the fasta results, e.g. PC.636_Good.fa # later we merge them and copy to output file post_pyro_tail = "_" + truncate_len if suppress_perseus: fasta_result_names = [sample_name + post_pyro_tail + "_seqnoise_cd.fa" for sample_name in sample_names] else: fasta_result_names = [sample_name + "_Good.fa" for sample_name in sample_names] cmd = "cd " + output_dir # see also os.chdir above commands.append([("change to output dir", cmd)]) cmd = "echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt" commands.append([("confirm pyro lookup filepath environment variable", cmd)]) cmd = ( "SplitKeys.pl " + one_primer + " map.csv < " + os.path.join(called_dir, sff_txt_fp) + " > splitkeys_log.txt 2> unassigned.fna" ) commands.append([("split sff.txt via barcodes (keys)", cmd)]) for i, sample_name in enumerate(sample_names): # Build the summarize taxonomy command if platform == "flx": cmd = "Clean360.pl " + one_primer + " " + sample_name + " < " + sample_name + ".raw" commands.append([("clean flows " + sample_name, cmd)]) # these run through the whole sff file once per sample, I think # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) elif platform == "titanium": cmd = "CleanMinMax.pl " + one_primer + " " + sample_name + " < " + sample_name + ".raw" commands.append([("clean flows " + sample_name, cmd)]) # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) else: raise RuntimeError("platform " + platform + " not supported") cmd = ( "mpirun -np " + str(numnodes) + " PyroDist -in " + sample_name + ".dat -out " + sample_name + " > " + sample_name + ".pdout" ) commands.append([("pyrodist " + sample_name, cmd)]) cmd = "FCluster -in " + sample_name + ".fdist -out " + sample_name + " > " + sample_name + ".fcout" commands.append([("fcluster pyrodist " + sample_name, cmd)]) # e.g.: # mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin # PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout cmd = ( "mpirun -np " + str(numnodes) + " PyroNoise -din " + sample_name + ".dat -out " + sample_name + "_pyronoise " + "-lin " + sample_name + ".list -s 60.0 -c 0.01 > " + sample_name + "_pyronoise.pnout" ) commands.append([("pyronoise " + sample_name, cmd)]) cmd = ( "Parse.pl " + bc_seqs[i] + one_primer + " " + truncate_len + " < " + sample_name + "_pyronoise_cd.fa" + " > " + sample_name + "_" + truncate_len + ".fa" ) commands.append([("truncate " + sample_name, cmd)]) # now start with post_pyro_tail cmd = ( "mpirun -np " + str(numnodes) + " SeqDist -in " + sample_name + post_pyro_tail + ".fa > " + sample_name + post_pyro_tail + ".seqdist" ) commands.append([("seqdist " + sample_name, cmd)]) cmd = ( "FCluster -in " + sample_name + post_pyro_tail + ".seqdist -out " + sample_name + post_pyro_tail + "fcl > " + sample_name + post_pyro_tail + ".fcout" ) commands.append([("fcluster seqdist " + sample_name, cmd)]) # e.g.: # mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din # PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin # PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 > # PC.354_pyronoise_cd.snout cmd = ( "mpirun -np " + str(numnodes) + " SeqNoise -in " + sample_name + post_pyro_tail + ".fa -din " + sample_name + post_pyro_tail + ".seqdist -out " + sample_name + post_pyro_tail + "_seqnoise -lin " + sample_name + post_pyro_tail + "fcl.list -min " + sample_name + "_pyronoise" + ".mapping -s " + seqnoise_resolution + " -c 0.08 > " + sample_name + post_pyro_tail + ".snout" ) commands.append([("seqnoise " + sample_name, cmd)]) if not suppress_perseus: cmd = "Perseus -sin " + sample_name + post_pyro_tail + "_seqnoise_cd.fa > " + sample_name + ".per" commands.append([("Perseus " + sample_name, cmd)]) cmd = ( "Class.pl " + sample_name + ".per " + str(chimera_alpha) + " " + str(chimera_beta) + " > " + sample_name + ".class" ) commands.append([("Class.pl " + sample_name, cmd)]) cmd = ( "FilterGoodClass.pl " + sample_name + post_pyro_tail + "_seqnoise_cd.fa " + sample_name + ".class 0.5 > " + sample_name + "_Chi.fa 2> " + sample_name + "_Good.fa" ) commands.append([("FilterGoodClass " + sample_name, cmd)]) cmd = "unweight_fasta.py -i %s -o %s -l %s" % (fasta_result_names[i], sample_name + "_unw.fna", sample_name) commands.append([("unweight fasta " + sample_name, cmd)]) cmd = ( "cat " + " ".join([sample_name + "_unw.fna" for sample_name in sample_names]) + " > " + output_filepath ) # this should be an abs filepath commands.append([("cat into one fasta file", cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_alpha_rarefaction(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, tree_fp=None, num_steps=10, parallel=False, logger=None, min_rare_depth=10, max_rare_depth=None, suppress_md5=False, status_update_callback=print_to_stdout, plot_stderr_and_stddev=False, retain_intermediate_files=True): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Generate rarefied OTU tables; 2) Compute alpha diversity metrics for each rarefied OTU table; 3) Collate alpha diversity results; 4) Generate alpha rarefaction plots. """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp]) if max_rare_depth == None: min_count, max_count, median_count, mean_count, counts_per_sample =\ compute_counts_per_sample_stats(parse_biom_table(open(otu_table_fp,'U'))) max_rare_depth = median_count step = int((max_rare_depth - min_rare_depth) / num_steps) or 1 max_rare_depth = int(max_rare_depth) rarefaction_dir = '%s/rarefaction/' % output_dir create_dir(rarefaction_dir) try: params_str = get_params_str(params['multiple_rarefactions']) except KeyError: params_str = '' if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the rarefaction command rarefaction_cmd = \ '%s %s/parallel_multiple_rarefactions.py -T -i %s -m %s -x %s -s %s -o %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, min_rare_depth, max_rare_depth, step, rarefaction_dir, params_str) else: # Build the rarefaction command rarefaction_cmd = \ '%s %s/multiple_rarefactions.py -i %s -m %s -x %s -s %s -o %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, min_rare_depth, max_rare_depth, step, rarefaction_dir, params_str) commands.append([('Alpha rarefaction', rarefaction_cmd)]) # Prep the alpha diversity command alpha_diversity_dir = '%s/alpha_div/' % output_dir create_dir(alpha_diversity_dir) try: params_str = get_params_str(params['alpha_diversity']) except KeyError: params_str = '' if tree_fp: params_str += ' -t %s' % tree_fp if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the alpha diversity command alpha_diversity_cmd = \ "%s %s/parallel_alpha_diversity.py -T -i %s -o %s %s" %\ (python_exe_fp, script_dir, rarefaction_dir, alpha_diversity_dir, params_str) else: # Build the alpha diversity command alpha_diversity_cmd = \ "%s %s/alpha_diversity.py -i %s -o %s %s" %\ (python_exe_fp, script_dir, rarefaction_dir, alpha_diversity_dir, params_str) commands.append(\ [('Alpha diversity on rarefied OTU tables',alpha_diversity_cmd)]) # Prep the alpha diversity collation command alpha_collated_dir = '%s/alpha_div_collated/' % output_dir create_dir(alpha_collated_dir) try: params_str = get_params_str(params['collate_alpha']) except KeyError: params_str = '' # Build the alpha diversity collation command alpha_collated_cmd = '%s %s/collate_alpha.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, alpha_diversity_dir, \ alpha_collated_dir, params_str) commands.append([('Collate alpha',alpha_collated_cmd)]) if not retain_intermediate_files: commands.append([('Removing intermediate files', 'rm -r %s %s' % (rarefaction_dir,alpha_diversity_dir))]) else: commands.append([('Skipping removal of intermediate files.','')]) # Prep the make rarefaction plot command(s) try: params_str = get_params_str(params['make_rarefaction_plots']) except KeyError: params_str = '' if 'std_type' in params['make_rarefaction_plots'] or not plot_stderr_and_stddev: rarefaction_plot_dir = '%s/alpha_rarefaction_plots/' % output_dir create_dir(rarefaction_plot_dir) # Build the make rarefaction plot command(s) #for metric in alpha_diversity_metrics: make_rarefaction_plot_cmd =\ '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s' %\ (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp, rarefaction_plot_dir, params_str) commands.append(\ [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)]) else: rarefaction_plot_dir_stddev = '%s/alpha_rarefaction_plots_stddev/' % output_dir rarefaction_plot_dir_stderr = '%s/alpha_rarefaction_plots_stderr/' % output_dir create_dir(rarefaction_plot_dir_stddev) create_dir(rarefaction_plot_dir_stderr) # Build the make rarefaction plot command(s) # for metric in alpha_diversity_metrics: make_rarefaction_plot_cmd =\ '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stddev' %\ (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stddev, params_str) commands.append(\ [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)]) make_rarefaction_plot_cmd =\ '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stderr' %\ (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stderr, params_str) commands.append(\ [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def pick_subsampled_open_reference_otus( input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=None, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, suppress_index_page=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust'] allowed_reference_otu_picking_methods = [ 'uclust_ref', 'usearch61_ref', 'sortmerna' ] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps index_links = [] input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True index_links.append( ('Run summary data', log_fp, _index_headers['run_summary'])) else: close_logger_on_success = False if not suppress_md5: log_input_md5s( logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp, prefiltered_input_fp, prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) index_links.append( ('Pre-filtered sequence identifiers ' '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id) * 100), prefilter_failures_list_fp, _index_headers['sequences'])) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") # Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus(input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir, input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp, step1_failures_list_fp, step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir # count number of sequences in step 1 failures fasta file with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f: num_failure_seqs, mean, std = count_seqs_from_file( step1_failures_fasta_f) # number of failures sequences is greater than the threshold, # continue to step 2,3 and 4 run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold if run_step_2_and_3: # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_dir = '%s/step2_otus/' % output_dir create_dir(step2_dir) step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step2_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath(step2_input_fasta_fp), percent_subsample)) # Prep the OTU picking command for the subsampled failures step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir # remove the indexed reference database from the dictionary of # parameters as it must be forced to build a new database # using the step2_repset_fasta_fp if reference_otu_picking_method == 'sortmerna': if 'sortmerna_db' in params['pick_otus']: del params['pick_otus']['sortmerna_db'] step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)]) index_links.append(( 'Final map of OTU identifier to sequence identifers (i.e., "OTU map")', merged_otu_map_fp, _index_headers['otu_maps'])) if not suppress_step4: step4_dir = '%s/step4_otus/' % output_dir if run_step_2_and_3: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) failures_fp = step3_failures_fasta_fp failures_otus_fp = 'failures_failures_otus.txt' failures_step = 'step3' else: failures_fp = step1_failures_fasta_fp failures_otus_fp = 'failures_otus.txt' failures_step = 'step1' step3_otu_map_fp = "" step4_cmd = pick_denovo_otus(failures_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp) commands.append([('Pick de novo OTUs on %s failures' % failures_step, step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp) commands.append([('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created if run_step_2_and_3: failures_fp = step3_failures_list_fp else: failures_fp = step1_failures_list_fp step3_otu_map_fp = "" cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([ ('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (failures_fp, output_dir)) ]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp, min_otu_size) index_links.append( ('Final map of OTU identifier to sequence identifers excluding ' 'OTUs with fewer than %d sequences' % min_otu_size, otu_no_singletons_fp, _index_headers['otu_maps'])) logger.write( '# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size)) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir index_links.append(('OTU representative sequences', final_repset_fp, _index_headers['sequences'])) final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir index_links.append(( 'New reference sequences (i.e., OTU representative sequences plus input ' 'reference sequences)', new_refseqs_fp, _index_headers['sequences'])) # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copyfile(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write( '# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. if run_step_2_and_3: for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() # steps 1-4 executed if run_step_2_and_3: logger.write( '# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # only steps 1 and 4 executed else: logger.write( '# Write non-singleton otus representative sequences from ' + 'step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size, otu_table_fp, _index_headers['otu_tables'])) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) index_links.append(('OTU taxonomic assignments', taxonomy_fp, _index_headers['taxa_assignments'])) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: rep_set_tree_fp = join(output_dir, 'rep_set.tre') index_links.append(('OTU phylogenetic tree', rep_set_tree_fp, _index_headers['trees'])) if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table( table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close() if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp)
def run_beta_diversity_through_plots(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, color_by_interesting_fields_only=True, sampling_depth=None, tree_fp=None, parallel=False, logger=None, suppress_emperor_plots=False, suppress_md5=False, status_update_callback=print_to_stdout): """ Compute beta diversity distance matrices, run PCoA, and generate emperor plots The steps performed by this function are: 1) Compute a beta diversity distance matrix for each metric 2) Peform a principal coordinates analysis on the result of step 1 3) Generate an emperor plot for each result of step 2 """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp]) mapping_data, mapping_header, mapping_comments =\ parse_mapping_file(open(mapping_fp,'U')) # Get the interesting mapping fields to color by -- if none are # interesting, take all of them. Interesting is defined as those # which have greater than one value and fewer values than the number # of samples if color_by_interesting_fields_only: mapping_fields =\ get_interesting_mapping_fields(mapping_data, mapping_header) or\ mapping_header else: mapping_fields = mapping_header mapping_fields = ','.join(mapping_fields) if sampling_depth: # Sample the OTU table at even depth even_sampled_otu_table_fp = '%s/%s_even%d%s' %\ (output_dir, otu_table_basename, sampling_depth, otu_table_ext) single_rarefaction_cmd = \ '%s %s/single_rarefaction.py -i %s -o %s -d %d' %\ (python_exe_fp, script_dir, otu_table_fp, even_sampled_otu_table_fp, sampling_depth) commands.append([ ('Sample OTU table at %d seqs/sample' % sampling_depth, single_rarefaction_cmd)]) otu_table_fp = even_sampled_otu_table_fp otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac','unweighted_unifrac'] dm_fps = [] for beta_diversity_metric in beta_diversity_metrics: # Prep the beta-diversity command try: bdiv_params_copy = params['beta_diversity'].copy() except KeyError: bdiv_params_copy = {} try: del bdiv_params_copy['metrics'] except KeyError: pass params_str = get_params_str(bdiv_params_copy) if tree_fp: params_str = '%s -t %s ' % (params_str,tree_fp) # Build the beta-diversity command if parallel: # Grab the parallel-specific parameters try: params_str += get_params_str(params['parallel']) except KeyError: pass beta_div_cmd = '%s %s/parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append(\ [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) else: beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s --metrics %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append(\ [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) orig_beta_div_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) beta_div_fp = '%s/%s_dm.txt' % \ (output_dir, beta_diversity_metric) commands.append([('Rename distance matrix (%s)' % beta_diversity_metric, 'mv %s %s' % (orig_beta_div_fp, beta_div_fp))]) dm_fps.append((beta_diversity_metric, beta_div_fp)) # Prep the principal coordinates command pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the principal coordinates command pc_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, beta_div_fp, pc_fp, params_str) commands.append(\ [('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)]) # Generate emperor plots if not suppress_emperor_plots: # Prep the emperor plots command emperor_dir = '%s/%s_emperor_pcoa_plot/' % (output_dir, beta_diversity_metric) create_dir(emperor_dir) try: params_str = get_params_str(params['make_emperor']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command emperor_command = \ 'make_emperor.py -i %s -o %s -m %s %s' % (pc_fp, emperor_dir, mapping_fp, params_str) commands.append([('Make emperor plots, %s)' % beta_diversity_metric, emperor_command)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return dm_fps
def run_pick_closed_reference_otus(input_fp, refseqs_fp, output_dir, taxonomy_fp, command_handler, params, qiime_config, assign_taxonomy=False, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Pick OTUs; 2) If assignment_taxonomy is True, choose representative sequence for OTUs and assign taxonomy using a classifier. 3) Build an OTU table with optional predefined taxonomy (if assign_taxonomy=False) or taxonomic assignments from step 2 (if assign_taxonomy=True). """ # confirm that a valid otu picking method was supplied before doing # any work reference_otu_picking_methods = [ 'blast', 'uclust_ref', 'usearch61_ref', 'usearch_ref', 'sortmerna' ] try: otu_picking_method = params['pick_otus']['otu_picking_method'] except KeyError: otu_picking_method = 'uclust_ref' assert otu_picking_method in reference_otu_picking_methods,\ "Invalid OTU picking method supplied: %s. Valid choices are: %s"\ % (otu_picking_method, ' '.join(reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, taxonomy_fp]) # Prep the OTU picking command pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method) otu_fp = '%s/%s_otus.txt' % (pick_otu_dir, input_basename) if parallel and (otu_picking_method == 'blast' or otu_picking_method == 'uclust_ref' or otu_picking_method == 'usearch61_ref'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['pick_otus'].copy() if 'otu_picking_method' in d: del d['otu_picking_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s -i %s -o %s -r %s -T %s' %\ (otu_picking_script, input_fp, pick_otu_dir, refseqs_fp, params_str) else: try: params_str = get_params_str(params['pick_otus']) except KeyError: params_str = '' # Since this is reference-based OTU picking we always want to # suppress new clusters -- force it here. params_str += ' --suppress_new_clusters' logger.write("Forcing --suppress_new_clusters as this is " "closed-reference OTU picking.\n\n") # Build the OTU picking command pick_otus_cmd = 'pick_otus.py -i %s -o %s -r %s -m %s %s' %\ (input_fp, pick_otu_dir, refseqs_fp, otu_picking_method, params_str) commands.append([('Pick OTUs', pick_otus_cmd)]) # Assign taxonomy using a taxonomy classifier, if request by the user. # (Alternatively predefined taxonomic assignments will be used, if provided.) if assign_taxonomy: # Prep the representative set picking command rep_set_dir = '%s/rep_set/' % output_dir create_dir(rep_set_dir) rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir, input_basename) rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir, input_basename) try: params_str = get_params_str(params['pick_rep_set']) except KeyError: params_str = '' # Build the representative set picking command pick_rep_set_cmd = 'pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\ (otu_fp, input_fp, rep_set_log_fp, rep_set_fp, params_str) commands.append([('Pick representative set', pick_rep_set_cmd)]) # Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'uclust' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir, assignment_method) taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \ (assign_taxonomy_dir, input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast' or assignment_method == 'uclust'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the taxonomy assignment parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() if 'assignment_method' in d: del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ 'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (assignment_method, rep_set_fp, assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\ (assign_taxonomy_dir, rep_set_fp, params_str) commands.append([('Assign taxonomy', assign_taxonomy_cmd)]) # Prep the OTU table building command otu_table_fp = '%s/otu_table.biom' % output_dir try: params_str = get_params_str(params['make_otu_table']) except KeyError: params_str = '' # If assign_taxonomy is True, this will be the path to the taxonomic # assignment results. If assign_taxonomy is False this will be either # the precomputed taxonomic assignments that the user passed in, # or None. if taxonomy_fp: taxonomy_str = '-t %s' % taxonomy_fp else: taxonomy_str = '' # Build the OTU table building command make_otu_table_cmd = 'make_otu_table.py -i %s %s -o %s %s' %\ (otu_fp, taxonomy_str, otu_table_fp, params_str) commands.append([('Make OTU table', make_otu_table_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_jackknifed_beta_diversity(otu_table_fp, tree_fp, seqs_per_sample, output_dir, command_handler, params, qiime_config, mapping_fp, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout, master_tree=None): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Compute beta diversity distance matrix from otu table (and tree, if applicable) 2) Build rarefied OTU tables; 3) Build UPGMA tree from full distance matrix; 4) Compute distance matrics for rarefied OTU tables; 5) Build UPGMA trees from rarefied OTU table distance matrices; 5.5) Build a consensus tree from the rarefied UPGMA trees 6) Compare rarefied OTU table distance matrix UPGMA trees to tree full UPGMA tree and write support file and newick tree with support values as node labels. master_tree can be 'full' or 'consensus', default full """ # Prepare some variables for the later steps if master_tree == None: master_tree = 'full' otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp]) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac','unweighted_unifrac'] # Prep the beta-diversity command try: params_str = get_params_str(params['beta_diversity']) except KeyError: params_str = '' if tree_fp: params_str = '%s -t %s' % (params_str,tree_fp) # Build the beta-diversity command beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, params_str) commands.append(\ [('Beta Diversity (%s)' % ', '.join(beta_diversity_metrics), beta_div_cmd)]) # Prep rarefaction command rarefaction_dir = '%s/rarefaction/' % output_dir create_dir(rarefaction_dir) try: params_str = get_params_str(params['multiple_rarefactions_even_depth']) except KeyError: params_str = '' # Build the rarefaction command rarefaction_cmd = \ '%s %s/multiple_rarefactions_even_depth.py -i %s -d %d -o %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, seqs_per_sample, rarefaction_dir, params_str) commands.append([('Rarefaction', rarefaction_cmd)]) # Begin iterating over beta diversity distance metrics, if more than one # was provided for beta_diversity_metric in beta_diversity_metrics: metric_output_dir = '%s/%s/' % (output_dir, beta_diversity_metric) distance_matrix_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) # Prep the hierarchical clustering command (for full distance matrix) full_tree_fp = '%s/%s_upgma.tre' % (metric_output_dir,otu_table_basename) try: params_str = get_params_str(params['upgma_cluster']) except KeyError: params_str = '' # Build the hierarchical clustering command (for full distance matrix) hierarchical_cluster_cmd = '%s %s/upgma_cluster.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, distance_matrix_fp, full_tree_fp, params_str) commands.append(\ [('UPGMA on full distance matrix: %s' % beta_diversity_metric,\ hierarchical_cluster_cmd)]) # Prep the beta diversity command (for rarefied OTU tables) dm_dir = '%s/rare_dm/' % metric_output_dir create_dir(dm_dir) # the metrics parameter needs to be ignored as we need to run # beta_diversity one metric at a time to keep the per-metric # output files in separate directories try: d = params['beta_diversity'].copy() del d['metrics'] except KeyError: params_str = {} params_str = get_params_str(d) + ' -m %s ' % beta_diversity_metric if tree_fp: params_str = '%s -t %s' % (params_str,tree_fp) if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the parallel beta diversity command (for rarefied OTU tables) beta_div_rarefied_cmd = \ '%s %s/parallel_beta_diversity.py -T -i %s -o %s %s' %\ (python_exe_fp, script_dir, rarefaction_dir, dm_dir, params_str) else: # Build the serial beta diversity command (for rarefied OTU tables) beta_div_rarefied_cmd = \ '%s %s/beta_diversity.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, rarefaction_dir, dm_dir, params_str) commands.append(\ [('Beta diversity on rarefied OTU tables (%s)' % beta_diversity_metric, beta_div_rarefied_cmd)]) # Prep the hierarchical clustering command (for rarefied # distance matrices) upgma_dir = '%s/rare_upgma/' % metric_output_dir create_dir(upgma_dir) try: params_str = get_params_str(params['upgma_cluster']) except KeyError: params_str = '' # Build the hierarchical clustering command (for rarefied # distance matrices) hierarchical_cluster_cmd =\ '%s %s/upgma_cluster.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, dm_dir, upgma_dir, params_str) commands.append(\ [('UPGMA on rarefied distance matrix (%s)' % beta_diversity_metric, hierarchical_cluster_cmd)]) # Build the consensus tree command consensus_tree_cmd =\ '%s %s/consensus_tree.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, upgma_dir, metric_output_dir + "/rare_upgma_consensus.tre", params_str) commands.append(\ [('consensus on rarefied distance matrices (%s)' % beta_diversity_metric, consensus_tree_cmd)]) # Prep the tree compare command tree_compare_dir = '%s/upgma_cmp/' % metric_output_dir create_dir(tree_compare_dir) try: params_str = get_params_str(params['tree_compare']) except KeyError: params_str = '' # Build the tree compare command if master_tree == "full": master_tree_fp = full_tree_fp elif master_tree == "consensus": master_tree_fp = metric_output_dir + "/rare_upgma_consensus.tre" else: raise RuntimeError('master tree method "%s" not found' % (master_tree,)) tree_compare_cmd = '%s %s/tree_compare.py -s %s -m %s -o %s %s' %\ (python_exe_fp, script_dir, upgma_dir, master_tree_fp, tree_compare_dir, params_str) commands.append(\ [('Tree compare (%s)' % beta_diversity_metric, tree_compare_cmd)]) # Prep the PCoA command pcoa_dir = '%s/pcoa/' % metric_output_dir create_dir(pcoa_dir) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the PCoA command pcoa_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, dm_dir, pcoa_dir, params_str) commands.append(\ [('Principal coordinates (%s)' % beta_diversity_metric, pcoa_cmd)]) # Prep the emperor plots command emperor_dir = '%s/emperor_pcoa_plots/' % metric_output_dir create_dir(emperor_dir) try: params_str = get_params_str(params['make_emperor']) except KeyError: params_str = '' emperor_cmd = 'make_emperor.py -i %s -o %s -m %s %s' %\ (pcoa_dir, emperor_dir, mapping_fp, params_str) commands.append(\ [('emperor plots (%s)' % beta_diversity_metric, emperor_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_core_diversity_analyses(biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, suppress_taxa_summary=False, suppress_beta_diversity=False, suppress_alpha_diversity=False, suppress_group_significance=False, status_update_callback=print_to_stdout): """ """ if categories is not None: # Validate categories provided by the users mapping_data, mapping_comments = \ parse_mapping_file_to_dict(open(mapping_fp, 'U')) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError( "Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c, ', '.join(metadata_map.CategoryNames))) if metadata_map.hasSingleCategoryValue(c): raise ValueError( "Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c) else: categories = [] comma_separated_categories = ','.join(categories) # prep some variables if params is None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = '%s/index.html' % output_dir index_links = [] commands = [] # begin logging old_log_fps = glob(join(output_dir, 'log_20*txt')) log_fp = generate_log_fp(output_dir) index_links.append( ('Master run log', log_fp, _index_headers['run_summary'])) for old_log_fp in old_log_fps: index_links.append( ('Previous run log', old_log_fp, _index_headers['run_summary'])) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp, mapping_fp] if tree_fp is not None: input_fps.append(tree_fp) log_input_md5s(logger, input_fps) # run 'biom summarize-table' on input BIOM table try: params_str = get_params_str(params['biom-summarize-table']) except KeyError: params_str = '' biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir if not exists(biom_table_stats_output_fp): biom_table_summary_cmd = \ "biom summarize-table -i %s -o %s %s" % \ (biom_fp, biom_table_stats_output_fp, params_str) commands.append([('Generate BIOM table summary', biom_table_summary_cmd)]) else: logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" % biom_table_stats_output_fp) index_links.append(('BIOM table statistics', biom_table_stats_output_fp, _index_headers['run_summary'])) # filter samples with fewer observations than the requested sampling_depth. # since these get filtered for some analyses (eg beta diversity after # even sampling) it's useful to filter them here so they're filtered # from all analyses. filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth) if not exists(filtered_biom_fp): filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\ (biom_fp, filtered_biom_fp, sampling_depth) commands.append([( 'Filter low sequence count samples from table (minimum sequence count: %d)' % sampling_depth, filter_samples_cmd)]) else: logger.write( "Skipping filter_samples_from_otu_table.py as %s exists.\n\n" % filtered_biom_fp) biom_fp = filtered_biom_fp # rarify the BIOM table to sampling_depth rarefied_biom_fp = "%s/table_even%d.biom" % (output_dir, sampling_depth) if not exists(rarefied_biom_fp): single_rarefaction_cmd = "single_rarefaction.py -i %s -o %s -d %d" %\ (biom_fp, rarefied_biom_fp, sampling_depth) commands.append([ ('Rarify the OTU table to %d sequences/sample' % sampling_depth, single_rarefaction_cmd) ]) else: logger.write("Skipping single_rarefaction.py as %s exists.\n\n" % rarefied_biom_fp) # run initial commands and reset the command list if len(commands) > 0: command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] if not suppress_beta_diversity: bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir, sampling_depth) # Need to check for the existence of any distance matrices, since the user # can select which will be generated. existing_dm_fps = glob('%s/*_dm.txt' % bdiv_even_output_dir) if len(existing_dm_fps) == 0: even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=rarefied_biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, # Note: we pass sampling depth=None here as # we rarify the BIOM table above and pass that # in here. sampling_depth=None, tree_fp=tree_fp, parallel=parallel, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write( "Skipping beta_diversity_through_plots.py as %s exist(s).\n\n" % ', '.join(existing_dm_fps)) even_dm_fps = [(split(fp)[1].strip('_dm.txt'), fp) for fp in existing_dm_fps] # Get make_distance_boxplots parameters try: params_str = get_params_str(params['make_distance_boxplots']) except KeyError: params_str = '' for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = '%s/%s_boxplots/' % ( bdiv_even_output_dir, bdiv_metric) plot_output_fp = '%s/%s_Distances.pdf' % (boxplots_output_dir, category) stats_output_fp = '%s/%s_Stats.txt' % (boxplots_output_dir, category) if not exists(plot_output_fp): boxplots_cmd = \ 'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\ (dm_fp, category, boxplots_output_dir, mapping_fp, params_str) commands.append([('Boxplots (%s)' % category, boxplots_cmd) ]) else: logger.write( "Skipping make_distance_boxplots.py for %s as %s exists.\n\n" % (category, plot_output_fp)) index_links.append( ('Distance boxplots (%s)' % bdiv_metric, plot_output_fp, _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append( ('Distance boxplots statistics (%s)' % bdiv_metric, stats_output_fp, _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append( ('PCoA plot (%s)' % bdiv_metric, '%s/%s_emperor_pcoa_plot/index.html' % (bdiv_even_output_dir, bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append( ('Distance matrix (%s)' % bdiv_metric, '%s/%s_dm.txt' % (bdiv_even_output_dir, bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append( ('Principal coordinate matrix (%s)' % bdiv_metric, '%s/%s_pc.txt' % (bdiv_even_output_dir, bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) if not suppress_alpha_diversity: # Alpha rarefaction workflow arare_full_output_dir = '%s/arare_max%d/' % (output_dir, sampling_depth) rarefaction_plots_output_fp = \ '%s/alpha_rarefaction_plots/rarefaction_plots.html' % arare_full_output_dir if not exists(rarefaction_plots_output_fp): run_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, suppress_md5=True, status_update_callback=status_update_callback, retain_intermediate_files=False) else: logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" % rarefaction_plots_output_fp) index_links.append( ('Alpha rarefaction plots', rarefaction_plots_output_fp, _index_headers['alpha_diversity'])) collated_alpha_diversity_fps = \ glob('%s/alpha_div_collated/*txt' % arare_full_output_dir) try: params_str = get_params_str(params['compare_alpha_diversity']) except KeyError: params_str = '' if len(categories) > 0: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext( split(collated_alpha_diversity_fp)[1])[0] compare_alpha_output_dir = '%s/compare_%s' % \ (arare_full_output_dir, alpha_metric) if not exists(compare_alpha_output_dir): compare_alpha_cmd = \ 'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\ (collated_alpha_diversity_fp, mapping_fp, comma_separated_categories, compare_alpha_output_dir, params_str) commands.append([ ('Compare alpha diversity (%s)' % alpha_metric, compare_alpha_cmd) ]) for category in categories: alpha_comparison_stat_fp = '%s/%s_stats.txt' % \ (compare_alpha_output_dir, category) alpha_comparison_boxplot_fp = '%s/%s_boxplots.pdf' % \ (compare_alpha_output_dir, category) index_links.append( ('Alpha diversity statistics (%s, %s)' % (category, alpha_metric), alpha_comparison_stat_fp, _index_headers['alpha_diversity'])) index_links.append( ('Alpha diversity boxplots (%s, %s)' % (category, alpha_metric), alpha_comparison_boxplot_fp, _index_headers['alpha_diversity'])) else: logger.write("Skipping compare_alpha_diversity.py" " for %s as %s exists.\n\n" % (alpha_metric, compare_alpha_output_dir)) else: logger.write("Skipping compare_alpha_diversity.py as" " no categories were provided.\n\n") if not suppress_taxa_summary: taxa_plots_output_dir = '%s/taxa_plots/' % output_dir # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob( join(taxa_plots_output_dir, 'taxa_summary_plots', '*.html')) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write( "Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n" % ', '.join(existing_taxa_plot_html_fps)) index_links.append( ('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html' % taxa_plots_output_dir, _index_headers['taxa_summary'])) index_links.append( ('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html' % taxa_plots_output_dir, _index_headers['taxa_summary'])) for category in categories: taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir, category) # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob('%s/taxa_summary_plots/*.html' % taxa_plots_output_dir) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=category, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write( "Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n" % (category, ', '.join(existing_taxa_plot_html_fps))) index_links.append( ('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html' % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) index_links.append( ('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html' % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) if not suppress_group_significance: params_str = get_params_str(params['group_significance']) # group significance tests, aka category significance for category in categories: group_signifance_fp = \ '%s/group_significance_%s.txt' % (output_dir, category) if not exists(group_signifance_fp): # Build the OTU cateogry significance command group_significance_cmd = \ 'group_significance.py -i %s -m %s -c %s -o %s %s' %\ (rarefied_biom_fp, mapping_fp, category, group_signifance_fp, params_str) commands.append([('Group significance (%s)' % category, group_significance_cmd)]) else: logger.write( "Skipping group_significance.py for %s as %s exists.\n\n" % (category, group_signifance_fp)) index_links.append( ('Category significance (%s)' % category, group_signifance_fp, _index_headers['group_significance'])) filtered_biom_gzip_fp = '%s.gz' % filtered_biom_fp if not exists(filtered_biom_gzip_fp): commands.append([('Compress the filtered BIOM table', 'gzip %s' % filtered_biom_fp)]) else: logger.write( "Skipping compressing of filtered BIOM table as %s exists.\n\n" % filtered_biom_gzip_fp) index_links.append( ('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth, filtered_biom_gzip_fp, _index_headers['run_summary'])) rarified_biom_gzip_fp = '%s.gz' % rarefied_biom_fp if not exists(rarified_biom_gzip_fp): commands.append([('Compress the rarified BIOM table', 'gzip %s' % rarefied_biom_fp)]) else: logger.write( "Skipping compressing of rarified BIOM table as %s exists.\n\n" % rarified_biom_gzip_fp) index_links.append( ('Rarified BIOM table (sampling depth: %d)' % sampling_depth, rarified_biom_gzip_fp, _index_headers['run_summary'])) if len(commands) > 0: command_handler(commands, status_update_callback, logger) else: logger.close() generate_index_page(index_links, index_fp)
def run_summarize_taxa_through_plots(otu_table_fp, mapping_fp, output_dir, mapping_cat, sort, command_handler, params, qiime_config, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation for summarizing taxonomies and generating plots The steps performed by this function are: 1) Summarize OTU by Category 2) Summarize Taxonomy 3) Plot Taxonomy Summary """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[otu_table_fp,mapping_fp]) # if mapping category not passed via command-line, # check if it is passed in params file if not mapping_cat: try: mapping_cat=params['summarize_otu_by_cat']['mapping_category'] except: mapping_cat=None try: params_str = get_params_str(params['summarize_otu_by_cat']) # Need to remove the mapping category option, since it is defined above. # Using this method since we don't want to change the params dict split_params=params_str.split('--') updated_params_str=[] for i in split_params: if not i.startswith('mapping_category'): updated_params_str.append(i) params_str='--'.join(updated_params_str) except: params_str = '' if mapping_cat: output_fp=join(output_dir,'%s_otu_table.biom' % (mapping_cat.replace(' ','-'))) # Build the summarize otu by category command summarize_otu_by_cat_cmd = \ "%s %s/summarize_otu_by_cat.py -m %s -i %s -o %s -c '%s' %s" %\ (python_exe_fp, script_dir, mapping_fp, otu_table_fp, output_fp, mapping_cat, params_str) commands.append(\ [('Summarize OTU table by Category',summarize_otu_by_cat_cmd)]) otu_table_fp=output_fp # Build the sort OTU table command if sort: # Prep the sort_otu_table command try: params_str = get_params_str(params['sort_otu_table']) except: params_str = '' # define output otu table sorted_fp=join(output_dir, splitext(split(otu_table_fp)[-1])[0]+'_sorted.biom') if mapping_cat or params_str=='': # for this case we don't have a collapsed mapping file so must # handle separately sort_otu_table_cmd = \ "%s %s/sort_otu_table.py -i %s -o %s" %\ (python_exe_fp, script_dir, otu_table_fp, sorted_fp) else: sort_otu_table_cmd = \ "%s %s/sort_otu_table.py -i %s -o %s -m %s %s" %\ (python_exe_fp, script_dir, otu_table_fp, sorted_fp, mapping_fp, params_str) commands.append([('Sort OTU Table',sort_otu_table_cmd)]) # redefine otu_table_fp to use otu_table_fp=sorted_fp # Prep the summarize taxonomy command try: params_str = get_params_str(params['summarize_taxa']) except: params_str = '' try: sum_taxa_levels=params['summarize_taxa']['level'] except: sum_taxa_levels=None # Build the summarize taxonomy command summarize_taxa_cmd = '%s %s/summarize_taxa.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, params_str) commands.append([('Summarize Taxonomy',summarize_taxa_cmd)]) sum_taxa_fps=[] if sum_taxa_levels: basename=join(output_dir,splitext(split(otu_table_fp)[-1])[0]) for i in sum_taxa_levels.split(','): sum_taxa_fps.append(basename+'_L%s.txt' % (str(i))) else: basename=join(output_dir,splitext(split(otu_table_fp)[-1])[0]) # this is the default levels from summarize_taxa, but cannot import # script to get these values for i in [2,3,4,5,6]: sum_taxa_fps.append(basename+'_L%s.txt' % (str(i))) # Prep the plot taxa summary plot command(s) taxa_summary_plots_dir = '%s/taxa_summary_plots/' % output_dir create_dir(taxa_summary_plots_dir) try: params_str = get_params_str(params['plot_taxa_summary']) except: params_str = '' # Build the plot taxa summary plot command(s) plot_taxa_summary_cmd =\ '%s %s/plot_taxa_summary.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, ','.join(sum_taxa_fps), taxa_summary_plots_dir, params_str) commands.append(\ [('Plot Taxonomy Summary',plot_taxa_summary_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_pick_de_novo_otus(input_fp, output_dir, command_handler, params, qiime_config, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Pick OTUs; 2) Pick a representative set; 3) Align the representative set; 4) Assign taxonomy; 5) Filter the alignment prior to tree building - remove positions which are all gaps, and specified as 0 in the lanemask 6) Build a phylogenetic tree; 7) Build an OTU table. """ # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() cluster_failures = False if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[input_fp]) # Prep the OTU picking command try: otu_picking_method = params['pick_otus']['otu_picking_method'] except KeyError: otu_picking_method = 'uclust' pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method) otu_fp = '%s/%s_otus.txt' % (pick_otu_dir,input_basename) if parallel and (otu_picking_method == 'blast' or otu_picking_method == 'uclust_ref'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --otu_picking_method # option. This works for now though. d = params['pick_otus'].copy() del d['otu_picking_method'] except KeyError: pass if otu_picking_method == 'uclust_ref': try: suppress_new_clusters = d['suppress_new_clusters'] del d['suppress_new_clusters'] cluster_failures = False except KeyError: cluster_failures = True failure_otu_picking_method = 'uclust' params_str += ' %s' % get_params_str(d) otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s %s/%s -i %s -o %s -T %s' % (python_exe_fp, script_dir, otu_picking_script, input_fp, pick_otu_dir, params_str) else: try: params_str = get_params_str(params['pick_otus']) except KeyError: params_str = '' # Build the OTU picking command pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, input_fp, pick_otu_dir, params_str) commands.append([('Pick OTUs', pick_otus_cmd)]) if cluster_failures: reference_otu_fp = otu_fp clustered_failures_dir = '%s/failure_otus/' % pick_otu_dir try: d = params['pick_otus'].copy() del d['otu_picking_method'] except KeyError: pass if 'uclust_otu_id_prefix' not in d: d['uclust_otu_id_prefix'] = 'DeNovoOTU' params_str = ' %s' % get_params_str(d) failures_list_fp = '%s/%s_failures.txt' % \ (pick_otu_dir,input_basename) failures_fasta_fp = '%s/%s_failures.fasta' % \ (pick_otu_dir,input_basename) filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,failures_list_fp,failures_fasta_fp) commands.append([('Generate failures fasta file', filter_fasta_cmd)]) # Prep the OTU picking command for failure_otu_fp = '%s/%s_failures_otus.txt' % (clustered_failures_dir,input_basename) # Build the OTU picking command pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, failures_fasta_fp, clustered_failures_dir, failure_otu_picking_method, params_str) commands.append([('Pick de novo OTUs for new clusters', pick_otus_cmd)]) merged_otu_map_fp = '%s/merged_otu_map.txt' % clustered_failures_dir cat_otu_tables_cmd = 'cat %s %s >> %s' %\ (reference_otu_fp,failure_otu_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) otu_fp = merged_otu_map_fp # Prep the representative set picking command rep_set_dir = '%s/rep_set/' % output_dir create_dir(rep_set_dir) rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir,input_basename) rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir,input_basename) try: params_str = get_params_str(params['pick_rep_set']) except KeyError: params_str = '' # Build the representative set picking command pick_rep_set_cmd = '%s %s/pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\ (python_exe_fp, script_dir, otu_fp, input_fp, rep_set_log_fp,\ rep_set_fp, params_str) commands.append([('Pick representative set', pick_rep_set_cmd)]) # Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'uclust' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir,assignment_method) taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \ (assign_taxonomy_dir,input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast' or assignment_method == 'uclust'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the taxonomy assignment parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() if 'assignment_method' in d: del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ '%s %s/parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (python_exe_fp, script_dir, assignment_method, rep_set_fp,\ assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = '%s %s/assign_taxonomy.py -o %s -i %s %s' %\ (python_exe_fp, script_dir, assign_taxonomy_dir,\ rep_set_fp, params_str) commands.append([('Assign taxonomy',assign_taxonomy_cmd)]) # Prep the OTU table building command otu_table_fp = '%s/otu_table.biom' % output_dir try: params_str = get_params_str(params['make_otu_table']) except KeyError: params_str = '' # Build the OTU table building command make_otu_table_cmd = '%s %s/make_otu_table.py -i %s -t %s -o %s %s' %\ (python_exe_fp, script_dir, otu_fp, taxonomy_fp, otu_table_fp, params_str) commands.append([('Make OTU table', make_otu_table_cmd)]) if cluster_failures: reference_otu_table_fp = '%s/reference_only_otu_table.biom' % output_dir # Build the OTU table building command make_otu_table_cmd = '%s %s/make_otu_table.py -i %s -t %s -o %s %s' %\ (python_exe_fp, script_dir, reference_otu_fp, taxonomy_fp, reference_otu_table_fp, params_str) commands.append([('Make reference-only OTU table', make_otu_table_cmd)]) # Prep the pynast alignment command try: alignment_method = params['align_seqs']['alignment_method'] except KeyError: alignment_method = 'pynast' pynast_dir = '%s/%s_aligned_seqs' % (output_dir,alignment_method) aln_fp = '%s/%s_rep_set_aligned.fasta' % (pynast_dir,input_basename) if parallel and alignment_method == 'pynast': # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the alignment parameters # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. try: d = params['align_seqs'].copy() except KeyError: d = {} try: del d['alignment_method'] except KeyError: pass params_str += ' %s' % get_params_str(d) # Build the parallel pynast alignment command align_seqs_cmd = '%s %s/parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\ (python_exe_fp, script_dir, rep_set_fp, pynast_dir, params_str) else: try: params_str = get_params_str(params['align_seqs']) except KeyError: params_str = '' # Build the pynast alignment command align_seqs_cmd = '%s %s/align_seqs.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, rep_set_fp, pynast_dir, params_str) commands.append([('Align sequences', align_seqs_cmd)]) # Prep the alignment filtering command filtered_aln_fp = '%s/%s_rep_set_aligned_pfiltered.fasta' %\ (pynast_dir,input_basename) try: params_str = get_params_str(params['filter_alignment']) except KeyError: params_str = '' # Build the alignment filtering command filter_alignment_cmd = '%s %s/filter_alignment.py -o %s -i %s %s' %\ (python_exe_fp, script_dir, pynast_dir, aln_fp, params_str) commands.append([('Filter alignment', filter_alignment_cmd)]) # Prep the tree building command tree_fp = '%s/rep_set.tre' % output_dir try: params_str = get_params_str(params['make_phylogeny']) except KeyError: params_str = '' # Build the tree building command make_phylogeny_cmd = '%s %s/make_phylogeny.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, filtered_aln_fp, tree_fp,\ params_str) commands.append([('Build phylogenetic tree', make_phylogeny_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return abspath(tree_fp), abspath(otu_table_fp)
def pick_subsampled_open_reference_otus( input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method="uclust", reference_otu_picking_method="uclust_ref", status_update_callback=print_to_stdout, ): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ["uclust", "usearch61"] allowed_reference_otu_picking_methods = ["uclust_ref", "usearch61_ref"] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods, ( "Unknown de novo OTU picking method: %s. Known methods are: %s" % (denovo_otu_picking_method, ",".join(allowed_denovo_otu_picking_methods)) ) assert reference_otu_picking_method in allowed_reference_otu_picking_methods, ( "Unknown reference OTU picking method: %s. Known methods are: %s" % (reference_otu_picking_method, ",".join(allowed_reference_otu_picking_methods)) ) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = "%s/step1_otus" % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = "%s/prefilter_otus/" % output_dir prefilter_failures_list_fp = "%s/%s_failures.txt" % (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id, ) commands.append([("Pick Reference OTUs (prefilter)", prefilter_pick_otu_cmd)]) prefiltered_input_fp = "%s/prefiltered_%s%s" % (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = "filter_fasta.py -f %s -o %s -s %s -n" % ( input_fp, prefiltered_input_fp, prefilter_failures_list_fp, ) commands.append([("Filter prefilter failures from input", filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?" ) # Build the OTU picking command step1_dir = "%s/step1_otus" % output_dir step1_otu_map_fp = "%s/%s_otus.txt" % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus( input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger ) commands.append([("Pick Reference OTUs", step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = "%s/%s_failures.txt" % (step1_dir, input_basename) step1_failures_fasta_fp = "%s/failures.fasta" % step1_dir step1_filter_fasta_cmd = "filter_fasta.py -f %s -s %s -o %s" % ( input_fp, step1_failures_list_fp, step1_failures_fasta_fp, ) commands.append([("Generate full failures fasta file", step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = "%s/step1_rep_set.fna" % step1_dir step1_pick_rep_set_cmd = "pick_rep_set.py -i %s -o %s -f %s" % (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([("Pick rep set", step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_input_fasta_fp = "%s/subsampled_failures.fasta" % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write( "# Subsample the failures fasta file using API \n" + 'python -c "import qiime; qiime.util.subsample_fasta' + "('%s', '%s', '%f')\n\n\"" % (abspath(step1_failures_fasta_fp), abspath(step2_input_fasta_fp), percent_subsample) ) # Prep the OTU picking command for the subsampled failures step2_dir = "%s/step2_otus/" % output_dir step2_cmd = pick_denovo_otus( step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger ) step2_otu_map_fp = "%s/subsampled_failures_otus.txt" % step2_dir commands.append([("Pick de novo OTUs for new clusters", step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = "%s/step2_rep_set.fna" % step2_dir step2_rep_set_cmd = "pick_rep_set.py -i %s -o %s -f %s" % ( step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp, ) commands.append([("Pick representative set for subsampled failures", step2_rep_set_cmd)]) step3_dir = "%s/step3_otus/" % output_dir step3_otu_map_fp = "%s/failures_otus.txt" % step3_dir step3_failures_list_fp = "%s/failures_failures.txt" % step3_dir step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger, ) commands.append([("Pick reference OTUs using de novo rep set", step3_cmd)]) # name the final otu map merged_otu_map_fp = "%s/final_otu_map.txt" % output_dir if not suppress_step4: step3_failures_fasta_fp = "%s/failures_failures.fasta" % step3_dir step3_filter_fasta_cmd = "filter_fasta.py -f %s -s %s -o %s" % ( step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp, ) commands.append([("Create fasta file of step3 failures", step3_filter_fasta_cmd)]) step4_dir = "%s/step4_otus/" % output_dir step4_cmd = pick_denovo_otus( step3_failures_fasta_fp, step4_dir, ".".join([new_ref_set_id, "CleanUp"]), denovo_otu_picking_method, params, logger, ) step4_otu_map_fp = "%s/failures_failures_otus.txt" % step4_dir commands.append([("Pick de novo OTUs on step3 failures", step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = "cat %s %s %s > %s" % ( step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp, ) commands.append([("Merge OTU maps", cat_otu_tables_cmd)]) step4_repset_fasta_fp = "%s/step4_rep_set.fna" % step4_dir step4_rep_set_cmd = "pick_rep_set.py -i %s -o %s -f %s" % ( step4_otu_map_fp, step4_repset_fasta_fp, step3_failures_fasta_fp, ) commands.append([("Pick representative set for subsampled failures", step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = "cat %s %s > %s" % (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([("Merge OTU maps", cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append( [ ( "Move final failures file to top-level directory", "mv %s %s/final_failures.txt" % (step3_failures_list_fp, output_dir), ) ] ) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = "%s/final_otu_map_mc%d.txt" % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp, min_otu_size) logger.write( "# Filter singletons from the otu map using API \n" + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + "('%s', '%s', '%d')\"\n\n" % (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size) ) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = "%s/rep_set.fna" % output_dir final_repset_f = open(final_repset_fp, "w") new_refseqs_fp = "%s/new_refseqs.fna" % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp, "U")): if otu_id.split()[0] in otus_to_keep: final_repset_f.write(">%s\n%s\n" % (otu_id, seq)) logger.write( "# Write non-singleton otus representative sequences " + "from step1 to the final rep set file: %s\n\n" % final_repset_fp ) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, "a") new_refseqs_f.write("\n") logger.write( "# Copy the full input refseqs file to the new refseq file\n" + "cp %s %s\n\n" % (refseqs_fp, new_refseqs_fp) ) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp, "U")): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write(">%s\n%s\n" % (otu_id, seq)) final_repset_f.write(">%s\n%s\n" % (otu_id, seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp, "U")): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write(">%s\n%s\n" % (otu_id, seq)) final_repset_f.write(">%s\n%s\n" % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() logger.write( "# Write non-singleton otus representative sequences from " + "step 2 and step 4 to the final representative set and the new reference" + " set (%s and %s respectively)\n\n" % (final_repset_fp, new_refseqs_fp) ) # Prep the make_otu_table.py command otu_table_fp = "%s/otu_table_mc%d.biom" % (output_dir, min_otu_size) make_otu_table_cmd = "make_otu_table.py -i %s -o %s" % (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = "%s/otu_table_mc%d_w_tax.biom" % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = "%s/otu_table_mc%d_w_tax_no_pynast_failures.biom" % ( output_dir, min_otu_size, ) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = "%s/otu_table_mc%d_w_tax.biom" % (output_dir, min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = "%s/otu_table_mc%d_no_pynast_failures.biom" % (output_dir, min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback, ) # Add taxa to otu table add_metadata_cmd = ( "biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy" % (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) ) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback, ) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table, "U")), get_seq_ids_from_fasta_file(open(pynast_failures_fp, "U")), 0, inf, 0, inf, negate_ids_to_keep=True, ) otu_table_f = open(pynast_failure_filtered_otu_table_fp, "w") otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close()
def run_ampliconnoise(mapping_fp, output_dir, command_handler, params, qiime_config, logger=None, status_update_callback=print_to_stdout, chimera_alpha=-3.8228,chimera_beta=0.6200, sff_txt_fp=None, numnodes=2, suppress_perseus=True, output_filepath=None, platform='flx', seqnoise_resolution=None, truncate_len=None): """ Run the ampliconnoise pipeline The steps performed by this function are: 1. Split input sff.txt file into one file per sample 2. Run scripts required for PyroNoise 3. Run scripts required for SeqNoise 4. Run scripts requred for Perseus (chimera removal) 5. Merge output files into one file similar to the output of split_libraries.py output_filepath should be absolute seqnoise_resolution should be string environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be careful passing command handlers that don't spawn child processes, as they may not inherit the correct environment variable setting """ map_data,headers,comments = parse_mapping_file(open(mapping_fp,'U')) create_dir(output_dir) if seqnoise_resolution == None: if platform=='flx': seqnoise_resolution = '30.0' elif platform=='titanium': seqnoise_resolution = '25.0' else: raise RuntimeError('seqnoise_resolution not set, and no'+\ ' default for platform '+platform) if truncate_len == None: if platform=='flx': truncate_len = '220' elif platform=='titanium': truncate_len = '400' else: raise RuntimeError('truncate_len not set, and no'+\ ' default for platform '+platform) sample_names = [] # these are filenames minus extension, and are sample IDs primer_seqs = [] # same order as sample_names bc_seqs = [] # same order as sample_names for i in range(len(map_data)): sample_names.append(map_data[i][headers.index('SampleID')]) bc_seqs.append(map_data[i][headers.index('BarcodeSequence')]) # don't know why don't just take off the primer now. # but that's done later # primer += (map_data[i][headers.index('LinkerPrimerSequence')]) # for char, bases in IUPAC_DNA_ambiguities.items(): # primer = primer.replace(char,'['+''.join(bases)+']') primer = (map_data[i][headers.index('LinkerPrimerSequence')]) for char, bases in IUPAC_DNA_ambiguities.items(): primer = primer.replace(char,'['+''.join(bases)+']') primer_seqs.append(primer) if len(set(primer_seqs)) != 1: raise RuntimeError( 'Error: only one primer per mapping file supported.') one_primer = primer_seqs[0] commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False log_input_md5s(logger,[mapping_fp,sff_txt_fp]) # execute commands in output_dir called_dir = os.getcwd() os.chdir(output_dir) fh = open(os.path.join(output_dir,'map.csv'),'w') for i in range(len(sample_names)): fh.write(sample_names[i]+','+bc_seqs[i]+'\n') fh.close() # these are the fasta results, e.g. PC.636_Good.fa # later we merge them and copy to output file post_pyro_tail = '_'+truncate_len if suppress_perseus == True: fasta_result_names = [sample_name + post_pyro_tail+'_seqnoise_cd.fa' for sample_name in sample_names] else: fasta_result_names = [sample_name + '_Good.fa' \ for sample_name in sample_names] cmd = 'cd '+output_dir # see also os.chdir above commands.append([('change to output dir', cmd)]) cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt' commands.append([('confirm pyro lookup filepath environment variable', cmd)]) cmd = 'SplitKeys.pl '+one_primer+' map.csv < '+\ os.path.join(called_dir,sff_txt_fp)+\ ' > splitkeys_log.txt 2> unassigned.fna' commands.append([('split sff.txt via barcodes (keys)', cmd)]) for i, sample_name in enumerate(sample_names): # Build the summarize taxonomy command if platform == 'flx': cmd = 'Clean360.pl '+one_primer+' '+sample_name+' < '+\ sample_name+'.raw' commands.append([('clean flows '+sample_name, cmd)]) # these run through the whole sff file once per sample, I think # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) elif platform == 'titanium': cmd = 'CleanMinMax.pl '+one_primer+' '+sample_name+' < '+\ sample_name+'.raw' commands.append([('clean flows '+sample_name, cmd)]) # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) else: raise RuntimeError("platform " + platform + " not supported") cmd = "mpirun -np "+str(numnodes)+" PyroDist -in "+\ sample_name+".dat -out "+sample_name+ " > "+sample_name+".pdout" commands.append([('pyrodist '+sample_name, cmd)]) cmd = "FCluster -in "+sample_name+".fdist -out "+sample_name+\ " > "+sample_name+".fcout" commands.append([('fcluster pyrodist '+sample_name, cmd)]) # e.g.: # mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin # PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout cmd = "mpirun -np "+str(numnodes)+" PyroNoise -din "+\ sample_name+".dat -out "+\ sample_name+"_pyronoise "+"-lin "+\ sample_name+".list -s 60.0 -c 0.01 > "+\ sample_name+"_pyronoise.pnout" commands.append([('pyronoise '+sample_name, cmd)]) cmd = 'Parse.pl '+bc_seqs[i]+one_primer+' '+truncate_len+' < '+\ sample_name+'_pyronoise_cd.fa'+' > '+ sample_name+'_'+\ truncate_len+'.fa' commands.append([('truncate '+sample_name, cmd)]) # now start with post_pyro_tail cmd = "mpirun -np "+str(numnodes)+" SeqDist -in "+\ sample_name+post_pyro_tail+\ ".fa > "+sample_name+post_pyro_tail+".seqdist" commands.append([('seqdist '+sample_name, cmd)]) cmd = "FCluster -in "+sample_name+post_pyro_tail+".seqdist -out "+\ sample_name+post_pyro_tail+"fcl > "+\ sample_name+post_pyro_tail+".fcout" commands.append([('fcluster seqdist '+sample_name, cmd)]) # e.g.: # mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din # PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin # PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 > # PC.354_pyronoise_cd.snout cmd = "mpirun -np "+str(numnodes)+" SeqNoise -in "+\ sample_name+post_pyro_tail+\ ".fa -din "+sample_name+post_pyro_tail+".seqdist -out "+\ sample_name+post_pyro_tail+\ "_seqnoise -lin "+sample_name+post_pyro_tail+'fcl.list -min '+\ sample_name+'_pyronoise'+\ '.mapping -s '+seqnoise_resolution+' -c 0.08 > '+\ sample_name+post_pyro_tail+'.snout' commands.append([('seqnoise '+sample_name, cmd)]) if suppress_perseus == False: cmd = 'Perseus -sin '+sample_name+post_pyro_tail+\ '_seqnoise_cd.fa > ' +\ sample_name+'.per' commands.append([('Perseus '+sample_name, cmd)]) cmd = 'Class.pl '+sample_name+'.per '+\ str(chimera_alpha) + ' '+ str(chimera_beta)+\ ' > '+sample_name+'.class' commands.append([('Class.pl '+sample_name, cmd)]) cmd = 'FilterGoodClass.pl '+sample_name+post_pyro_tail+\ '_seqnoise_cd.fa '+\ sample_name+'.class 0.5 > '+sample_name+'_Chi.fa 2> '+\ sample_name+'_Good.fa' commands.append([('FilterGoodClass '+sample_name, cmd)]) cmd = '%s %s/unweight_fasta.py -i %s -o %s -l %s' %\ (python_exe_fp, script_dir, fasta_result_names[i], sample_name+'_unw.fna', sample_name) commands.append([('unweight fasta '+sample_name, cmd)]) cmd = 'cat ' +\ ' '.join([sample_name+'_unw.fna' for sample_name in sample_names]) +\ ' > ' + output_filepath # this should be an abs filepath commands.append([('cat into one fasta file', cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ['uclust','usearch61'] allowed_reference_otu_picking_methods = ['uclust_ref','usearch61_ref'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp ## Step 1: Closed-reference OTU picking on the input file (if not already complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id != None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir,input_basename) prefilter_pick_otu_cmd = pick_reference_otus(\ input_fp,prefilter_dir,reference_otu_picking_method, prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir,input_basename,input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp,prefiltered_input_fp,prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) ## Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir,input_basename) step1_pick_otu_cmd = pick_reference_otus(\ input_fp,step1_dir,reference_otu_picking_method, refseqs_fp,parallel,params,logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) ## Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir,input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,step1_failures_list_fp,step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set',step1_pick_rep_set_cmd)]) ## Subsample the failures fasta file to retain (roughly) the ## percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) ## Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) ## Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures',step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set',step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id,'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp) commands.append([('Pick representative set for subsampled failures',step4_rep_set_cmd)]) else: # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp,output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp,otu_no_singletons_fp,min_otu_size) ## make the final representative seqs file and a new refseqs file that ## could be used in subsequent otu picking runs. ## this is clunky. first, we need to do this without singletons to match ## the otu map without singletons. next, there is a difference in what ## we need the reference set to be and what we need the repseqs to be. ## the reference set needs to be a superset of the input reference set ## to this set. the repset needs to be only the sequences that were observed ## in this data set, and we want reps for the step1 reference otus to be ## reads from this run so we don't hit issues building a tree using ## sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_f = open(final_repset_fp,'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp,new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp,'a') new_refseqs_f.write('\n') # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) new_refseqs_f.close() final_repset_f.close() # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp,otu_table_fp) commands.append([("Make the otu table",make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp],error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table # Add taxa to otu table add_metadata_cmd = 'add_metadata.py -i %s --observation_mapping_fp %s -o %s --sc_separated taxonomy --observation_header OTUID,taxonomy' %\ (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table",add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." %\ pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table,'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')), 0,inf,0,inf,negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close()
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ['uclust','usearch61'] allowed_reference_otu_picking_methods = ['uclust_ref','usearch61_ref'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp ## Step 1: Closed-reference OTU picking on the input file (if not already complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id != None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir,input_basename) prefilter_pick_otu_cmd = pick_reference_otus(\ input_fp,prefilter_dir,reference_otu_picking_method, prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir,input_basename,input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp,prefiltered_input_fp,prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) ## Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir,input_basename) step1_pick_otu_cmd = pick_reference_otus(\ input_fp,step1_dir,reference_otu_picking_method, refseqs_fp,parallel,params,logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) ## Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir,input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,step1_failures_list_fp,step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set',step1_pick_rep_set_cmd)]) ## Subsample the failures fasta file to retain (roughly) the ## percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) ## Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) ## Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures',step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set',step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id,'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp) commands.append([('Pick representative set for subsampled failures',step4_rep_set_cmd)]) else: # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp,output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp,otu_no_singletons_fp,min_otu_size) ## make the final representative seqs file and a new refseqs file that ## could be used in subsequent otu picking runs. ## this is clunky. first, we need to do this without singletons to match ## the otu map without singletons. next, there is a difference in what ## we need the reference set to be and what we need the repseqs to be. ## the reference set needs to be a superset of the input reference set ## to this set. the repset needs to be only the sequences that were observed ## in this data set, and we want reps for the step1 reference otus to be ## reads from this run so we don't hit issues building a tree using ## sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_f = open(final_repset_fp,'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp,new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp,'a') new_refseqs_f.write('\n') # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) new_refseqs_f.close() final_repset_f.close() # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp,otu_table_fp) commands.append([("Make the otu table",make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp],error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table",add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." %\ pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table,'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')), 0,inf,0,inf,negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close()
def run_core_diversity_analyses( biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, suppress_taxa_summary=False, suppress_beta_diversity=False, suppress_alpha_diversity=False, suppress_otu_category_significance=False, status_update_callback=print_to_stdout): """ """ if categories != None: # Validate categories provided by the users mapping_data, mapping_comments = \ parse_mapping_file_to_dict(open(mapping_fp,'U')) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError, ("Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames))) if metadata_map.hasSingleCategoryValue(c): raise ValueError, ("Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c) else: categories= [] # prep some variables if params == None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = '%s/index.html' % output_dir index_links = [] commands = [] # begin logging log_fp = generate_log_fp(output_dir) index_links.append(('Master run log',log_fp,_index_headers['run_summary'])) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp,mapping_fp] if tree_fp != None: input_fps.append(tree_fp) log_input_md5s(logger,input_fps) # run print_biom_table_summary.py on input BIOM table try: params_str = get_params_str(params['print_biom_table_summary']) except KeyError: params_str = '' biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir print_biom_table_summary_cmd = \ "print_biom_table_summary.py -i %s -o %s --suppress_md5 %s" % \ (biom_fp, biom_table_stats_output_fp,params_str) index_links.append(('BIOM table statistics', biom_table_stats_output_fp, _index_headers['run_summary'])) commands.append([('Generate BIOM table summary', print_biom_table_summary_cmd)]) # filter samples with fewer observations than the requested sampling_depth. # since these get filtered for some analyses (eg beta diversity after # even sampling) it's useful to filter them here so they're filtered # from all analyses. filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth) filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\ (biom_fp,filtered_biom_fp,sampling_depth) commands.append([('Filter low sequence count samples from table (minimum sequence count: %d)' % sampling_depth, filter_samples_cmd)]) biom_fp = filtered_biom_fp # run initial commands and reset the command list command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] if not suppress_beta_diversity: bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth) even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, sampling_depth=sampling_depth, # force suppression of distance histograms - boxplots work better # in this context, and are created below. histogram_categories=[], tree_fp=tree_fp, parallel=parallel, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric) try: params_str = get_params_str(params['make_distance_boxplots']) except KeyError: params_str = '' boxplots_cmd = \ 'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\ (dm_fp, category, boxplots_output_dir, mapping_fp, params_str) commands.append([('Boxplots (%s)' % category, boxplots_cmd)]) index_links.append(('Distance boxplots (%s)' % bdiv_metric, '%s/%s_Distances.pdf' % \ (boxplots_output_dir,category), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric, '%s/%s_Stats.txt' % \ (boxplots_output_dir,category), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('3D plot (%s, continuous coloring)' % bdiv_metric, '%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('3D plot (%s, discrete coloring)' % bdiv_metric, '%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('2D plot (%s, continuous coloring)' % bdiv_metric, '%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('2D plot (%s, discrete coloring)' % bdiv_metric, '%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Distance matrix (%s)' % bdiv_metric, '%s/%s_dm.txt' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric, '%s/%s_pc.txt' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) if not suppress_alpha_diversity: ## Alpha rarefaction workflow arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth) run_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, suppress_md5=True, status_update_callback=status_update_callback) index_links.append(('Alpha rarefaction plots', '%s/alpha_rarefaction_plots/rarefaction_plots.html'\ % arare_full_output_dir, _index_headers['alpha_diversity'])) collated_alpha_diversity_fps = \ glob('%s/alpha_div_collated/*txt' % arare_full_output_dir) try: params_str = get_params_str(params['compare_alpha_diversity']) except KeyError: params_str = '' for category in categories: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0] alpha_comparison_output_fp = '%s/%s_%s.txt' % \ (arare_full_output_dir,category,alpha_metric) compare_alpha_cmd = \ 'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\ (collated_alpha_diversity_fp, mapping_fp, category, alpha_comparison_output_fp, params_str) commands.append([('Compare alpha diversity (%s, %s)' %\ (category,alpha_metric), compare_alpha_cmd)]) index_links.append( ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric), alpha_comparison_output_fp, _index_headers['alpha_diversity'])) if not suppress_taxa_summary: taxa_plots_output_dir = '%s/taxa_plots/' % output_dir run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) index_links.append(('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary'])) index_links.append(('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary'])) for category in categories: taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,category) run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=category, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) index_links.append(('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) index_links.append(('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) if not suppress_otu_category_significance: # OTU category significance for category in categories: category_signifance_fp = \ '%s/category_significance_%s.txt' % (output_dir, category) try: params_str = get_params_str(params['otu_category_significance']) except KeyError: params_str = '' # Build the OTU cateogry significance command category_significance_cmd = \ 'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\ (biom_fp, mapping_fp, category, category_signifance_fp, params_str) commands.append([('OTU category significance (%s)' % category, category_significance_cmd)]) index_links.append(('Category significance (%s)' % category, category_signifance_fp, _index_headers['otu_category_sig'])) commands.append([('Compress the filtered BIOM table','gzip %s' % filtered_biom_fp)]) index_links.append(('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth, '%s.gz' % filtered_biom_fp, _index_headers['run_summary'])) command_handler(commands, status_update_callback, logger) generate_index_page(index_links,index_fp)