def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if len(opts.taxa_summary_fps) != 2: option_parser.error("Exactly two taxa summary files are required. You " "provided %d." % len(opts.taxa_summary_fps)) # Create the output dir if it doesn't already exist. try: create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") sample_id_map = None if opts.sample_id_map_fp: sample_id_map = parse_sample_id_map(open(opts.sample_id_map_fp, 'U')) results = compare_taxa_summaries( parse_taxa_summary_table(open(opts.taxa_summary_fps[0], 'U')), parse_taxa_summary_table(open(opts.taxa_summary_fps[1], 'U')), opts.comparison_mode, correlation_type=opts.correlation_type, tail_type=opts.tail_type, num_permutations=opts.num_permutations, confidence_level=opts.confidence_level, perform_detailed_comparisons=opts.perform_detailed_comparisons, sample_id_map=sample_id_map, expected_sample_id=opts.expected_sample_id) # Write out the sorted and filled taxa summaries, basing their # filenames on the original input filenames. If the filenames are the same, # append a number to each filename. same_filenames = False if basename(opts.taxa_summary_fps[0]) == \ basename(opts.taxa_summary_fps[1]): same_filenames = True for orig_ts_fp, filled_ts_lines, file_num in zip(opts.taxa_summary_fps, results[:2], range(0, 2)): filename_suffix = '_sorted_and_filled' if same_filenames: filename_suffix += '_%d' % file_num filled_ts_fp = add_filename_suffix(orig_ts_fp, filename_suffix) filled_ts_f = open(join(opts.output_dir, filled_ts_fp), 'w') filled_ts_f.write(filled_ts_lines) filled_ts_f.close() # Write the overall comparison result. overall_comp_f = open(join(opts.output_dir, 'overall_comparison.txt'), 'w') overall_comp_f.write(results[2]) overall_comp_f.close() # Write the correlation vector containing the pairwise sample comparisons. if opts.perform_detailed_comparisons: corr_vec_f = open(join(opts.output_dir, 'detailed_comparisons.txt'), 'w') corr_vec_f.write(results[3]) corr_vec_f.close()
def _generate_taxa_processing_commands(assigned_taxonomy_dir, input_fasta_fp, clean_otu_table_fp, run_id): """ Build command strings for adding and summarizing taxa commands. These are used with every method. """ taxa_assignments_fp = join(assigned_taxonomy_dir, splitext(basename(input_fasta_fp))[0] + '_tax_assignments.txt') otu_table_w_taxa_fp = join(assigned_taxonomy_dir, add_filename_suffix(clean_otu_table_fp, '_w_taxa')) add_md_command = [('Adding metadata (%s)' % run_id, 'biom add-metadata -i %s -o %s ' '--observation-metadata-fp %s --sc-separated taxonomy ' '--observation-header OTUID,taxonomy' % (clean_otu_table_fp, otu_table_w_taxa_fp, taxa_assignments_fp))] summarize_taxa_command = [('Summarizing taxa (%s)' % run_id, 'summarize_taxa.py -i %s -o %s' % (otu_table_w_taxa_fp, assigned_taxonomy_dir))] return add_md_command, summarize_taxa_command
def _generate_taxa_processing_commands(assigned_taxonomy_dir, input_fasta_fp, clean_otu_table_fp, run_id): """ Build command strings for adding and summarizing taxa commands. These are used with every method. """ taxa_assignments_fp = join( assigned_taxonomy_dir, splitext(basename(input_fasta_fp))[0] + '_tax_assignments.txt') otu_table_w_taxa_fp = join( assigned_taxonomy_dir, add_filename_suffix(clean_otu_table_fp, '_w_taxa')) add_taxa_command = [ ('Adding taxa (%s)' % run_id, 'add_taxa.py -i %s -o %s -t %s' % (clean_otu_table_fp, otu_table_w_taxa_fp, taxa_assignments_fp)) ] summarize_taxa_command = [ ('Summarizing taxa (%s)' % run_id, 'summarize_taxa.py -i %s -o %s' % (otu_table_w_taxa_fp, assigned_taxonomy_dir)) ] return add_taxa_command, summarize_taxa_command
def _build_simulated_data_commands(analysis_type, out_dir, even_otu_table_fp, map_fp, tree_fp, workflow): cmds = [] data_type_dir = join(out_dir, 'simulated') create_dir(data_type_dir) num_samps = get_num_samples_in_table(even_otu_table_fp) for category in workflow['categories']: category_dir = join(data_type_dir, category[0]) create_dir(category_dir) for trial_num in range(workflow['num_sim_data_trials']): trial_num_dir = join(category_dir, '%d' % trial_num) create_dir(trial_num_dir) for samp_size in workflow['sample_sizes']: samp_size_dir = join(trial_num_dir, '%d' % samp_size) create_dir(samp_size_dir) # Lots of duplicate code between these two blocks... # need to refactor and test. if samp_size <= num_samps: simsam_rep_num = 1 subset_otu_table_fp = join(samp_size_dir, basename(even_otu_table_fp)) subset_map_fp = join(samp_size_dir, basename(map_fp)) if not has_results(samp_size_dir, required_files=[basename(subset_otu_table_fp), basename(subset_map_fp)]): run_command('choose_data_subset.py -t %s -i %s -m %s -c %s -n %d -o %s' % (analysis_type, even_otu_table_fp, map_fp, category[0], samp_size, samp_size_dir)) assert get_num_samples_in_table(subset_otu_table_fp) == samp_size assert get_num_samples_in_map(subset_map_fp) == samp_size for d in workflow['dissim']: dissim_dir = join(samp_size_dir, repr(d)) create_dir(dissim_dir) simsam_map_fp = join(dissim_dir, add_filename_suffix(subset_map_fp, '_n%d_d%r' % (simsam_rep_num, d))) simsam_otu_table_fp = join(dissim_dir, add_filename_suffix(subset_otu_table_fp, '_n%d_d%r' % (simsam_rep_num, d))) # Check for simulated table/map and various # distance matrices / coordinates files. required_simsam_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)] has_simsam_files = has_results(dissim_dir, required_files=required_simsam_files) has_metric_files = True for metric in workflow['metrics']: required_metric_files = ['dm.txt', 'map.txt', 'pc.txt'] if analysis_type == 'gradient': required_metric_files.append('%s_dm.txt' % category[0]) metric_dir = join(dissim_dir, metric[0]) has_metric_files = has_results(metric_dir, required_metric_files) if not has_metric_files: break if not (has_simsam_files and has_metric_files): cmd = ['simsam.py -i %s -t %s -o %s -d %r -n %d -m %s' % (subset_otu_table_fp, tree_fp, dissim_dir, d, simsam_rep_num, subset_map_fp)] for metric in workflow['metrics']: metric_dir = join(dissim_dir, metric[0]) create_dir(metric_dir) if analysis_type == 'gradient': cmd.append('distance_matrix_from_mapping.py -i %s -c %s -o %s' % (simsam_map_fp, category[0], join(metric_dir, '%s_dm.txt' % category[0]))) cmd.append('beta_diversity.py -i %s -o %s -m %s -t %s' % (simsam_otu_table_fp, metric_dir, metric[0], tree_fp)) cmd.append('mv %s %s' % (join(metric_dir, '%s_%s.txt' % (metric[0], splitext(basename(simsam_otu_table_fp))[0])), join(metric_dir, 'dm.txt'))) cmd.append('cp %s %s' % (simsam_map_fp, join(metric_dir, 'map.txt'))) cmd.append('principal_coordinates.py -i %s -o %s' % (join(metric_dir, 'dm.txt'), join(metric_dir, 'pc.txt'))) cmds.append(' && '.join(cmd)) else: # We need to simulate more samples than we originally have. simsam_rep_num = get_simsam_rep_num(samp_size, num_samps) for d in workflow['dissim']: dissim_dir = join(samp_size_dir, repr(d)) create_dir(dissim_dir) simsam_map_fp = join(dissim_dir, add_filename_suffix(map_fp, '_n%d_d%r' % (simsam_rep_num, d))) simsam_otu_table_fp = join(dissim_dir, add_filename_suffix(even_otu_table_fp, '_n%d_d%r' % (simsam_rep_num, d))) required_simsam_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)] has_simsam_files = has_results(dissim_dir, required_files=required_simsam_files) required_subset_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)] has_subset_files = has_results(join(dissim_dir, 'subset'), required_files=required_subset_files) has_metric_files = True for metric in workflow['metrics']: required_metric_files = ['dm.txt', 'map.txt', 'pc.txt'] if analysis_type == 'gradient': required_metric_files.append('%s_dm.txt' % category[0]) metric_dir = join(dissim_dir, metric[0]) has_metric_files = has_results(metric_dir, required_metric_files) if not has_metric_files: break if not (has_simsam_files and has_subset_files and has_metric_files): cmd = ['simsam.py -i %s -t %s -o %s -d %r -n %d -m %s' % (even_otu_table_fp, tree_fp, dissim_dir, d, simsam_rep_num, map_fp)] subset_dir = join(dissim_dir, 'subset') cmd.append('choose_data_subset.py -t %s -i %s -m %s -c %s -n %d -o %s' % (analysis_type, simsam_otu_table_fp, simsam_map_fp, category[0], samp_size, subset_dir)) subset_otu_table_fp = join(subset_dir, basename(simsam_otu_table_fp)) subset_map_fp = join(subset_dir, basename(simsam_map_fp)) for metric in workflow['metrics']: metric_dir = join(dissim_dir, metric[0]) create_dir(metric_dir) if analysis_type == 'gradient': cmd.append('distance_matrix_from_mapping.py -i %s -c %s -o %s' % (subset_map_fp, category[0], join(metric_dir, '%s_dm.txt' % category[0]))) cmd.append('beta_diversity.py -i %s -o %s -m %s -t %s' % (subset_otu_table_fp, metric_dir, metric[0], tree_fp)) cmd.append('mv %s %s' % (join(metric_dir, '%s_%s.txt' % (metric[0], splitext(basename(subset_otu_table_fp))[0])), join(metric_dir, 'dm.txt'))) cmd.append('cp %s %s' % (subset_map_fp, join(metric_dir, 'map.txt'))) cmd.append('principal_coordinates.py -i %s -o %s' % (join(metric_dir, 'dm.txt'), join(metric_dir, 'pc.txt'))) cmds.append(' && '.join(cmd)) return cmds
def create_personal_results( output_dir, mapping_fp, coord_fp, collated_dir, otu_table_fp, prefs_fp, personal_id_column, personal_ids=None, column_title="Self", individual_titles=None, category_to_split="BodySite", time_series_category="WeeksSinceStart", rarefaction_depth=10000, alpha=0.05, rep_set_fp=None, body_site_rarefied_otu_table_dir=None, retain_raw_data=False, suppress_alpha_rarefaction=False, suppress_beta_diversity=False, suppress_taxa_summary_plots=False, suppress_alpha_diversity_boxplots=False, suppress_otu_category_significance=False, command_handler=call_commands_serially, status_update_callback=no_status_updates, ): # Create our output directory and copy over the resources the personalized # pages need (e.g. javascript, images, etc.). create_dir(output_dir) support_files_dir = join(output_dir, "support_files") if not exists(support_files_dir): copytree(join(get_project_dir(), "my_microbes", "support_files"), support_files_dir) logger = WorkflowLogger(generate_log_fp(output_dir)) mapping_data, header, comments = parse_mapping_file(open(mapping_fp, "U")) try: personal_id_index = header.index(personal_id_column) except ValueError: raise ValueError("Personal ID field '%s' is not a mapping file column " "header." % personal_id_column) try: bodysite_index = header.index(category_to_split) except ValueError: raise ValueError("Category to split field '%s' is not a mapping file " "column header." % category_to_split) header = header[:-1] + [column_title] + [header[-1]] # column that differentiates between body-sites within a single individual # used for the creation of the vectors in make_3d_plots.py, this data is # created by concatenating the two columns when writing the mapping file site_id_category = "%s&&%s" % (personal_id_column, category_to_split) header.insert(len(header) - 1, site_id_category) all_personal_ids = get_personal_ids(mapping_data, personal_id_index) if personal_ids == None: personal_ids = all_personal_ids else: for pid in personal_ids: if pid not in all_personal_ids: raise ValueError( "'%s' is not a personal ID in the mapping " "file column '%s'." % (pid, personal_id_column) ) if time_series_category not in header: raise ValueError("Time series field '%s' is not a mapping file column " "header." % time_series_category) otu_table_title = splitext(basename(otu_table_fp)) output_directories = [] raw_data_files = [] raw_data_dirs = [] # Rarefy the OTU table and split by body site here (instead of on a # per-individual basis) as we can use the same rarefied and split tables # for each individual. if not suppress_otu_category_significance: rarefied_otu_table_fp = join(output_dir, add_filename_suffix(otu_table_fp, "_even%d" % rarefaction_depth)) if body_site_rarefied_otu_table_dir is None: commands = [] cmd_title = "Rarefying OTU table" cmd = "single_rarefaction.py -i %s -o %s -d %s" % (otu_table_fp, rarefied_otu_table_fp, rarefaction_depth) commands.append([(cmd_title, cmd)]) raw_data_files.append(rarefied_otu_table_fp) per_body_site_dir = join(output_dir, "per_body_site_otu_tables") cmd_title = "Splitting rarefied OTU table by body site" cmd = "split_otu_table.py -i %s -m %s -f %s -o %s" % ( rarefied_otu_table_fp, mapping_fp, category_to_split, per_body_site_dir, ) commands.append([(cmd_title, cmd)]) raw_data_dirs.append(per_body_site_dir) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) else: per_body_site_dir = body_site_rarefied_otu_table_dir for person_of_interest in personal_ids: # Files to clean up on a per-individual basis. personal_raw_data_files = [] personal_raw_data_dirs = [] create_dir(join(output_dir, person_of_interest)) personal_mapping_file_fp = join(output_dir, person_of_interest, "mapping_file.txt") html_fp = join(output_dir, person_of_interest, "index.html") personal_mapping_data = create_personal_mapping_file( mapping_data, person_of_interest, personal_id_index, bodysite_index, individual_titles ) personal_mapping_f = open(personal_mapping_file_fp, "w") personal_mapping_f.write(format_mapping_file(header, personal_mapping_data, comments)) personal_mapping_f.close() personal_raw_data_files.append(personal_mapping_file_fp) column_title_index = header.index(column_title) column_title_values = set([e[column_title_index] for e in personal_mapping_data]) cat_index = header.index(category_to_split) cat_values = set([e[cat_index] for e in personal_mapping_data]) # Generate alpha diversity boxplots, split by body site, one per # metric. We run this one first because it completes relatively # quickly and it does not call any QIIME scripts. alpha_diversity_boxplots_html = "" if not suppress_alpha_diversity_boxplots: adiv_boxplots_dir = join(output_dir, person_of_interest, "adiv_boxplots") create_dir(adiv_boxplots_dir) output_directories.append(adiv_boxplots_dir) logger.write("\nGenerating alpha diversity boxplots (%s)\n\n" % person_of_interest) plot_filenames = _generate_alpha_diversity_boxplots( collated_dir, personal_mapping_file_fp, category_to_split, column_title, rarefaction_depth, adiv_boxplots_dir, ) # Create relative paths for use with the index page. rel_boxplot_dir = basename(normpath(adiv_boxplots_dir)) plot_fps = [join(rel_boxplot_dir, plot_filename) for plot_filename in plot_filenames] alpha_diversity_boxplots_html = create_alpha_diversity_boxplots_html(plot_fps) ## Alpha rarefaction steps if not suppress_alpha_rarefaction: rarefaction_dir = join(output_dir, person_of_interest, "alpha_rarefaction") output_directories.append(rarefaction_dir) commands = [] cmd_title = "Creating rarefaction plots (%s)" % person_of_interest cmd = "make_rarefaction_plots.py -i %s -m %s -p %s -o %s" % ( collated_dir, personal_mapping_file_fp, prefs_fp, rarefaction_dir, ) commands.append([(cmd_title, cmd)]) personal_raw_data_dirs.append(join(rarefaction_dir, "average_plots")) personal_raw_data_dirs.append(join(rarefaction_dir, "average_tables")) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) ## Beta diversity steps if not suppress_beta_diversity: pcoa_dir = join(output_dir, person_of_interest, "beta_diversity") pcoa_time_series_dir = join(output_dir, person_of_interest, "beta_diversity_time_series") output_directories.append(pcoa_dir) output_directories.append(pcoa_time_series_dir) commands = [] cmd_title = "Creating beta diversity time series plots (%s)" % person_of_interest cmd = "make_3d_plots.py -m %s -p %s -i %s -o %s --custom_axes=" % ( personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_time_series_dir, ) + "'%s' --add_vectors='%s,%s'" % (time_series_category, site_id_category, time_series_category) commands.append([(cmd_title, cmd)]) cmd_title = "Creating beta diversity plots (%s)" % person_of_interest cmd = "make_3d_plots.py -m %s -p %s -i %s -o %s" % (personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_dir) commands.append([(cmd_title, cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) ## Time series taxa summary plots steps taxa_summary_plots_html = "" if not suppress_taxa_summary_plots: area_plots_dir = join(output_dir, person_of_interest, "time_series") create_dir(area_plots_dir) output_directories.append(area_plots_dir) files_to_remove, dirs_to_remove = _generate_taxa_summary_plots( otu_table_fp, personal_mapping_file_fp, person_of_interest, column_title, column_title_values, category_to_split, cat_values, time_series_category, area_plots_dir, command_handler, status_update_callback, logger, ) personal_raw_data_files.extend(files_to_remove) personal_raw_data_dirs.extend(dirs_to_remove) taxa_summary_plots_html = create_taxa_summary_plots_html(output_dir, person_of_interest, cat_values) # Generate OTU category significance tables (per body site). otu_cat_sig_output_fps = [] otu_category_significance_html = "" if not suppress_otu_category_significance: otu_cat_sig_dir = join(output_dir, person_of_interest, "otu_category_significance") create_dir(otu_cat_sig_dir) output_directories.append(otu_cat_sig_dir) # For each body-site rarefied OTU table, run # otu_category_significance.py using self versus other category. # Keep track of each output file that is created because we need to # parse these later on. commands = [] valid_body_sites = [] for cat_value in cat_values: body_site_otu_table_fp = join( per_body_site_dir, add_filename_suffix(rarefied_otu_table_fp, "_%s" % cat_value) ) if exists(body_site_otu_table_fp): # Make sure we have at least one sample for Self, otherwise # otu_category_significance.py crashes with a division by # zero error. with open(body_site_otu_table_fp, "U") as body_site_otu_table_f, open( personal_mapping_file_fp, "U" ) as personal_mapping_file_f: personal_sample_count = _count_per_individual_samples( body_site_otu_table_f, personal_mapping_file_f, personal_id_column, person_of_interest ) if personal_sample_count < 1: continue else: valid_body_sites.append(cat_value) otu_cat_output_fp = join(otu_cat_sig_dir, "otu_cat_sig_%s.txt" % cat_value) cmd_title = "Testing for significant differences in " 'OTU abundances in "%s" body site (%s)' % ( cat_value, person_of_interest, ) cmd = "otu_category_significance.py -i %s -m %s -c %s " "-o %s" % ( body_site_otu_table_fp, personal_mapping_file_fp, column_title, otu_cat_output_fp, ) commands.append([(cmd_title, cmd)]) personal_raw_data_files.append(otu_cat_output_fp) otu_cat_sig_output_fps.append(otu_cat_output_fp) # Hack to allow print-only mode. if command_handler is not print_commands and not valid_body_sites: raise ValueError( "None of the body sites for personal ID '%s' " "could be processed because there were no " "matching samples in the rarefied OTU table." % person_of_interest ) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # Reformat otu category significance tables. otu_cat_sig_html_filenames = create_otu_category_significance_html_tables( otu_cat_sig_output_fps, alpha, otu_cat_sig_dir, individual_titles, rep_set_fp=rep_set_fp ) # Create relative paths for use with the index page. rel_otu_cat_sig_dir = basename(normpath(otu_cat_sig_dir)) otu_cat_sig_html_fps = [ join(rel_otu_cat_sig_dir, html_filename) for html_filename in otu_cat_sig_html_filenames ] otu_category_significance_html = create_otu_category_significance_html(otu_cat_sig_html_fps) # Create the index.html file for the current individual. create_index_html( person_of_interest, html_fp, taxa_summary_plots_html=taxa_summary_plots_html, alpha_diversity_boxplots_html=alpha_diversity_boxplots_html, otu_category_significance_html=otu_category_significance_html, ) # Clean up the unnecessary raw data files and directories for the # current individual. glob will only grab paths that exist. if not retain_raw_data: clean_up_raw_data_files(personal_raw_data_files, personal_raw_data_dirs) # Clean up any remaining raw data files that weren't created on a # per-individual basis. if not retain_raw_data: clean_up_raw_data_files(raw_data_files, raw_data_dirs) logger.close() return output_directories
def _generate_taxa_summary_plots( otu_table_fp, personal_map_fp, personal_id, personal_cat, personal_cat_values, body_site_cat, body_site_cat_values, time_series_cat, output_dir, command_handler, status_update_callback, logger, ): files_to_remove = [] dirs_to_remove = [] ## Split OTU table into self/other per-body-site tables commands = [] cmd_title = "Splitting OTU table into self/other (%s)" % personal_id cmd = "split_otu_table.py -i %s -m %s -f %s -o %s" % (otu_table_fp, personal_map_fp, personal_cat, output_dir) commands.append([(cmd_title, cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # Prefix to be used for taxa summary dirs. Will be # <taxa_summary_dir_prefix>_<self|other>_<body site>/. ts_dir_prefix = "taxa_summaries" # Create taxa summaries for self and other, per body site. for personal_cat_value in personal_cat_values: personal_cat_biom_fp = join(output_dir, add_filename_suffix(otu_table_fp, "_%s" % personal_cat_value)) personal_cat_map_fp = join(output_dir, "mapping_%s.txt" % personal_cat_value) files_to_remove.append(personal_cat_biom_fp) files_to_remove.append(personal_cat_map_fp) body_site_dir = join(output_dir, personal_cat_value) commands = [] cmd_title = 'Splitting "%s" OTU table by body site (%s)' % (personal_cat_value, personal_id) cmd = "split_otu_table.py -i %s -m %s -f %s -o %s" % ( personal_cat_biom_fp, personal_map_fp, body_site_cat, body_site_dir, ) commands.append([(cmd_title, cmd)]) dirs_to_remove.append(body_site_dir) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] for body_site_cat_value in body_site_cat_values: body_site_otu_table_fp = join( body_site_dir, add_filename_suffix(personal_cat_biom_fp, "_%s" % body_site_cat_value) ) # We won't always get an OTU table if the mapping file # category contains samples that aren't in the OTU table # (e.g. the 'na' state for body site). if exists(body_site_otu_table_fp): ts_dir = join(output_dir, "%s_%s_%s" % (ts_dir_prefix, personal_cat_value, body_site_cat_value)) create_dir(ts_dir) dirs_to_remove.append(ts_dir) # Summarize. summarized_otu_table_fp = join(ts_dir, "%s_otu_table.biom" % time_series_cat) cmd_title = "Summarizing OTU table by category (%s)" % personal_id cmd = "summarize_otu_by_cat.py -i %s -c %s -o %s " "-m %s " % ( personal_map_fp, body_site_otu_table_fp, summarized_otu_table_fp, time_series_cat, ) commands.append([(cmd_title, cmd)]) # Sort. sorted_otu_table_fp = join(ts_dir, "%s_otu_table_sorted.biom" % time_series_cat) cmd_title = "Sorting OTU table (%s)" % personal_id cmd = "sort_otu_table.py -i %s -o %s" % (summarized_otu_table_fp, sorted_otu_table_fp) commands.append([(cmd_title, cmd)]) # Summarize taxa. cmd_title = "Summarizing taxa (%s)" % personal_id cmd = "summarize_taxa.py -i %s -o %s" % (sorted_otu_table_fp, ts_dir) commands.append([(cmd_title, cmd)]) create_comparative_taxa_plots_html( body_site_cat_value, join(output_dir, "%s_comparative.html" % body_site_cat_value) ) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # Make each corresponding taxa summary compatible so that coloring matches # between them. We want to be able to compare self versus other at each # body site. commands = [] valid_body_sites = [] for body_site_cat_value in body_site_cat_values: personal_cat_vals = list(personal_cat_values) ts_dir = join(output_dir, "%s_%s_%s" % (ts_dir_prefix, personal_cat_vals[0], body_site_cat_value)) if not exists(ts_dir): continue # Check that we have 2+ weeks (samples were previously collapsed into # weeks for self and other). If we don't have 2+ weeks, # plot_taxa_summary.py will fail, so we'll skip this body site. weeks_otu_table_fp = join(ts_dir, "%s_otu_table_sorted.biom" % time_series_cat) with open(weeks_otu_table_fp, "U") as weeks_otu_table_f: if _count_num_samples(weeks_otu_table_f) < 2: continue ts_fps1 = sorted(glob(join(ts_dir, "%s_otu_table_sorted_L*.txt" % time_series_cat))) ts_dir = join(output_dir, "%s_%s_%s" % (ts_dir_prefix, personal_cat_vals[1], body_site_cat_value)) if not exists(ts_dir): continue weeks_otu_table_fp = join(ts_dir, "%s_otu_table_sorted.biom" % time_series_cat) with open(weeks_otu_table_fp, "U") as weeks_otu_table_f: if _count_num_samples(weeks_otu_table_f) < 2: continue ts_fps2 = sorted(glob(join(ts_dir, "%s_otu_table_sorted_L*.txt" % time_series_cat))) if len(ts_fps1) != len(ts_fps2): raise ValueError("There are not an equal number of taxa summaries " "to compare between self and other.") compatible_ts_dir = join(output_dir, "compatible_ts_%s" % body_site_cat_value) dirs_to_remove.append(compatible_ts_dir) compatible_ts_fps = defaultdict(list) for ts_fp1, ts_fp2 in zip(ts_fps1, ts_fps2): if basename(ts_fp1) != basename(ts_fp2): raise ValueError("Could not find matching taxa summaries " "between self and other to compare.") # Make taxa summaries compatible. cmd_title = "Making compatible taxa summaries (%s)" % personal_id cmd = "compare_taxa_summaries.py -i %s,%s -o %s -m paired -n 0" % (ts_fp1, ts_fp2, compatible_ts_dir) commands.append([(cmd_title, cmd)]) compatible_ts_fps[personal_cat_vals[0]].append( join(compatible_ts_dir, add_filename_suffix(ts_fp1, "_sorted_and_filled_0")) ) compatible_ts_fps[personal_cat_vals[1]].append( join(compatible_ts_dir, add_filename_suffix(ts_fp2, "_sorted_and_filled_1")) ) for personal_cat_value in personal_cat_values: # Plot taxa summaries. ts_fps = ",".join(sorted(compatible_ts_fps[personal_cat_value])) ts_plots_dir = join( output_dir, "taxa_plots_%s_%s" % (personal_cat_value, body_site_cat_value), "taxa_summary_plots" ) cmd_title = "Plot taxa summaries (%s)" % personal_id cmd = "plot_taxa_summary.py -i %s -o %s -a numeric" % (ts_fps, ts_plots_dir) commands.append([(cmd_title, cmd)]) # If we've gotten this far, we'll be able to process this body site # (i.e. there are enough weeks). valid_body_sites.append(body_site_cat_value) # Hack to allow print-only mode. if command_handler is not print_commands and not valid_body_sites: raise ValueError( "None of the body sites for personal ID '%s' could " "be processed because there were not enough weeks " "to create taxa summary plots." % personal_id ) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) return files_to_remove, dirs_to_remove
def create_personal_results(output_dir, mapping_fp, coord_fp, collated_dir, otu_table_fp, prefs_fp, personal_id_column, personal_ids=None, column_title='Self', individual_titles=None, category_to_split='BodySite', time_series_category='WeeksSinceStart', rarefaction_depth=10000, alpha=0.05, rep_set_fp=None, parameter_fp=None, body_site_rarefied_otu_table_dir=None, retain_raw_data=False, suppress_alpha_rarefaction=False, suppress_beta_diversity=False, suppress_taxa_summary_plots=False, suppress_alpha_diversity_boxplots=False, suppress_otu_category_significance=False, command_handler=call_commands_serially, status_update_callback=no_status_updates): # Create our output directory and copy over the resources the personalized # pages need (e.g. javascript, images, etc.). create_dir(output_dir) support_files_dir = join(output_dir, 'support_files') if not exists(support_files_dir): copytree(join(get_project_dir(), 'my_microbes', 'support_files'), support_files_dir) logger = WorkflowLogger(generate_log_fp(output_dir)) mapping_data, header, comments = parse_mapping_file(open(mapping_fp, 'U')) try: personal_id_index = header.index(personal_id_column) except ValueError: raise ValueError("Personal ID field '%s' is not a mapping file column " "header." % personal_id_column) try: bodysite_index = header.index(category_to_split) except ValueError: raise ValueError("Category to split field '%s' is not a mapping file " "column header." % category_to_split) header = header[:-1] + [column_title] + [header[-1]] # column that differentiates between body-sites within a single individual # used for the creation of the vectors in make_3d_plots.py, this data is # created by concatenating the two columns when writing the mapping file site_id_category = '%s&&%s' % (personal_id_column, category_to_split) header.insert(len(header)-1, site_id_category) all_personal_ids = get_personal_ids(mapping_data, personal_id_index) if personal_ids == None: personal_ids = all_personal_ids else: for pid in personal_ids: if pid not in all_personal_ids: raise ValueError("'%s' is not a personal ID in the mapping " "file column '%s'." % (pid, personal_id_column)) if time_series_category not in header: raise ValueError("Time series field '%s' is not a mapping file column " "header." % time_series_category) otu_table_title = splitext(basename(otu_table_fp)) output_directories = [] raw_data_files = [] raw_data_dirs = [] # Rarefy the OTU table and split by body site here (instead of on a # per-individual basis) as we can use the same rarefied and split tables # for each individual. if not suppress_otu_category_significance: rarefied_otu_table_fp = join(output_dir, add_filename_suffix(otu_table_fp, '_even%d' % rarefaction_depth)) if body_site_rarefied_otu_table_dir is None: commands = [] cmd_title = 'Rarefying OTU table' cmd = 'single_rarefaction.py -i %s -o %s -d %s' % (otu_table_fp, rarefied_otu_table_fp, rarefaction_depth) commands.append([(cmd_title, cmd)]) raw_data_files.append(rarefied_otu_table_fp) per_body_site_dir = join(output_dir, 'per_body_site_otu_tables') cmd_title = 'Splitting rarefied OTU table by body site' cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % ( rarefied_otu_table_fp, mapping_fp, category_to_split, per_body_site_dir) commands.append([(cmd_title, cmd)]) raw_data_dirs.append(per_body_site_dir) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) else: per_body_site_dir = body_site_rarefied_otu_table_dir for person_of_interest in personal_ids: create_dir(join(output_dir, person_of_interest)) personal_mapping_file_fp = join(output_dir, person_of_interest, 'mapping_file.txt') html_fp = join(output_dir, person_of_interest, 'index.html') personal_mapping_data = create_personal_mapping_file(mapping_data, person_of_interest, personal_id_index, bodysite_index, individual_titles) personal_mapping_f = open(personal_mapping_file_fp, 'w') personal_mapping_f.write( format_mapping_file(header, personal_mapping_data, comments)) personal_mapping_f.close() raw_data_files.append(personal_mapping_file_fp) column_title_index = header.index(column_title) column_title_values = set([e[column_title_index] for e in personal_mapping_data]) cat_index = header.index(category_to_split) cat_values = set([e[cat_index] for e in personal_mapping_data]) # Generate alpha diversity boxplots, split by body site, one per # metric. We run this one first because it completes relatively # quickly and it does not call any QIIME scripts. alpha_diversity_boxplots_html = '' if not suppress_alpha_diversity_boxplots: adiv_boxplots_dir = join(output_dir, person_of_interest, 'adiv_boxplots') create_dir(adiv_boxplots_dir) output_directories.append(adiv_boxplots_dir) logger.write("\nGenerating alpha diversity boxplots (%s)\n\n" % person_of_interest) plot_filenames = _generate_alpha_diversity_boxplots( collated_dir, personal_mapping_file_fp, category_to_split, column_title, rarefaction_depth, adiv_boxplots_dir) # Create relative paths for use with the index page. rel_boxplot_dir = basename(normpath(adiv_boxplots_dir)) plot_fps = [join(rel_boxplot_dir, plot_filename) for plot_filename in plot_filenames] alpha_diversity_boxplots_html = \ create_alpha_diversity_boxplots_html(plot_fps) ## Alpha rarefaction steps if not suppress_alpha_rarefaction: rarefaction_dir = join(output_dir, person_of_interest, 'alpha_rarefaction') output_directories.append(rarefaction_dir) commands = [] cmd_title = 'Creating rarefaction plots (%s)' % person_of_interest cmd = 'make_rarefaction_plots.py -i %s -m %s -p %s -o %s' % ( collated_dir, personal_mapping_file_fp, prefs_fp, rarefaction_dir) commands.append([(cmd_title, cmd)]) raw_data_dirs.append(join(rarefaction_dir, 'average_plots')) raw_data_dirs.append(join(rarefaction_dir, 'average_tables')) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) ## Beta diversity steps if not suppress_beta_diversity: pcoa_dir = join(output_dir, person_of_interest, 'beta_diversity') pcoa_time_series_dir = join(output_dir, person_of_interest, 'beta_diversity_time_series') output_directories.append(pcoa_dir) output_directories.append(pcoa_time_series_dir) commands = [] cmd_title = 'Creating beta diversity time series plots (%s)' % \ person_of_interest cmd = 'make_3d_plots.py -m %s -p %s -i %s -o %s --custom_axes=' % ( personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_time_series_dir) +\ '\'%s\' --add_vectors=\'%s,%s\'' % (time_series_category, site_id_category, time_series_category) commands.append([(cmd_title, cmd)]) cmd_title = 'Creating beta diversity plots (%s)' % \ person_of_interest cmd = 'make_3d_plots.py -m %s -p %s -i %s -o %s' % (personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_dir) commands.append([(cmd_title, cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) ## Time series taxa summary plots steps if not suppress_taxa_summary_plots: area_plots_dir = join(output_dir, person_of_interest, 'time_series') create_dir(area_plots_dir) output_directories.append(area_plots_dir) ## Split OTU table into self/other per-body-site tables commands = [] cmd_title = 'Splitting OTU table into self/other (%s)' % \ person_of_interest cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (otu_table_fp, personal_mapping_file_fp, column_title, area_plots_dir) commands.append([(cmd_title, cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) for column_title_value in column_title_values: biom_fp = join(area_plots_dir, add_filename_suffix(otu_table_fp, '_%s' % column_title_value)) column_title_map_fp = join(area_plots_dir, 'mapping_%s.txt' % column_title_value) raw_data_files.append(biom_fp) raw_data_files.append(column_title_map_fp) body_site_dir = join(area_plots_dir, column_title_value) commands = [] cmd_title = 'Splitting "%s" OTU table by body site (%s)' % \ (column_title_value, person_of_interest) cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (biom_fp, personal_mapping_file_fp, category_to_split, body_site_dir) commands.append([(cmd_title, cmd)]) raw_data_dirs.append(body_site_dir) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] for cat_value in cat_values: body_site_otu_table_fp = join(body_site_dir, add_filename_suffix(biom_fp, '_%s' % cat_value)) # We won't always get an OTU table if the mapping file # category contains samples that aren't in the OTU table # (e.g. the 'na' state for body site). if exists(body_site_otu_table_fp): plots = join(area_plots_dir, 'taxa_plots_%s_%s' % ( column_title_value, cat_value)) cmd_title = 'Creating taxa summary plots (%s)' % \ person_of_interest cmd = ('summarize_taxa_through_plots.py -i %s ' '-o %s -c %s -m %s -s' % (body_site_otu_table_fp, plots, time_series_category, personal_mapping_file_fp)) if parameter_fp is not None: cmd += ' -p %s' % parameter_fp commands.append([(cmd_title, cmd)]) raw_data_files.append(join(plots, '*.biom')) raw_data_files.append(join(plots, '*.txt')) create_comparative_taxa_plots_html(cat_value, join(area_plots_dir, '%s_comparative.html' % cat_value)) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # Generate OTU category significance tables (per body site). otu_cat_sig_output_fps = [] otu_category_significance_html = '' if not suppress_otu_category_significance: otu_cat_sig_dir = join(output_dir, person_of_interest, 'otu_category_significance') create_dir(otu_cat_sig_dir) output_directories.append(otu_cat_sig_dir) # For each body-site rarefied OTU table, run # otu_category_significance.py using self versus other category. # Keep track of each output file that is created because we need to # parse these later on. commands = [] for cat_value in cat_values: body_site_otu_table_fp = join(per_body_site_dir, add_filename_suffix(rarefied_otu_table_fp, '_%s' % cat_value)) if exists(body_site_otu_table_fp): otu_cat_output_fp = join(otu_cat_sig_dir, 'otu_cat_sig_%s.txt' % cat_value) cmd_title = ('Testing for significant differences in ' 'OTU abundances in "%s" body site (%s)' % ( cat_value, person_of_interest)) cmd = ('otu_category_significance.py -i %s -m %s -c %s ' '-o %s' % (body_site_otu_table_fp, personal_mapping_file_fp, column_title, otu_cat_output_fp)) commands.append([(cmd_title, cmd)]) raw_data_files.append(otu_cat_output_fp) otu_cat_sig_output_fps.append(otu_cat_output_fp) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # Reformat otu category significance tables. otu_cat_sig_html_filenames = \ format_otu_category_significance_tables_as_html( otu_cat_sig_output_fps, alpha, otu_cat_sig_dir, individual_titles, rep_set_fp=rep_set_fp) # Create relative paths for use with the index page. rel_otu_cat_sig_dir = basename(normpath(otu_cat_sig_dir)) otu_cat_sig_html_fps = [join(rel_otu_cat_sig_dir, html_filename) for html_filename in otu_cat_sig_html_filenames] otu_category_significance_html = \ create_otu_category_significance_html(otu_cat_sig_html_fps) # Create the index.html file for the current individual. create_index_html(person_of_interest, html_fp, alpha_diversity_boxplots_html=alpha_diversity_boxplots_html, otu_category_significance_html=otu_category_significance_html) logger.close() # Clean up the unnecessary raw data files and directories. glob will only # grab paths that exist. if not retain_raw_data: for raw_data_fp_glob in raw_data_files: remove_files(glob(raw_data_fp_glob)) for raw_data_dir_glob in raw_data_dirs: for dir_to_remove in glob(raw_data_dir_glob): rmtree(dir_to_remove) return output_directories
def generate_most_wanted_list(output_dir, otu_table_fp, rep_set_fp, gg_fp, nt_fp, mapping_fp, mapping_category, top_n, min_abundance, max_abundance, min_categories, max_gg_similarity, e_value, word_size, jobs_to_start, command_handler, status_update_callback, force): try: makedirs(output_dir) except OSError: if not force: raise WorkflowError("Output directory '%s' already exists. Please " "choose a different directory, or force overwrite with -f." % output_dir) logger = WorkflowLogger(generate_log_fp(output_dir)) commands = [] # First filter to keep only new (non-GG) OTUs. novel_otu_table_fp = join(output_dir, add_filename_suffix(otu_table_fp, '_novel')) commands.append([('Filtering out all GG reference OTUs', 'filter_otus_from_otu_table.py -i %s -o %s -e %s' % (otu_table_fp, novel_otu_table_fp, gg_fp))]) # Next filter to keep only abundant otus in the specified range (looking # only at extremely abundant OTUs has the problem of yielding too many # that are similar to stuff in the nt database). novel_abund_otu_table_fp = join(output_dir, add_filename_suffix(novel_otu_table_fp, '_min%d_max%d' % (min_abundance, max_abundance))) commands.append([('Filtering out all OTUs that do not fall within the ' 'specified abundance threshold', 'filter_otus_from_otu_table.py -i %s -o %s -n %d -x %d' % (novel_otu_table_fp, novel_abund_otu_table_fp, min_abundance, max_abundance))]) # Next, collapse by mapping_category. otu_table_by_samp_type_fp = join(output_dir, add_filename_suffix(novel_abund_otu_table_fp, '_%s' % mapping_category)) commands.append([('Collapsing OTU table by %s' % mapping_category, 'summarize_otu_by_cat.py -c %s -o %s -m %s -i %s' % (novel_abund_otu_table_fp, otu_table_by_samp_type_fp, mapping_category, mapping_fp))]) # Filter to contain only otus in the specified minimum number of sample # types. otu_table_by_samp_type_ms_fp = join(output_dir, add_filename_suffix( otu_table_by_samp_type_fp, '_ms%d' % min_categories)) commands.append([('Filtering OTU table to include only OTUs that appear ' 'in at least %d sample groups' % min_categories, 'filter_otus_from_otu_table.py -i %s -o %s -s %d' % (otu_table_by_samp_type_fp, otu_table_by_samp_type_ms_fp, min_categories))]) # Now that we have a filtered down OTU table of good candidate OTUs, filter # the corresponding representative set to include only these candidate # sequences. candidate_rep_set_fp = join(output_dir, add_filename_suffix( rep_set_fp, '_most_wanted_candidates')) commands.append([('Filtering representative set to include only the ' 'latest candidate OTUs', 'filter_fasta.py -f %s -o %s -b %s' % (rep_set_fp, candidate_rep_set_fp, otu_table_by_samp_type_ms_fp))]) # Find the otus that don't hit GG at a certain maximum similarity # threshold. uclust_output_dir = join(output_dir, 'most_wanted_candidates_%s_%s' % (basename(gg_fp), str(max_gg_similarity))) commands.append([('Running uclust to get list of sequences that don\'t ' 'hit the maximum GG similarity threshold', 'parallel_pick_otus_uclust_ref.py -i %s -o %s -r %s -s %s -O %d' % (candidate_rep_set_fp, uclust_output_dir, gg_fp, str(max_gg_similarity), jobs_to_start))]) # Filter the candidate sequences to only include the failures from uclust. cand_gg_dis_rep_set_fp = join(output_dir, add_filename_suffix(candidate_rep_set_fp, '_failures')) commands.append([('Filtering candidate sequences to only include uclust ' 'failures', 'filter_fasta.py -f %s -s %s -o %s' % (candidate_rep_set_fp, join(uclust_output_dir, splitext(basename(candidate_rep_set_fp))[0] + '_failures.txt'), cand_gg_dis_rep_set_fp))]) # BLAST the failures against nt. blast_output_dir = join(output_dir, 'blast_output') commands.append([('BLASTing candidate sequences against nt database', 'parallel_blast.py -i %s -o %s -r %s -D -e %f -w %d -O %d' % (cand_gg_dis_rep_set_fp, blast_output_dir, nt_fp, e_value, word_size, jobs_to_start))]) # Execute the commands we have so far, but keep the logger open because # we're going to write additional status updates as we process the data. command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # We'll sort the BLAST results by percent identity (ascending) and pick the # top n. logger.write("Reading in BLAST results, sorting by percent identity, " "and picking the top %d OTUs.\n\n" % top_n) blast_results = open(join(blast_output_dir, splitext(basename(cand_gg_dis_rep_set_fp))[0] + '_blast_out.txt'), 'U') top_n_mw = [] for line in blast_results: # Skip headers. line = line.strip() if line and not line.startswith('#'): line = line.split('\t') top_n_mw.append((line[0], line[1], float(line[2]))) top_n_mw = sorted(top_n_mw, key=itemgetter(2))[:top_n] # Read in our filtered down candidate seqs file and latest filtered and # collapsed OTU table. We'll need to compute some stats on these to include # in our report. logger.write("Reading in candidate sequences and latest filtered and " "collapsed OTU table.\n\n") mw_seqs = {} for seq_id, seq in MinimalFastaParser(open(cand_gg_dis_rep_set_fp, 'U')): seq_id = seq_id.strip().split()[0] mw_seqs[seq_id] = seq otu_table_by_samp_type_ms = parse_biom_table( open(otu_table_by_samp_type_ms_fp, 'U')) # Write results out to tsv and HTML table. logger.write("Writing most wanted OTUs results to TSV and HTML " "tables.\n\n") mw_tsv_f = open(join(output_dir, 'top_%d_most_wanted_otus.txt' % top_n), 'w') mw_html_f = open(join(output_dir, 'top_%d_most_wanted_otus.html' % top_n), 'w') tsv_header = 'OTU ID\tSequence\tGreengenes taxonomy\t' + \ 'NCBI nt closest match\tNCBI nt % identity' mw_tsv_f.write(tsv_header + '\n') tsv_header += '\tAbundance by %s' % mapping_category html_header = '' for col in tsv_header.split('\t'): html_header += '<th>%s</th>' % col mw_html_f.write('<table><tr>' + html_header + '</tr>') for otu_id, subject_id, percent_identity in top_n_mw: # Grab all necessary information to be included in our report. seq = mw_seqs[otu_id] tax = otu_table_by_samp_type_ms.ObservationMetadata[ otu_table_by_samp_type_ms.getObservationIndex(otu_id)]['taxonomy'] gb_id = subject_id.split('|')[3] ncbi_link = 'http://www.ncbi.nlm.nih.gov/nuccore/%s' % gb_id # Compute the abundance of each most wanted OTU in each sample # grouping and create a pie chart to go in the HTML table. samp_types = otu_table_by_samp_type_ms.SampleIds counts = otu_table_by_samp_type_ms.observationData(otu_id) if len(counts) != len(samp_types): raise WorkflowError("The number of observation counts does not " "match the number of samples in the OTU " "table.") # Piechart code modified from matplotlib example: # http://matplotlib.sourceforge.net/examples/pylab_examples/ # pie_demo.html figure(figsize=(6,6)) ax = axes([0.1, 0.1, 0.8, 0.8]) # Will auto-normalize the counts. pie(counts, labels=samp_types, autopct='%1.1f%%', shadow=True) output_img_dir = join(output_dir, 'img') try: makedirs(output_img_dir) except OSError: # It already exists, which is okay since we already know we are in # 'force' mode from above. pass # We need a relative path to the image. pie_chart_fp = join('img', 'abundance_by_%s_%s.png' % (mapping_category, otu_id)) savefig(join(output_dir, pie_chart_fp)) mw_tsv_f.write('%s\t%s\t%s\t%s\t%s\n' % (otu_id, seq, tax, gb_id, percent_identity)) mw_html_f.write('<tr><td>%s</td><td>%s</td><td>%s</td>' '<td><a href="%s" target="_blank">%s</a></td><td>%s</td><td>' '<img src="%s" /></td></tr>' % (otu_id, seq, tax, ncbi_link, gb_id, percent_identity, pie_chart_fp)) mw_html_f.write('</table>') mw_tsv_f.close() mw_html_f.close() logger.close()
def _get_most_wanted_filtering_commands(output_dir, otu_table_fps, rep_set_fp, gg_fp, nt_fp, mapping_fp, mapping_category, min_abundance, max_abundance, min_categories, max_gg_similarity, e_value, word_size, merged_otu_table_fp, jobs_to_start): commands = [] otu_tables_to_merge = [] if merged_otu_table_fp is None: for otu_table_fp in otu_table_fps: # First filter to keep only new (non-GG) OTUs. novel_otu_table_fp = join( output_dir, add_filename_suffix(otu_table_fp, '_novel')) commands.append([ ('Filtering out all GG reference OTUs', 'filter_otus_from_otu_table.py -i %s -o %s -e %s' % (otu_table_fp, novel_otu_table_fp, gg_fp)) ]) # Next filter to keep only abundant otus in the specified range # (looking only at extremely abundant OTUs has the problem of yielding # too many that are similar to stuff in the nt database). novel_abund_otu_table_fp = join( output_dir, add_filename_suffix( novel_otu_table_fp, '_min%d_max%d' % (min_abundance, max_abundance))) commands.append([ ('Filtering out all OTUs that do not fall within the ' 'specified abundance threshold', 'filter_otus_from_otu_table.py -i %s -o %s -n %d -x %d' % (novel_otu_table_fp, novel_abund_otu_table_fp, min_abundance, max_abundance)) ]) # Remove samples from the table that aren't in the mapping file. novel_abund_filtered_otu_table_fp = join( output_dir, add_filename_suffix(novel_abund_otu_table_fp, '_known_samples')) commands.append([ ('Filtering out samples that are not in the mapping ' 'file', 'filter_samples_from_otu_table.py -i %s -o %s ' '--sample_id_fp %s' % (novel_abund_otu_table_fp, novel_abund_filtered_otu_table_fp, mapping_fp)) ]) # Next, collapse by mapping_category. otu_table_by_samp_type_fp = join( output_dir, add_filename_suffix(novel_abund_filtered_otu_table_fp, '_%s' % mapping_category)) commands.append([ ('Collapsing OTU table by %s' % mapping_category, 'summarize_otu_by_cat.py -c %s -o %s -m %s -i %s' % (novel_abund_filtered_otu_table_fp, otu_table_by_samp_type_fp, mapping_category, mapping_fp)) ]) otu_tables_to_merge.append(otu_table_by_samp_type_fp) # Merge all collapsed OTU tables. master_otu_table_fp = join( output_dir, 'master_otu_table_novel_min%d_max%d_%s.biom' % (min_abundance, max_abundance, mapping_category)) commands.append([('Merging collapsed OTU tables', 'merge_otu_tables.py -i %s -o %s' % (','.join(otu_tables_to_merge), master_otu_table_fp)) ]) else: master_otu_table_fp = merged_otu_table_fp # Filter to contain only otus in the specified minimum number of sample # types. master_otu_table_ms_fp = join( output_dir, add_filename_suffix(master_otu_table_fp, '_ms%d' % min_categories)) commands.append([ ('Filtering OTU table to include only OTUs that appear ' 'in at least %d sample groups' % min_categories, 'filter_otus_from_otu_table.py -i %s -o %s -s %d' % (master_otu_table_fp, master_otu_table_ms_fp, min_categories)) ]) # Now that we have a filtered down OTU table of good candidate OTUs, filter # the corresponding representative set to include only these candidate # sequences. rep_set_cands_fp = join(output_dir, add_filename_suffix(rep_set_fp, '_candidates')) commands.append([ ('Filtering representative set to include only the ' 'latest candidate OTUs', 'filter_fasta.py -f %s -o %s -b %s' % (rep_set_fp, rep_set_cands_fp, master_otu_table_ms_fp)) ]) # Find the otus that don't hit GG at a certain maximum similarity # threshold. uclust_output_dir = join( output_dir, 'most_wanted_candidates_%s_%s' % (basename(gg_fp), str(max_gg_similarity))) commands.append([ ('Running uclust to get list of sequences that don\'t ' 'hit the maximum GG similarity threshold', 'parallel_pick_otus_uclust_ref.py -i %s -o %s -r %s -s %s -O %d' % (rep_set_cands_fp, uclust_output_dir, gg_fp, str(max_gg_similarity), jobs_to_start)) ]) # Filter the rep set to only include the failures from uclust. rep_set_cands_failures_fp = join( output_dir, add_filename_suffix(rep_set_cands_fp, '_failures')) commands.append([ ('Filtering candidate sequences to only include uclust ' 'failures', 'filter_fasta.py -f %s -s %s -o %s' % (rep_set_cands_fp, join(uclust_output_dir, splitext(basename(rep_set_cands_fp))[0] + '_failures.txt'), rep_set_cands_failures_fp)) ]) # BLAST the failures against nt. blast_output_dir = join(output_dir, 'blast_output') commands.append([ ('BLASTing filtered candidate sequences against nt ' 'database', 'parallel_blast.py -i %s -o %s -r %s -D -e %f -w %d -O %d' % (rep_set_cands_failures_fp, blast_output_dir, nt_fp, e_value, word_size, jobs_to_start)) ]) blast_results_fp = join( blast_output_dir, splitext(basename(rep_set_cands_failures_fp))[0] + '_blast_out.txt') return commands, blast_results_fp, rep_set_cands_failures_fp, \ master_otu_table_ms_fp