def run_make_otu_heatmap_html(otu_table_fp,mapping_fp,output_dir, params, qiime_config, command_handler,tree_fp, status_update_callback=print_to_stdout): """ This function calls the make_otu_heatmap_html script """ # define upper-level values python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() commands = [] logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) # get the user-defined parameters try: params_str = get_params_str(params['make_otu_heatmap_html']) except KeyError: params_str = '' # Build the make_otu_heatmap_html command heatmap_cmd = '%s %s/make_otu_heatmap_html.py -i %s -m %s -t %s -o %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, mapping_fp,tree_fp, output_dir, params_str) commands.append([('OTU Heatmap' , heatmap_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger) return True
def rarefy_otu_table(data_access, otu_table_fname, otu_table_file_dir, otu_table_file_dir_db, otutable_rarefied_at, meta_id, otu_table_filepath, otu_table_filepath_db, zip_fpath): """ Rarefy the OTU table is specified by user """ otu_table_basename, otu_table_ext = os.path.splitext(otu_table_fname) python_exe_fp = qiime_config['python_exe_fp'] commands=[] command_handler=call_commands_serially status_update_callback=no_status_updates logger = WorkflowLogger(generate_log_fp('/tmp/'), params=dict(''), qiime_config=qiime_config) # get the date to put in the db run_date=datetime.now().strftime("%d/%m/%Y/%H/%M/%S") # Sample the OTU table at even depth new_fname='%s_even%d%s' % (otu_table_basename, otutable_rarefied_at, otu_table_ext) even_sampled_otu_table_fp = os.path.join(otu_table_file_dir, new_fname) single_rarefaction_cmd = \ '%s %s/single_rarefaction.py -i %s -o %s -d %d' % \ (python_exe_fp, script_dir, otu_table_filepath, even_sampled_otu_table_fp, otutable_rarefied_at) commands.append([('Sample OTU table at %d seqs/sample' % \ otutable_rarefied_at, single_rarefaction_cmd)]) otu_table_filepath=even_sampled_otu_table_fp otu_table_filepath_db=os.path.join(otu_table_file_dir_db, new_fname) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger) # Insert the rarefied OTU table filepath to the DB valid=data_access.addMetaAnalysisFiles(True, int(meta_id), otu_table_filepath_db, 'OTUTABLE', run_date, 'OTU_TABLE') if not valid: raise ValueError, 'There was an issue uploading the filepaths to the DB!' # zip the rarefied OTU table cmd_call='cd %s; zip %s %s' % (otu_table_file_dir, zip_fpath, otu_table_filepath.split('/')[-1]) system(cmd_call) return
def _start_logging(self, params, args, argv, logger): if logger == None: self.logger = WorkflowLogger(generate_log_fp(params['master_script_log_dir']), params={}, qiime_config=qiime_config) close_logger_on_success = True else: self.logger = logger close_logger_on_success = False self.logger.write('Command:\n') self.logger.write(' '.join(argv)) self.logger.write('\n\n') log_input_md5s(self.logger, [params[p] for p in self._input_file_parameter_ids]) return close_logger_on_success
def pick_nested_reference_otus(input_fasta_fp, input_tree_fp, output_dir, run_id, similarity_thresholds, command_handler, status_update_callback=print_to_stdout): # Prepare some variables for the later steps create_dir(output_dir) otu_dir = join(output_dir,'otus') create_dir(otu_dir) rep_set_dir = join(output_dir,'rep_set') create_dir(rep_set_dir) # currently not doing anything with taxonomies and trees # tax_dir = join(output_dir,'taxonomies') # create_dir(tax_dir) if input_tree_fp: tree_dir = join(output_dir,'trees') create_dir(tree_dir) commands = [] files_to_remove = [] logger = WorkflowLogger(generate_log_fp(output_dir)) similarity_thresholds.sort() similarity_thresholds.reverse() current_inseqs_fp = input_fasta_fp current_tree_fp = input_tree_fp previous_otu_map = None for similarity_threshold in similarity_thresholds: current_inseqs_basename = splitext(split(current_inseqs_fp)[1])[0] # pick otus command otu_fp = '%s/%d_otu_map.txt' % (otu_dir,similarity_threshold) clusters_fp = '%s/%d_clusters.uc' % (otu_dir,similarity_threshold) temp_otu_fp = '%s/%s_otus.txt' % (otu_dir, current_inseqs_basename) temp_log_fp = '%s/%s_otus.log' % (otu_dir, current_inseqs_basename) temp_clusters_fp = '%s/%s_clusters.uc' % (otu_dir, current_inseqs_basename) pick_otus_cmd = \ 'pick_otus.py -m uclust -DBz -i %s -s %1.2f -o %s' % ( current_inseqs_fp, similarity_threshold/100, otu_dir) commands.append([('Pick OTUs (%d)' % similarity_threshold, pick_otus_cmd)]) commands.append([('Rename OTU file (%d)' % similarity_threshold, 'mv %s %s' % (temp_otu_fp,otu_fp))]) commands.append([('Rename uc file (%d)' % similarity_threshold, 'mv %s %s' % (temp_clusters_fp,clusters_fp))]) files_to_remove.append(temp_log_fp) # rep set picking temp_rep_set_fp = get_tmp_filename(prefix='NestedReference', suffix='.fasta') pick_rep_set_cmd = \ 'pick_rep_set.py -m first -i %s -o %s -f %s' % ( otu_fp, temp_rep_set_fp, current_inseqs_fp) commands.append([('Pick Rep Set (%d)' % similarity_threshold, pick_rep_set_cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] # rename representative sequences rep_set_fp = '%s/%d_otus_%s.fasta' % ( rep_set_dir, similarity_threshold, run_id) logger.write('Renaming OTU representative sequences so OTU ids are reference sequence ids.') rep_set_f = open(rep_set_fp,'w') for e in rename_rep_seqs(open(temp_rep_set_fp,'U')): rep_set_f.write('>%s\n%s\n' % e) rep_set_f.close() files_to_remove.append(temp_rep_set_fp) # filter the tree, if provided if current_tree_fp != None: tree_fp = '%s/%d_otus_%s.tre' % ( tree_dir, similarity_threshold, run_id) tree_cmd = 'filter_tree.py -i %s -f %s -o %s' %\ (current_tree_fp,rep_set_fp,tree_fp) commands.append([('Filter tree (%d)' % similarity_threshold,tree_cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # prep for the next iteration current_tree_fp = tree_fp # prep for the next iteration remove_files(files_to_remove) commands = [] files_to_remove = [] current_inseqs_fp = rep_set_fp logger.close()
qiime_config = load_qiime_config() script_info = {} script_info['brief_description'] = "" script_info['script_description'] = "" script_info['script_usage'] = [] script_info['script_usage'].append(("Run a subset of the interface tests in verbose mode","Run interface tests for the add_taxa.py and make_otu_table.py scripts. This illustrates how to run from the qiime_test_dir directory.","%prog -i $PWD/ -l $HOME/qime_script_tests.log -t add_taxa,make_otu_table -v")) script_info['script_usage'].append(("Run all of the interface tests","Run all script interface tests. This illustrates how to run from the qiime_test_dir directory.","%prog -i $PWD/ -l $HOME/all_qime_script_tests.log")) script_info['output_description']= "" script_info['required_options'] = [] log_fp_prefix = 'script_test_log' log_fp_suffix = 'txt' default_log_fp = generate_log_fp(get_qiime_temp_dir(), basefile_name=log_fp_prefix, suffix=log_fp_suffix, timestamp_pattern='%Y%m%d%H%M%S') default_log_fp_help_str = join(get_qiime_temp_dir(), '%s_TIMESTAMP.%s' % (log_fp_prefix,log_fp_suffix)) script_info['optional_options'] = [\ make_option('-t','--tests', help='comma-separated list of the tests to run [default: all]'), make_option('-w','--working_dir',default=get_qiime_temp_dir(), help='directory where the tests should be run [default: %default]', type='existing_dirpath'), make_option('-q','--qiime_scripts_dir',default=qiime_config['qiime_scripts_dir'], help='directory containing scripts to test [default: %default]', type='existing_dirpath'), make_option('-l','--failure_log_fp',type="new_filepath",default=default_log_fp, help='log file to store record of failures [default: %s]' % default_log_fp_help_str)
def run_core_diversity_analyses( biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, status_update_callback=print_to_stdout): """ """ if categories != None: # Validate categories provided by the users mapping_data, mapping_comments = \ parse_mapping_file_to_dict(open(mapping_fp,'U')) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError, ("Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames))) if metadata_map.hasSingleCategoryValue(c): raise ValueError, ("Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c) else: categories= [] # prep some variables if params == None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = '%s/index.html' % output_dir index_links = [] commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() # begin logging log_fp = generate_log_fp(output_dir) index_links.append(('Master run log',log_fp,'Log files')) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp,mapping_fp] if tree_fp != None: input_fps.append(tree_fp) log_input_md5s(logger,input_fps) bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth) even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, sampling_depth=sampling_depth, # force suppression of distance histograms - boxplots work better # in this context, and are created below. histogram_categories=[], tree_fp=tree_fp, parallel=parallel, logger=logger, status_update_callback=status_update_callback) for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric) try: params_str = get_params_str(params['make_distance_boxplots']) except KeyError: params_str = '' boxplots_cmd = \ 'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\ (dm_fp, category, boxplots_output_dir, mapping_fp, params_str) commands.append([('Boxplots (%s)' % category, boxplots_cmd)]) index_links.append(('Distance boxplots (%s)' % bdiv_metric, '%s/%s_Distances.pdf' % \ (boxplots_output_dir,category), 'Beta diversity results (even sampling: %d)' % sampling_depth)) index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric, '%s/%s_Stats.txt' % \ (boxplots_output_dir,category), 'Beta diversity results (even sampling: %d)' % sampling_depth)) index_links.append(('3D plot (%s, continuous coloring)' % bdiv_metric, '%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), 'Beta diversity results (even sampling: %d)' % sampling_depth)) index_links.append(('3D plot (%s, discrete coloring)' % bdiv_metric, '%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), 'Beta diversity results (even sampling: %d)' % sampling_depth)) index_links.append(('2D plot (%s, continuous coloring)' % bdiv_metric, '%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), 'Beta diversity results (even sampling: %d)' % sampling_depth)) index_links.append(('2D plot (%s, discrete coloring)' % bdiv_metric, '%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), 'Beta diversity results (even sampling: %d)' % sampling_depth)) index_links.append(('Distance matrix (%s)' % bdiv_metric, '%s/%s_dm.txt' % \ (bdiv_even_output_dir,bdiv_metric), 'Beta diversity results (even sampling: %d)' % sampling_depth)) index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric, '%s/%s_pc.txt' % \ (bdiv_even_output_dir,bdiv_metric), 'Beta diversity results (even sampling: %d)' % sampling_depth)) ## Alpha rarefaction workflow arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth) run_qiime_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, status_update_callback=status_update_callback) index_links.append(('Alpha rarefaction plots', '%s/alpha_rarefaction_plots/rarefaction_plots.html'\ % arare_full_output_dir, "Alpha rarefaction results")) collated_alpha_diversity_fps = \ glob('%s/alpha_div_collated/*txt' % arare_full_output_dir) try: params_str = get_params_str(params['compare_alpha_diversity']) except KeyError: params_str = '' for c in categories: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0] alpha_comparison_output_fp = '%s/%s_%s.txt' % \ (arare_full_output_dir,c,alpha_metric) compare_alpha_cmd = \ 'compare_alpha_diversity.py -i %s -m %s -c %s -d %s -o %s -n 999 %s' %\ (collated_alpha_diversity_fp, mapping_fp, c, sampling_depth, alpha_comparison_output_fp, params_str) commands.append([('Compare alpha diversity (%s, %s)' %\ (category,alpha_metric), compare_alpha_cmd)]) index_links.append( ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric), alpha_comparison_output_fp, "Alpha rarefaction results")) taxa_plots_output_dir = '%s/taxa_plots/' % output_dir run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, status_update_callback=status_update_callback) index_links.append(('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html'\ % taxa_plots_output_dir, "Taxonomic summary results")) index_links.append(('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html'\ % taxa_plots_output_dir, "Taxonomic summary results")) for c in categories: taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,c) run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=c, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, status_update_callback=status_update_callback) index_links.append(('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html'\ % taxa_plots_output_dir, "Taxonomic summary results (by %s)" % c)) index_links.append(('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html'\ % taxa_plots_output_dir, "Taxonomic summary results (by %s)" % c)) # OTU category significance for category in categories: category_signifance_fp = \ '%s/category_significance_%s.txt' % (output_dir, category) try: params_str = get_params_str(params['otu_category_significance']) except KeyError: params_str = '' # Build the OTU cateogry significance command category_significance_cmd = \ 'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\ (biom_fp, mapping_fp, category, category_signifance_fp, params_str) commands.append([('OTU category significance (%s)' % category, category_significance_cmd)]) index_links.append(('Category significance (%s)' % category, category_signifance_fp, "Category results")) command_handler(commands, status_update_callback, logger) generate_index_page(index_links,index_fp)
def pick_subsampled_open_referenence_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_tax_align_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking denovo_otu_picking_method = 'uclust' reference_otu_picking_method = 'uclust_ref' # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False log_input_md5s(logger,[input_fp,refseqs_fp,step1_otu_map_fp,step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp ## Step 1: Closed-reference OTU picking on the input file (if not already complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id != None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_otu_map_fp = \ '%s/%s_otus.txt' % (prefilter_dir,input_basename) prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir,input_basename) prefilter_pick_otu_cmd = pick_reference_otus(\ input_fp,prefilter_dir,reference_otu_picking_method, prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir,input_basename,input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp,prefiltered_input_fp,prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) ## Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir,input_basename) step1_pick_otu_cmd = pick_reference_otus(\ input_fp,step1_dir,reference_otu_picking_method, refseqs_fp,parallel,params,logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) ## Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir,input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,step1_failures_list_fp,step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set',step1_pick_rep_set_cmd)]) ## Subsample the failures fasta file to retain (roughly) the ## percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) ## Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) ## Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures',step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set',step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id,'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp) commands.append([('Pick representative set for subsampled failures',step4_rep_set_cmd)]) else: # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp,output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp,otu_no_singletons_fp,min_otu_size) ## make the final representative seqs file and a new refseqs file that ## could be used in subsequent otu picking runs. ## this is clunky. first, we need to do this without singletons to match ## the otu map without singletons. next, there is a difference in what ## we need the reference set to be and what we need the repseqs to be. ## the reference set needs to be a superset of the input reference set ## to this set. the repset needs to be only the sequences that were observed ## in this data set, and we want reps for the step1 reference otus to be ## reads from this run so we don't hit issues building a tree using ## sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_f = open(final_repset_fp,'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp,new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp,'a') new_refseqs_f.write('\n') # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) new_refseqs_f.close() final_repset_f.close() # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp,otu_table_fp) commands.append([("Make the otu table",make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_tax_align_tree: taxonomy_fp, pynast_failures_fp = tax_align_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\ (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table",add_taxa_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # Build OTU table without PyNAST failures otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(otu_table_w_tax_fp,'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')), 0,inf,0,inf,negate_ids_to_keep=True) otu_table_f = open(otu_table_fp,'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def iterative_pick_subsampled_open_referenence_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=0.60, min_otu_size=2, run_tax_align_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, status_update_callback=print_to_stdout): """ Call the pick_subsampled_open_referenence_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i, input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir, i) if iteration_output_exists(iteration_output_dir, min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger, [input_fp, refseqs_fp]) logger.write( 'Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i, input_fp)) else: pick_subsampled_open_referenence_otus( input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join([new_ref_set_id, str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_tax_align_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, status_update_callback=status_update_callback) ## perform post-iteration file shuffling whether the previous iteration's ## data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir, min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps),otu_table_fp) commands.append([("Merge OTU tables", merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_tax_align_tree: otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) final_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) if exists(final_otu_table_fp) and getsize(final_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp, final_otu_table_fp], error_on_missing=False) taxonomy_fp, pynast_failures_fp = tax_align_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\ (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_taxa_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(otu_table_w_tax_fp, 'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) otu_table_f = open(final_otu_table_fp, 'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def assign_tax(repset_fasta_fp, output_dir, command_handler, params, qiime_config, parallel=False, logger=None, status_update_callback=print_to_stdout): input_dir, input_filename = split(repset_fasta_fp) input_basename, input_ext = splitext(input_filename) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False ## Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'rdp' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir,assignment_method) taxonomy_fp = '%s/%s_tax_assignments.txt' % \ (assign_taxonomy_dir,input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() if 'assignment_method' in d: del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ 'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (assignment_method, repset_fasta_fp,assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\ (assign_taxonomy_dir,repset_fasta_fp, params_str) if exists(assign_taxonomy_dir): rmtree(assign_taxonomy_dir) commands.append([('Assign taxonomy',assign_taxonomy_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return taxonomy_fp
def create_personal_results(output_dir, mapping_fp, coord_fp, collated_dir, otu_table_fp, prefs_fp, personal_id_column, personal_ids=None, column_title='Self', individual_titles=None, category_to_split='BodySite', time_series_category='WeeksSinceStart', rarefaction_depth=10000, alpha=0.05, rep_set_fp=None, parameter_fp=None, body_site_rarefied_otu_table_dir=None, retain_raw_data=False, suppress_alpha_rarefaction=False, suppress_beta_diversity=False, suppress_taxa_summary_plots=False, suppress_alpha_diversity_boxplots=False, suppress_otu_category_significance=False, command_handler=call_commands_serially, status_update_callback=no_status_updates): # Create our output directory and copy over the resources the personalized # pages need (e.g. javascript, images, etc.). create_dir(output_dir) support_files_dir = join(output_dir, 'support_files') if not exists(support_files_dir): copytree(join(get_project_dir(), 'my_microbes', 'support_files'), support_files_dir) logger = WorkflowLogger(generate_log_fp(output_dir)) mapping_data, header, comments = parse_mapping_file(open(mapping_fp, 'U')) try: personal_id_index = header.index(personal_id_column) except ValueError: raise ValueError("Personal ID field '%s' is not a mapping file column " "header." % personal_id_column) try: bodysite_index = header.index(category_to_split) except ValueError: raise ValueError("Category to split field '%s' is not a mapping file " "column header." % category_to_split) header = header[:-1] + [column_title] + [header[-1]] # column that differentiates between body-sites within a single individual # used for the creation of the vectors in make_3d_plots.py, this data is # created by concatenating the two columns when writing the mapping file site_id_category = '%s&&%s' % (personal_id_column, category_to_split) header.insert(len(header)-1, site_id_category) all_personal_ids = get_personal_ids(mapping_data, personal_id_index) if personal_ids == None: personal_ids = all_personal_ids else: for pid in personal_ids: if pid not in all_personal_ids: raise ValueError("'%s' is not a personal ID in the mapping " "file column '%s'." % (pid, personal_id_column)) if time_series_category not in header: raise ValueError("Time series field '%s' is not a mapping file column " "header." % time_series_category) otu_table_title = splitext(basename(otu_table_fp)) output_directories = [] raw_data_files = [] raw_data_dirs = [] # Rarefy the OTU table and split by body site here (instead of on a # per-individual basis) as we can use the same rarefied and split tables # for each individual. if not suppress_otu_category_significance: rarefied_otu_table_fp = join(output_dir, add_filename_suffix(otu_table_fp, '_even%d' % rarefaction_depth)) if body_site_rarefied_otu_table_dir is None: commands = [] cmd_title = 'Rarefying OTU table' cmd = 'single_rarefaction.py -i %s -o %s -d %s' % (otu_table_fp, rarefied_otu_table_fp, rarefaction_depth) commands.append([(cmd_title, cmd)]) raw_data_files.append(rarefied_otu_table_fp) per_body_site_dir = join(output_dir, 'per_body_site_otu_tables') cmd_title = 'Splitting rarefied OTU table by body site' cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % ( rarefied_otu_table_fp, mapping_fp, category_to_split, per_body_site_dir) commands.append([(cmd_title, cmd)]) raw_data_dirs.append(per_body_site_dir) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) else: per_body_site_dir = body_site_rarefied_otu_table_dir for person_of_interest in personal_ids: create_dir(join(output_dir, person_of_interest)) personal_mapping_file_fp = join(output_dir, person_of_interest, 'mapping_file.txt') html_fp = join(output_dir, person_of_interest, 'index.html') personal_mapping_data = create_personal_mapping_file(mapping_data, person_of_interest, personal_id_index, bodysite_index, individual_titles) personal_mapping_f = open(personal_mapping_file_fp, 'w') personal_mapping_f.write( format_mapping_file(header, personal_mapping_data, comments)) personal_mapping_f.close() raw_data_files.append(personal_mapping_file_fp) column_title_index = header.index(column_title) column_title_values = set([e[column_title_index] for e in personal_mapping_data]) cat_index = header.index(category_to_split) cat_values = set([e[cat_index] for e in personal_mapping_data]) # Generate alpha diversity boxplots, split by body site, one per # metric. We run this one first because it completes relatively # quickly and it does not call any QIIME scripts. alpha_diversity_boxplots_html = '' if not suppress_alpha_diversity_boxplots: adiv_boxplots_dir = join(output_dir, person_of_interest, 'adiv_boxplots') create_dir(adiv_boxplots_dir) output_directories.append(adiv_boxplots_dir) logger.write("\nGenerating alpha diversity boxplots (%s)\n\n" % person_of_interest) plot_filenames = _generate_alpha_diversity_boxplots( collated_dir, personal_mapping_file_fp, category_to_split, column_title, rarefaction_depth, adiv_boxplots_dir) # Create relative paths for use with the index page. rel_boxplot_dir = basename(normpath(adiv_boxplots_dir)) plot_fps = [join(rel_boxplot_dir, plot_filename) for plot_filename in plot_filenames] alpha_diversity_boxplots_html = \ create_alpha_diversity_boxplots_html(plot_fps) ## Alpha rarefaction steps if not suppress_alpha_rarefaction: rarefaction_dir = join(output_dir, person_of_interest, 'alpha_rarefaction') output_directories.append(rarefaction_dir) commands = [] cmd_title = 'Creating rarefaction plots (%s)' % person_of_interest cmd = 'make_rarefaction_plots.py -i %s -m %s -p %s -o %s' % ( collated_dir, personal_mapping_file_fp, prefs_fp, rarefaction_dir) commands.append([(cmd_title, cmd)]) raw_data_dirs.append(join(rarefaction_dir, 'average_plots')) raw_data_dirs.append(join(rarefaction_dir, 'average_tables')) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) ## Beta diversity steps if not suppress_beta_diversity: pcoa_dir = join(output_dir, person_of_interest, 'beta_diversity') pcoa_time_series_dir = join(output_dir, person_of_interest, 'beta_diversity_time_series') output_directories.append(pcoa_dir) output_directories.append(pcoa_time_series_dir) commands = [] cmd_title = 'Creating beta diversity time series plots (%s)' % \ person_of_interest cmd = 'make_3d_plots.py -m %s -p %s -i %s -o %s --custom_axes=' % ( personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_time_series_dir) +\ '\'%s\' --add_vectors=\'%s,%s\'' % (time_series_category, site_id_category, time_series_category) commands.append([(cmd_title, cmd)]) cmd_title = 'Creating beta diversity plots (%s)' % \ person_of_interest cmd = 'make_3d_plots.py -m %s -p %s -i %s -o %s' % (personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_dir) commands.append([(cmd_title, cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) ## Time series taxa summary plots steps if not suppress_taxa_summary_plots: area_plots_dir = join(output_dir, person_of_interest, 'time_series') create_dir(area_plots_dir) output_directories.append(area_plots_dir) ## Split OTU table into self/other per-body-site tables commands = [] cmd_title = 'Splitting OTU table into self/other (%s)' % \ person_of_interest cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (otu_table_fp, personal_mapping_file_fp, column_title, area_plots_dir) commands.append([(cmd_title, cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) for column_title_value in column_title_values: biom_fp = join(area_plots_dir, add_filename_suffix(otu_table_fp, '_%s' % column_title_value)) column_title_map_fp = join(area_plots_dir, 'mapping_%s.txt' % column_title_value) raw_data_files.append(biom_fp) raw_data_files.append(column_title_map_fp) body_site_dir = join(area_plots_dir, column_title_value) commands = [] cmd_title = 'Splitting "%s" OTU table by body site (%s)' % \ (column_title_value, person_of_interest) cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (biom_fp, personal_mapping_file_fp, category_to_split, body_site_dir) commands.append([(cmd_title, cmd)]) raw_data_dirs.append(body_site_dir) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] for cat_value in cat_values: body_site_otu_table_fp = join(body_site_dir, add_filename_suffix(biom_fp, '_%s' % cat_value)) # We won't always get an OTU table if the mapping file # category contains samples that aren't in the OTU table # (e.g. the 'na' state for body site). if exists(body_site_otu_table_fp): plots = join(area_plots_dir, 'taxa_plots_%s_%s' % ( column_title_value, cat_value)) cmd_title = 'Creating taxa summary plots (%s)' % \ person_of_interest cmd = ('summarize_taxa_through_plots.py -i %s ' '-o %s -c %s -m %s -s' % (body_site_otu_table_fp, plots, time_series_category, personal_mapping_file_fp)) if parameter_fp is not None: cmd += ' -p %s' % parameter_fp commands.append([(cmd_title, cmd)]) raw_data_files.append(join(plots, '*.biom')) raw_data_files.append(join(plots, '*.txt')) create_comparative_taxa_plots_html(cat_value, join(area_plots_dir, '%s_comparative.html' % cat_value)) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # Generate OTU category significance tables (per body site). otu_cat_sig_output_fps = [] otu_category_significance_html = '' if not suppress_otu_category_significance: otu_cat_sig_dir = join(output_dir, person_of_interest, 'otu_category_significance') create_dir(otu_cat_sig_dir) output_directories.append(otu_cat_sig_dir) # For each body-site rarefied OTU table, run # otu_category_significance.py using self versus other category. # Keep track of each output file that is created because we need to # parse these later on. commands = [] for cat_value in cat_values: body_site_otu_table_fp = join(per_body_site_dir, add_filename_suffix(rarefied_otu_table_fp, '_%s' % cat_value)) if exists(body_site_otu_table_fp): otu_cat_output_fp = join(otu_cat_sig_dir, 'otu_cat_sig_%s.txt' % cat_value) cmd_title = ('Testing for significant differences in ' 'OTU abundances in "%s" body site (%s)' % ( cat_value, person_of_interest)) cmd = ('otu_category_significance.py -i %s -m %s -c %s ' '-o %s' % (body_site_otu_table_fp, personal_mapping_file_fp, column_title, otu_cat_output_fp)) commands.append([(cmd_title, cmd)]) raw_data_files.append(otu_cat_output_fp) otu_cat_sig_output_fps.append(otu_cat_output_fp) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # Reformat otu category significance tables. otu_cat_sig_html_filenames = \ format_otu_category_significance_tables_as_html( otu_cat_sig_output_fps, alpha, otu_cat_sig_dir, individual_titles, rep_set_fp=rep_set_fp) # Create relative paths for use with the index page. rel_otu_cat_sig_dir = basename(normpath(otu_cat_sig_dir)) otu_cat_sig_html_fps = [join(rel_otu_cat_sig_dir, html_filename) for html_filename in otu_cat_sig_html_filenames] otu_category_significance_html = \ create_otu_category_significance_html(otu_cat_sig_html_fps) # Create the index.html file for the current individual. create_index_html(person_of_interest, html_fp, alpha_diversity_boxplots_html=alpha_diversity_boxplots_html, otu_category_significance_html=otu_category_significance_html) logger.close() # Clean up the unnecessary raw data files and directories. glob will only # grab paths that exist. if not retain_raw_data: for raw_data_fp_glob in raw_data_files: remove_files(glob(raw_data_fp_glob)) for raw_data_dir_glob in raw_data_dirs: for dir_to_remove in glob(raw_data_dir_glob): rmtree(dir_to_remove) return output_directories
def generate_most_wanted_list(output_dir, otu_table_fp, rep_set_fp, gg_fp, nt_fp, mapping_fp, mapping_category, top_n, min_abundance, max_abundance, min_categories, max_gg_similarity, e_value, word_size, jobs_to_start, command_handler, status_update_callback, force): try: makedirs(output_dir) except OSError: if not force: raise WorkflowError("Output directory '%s' already exists. Please " "choose a different directory, or force overwrite with -f." % output_dir) logger = WorkflowLogger(generate_log_fp(output_dir)) commands = [] # First filter to keep only new (non-GG) OTUs. novel_otu_table_fp = join(output_dir, add_filename_suffix(otu_table_fp, '_novel')) commands.append([('Filtering out all GG reference OTUs', 'filter_otus_from_otu_table.py -i %s -o %s -e %s' % (otu_table_fp, novel_otu_table_fp, gg_fp))]) # Next filter to keep only abundant otus in the specified range (looking # only at extremely abundant OTUs has the problem of yielding too many # that are similar to stuff in the nt database). novel_abund_otu_table_fp = join(output_dir, add_filename_suffix(novel_otu_table_fp, '_min%d_max%d' % (min_abundance, max_abundance))) commands.append([('Filtering out all OTUs that do not fall within the ' 'specified abundance threshold', 'filter_otus_from_otu_table.py -i %s -o %s -n %d -x %d' % (novel_otu_table_fp, novel_abund_otu_table_fp, min_abundance, max_abundance))]) # Next, collapse by mapping_category. otu_table_by_samp_type_fp = join(output_dir, add_filename_suffix(novel_abund_otu_table_fp, '_%s' % mapping_category)) commands.append([('Collapsing OTU table by %s' % mapping_category, 'summarize_otu_by_cat.py -c %s -o %s -m %s -i %s' % (novel_abund_otu_table_fp, otu_table_by_samp_type_fp, mapping_category, mapping_fp))]) # Filter to contain only otus in the specified minimum number of sample # types. otu_table_by_samp_type_ms_fp = join(output_dir, add_filename_suffix( otu_table_by_samp_type_fp, '_ms%d' % min_categories)) commands.append([('Filtering OTU table to include only OTUs that appear ' 'in at least %d sample groups' % min_categories, 'filter_otus_from_otu_table.py -i %s -o %s -s %d' % (otu_table_by_samp_type_fp, otu_table_by_samp_type_ms_fp, min_categories))]) # Now that we have a filtered down OTU table of good candidate OTUs, filter # the corresponding representative set to include only these candidate # sequences. candidate_rep_set_fp = join(output_dir, add_filename_suffix( rep_set_fp, '_most_wanted_candidates')) commands.append([('Filtering representative set to include only the ' 'latest candidate OTUs', 'filter_fasta.py -f %s -o %s -b %s' % (rep_set_fp, candidate_rep_set_fp, otu_table_by_samp_type_ms_fp))]) # Find the otus that don't hit GG at a certain maximum similarity # threshold. uclust_output_dir = join(output_dir, 'most_wanted_candidates_%s_%s' % (basename(gg_fp), str(max_gg_similarity))) commands.append([('Running uclust to get list of sequences that don\'t ' 'hit the maximum GG similarity threshold', 'parallel_pick_otus_uclust_ref.py -i %s -o %s -r %s -s %s -O %d' % (candidate_rep_set_fp, uclust_output_dir, gg_fp, str(max_gg_similarity), jobs_to_start))]) # Filter the candidate sequences to only include the failures from uclust. cand_gg_dis_rep_set_fp = join(output_dir, add_filename_suffix(candidate_rep_set_fp, '_failures')) commands.append([('Filtering candidate sequences to only include uclust ' 'failures', 'filter_fasta.py -f %s -s %s -o %s' % (candidate_rep_set_fp, join(uclust_output_dir, splitext(basename(candidate_rep_set_fp))[0] + '_failures.txt'), cand_gg_dis_rep_set_fp))]) # BLAST the failures against nt. blast_output_dir = join(output_dir, 'blast_output') commands.append([('BLASTing candidate sequences against nt database', 'parallel_blast.py -i %s -o %s -r %s -D -e %f -w %d -O %d' % (cand_gg_dis_rep_set_fp, blast_output_dir, nt_fp, e_value, word_size, jobs_to_start))]) # Execute the commands we have so far, but keep the logger open because # we're going to write additional status updates as we process the data. command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # We'll sort the BLAST results by percent identity (ascending) and pick the # top n. logger.write("Reading in BLAST results, sorting by percent identity, " "and picking the top %d OTUs.\n\n" % top_n) blast_results = open(join(blast_output_dir, splitext(basename(cand_gg_dis_rep_set_fp))[0] + '_blast_out.txt'), 'U') top_n_mw = [] for line in blast_results: # Skip headers. line = line.strip() if line and not line.startswith('#'): line = line.split('\t') top_n_mw.append((line[0], line[1], float(line[2]))) top_n_mw = sorted(top_n_mw, key=itemgetter(2))[:top_n] # Read in our filtered down candidate seqs file and latest filtered and # collapsed OTU table. We'll need to compute some stats on these to include # in our report. logger.write("Reading in candidate sequences and latest filtered and " "collapsed OTU table.\n\n") mw_seqs = {} for seq_id, seq in MinimalFastaParser(open(cand_gg_dis_rep_set_fp, 'U')): seq_id = seq_id.strip().split()[0] mw_seqs[seq_id] = seq otu_table_by_samp_type_ms = parse_biom_table( open(otu_table_by_samp_type_ms_fp, 'U')) # Write results out to tsv and HTML table. logger.write("Writing most wanted OTUs results to TSV and HTML " "tables.\n\n") mw_tsv_f = open(join(output_dir, 'top_%d_most_wanted_otus.txt' % top_n), 'w') mw_html_f = open(join(output_dir, 'top_%d_most_wanted_otus.html' % top_n), 'w') tsv_header = 'OTU ID\tSequence\tGreengenes taxonomy\t' + \ 'NCBI nt closest match\tNCBI nt % identity' mw_tsv_f.write(tsv_header + '\n') tsv_header += '\tAbundance by %s' % mapping_category html_header = '' for col in tsv_header.split('\t'): html_header += '<th>%s</th>' % col mw_html_f.write('<table><tr>' + html_header + '</tr>') for otu_id, subject_id, percent_identity in top_n_mw: # Grab all necessary information to be included in our report. seq = mw_seqs[otu_id] tax = otu_table_by_samp_type_ms.ObservationMetadata[ otu_table_by_samp_type_ms.getObservationIndex(otu_id)]['taxonomy'] gb_id = subject_id.split('|')[3] ncbi_link = 'http://www.ncbi.nlm.nih.gov/nuccore/%s' % gb_id # Compute the abundance of each most wanted OTU in each sample # grouping and create a pie chart to go in the HTML table. samp_types = otu_table_by_samp_type_ms.SampleIds counts = otu_table_by_samp_type_ms.observationData(otu_id) if len(counts) != len(samp_types): raise WorkflowError("The number of observation counts does not " "match the number of samples in the OTU " "table.") # Piechart code modified from matplotlib example: # http://matplotlib.sourceforge.net/examples/pylab_examples/ # pie_demo.html figure(figsize=(6,6)) ax = axes([0.1, 0.1, 0.8, 0.8]) # Will auto-normalize the counts. pie(counts, labels=samp_types, autopct='%1.1f%%', shadow=True) output_img_dir = join(output_dir, 'img') try: makedirs(output_img_dir) except OSError: # It already exists, which is okay since we already know we are in # 'force' mode from above. pass # We need a relative path to the image. pie_chart_fp = join('img', 'abundance_by_%s_%s.png' % (mapping_category, otu_id)) savefig(join(output_dir, pie_chart_fp)) mw_tsv_f.write('%s\t%s\t%s\t%s\t%s\n' % (otu_id, seq, tax, gb_id, percent_identity)) mw_html_f.write('<tr><td>%s</td><td>%s</td><td>%s</td>' '<td><a href="%s" target="_blank">%s</a></td><td>%s</td><td>' '<img src="%s" /></td></tr>' % (otu_id, seq, tax, ncbi_link, gb_id, percent_identity, pie_chart_fp)) mw_html_f.write('</table>') mw_tsv_f.close() mw_html_f.close() logger.close()
def assign_taxonomy_multiple_times(input_dirs, output_dir, assignment_methods, reference_seqs_fp, input_fasta_filename, clean_otu_table_filename, id_to_taxonomy_fp=None, confidences=None, e_values=None, command_handler=call_commands_serially, rdp_max_memory=None, status_update_callback=print_to_stdout, force=False, read_1_seqs_fp=None, read_2_seqs_fp=None): """ Performs sanity checks on passed arguments and directories. Builds commands for each method and sends them off to be executed. """ ## Check if temp output directory exists try: makedirs(output_dir) except OSError: if not force: raise WorkflowError( "Output directory '%s' already exists. Please " "choose a different directory, or force overwrite with -f." % output_dir) ## Check for inputs that are universally required if assignment_methods is None: raise WorkflowError("You must specify at least one method:" "'rdp', 'blast', 'mothur', or 'rtax'.") if input_fasta_filename is None: raise WorkflowError("You must provide an input fasta filename.") if clean_otu_table_filename is None: raise WorkflowError("You must provide a clean otu table filename.") if id_to_taxonomy_fp is None: raise WorkflowError("You must provide an ID to taxonomy map filename.") logger = WorkflowLogger(generate_log_fp(output_dir)) time_results = [] for input_dir in input_dirs: ## Make sure the input dataset directory exists. if not isdir(input_dir): raise WorkflowError("The input directory '%s' does not exist." % input_dir) input_dir_name = split(normpath(input_dir))[1] output_dataset_dir = join(output_dir, input_dir_name) input_fasta_fp = join(input_dir, input_fasta_filename) clean_otu_table_fp = join(input_dir, clean_otu_table_filename) logger.write("\nCreating output subdirectory '%s' if it doesn't " "already exist.\n" % output_dataset_dir) try: makedirs(output_dataset_dir) except OSError: # It already exists, which is okay since we already know we are in # 'force' mode from above. pass for method in assignment_methods: ## Method is RDP if method == 'rdp': ## Check for execution parameters required by RDP method if confidences is None: raise WorkflowError("You must specify at least one " "confidence level.") ## Generate command for RDP commands = _generate_rdp_commands( output_dataset_dir, input_fasta_fp, reference_seqs_fp, id_to_taxonomy_fp, clean_otu_table_fp, confidences, rdp_max_memory=rdp_max_memory) ## Method is BLAST elif method == 'blast': ## Check for execution parameters required by BLAST method if e_values is None: raise WorkflowError("You must specify at least one " "E value.") ## Generate command for BLAST commands = _generate_blast_commands( output_dataset_dir, input_fasta_fp, reference_seqs_fp, id_to_taxonomy_fp, clean_otu_table_fp, e_values) ## Method is Mothur elif method == 'mothur': ## Check for execution parameters required by Mothur method if confidences is None: raise WorkflowError("You must specify at least one " "confidence level.") ## Generate command for mothur commands = _generate_mothur_commands( output_dataset_dir, input_fasta_fp, reference_seqs_fp, id_to_taxonomy_fp, clean_otu_table_fp, confidences) ## Method is RTAX elif method == 'rtax': ## Check for execution parameters required by RTAX method if read_1_seqs_fp is None: raise WorkflowError("You must specify a file containing " "the first read from pair-end " "sequencing.") ## Generate command for rtax commands = _generate_rtax_commands( output_dataset_dir, input_fasta_fp, reference_seqs_fp, id_to_taxonomy_fp, clean_otu_table_fp, read_1_seqs_fp, read_2_seqs_fp=read_2_seqs_fp) ## Unsupported method else: raise WorkflowError("Unrecognized or unsupported taxonomy " "assignment method '%s'." % method) # send command for current method to command handler for command in commands: #call_commands_serially needs a list of commands so here's a length one commmand list. c = list() c.append(command) start = time() command_handler(c, status_update_callback, logger, close_logger_on_success=False) end = time() input_file = command[0][1].split()[ command[0][1].split().index('-i') + 1].split('/')[-2] if 'Assigning' in command[0][0]: time_results.append( (input_file, ' '.join(command[0][0].split()[2:]), end - start)) # removes and writes out the title we initialized with earlier logger.write('\n\nAssignment times (seconds):\n') for t in time_results: # write out each time result as (method, params)\ttime (seconds) #First clean up the output method, param = t[1].split(', ') method = method.lstrip('(') param = param.rstrip(')') logger.write('%s\t%s\t%s\t%s\n' % (t[0], method, param, str(t[2]))) logger.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) #get all the options cd_dir=path.join(opts.fs_fp,'sumtaxa') tmp_prefix=get_tmp_filename('',suffix='').strip() output_dir=path.join(opts.fs_fp,'sumtaxa','sum_taxa_'+tmp_prefix) web_fp=path.join(opts.web_fp,'sumtaxa','sum_taxa_'+tmp_prefix) otu_table_fp=opts.otu_table_fp mapping_file_fp=opts.mapping_file_fp file_name_prefix=opts.fname_prefix user_id=int(opts.user_id) meta_id=int(opts.meta_id) bdiv_rarefied_at=int(opts.bdiv_rarefied_at) jobs_to_start=opts.jobs_to_start.split(',') tree_fp=opts.tree_fp command_handler=call_commands_serially status_update_callback=no_status_updates zip_fpath=opts.zip_fpath zip_fpath_db=opts.zip_fpath_db run_date=opts.run_date force=True # get database connection try: from data_access_connections import data_access_factory from enums import ServerConfig import cx_Oracle data_access = data_access_factory(ServerConfig.data_access_type) except ImportError: print "NOT IMPORTING QIIMEDATAACCESS" pass # parse params try: parameter_f = open(opts.params_path) except IOError: raise IOError,\ "Can't open parameters file (%s). Does it exist? Do you have read access?"\ % opts.params_path params=parse_qiime_parameters(parameter_f) # write output directory try: makedirs(output_dir) except OSError: if force: pass else: # Since the analysis can take quite a while, I put this check # in to help users avoid overwriting previous output. print "Output directory already exists. Please choose "+\ "a different directory, or force overwrite with -f." exit(1) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) #start preparing the script call sum_taxa_cmd='%s %s/summarize_taxa_through_plots.py -i %s -m %s -o %s -p %s -s -f' %\ (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, output_dir,\ opts.params_path) chart_types=params['plot_taxa_summary']['chart_type'].split(',') html_fpaths=[] for ctype in chart_types: html_fpaths.append((path.join(web_fp,'taxa_summary_plots', '%s_charts.html' % (ctype)), 'SUMTAXA')) commands.append([('Summarize Taxonomy',sum_taxa_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger) #zip the files produced cmd_call='cd %s; zip -r %s %s' % (output_dir,\ zip_fpath, './*') system(cmd_call) #add html links to DB for easy display for i in html_fpaths: valid=data_access.addMetaAnalysisFiles(True,int(meta_id),i[0], 'SUMTAXA',run_date,i[1].upper()) if not valid: raise ValueError, 'There was an issue uploading the filepaths to the DB!'
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) #get all the options cd_dir=path.join(opts.fs_fp,'arare') tmp_prefix=get_tmp_filename('',suffix='').strip() output_dir=path.join(opts.fs_fp,'arare','arare_'+tmp_prefix) web_fp=path.join(opts.web_fp,'arare','arare_'+tmp_prefix) otu_table_fp=opts.otu_table_fp mapping_file_fp=opts.mapping_file_fp file_name_prefix=opts.fname_prefix user_id=int(opts.user_id) meta_id=int(opts.meta_id) bdiv_rarefied_at=int(opts.bdiv_rarefied_at) jobs_to_start=opts.jobs_to_start tree_fp=opts.tree_fp command_handler=call_commands_serially status_update_callback=no_status_updates zip_fpath=opts.zip_fpath zip_fpath_db=opts.zip_fpath_db run_date=opts.run_date force=True try: from data_access_connections import data_access_factory from enums import ServerConfig import cx_Oracle data_access = data_access_factory(ServerConfig.data_access_type) except ImportError: print "NOT IMPORTING QIIMEDATAACCESS" pass try: parameter_f = open(opts.params_path) except IOError: raise IOError,\ "Can't open parameters file (%s). Does it exist? Do you have read access?"\ % opts.params_path params=parse_qiime_parameters(parameter_f) try: makedirs(output_dir) except OSError: if force: pass else: # Since the analysis can take quite a while, I put this check # in to help users avoid overwriting previous output. print "Output directory already exists. Please choose "+\ "a different directory, or force overwrite with -f." exit(1) commands=[] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) # determine whether to run alpha-diversity in serial or parallel serial_or_parallel = params['serial_or_parallel']['method'] if serial_or_parallel=='Serial': arare_cmd='%s %s/alpha_rarefaction.py -i %s -m %s -o %s -t %s -p %s -f' %\ (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, \ output_dir,tree_fp,opts.params_path) else: arare_cmd='%s %s/alpha_rarefaction.py -i %s -m %s -o %s -t %s -a -O 50 -p %s -f' %\ (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, \ output_dir,tree_fp,opts.params_path) commands.append([('Alpha-Rarefaction',arare_cmd)]) command_handler(commands, status_update_callback, logger) #zip the distance matrices cmd_call='cd %s; zip -r %s %s' % (cd_dir,zip_fpath,'arare_'+tmp_prefix) system(cmd_call) #convert link into web-link web_link=path.join(web_fp, 'alpha_rarefaction_plots', 'rarefaction_plots.html') #add the distance matrices valid=data_access.addMetaAnalysisFiles(True, int(meta_id), web_link, 'ARARE', run_date, 'ARARE') if not valid: raise ValueError, 'There was an issue uploading the filepaths to the DB!'
def create_personal_results( output_dir, mapping_fp, coord_fp, collated_dir, otu_table_fp, prefs_fp, personal_id_column, personal_ids=None, column_title="Self", individual_titles=None, category_to_split="BodySite", time_series_category="WeeksSinceStart", rarefaction_depth=10000, alpha=0.05, rep_set_fp=None, body_site_rarefied_otu_table_dir=None, retain_raw_data=False, suppress_alpha_rarefaction=False, suppress_beta_diversity=False, suppress_taxa_summary_plots=False, suppress_alpha_diversity_boxplots=False, suppress_otu_category_significance=False, command_handler=call_commands_serially, status_update_callback=no_status_updates, ): # Create our output directory and copy over the resources the personalized # pages need (e.g. javascript, images, etc.). create_dir(output_dir) support_files_dir = join(output_dir, "support_files") if not exists(support_files_dir): copytree(join(get_project_dir(), "my_microbes", "support_files"), support_files_dir) logger = WorkflowLogger(generate_log_fp(output_dir)) mapping_data, header, comments = parse_mapping_file(open(mapping_fp, "U")) try: personal_id_index = header.index(personal_id_column) except ValueError: raise ValueError("Personal ID field '%s' is not a mapping file column " "header." % personal_id_column) try: bodysite_index = header.index(category_to_split) except ValueError: raise ValueError("Category to split field '%s' is not a mapping file " "column header." % category_to_split) header = header[:-1] + [column_title] + [header[-1]] # column that differentiates between body-sites within a single individual # used for the creation of the vectors in make_3d_plots.py, this data is # created by concatenating the two columns when writing the mapping file site_id_category = "%s&&%s" % (personal_id_column, category_to_split) header.insert(len(header) - 1, site_id_category) all_personal_ids = get_personal_ids(mapping_data, personal_id_index) if personal_ids == None: personal_ids = all_personal_ids else: for pid in personal_ids: if pid not in all_personal_ids: raise ValueError( "'%s' is not a personal ID in the mapping " "file column '%s'." % (pid, personal_id_column) ) if time_series_category not in header: raise ValueError("Time series field '%s' is not a mapping file column " "header." % time_series_category) otu_table_title = splitext(basename(otu_table_fp)) output_directories = [] raw_data_files = [] raw_data_dirs = [] # Rarefy the OTU table and split by body site here (instead of on a # per-individual basis) as we can use the same rarefied and split tables # for each individual. if not suppress_otu_category_significance: rarefied_otu_table_fp = join(output_dir, add_filename_suffix(otu_table_fp, "_even%d" % rarefaction_depth)) if body_site_rarefied_otu_table_dir is None: commands = [] cmd_title = "Rarefying OTU table" cmd = "single_rarefaction.py -i %s -o %s -d %s" % (otu_table_fp, rarefied_otu_table_fp, rarefaction_depth) commands.append([(cmd_title, cmd)]) raw_data_files.append(rarefied_otu_table_fp) per_body_site_dir = join(output_dir, "per_body_site_otu_tables") cmd_title = "Splitting rarefied OTU table by body site" cmd = "split_otu_table.py -i %s -m %s -f %s -o %s" % ( rarefied_otu_table_fp, mapping_fp, category_to_split, per_body_site_dir, ) commands.append([(cmd_title, cmd)]) raw_data_dirs.append(per_body_site_dir) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) else: per_body_site_dir = body_site_rarefied_otu_table_dir for person_of_interest in personal_ids: # Files to clean up on a per-individual basis. personal_raw_data_files = [] personal_raw_data_dirs = [] create_dir(join(output_dir, person_of_interest)) personal_mapping_file_fp = join(output_dir, person_of_interest, "mapping_file.txt") html_fp = join(output_dir, person_of_interest, "index.html") personal_mapping_data = create_personal_mapping_file( mapping_data, person_of_interest, personal_id_index, bodysite_index, individual_titles ) personal_mapping_f = open(personal_mapping_file_fp, "w") personal_mapping_f.write(format_mapping_file(header, personal_mapping_data, comments)) personal_mapping_f.close() personal_raw_data_files.append(personal_mapping_file_fp) column_title_index = header.index(column_title) column_title_values = set([e[column_title_index] for e in personal_mapping_data]) cat_index = header.index(category_to_split) cat_values = set([e[cat_index] for e in personal_mapping_data]) # Generate alpha diversity boxplots, split by body site, one per # metric. We run this one first because it completes relatively # quickly and it does not call any QIIME scripts. alpha_diversity_boxplots_html = "" if not suppress_alpha_diversity_boxplots: adiv_boxplots_dir = join(output_dir, person_of_interest, "adiv_boxplots") create_dir(adiv_boxplots_dir) output_directories.append(adiv_boxplots_dir) logger.write("\nGenerating alpha diversity boxplots (%s)\n\n" % person_of_interest) plot_filenames = _generate_alpha_diversity_boxplots( collated_dir, personal_mapping_file_fp, category_to_split, column_title, rarefaction_depth, adiv_boxplots_dir, ) # Create relative paths for use with the index page. rel_boxplot_dir = basename(normpath(adiv_boxplots_dir)) plot_fps = [join(rel_boxplot_dir, plot_filename) for plot_filename in plot_filenames] alpha_diversity_boxplots_html = create_alpha_diversity_boxplots_html(plot_fps) ## Alpha rarefaction steps if not suppress_alpha_rarefaction: rarefaction_dir = join(output_dir, person_of_interest, "alpha_rarefaction") output_directories.append(rarefaction_dir) commands = [] cmd_title = "Creating rarefaction plots (%s)" % person_of_interest cmd = "make_rarefaction_plots.py -i %s -m %s -p %s -o %s" % ( collated_dir, personal_mapping_file_fp, prefs_fp, rarefaction_dir, ) commands.append([(cmd_title, cmd)]) personal_raw_data_dirs.append(join(rarefaction_dir, "average_plots")) personal_raw_data_dirs.append(join(rarefaction_dir, "average_tables")) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) ## Beta diversity steps if not suppress_beta_diversity: pcoa_dir = join(output_dir, person_of_interest, "beta_diversity") pcoa_time_series_dir = join(output_dir, person_of_interest, "beta_diversity_time_series") output_directories.append(pcoa_dir) output_directories.append(pcoa_time_series_dir) commands = [] cmd_title = "Creating beta diversity time series plots (%s)" % person_of_interest cmd = "make_3d_plots.py -m %s -p %s -i %s -o %s --custom_axes=" % ( personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_time_series_dir, ) + "'%s' --add_vectors='%s,%s'" % (time_series_category, site_id_category, time_series_category) commands.append([(cmd_title, cmd)]) cmd_title = "Creating beta diversity plots (%s)" % person_of_interest cmd = "make_3d_plots.py -m %s -p %s -i %s -o %s" % (personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_dir) commands.append([(cmd_title, cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) ## Time series taxa summary plots steps taxa_summary_plots_html = "" if not suppress_taxa_summary_plots: area_plots_dir = join(output_dir, person_of_interest, "time_series") create_dir(area_plots_dir) output_directories.append(area_plots_dir) files_to_remove, dirs_to_remove = _generate_taxa_summary_plots( otu_table_fp, personal_mapping_file_fp, person_of_interest, column_title, column_title_values, category_to_split, cat_values, time_series_category, area_plots_dir, command_handler, status_update_callback, logger, ) personal_raw_data_files.extend(files_to_remove) personal_raw_data_dirs.extend(dirs_to_remove) taxa_summary_plots_html = create_taxa_summary_plots_html(output_dir, person_of_interest, cat_values) # Generate OTU category significance tables (per body site). otu_cat_sig_output_fps = [] otu_category_significance_html = "" if not suppress_otu_category_significance: otu_cat_sig_dir = join(output_dir, person_of_interest, "otu_category_significance") create_dir(otu_cat_sig_dir) output_directories.append(otu_cat_sig_dir) # For each body-site rarefied OTU table, run # otu_category_significance.py using self versus other category. # Keep track of each output file that is created because we need to # parse these later on. commands = [] valid_body_sites = [] for cat_value in cat_values: body_site_otu_table_fp = join( per_body_site_dir, add_filename_suffix(rarefied_otu_table_fp, "_%s" % cat_value) ) if exists(body_site_otu_table_fp): # Make sure we have at least one sample for Self, otherwise # otu_category_significance.py crashes with a division by # zero error. with open(body_site_otu_table_fp, "U") as body_site_otu_table_f, open( personal_mapping_file_fp, "U" ) as personal_mapping_file_f: personal_sample_count = _count_per_individual_samples( body_site_otu_table_f, personal_mapping_file_f, personal_id_column, person_of_interest ) if personal_sample_count < 1: continue else: valid_body_sites.append(cat_value) otu_cat_output_fp = join(otu_cat_sig_dir, "otu_cat_sig_%s.txt" % cat_value) cmd_title = "Testing for significant differences in " 'OTU abundances in "%s" body site (%s)' % ( cat_value, person_of_interest, ) cmd = "otu_category_significance.py -i %s -m %s -c %s " "-o %s" % ( body_site_otu_table_fp, personal_mapping_file_fp, column_title, otu_cat_output_fp, ) commands.append([(cmd_title, cmd)]) personal_raw_data_files.append(otu_cat_output_fp) otu_cat_sig_output_fps.append(otu_cat_output_fp) # Hack to allow print-only mode. if command_handler is not print_commands and not valid_body_sites: raise ValueError( "None of the body sites for personal ID '%s' " "could be processed because there were no " "matching samples in the rarefied OTU table." % person_of_interest ) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # Reformat otu category significance tables. otu_cat_sig_html_filenames = create_otu_category_significance_html_tables( otu_cat_sig_output_fps, alpha, otu_cat_sig_dir, individual_titles, rep_set_fp=rep_set_fp ) # Create relative paths for use with the index page. rel_otu_cat_sig_dir = basename(normpath(otu_cat_sig_dir)) otu_cat_sig_html_fps = [ join(rel_otu_cat_sig_dir, html_filename) for html_filename in otu_cat_sig_html_filenames ] otu_category_significance_html = create_otu_category_significance_html(otu_cat_sig_html_fps) # Create the index.html file for the current individual. create_index_html( person_of_interest, html_fp, taxa_summary_plots_html=taxa_summary_plots_html, alpha_diversity_boxplots_html=alpha_diversity_boxplots_html, otu_category_significance_html=otu_category_significance_html, ) # Clean up the unnecessary raw data files and directories for the # current individual. glob will only grab paths that exist. if not retain_raw_data: clean_up_raw_data_files(personal_raw_data_files, personal_raw_data_dirs) # Clean up any remaining raw data files that weren't created on a # per-individual basis. if not retain_raw_data: clean_up_raw_data_files(raw_data_files, raw_data_dirs) logger.close() return output_directories
def tax_align_tree(repset_fasta_fp, output_dir, command_handler, params, qiime_config, parallel=False, logger=None, status_update_callback=print_to_stdout): input_dir, input_filename = split(repset_fasta_fp) input_basename, input_ext = splitext(input_filename) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False ## Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'rdp' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir,assignment_method) taxonomy_fp = '%s/%s_tax_assignments.txt' % \ (assign_taxonomy_dir,input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ 'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (assignment_method, repset_fasta_fp,assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\ (assign_taxonomy_dir,repset_fasta_fp, params_str) if exists(assign_taxonomy_dir): rmtree(assign_taxonomy_dir) commands.append([('Assign taxonomy', assign_taxonomy_cmd)]) ## Prep the pynast alignment command alignment_method = 'pynast' pynast_dir = '%s/%s_aligned_seqs' % (output_dir, alignment_method) aln_fp = '%s/%s_aligned.fasta' % (pynast_dir, input_basename) failures_fp = '%s/%s_failures.fasta' % (pynast_dir, input_basename) if exists(pynast_dir): rmtree(pynast_dir) if parallel: # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['align_seqs'].copy() if 'alignment_method' in d: del d['alignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel pynast alignment command align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\ (repset_fasta_fp, pynast_dir, params_str) else: try: params_str = get_params_str(params['align_seqs']) except KeyError: params_str = '' # Build the pynast alignment command align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\ (repset_fasta_fp, pynast_dir, params_str) commands.append([('Align sequences', align_seqs_cmd)]) ## Prep the alignment filtering command filtered_aln_fp = '%s/%s_aligned_pfiltered.fasta' %\ (pynast_dir,input_basename) try: params_str = get_params_str(params['filter_alignment']) except KeyError: params_str = '' # Build the alignment filtering command filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\ (pynast_dir, aln_fp, params_str) commands.append([('Filter alignment', filter_alignment_cmd)]) ## Prep the tree building command tree_fp = '%s/rep_set.tre' % output_dir try: params_str = get_params_str(params['make_phylogeny']) except KeyError: params_str = '' # Build the tree building command make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\ (filtered_aln_fp, tree_fp,params_str) commands.append([('Build phylogenetic tree', make_phylogeny_cmd)]) if exists(tree_fp): remove_files([tree_fp]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return taxonomy_fp, failures_fp
def tax_align_tree(repset_fasta_fp, output_dir, command_handler, params, qiime_config, parallel=False, logger=None, status_update_callback=print_to_stdout): input_dir, input_filename = split(repset_fasta_fp) input_basename, input_ext = splitext(input_filename) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False ## Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'rdp' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir,assignment_method) taxonomy_fp = '%s/%s_tax_assignments.txt' % \ (assign_taxonomy_dir,input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ 'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (assignment_method, repset_fasta_fp,assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\ (assign_taxonomy_dir,repset_fasta_fp, params_str) if exists(assign_taxonomy_dir): rmtree(assign_taxonomy_dir) commands.append([('Assign taxonomy',assign_taxonomy_cmd)]) ## Prep the pynast alignment command alignment_method = 'pynast' pynast_dir = '%s/%s_aligned_seqs' % (output_dir,alignment_method) aln_fp = '%s/%s_aligned.fasta' % (pynast_dir,input_basename) failures_fp = '%s/%s_failures.fasta' % (pynast_dir,input_basename) if exists(pynast_dir): rmtree(pynast_dir) if parallel: # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['align_seqs'].copy() if 'alignment_method' in d: del d['alignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel pynast alignment command align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\ (repset_fasta_fp, pynast_dir, params_str) else: try: params_str = get_params_str(params['align_seqs']) except KeyError: params_str = '' # Build the pynast alignment command align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\ (repset_fasta_fp, pynast_dir, params_str) commands.append([('Align sequences', align_seqs_cmd)]) ## Prep the alignment filtering command filtered_aln_fp = '%s/%s_aligned_pfiltered.fasta' %\ (pynast_dir,input_basename) try: params_str = get_params_str(params['filter_alignment']) except KeyError: params_str = '' # Build the alignment filtering command filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\ (pynast_dir, aln_fp, params_str) commands.append([('Filter alignment', filter_alignment_cmd)]) ## Prep the tree building command tree_fp = '%s/rep_set.tre' % output_dir try: params_str = get_params_str(params['make_phylogeny']) except KeyError: params_str = '' # Build the tree building command make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\ (filtered_aln_fp, tree_fp,params_str) commands.append([('Build phylogenetic tree', make_phylogeny_cmd)]) if exists(tree_fp): remove_files([tree_fp]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return taxonomy_fp, failures_fp
def pick_subsampled_open_referenence_otus( input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_tax_align_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking denovo_otu_picking_method = 'uclust' reference_otu_picking_method = 'uclust_ref' # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False log_input_md5s( logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp ## Step 1: Closed-reference OTU picking on the input file (if not already complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id != None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_otu_map_fp = \ '%s/%s_otus.txt' % (prefilter_dir,input_basename) prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir,input_basename) prefilter_pick_otu_cmd = pick_reference_otus(\ input_fp,prefilter_dir,reference_otu_picking_method, prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir,input_basename,input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp,prefiltered_input_fp,prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) ## Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir,input_basename) step1_pick_otu_cmd = pick_reference_otus(\ input_fp,step1_dir,reference_otu_picking_method, refseqs_fp,parallel,params,logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) ## Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir,input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,step1_failures_list_fp,step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) ## Subsample the failures fasta file to retain (roughly) the ## percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) ## Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) ## Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp) commands.append([('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp, output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp, min_otu_size) ## make the final representative seqs file and a new refseqs file that ## could be used in subsequent otu picking runs. ## this is clunky. first, we need to do this without singletons to match ## the otu map without singletons. next, there is a difference in what ## we need the reference set to be and what we need the repseqs to be. ## the reference set needs to be a superset of the input reference set ## to this set. the repset needs to be only the sequences that were observed ## in this data set, and we want reps for the step1 reference otus to be ## reads from this run so we don't hit issues building a tree using ## sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp,otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_tax_align_tree: taxonomy_fp, pynast_failures_fp = tax_align_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\ (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_taxa_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # Build OTU table without PyNAST failures otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(otu_table_w_tax_fp, 'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) otu_table_f = open(otu_table_fp, 'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def iterative_pick_subsampled_open_referenence_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=0.60, min_otu_size=2, run_tax_align_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, status_update_callback=print_to_stdout): """ Call the pick_subsampled_open_referenence_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i,input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir,i) if iteration_output_exists(iteration_output_dir,min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger,[input_fp,refseqs_fp]) logger.write('Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i,input_fp)) else: pick_subsampled_open_referenence_otus(input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join([new_ref_set_id,str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_tax_align_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, status_update_callback=status_update_callback) ## perform post-iteration file shuffling whether the previous iteration's ## data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir,min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps),otu_table_fp) commands.append([("Merge OTU tables",merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps,final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_tax_align_tree: otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) final_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) if exists(final_otu_table_fp) and getsize(final_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp,final_otu_table_fp],error_on_missing=False) taxonomy_fp, pynast_failures_fp = tax_align_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\ (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table",add_taxa_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(otu_table_w_tax_fp,'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')), 0,inf,0,inf,negate_ids_to_keep=True) otu_table_f = open(final_otu_table_fp,'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def run_other_qiime_analysis(data_access, fs_fp, web_fp, otu_table_filepath, map_filepath, file_name_prefix, user_id, meta_id, params_path, rarefied_at, jobs_to_start, tree_fp, zip_fpath, zip_fpath_db): # get the date to put in the db run_date=datetime.now().strftime("%d/%m/%Y/%H/%M/%S") # Prepare the params for submitting new jobs to the torque-poller params=[] params.append('fs_fp=%s' % fs_fp) params.append('web_fp=%s' % web_fp) params.append('otu_table_fp=%s' % otu_table_filepath) params.append('mapping_file_fp=%s' % map_filepath) params.append('fname_prefix=%s' % file_name_prefix) params.append('user_id=%s' % user_id) params.append('meta_id=%s' % meta_id) params.append('params_path=%s' % params_path) params.append('bdiv_rarefied_at=%s' % rarefied_at) params.append('jobs_to_start=%s' % jobs_to_start) params.append('tree_fp=%s' % tree_fp) params.append('run_date=%s' % run_date) params.append('zip_fpath=%s' % zip_fpath) params.append('zip_fpath_db=%s' % zip_fpath_db) job_input='!!'.join(params) # Determine which meta-analyses the user selected analyses_to_start=jobs_to_start.split(',') # Prepare TopiaryExplorer job if 'showTE' in analyses_to_start: tree_fpath=path.abspath('%s/software/gg_otus_4feb2011/trees/gg_97_otus_4feb2011.tre' % (os.environ['HOME'])) python_exe_fp = qiime_config['python_exe_fp'] commands=[] command_handler=call_commands_serially status_update_callback=no_status_updates logger = WorkflowLogger(generate_log_fp('/tmp/'), params=dict(''), qiime_config=qiime_config) #define topiary explorer fpaths jnlp_fname=path.splitext(path.split(otu_table_filepath)[-1])[0]+'.jnlp' tep_fname=path.splitext(path.split(otu_table_filepath)[-1])[0] + '.tep' jnlp_filepath_web=path.join(web_fp, 'topiaryexplorer_files', jnlp_fname) jnlp_filepath_web_tep=path.join(web_fp,'topiaryexplorer_files', tep_fname) # define the hard-link for the JNLP if ServerConfig.home=='/home/wwwdevuser/': host_name='http://webdev.microbio.me/qiime' else: host_name='http://www.microbio.me/qiime' jnlp_filepath_web_tep_url=path.join(host_name, jnlp_filepath_web_tep) output_dir=os.path.join(fs_fp, 'topiaryexplorer_files') #build command make_tep_cmd='%s %s/make_tep.py -i %s -m %s -t %s -o %s -u %s -w' %\ (python_exe_fp, script_dir, otu_table_filepath, map_filepath, tree_fpath, output_dir, jnlp_filepath_web_tep_url) commands.append([('Make TopiaryExplorer jnlp', make_tep_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger) #zip Topiary Explorer jnlp file cmd_call='cd %s; zip %s %s' % (output_dir,zip_fpath,jnlp_fname) system(cmd_call) #zip Topiary Explorer project file cmd_call='cd %s; zip %s %s' % (output_dir,zip_fpath,tep_fname) system(cmd_call) valid=data_access.addMetaAnalysisFiles(True, int(meta_id), jnlp_filepath_web, 'OTUTABLE', run_date, 'TOPIARYEXPLORER') if not valid: raise ValueError, 'There was an issue uploading the filepaths to the DB!' # Generate and Submit Beta-Diversity Job if 'bdiv' in analyses_to_start: job_type='betaDiversityThroughPlots' # Submit the Beta Diversity jobs try: # Attempt the submission submitQiimeJob(meta_id, user_id, job_type, job_input, data_access) except Exception, e: raise ValueError,e
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) #get all the options tmp_prefix=get_tmp_filename('',suffix='').strip() output_dir=path.join(opts.fs_fp,'bdiv',tmp_prefix) web_fp=path.join(opts.web_fp,'bdiv',tmp_prefix) otu_table_fp=opts.otu_table_fp mapping_file_fp=opts.mapping_file_fp file_name_prefix=opts.fname_prefix user_id=int(opts.user_id) meta_id=int(opts.meta_id) bdiv_rarefied_at=int(opts.bdiv_rarefied_at) jobs_to_start=opts.jobs_to_start.split(',') tree_fp=opts.tree_fp command_handler=call_commands_serially status_update_callback=no_status_updates zip_fpath=opts.zip_fpath zip_fpath_db=opts.zip_fpath_db run_date=opts.run_date force=True # Connect to the database for adding fpaths try: from data_access_connections import data_access_factory from enums import ServerConfig import cx_Oracle data_access = data_access_factory(ServerConfig.data_access_type) except ImportError: print "NOT IMPORTING QIIMEDATAACCESS" pass # open and get params try: parameter_f = open(opts.params_path) except IOError: raise IOError,\ "Can't open parameters file (%s). Does it exist? Do you have read access?"\ % opts.params_path params=parse_qiime_parameters(parameter_f) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) # get the beta_diversity metrics, so we can determine the filepaths based # on these beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') # determine if beta-diversity should be run in serial or parallel serial_or_parallel = params['serial_or_parallel']['method'] if 'disthist_bdiv_plots' in jobs_to_start: #start preparing the script call if serial_or_parallel=='Serial': beta_div_cmd='%s %s/beta_diversity_through_plots.py -i %s -m %s -o %s -t %s -p %s -c %s -f' %\ (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, \ output_dir,tree_fp,opts.params_path, \ params['make_distance_histograms']['fields']) else: beta_div_cmd='%s %s/beta_diversity_through_plots.py -i %s -m %s -o %s -t %s -p %s -c %s -a -O 50 -f' %\ (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, \ output_dir,tree_fp,opts.params_path, \ params['make_distance_histograms']['fields']) else: #start preparing the script call if serial_or_parallel=='Serial': beta_div_cmd='%s %s/beta_diversity_through_plots.py -i %s -m %s -o %s -t %s -p %s -f' %\ (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, output_dir,\ tree_fp,opts.params_path) else: beta_div_cmd='%s %s/beta_diversity_through_plots.py -i %s -m %s -o %s -t %s -p %s -a -O 50 -f' %\ (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, output_dir,\ tree_fp,opts.params_path) # add in optional parameters depending on whether they are supplied if bdiv_rarefied_at: beta_div_cmd+=" -e %s" % (str(bdiv_rarefied_at)) html_fpaths=[] # add 3d plots params if '3d_bdiv_plots' not in jobs_to_start: beta_div_cmd+=" --suppress_3d_plots" else: for met in beta_diversity_metrics: html_fpaths.append((path.join(web_fp,'%s_3d_discrete' % (met), '%s_pc_3D_PCoA_plots.html' % (met)), '3D_DISCRETE_PLOT')) html_fpaths.append((path.join(web_fp,'%s_3d_continuous' % (met), '%s_pc_3D_PCoA_plots.html' % (met)), '3D_CONTINUOUS_PLOT')) # add 2d plots params if '2d_bdiv_plots' not in jobs_to_start: beta_div_cmd+=" --suppress_2d_plots" else: for met in beta_diversity_metrics: html_fpaths.append((path.join(web_fp,'%s_2d_discrete' % (met), '%s_pc_2D_PCoA_plots.html' % (met)), '2D_DISCRETE_PLOT')) html_fpaths.append((path.join(web_fp,'%s_2d_continuous' % (met), '%s_pc_2D_PCoA_plots.html' % (met)), '2D_CONTINUOUS_PLOT')) # add distance histograms params if 'disthist_bdiv_plots' not in jobs_to_start: #beta_div_cmd+=" --suppress_distance_histograms" pass else: for met in beta_diversity_metrics: html_fpaths.append((path.join(web_fp,'%s_histograms' % (met), '%s_dm_distance_histograms.html' % (met)), 'DISTANCE_HISTOGRAM')) commands.append([('Beta Diversity Through Plots',beta_div_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger) #zip the files produced cmd_call='cd %s; zip -r %s %s' % (output_dir,\ zip_fpath, './*') system(cmd_call) #add html links to DB for easy display for i in html_fpaths: valid=data_access.addMetaAnalysisFiles(True,int(meta_id),i[0], 'BDIV',run_date,i[1].upper()) if not valid: raise ValueError, 'There was an issue uploading the filepaths to the DB!'