def main(): args = parser.parse_args() # Check that input filenames exist. check_files_exist([args.study_fasta, args.ref_msa, args.tree]) # If intermediate output directory set then create and output there. # Otherwise make a temporary directory for the intermediate files. if args.intermediate: make_output_dir(args.intermediate) place_seqs_pipeline(study_fasta=args.study_fasta, ref_msa=args.ref_msa, tree=args.tree, out_tree=args.out_tree, threads=args.threads, papara_output=args.papara_output, out_dir=args.intermediate, chunk_size=args.chunk_size, print_cmds=args.print_cmds) else: with TemporaryDirectory() as temp_dir: place_seqs_pipeline(study_fasta=args.study_fasta, ref_msa=args.ref_msa, tree=args.tree, out_tree=args.out_tree, threads=args.threads, papara_output=args.papara_output, out_dir=temp_dir, chunk_size=args.chunk_size, print_cmds=args.print_cmds)
def main(): args = parser.parse_args() # If intermediate output directory set then create and output there. # Otherwise make a temporary directory for the intermediate files. if args.intermediate: make_output_dir(args.intermediate) place_seqs_pipeline(study_fasta=args.study_fasta, ref_dir=args.ref_dir, out_tree=args.out_tree, threads=args.processes, out_dir=args.intermediate, chunk_size=args.chunk_size, print_cmds=args.print_cmds) else: with TemporaryDirectory() as temp_dir: place_seqs_pipeline(study_fasta=args.study_fasta, ref_dir=args.ref_dir, out_tree=args.out_tree, threads=args.processes, out_dir=temp_dir, chunk_size=args.chunk_size, print_cmds=args.print_cmds)
def test_run_place_seqs_pipeline(self): '''Basic test of full place seqs pipeline. As for EPA-NG, exact matches to a treefile are not checked since slight differences are expected depending on different versions.''' with TemporaryDirectory() as temp_dir: tmp_tree = path.join(temp_dir, "out.tre") place_seqs_pipeline(study_fasta=test_study_seqs, ref_dir=test_ref_dir, out_tree=tmp_tree, threads=1, out_dir=temp_dir, chunk_size=5000, print_cmds=False)
def full_pipeline(study_fasta, input_table, output_folder, threads, ref_msa, tree, hmm, in_traits, custom_trait_tables, marker_gene_table, pathway_map, no_pathways, regroup_map, no_regroup, stratified, alignment_tool, max_nsti, min_reads, min_samples, hsp_method, calculate_NSTI, confidence, seed, no_gap_fill, per_sequence_contrib, no_descrip, verbose): '''Function that contains wrapper commands for full PICRUSt2 pipeline. Descriptions of all of these input arguments/options are given in the picrust2_pipeline.py script.''' # Check that input files exist. check_files_exist([study_fasta, input_table]) if path.exists(output_folder): sys.exit("Stopping - output directory " + output_folder + " already exists.") # Make output folder. make_output_dir(output_folder) out_tree = path.join(output_folder, "out.tre") if custom_trait_tables is None: # Check that specified functional categories are allowed. FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM'] funcs = in_traits.split(",") for func in funcs: if func not in FUNC_TRAIT_OPTIONS: sys.exit("Error - specified category " + func + " is not one of " "the default categories.") # Add EC to this set if pathways are to be predicted. if "EC" not in funcs and not no_pathways: funcs.append("EC") rxn_func = "EC" func_tables = default_tables else: no_descrip = True funcs = [] func_tables = {} table_i = 0 for custom in custom_trait_tables.split(","): func_id = path.splitext(path.basename(custom))[0] funcs.append(func_id) func_tables[func_id] = custom if table_i == 0: rxn_func = func_id table_i += 1 # Append marker as well, since this also needs to be run. funcs.append("marker") func_tables["marker"] = marker_gene_table # Methods for discrete trait prediction with CI enabled. discrete_set = set(['emp_prob', 'mp']) if confidence and hsp_method in discrete_set: ci_setting = True else: ci_setting = False gap_fill_opt = not no_gap_fill if verbose: print("Placing sequences onto reference tree.") # Define folders for intermediate files. intermediate_dir = path.join(output_folder, "intermediate") place_seqs_intermediate = path.join(intermediate_dir, "place_seqs") make_output_dir(intermediate_dir) make_output_dir(place_seqs_intermediate) place_seqs_pipeline(study_fasta=study_fasta, ref_msa=ref_msa, tree=tree, hmm=hmm, out_tree=out_tree, alignment_tool=alignment_tool, threads=threads, out_dir=place_seqs_intermediate, chunk_size=5000, print_cmds=verbose) if verbose: print("Finished placing sequences on output tree: " + out_tree) # Get predictions for all specified functions and keep track of outfiles. predicted_funcs = {} for func in funcs: count_outfile = hsp_pipeline_steps(func=func, calculate_NSTI=calculate_NSTI, out_tree=out_tree, func_table_in=func_tables[func], hsp_method=hsp_method, ci_setting=ci_setting, threads=threads, seed=seed, output_folder=output_folder, verbose=verbose) # Keep track of output file name for next step of pipeline. predicted_funcs[func] = count_outfile marker_infile = predicted_funcs["marker"] # Inititalize dictionary of function names to output filenames to return. func_output = {} # Each value will be a list of 2 elements corresponding to the unstratified # and stratified tables respectively (stratified will be None of not calculated). # Loop over each function again and run metagenome pipeline. for func in funcs: if func == "marker": continue if verbose: print("Running metagenome pipeline for " + func) func_infile = predicted_funcs[func] func_output_dir = path.join(output_folder, func + "_metagenome_out") func_map = None if func in default_map: func_map = default_map[func] func_strat_out, func_unstrat_out = metagenome_pipeline_steps( input_table=input_table, func_infile=func_infile, marker_infile=marker_infile, func_output_dir=func_output_dir, no_descrip=no_descrip, max_nsti=max_nsti, min_reads=min_reads, min_samples=min_samples, stratified=stratified, threads=threads, func_map=func_map, verbose=verbose) if stratified: func_output[func] = func_strat_out else: func_output[func] = func_unstrat_out pathway_outfiles = None # Infer pathway abundances and coverages unless --no_pathways set. if not no_pathways: pathways_intermediate = path.join(intermediate_dir, "pathways") make_output_dir(pathways_intermediate) if verbose: print("Inferring pathways from predicted " + rxn_func) predicted_rxn = func_output[rxn_func] # Set regrouping mapfile to be empty if no_regroup set. if no_regroup: regroup_map = None unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline( inputfile=predicted_rxn, mapfile=pathway_map, regroup_mapfile=regroup_map, proc=threads, out_dir=pathways_intermediate, gap_fill=gap_fill_opt, per_sequence_contrib=per_sequence_contrib, print_cmds=verbose) pathways_out = path.join(output_folder, "pathways_out") unstrat_abun.index.name = 'pathway' unstrat_cov.index.name = 'pathway' unstrat_abun.reset_index(inplace=True) unstrat_cov.reset_index(inplace=True) pathway_outfiles = {} if not no_descrip: unstrat_abun = add_descrip_col(inputfile=unstrat_abun, mapfile=default_map["METACYC"], in_df=True) if not no_descrip: unstrat_cov = add_descrip_col(inputfile=unstrat_cov, mapfile=default_map["METACYC"], in_df=True) if verbose: print("Writing predicted pathway abundances and coverages to " + pathways_out) make_output_dir(pathways_out) unstrat_abun_outfile = path.join(pathways_out, "path_abun_unstrat.tsv") unstrat_cov_outfile = path.join(pathways_out, "path_cov_unstrat.tsv") unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile, sep="\t", index=False) unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile, sep="\t", index=False) pathway_outfiles["unstrat_abun"] = unstrat_abun_outfile pathway_outfiles["unstrat_cov"] = unstrat_cov_outfile strat_abun_outfile = None strat_cov_outfile = None # Write stratified output only if something besides None was returned. if strat_abun is not None: if not no_descrip: strat_abun = add_descrip_col(inputfile=strat_abun, mapfile=default_map["METACYC"], in_df=True) strat_abun_outfile = path.join(pathways_out, "path_abun_strat.tsv") strat_abun.to_csv(path_or_buf=strat_abun_outfile, sep="\t", index=False) if strat_cov is not None: if not no_descrip: strat_cov = add_descrip_col(inputfile=strat_cov, mapfile=default_map["METACYC"], in_df=True) strat_cov_outfile = path.join(pathways_out, "path_cov_strat.tsv") strat_cov.to_csv(path_or_buf=strat_cov_outfile, sep="\t", index=False) pathway_outfiles["strat_abun"] = strat_abun_outfile pathway_outfiles["strat_cov"] = strat_cov_outfile return (func_output, pathway_outfiles)
def main(): args = parser.parse_args() # Get start time. start_time = time.time() # Check that input files exist. check_files_exist([args.study_fasta, args.input]) # Make output folder. make_output_dir(args.output) out_tree = path.join(args.output, "out.tre") if args.custom_trait_tables is None: # Check that specified functional categories are allowed. FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM'] funcs = args.in_traits.split(",") for func in funcs: if func not in FUNC_TRAIT_OPTIONS: sys.exit("Error - specified category " + func + " is not one of " "the default categories.") # Add EC to this set if pathways are to be predicted. if "EC" not in funcs and not args.no_pathways: funcs.append("EC") rxn_func = "EC" func_tables = default_tables else: funcs = [] func_tables = {} table_i = 0 for custom in args.custom_trait_tables.split(","): func_id = path.splitext(path.basename(custom))[0] funcs.append(func_id) func_tables[func_id] = custom if table_i == 0: rxn_func = func_id table_i += 1 # Append marker as well, since this also needs to be run. funcs.append("marker") func_tables["marker"] = args.marker_gene_table # Methods for discrete trait prediction with CI enabled. discrete_set = set(['emp_prob', 'mp']) if args.confidence and args.hsp_method in discrete_set: ci_setting = True else: ci_setting = False gap_fill_opt = not args.no_gap_fill with TemporaryDirectory() as temp_dir: print("Placing sequences onto reference tree.") place_seqs_pipeline(study_fasta=args.study_fasta, ref_msa=args.ref_msa, tree=args.tree, out_tree=out_tree, threads=args.threads, papara_output=None, out_dir=temp_dir, chunk_size=5000, print_cmds=args.print_cmds) print("Finished placing sequences on output tree: " + out_tree) # Get predictions for all specified functions and keep track of outfiles. predicted_funcs = {} for func in funcs: # Only output NSTI in 16S table. nsti_setting = False if func == "marker" and args.calculate_NSTI: nsti_setting = True print("Running hidden-state prediction for " + func) hsp_table, ci_table = castor_hsp_workflow( tree_path=out_tree, trait_table_path=func_tables[func], hsp_method=args.hsp_method, calc_nsti=nsti_setting, calc_ci=ci_setting, check_input=False, num_proc=args.threads, ran_seed=args.seed) count_outfile = path.join(args.output, func + "_predicted.tsv") # Add "_nsti" to filename if output. if nsti_setting: count_outfile = path.join(args.output, func + "_nsti_predicted.tsv") # Keep track of output file name for next step of pipeline. predicted_funcs[func] = count_outfile print("Writing out predicted gene family abundances to " + count_outfile) hsp_table.to_csv(path_or_buf=count_outfile, index_label="sequence", sep="\t") # Output the CI file as well if option set. if ci_setting: ci_outfile = path.join(args.output, func + "_predicted_ci.tsv") print("Writing out predicted gene family CIs to " + ci_outfile) ci_table.to_csv(path_or_buf=ci_outfile, index_label="sequence", sep="\t") marker_infile = predicted_funcs["marker"] # Loop over each function again and run metagenome pipeline. for func in funcs: if func == "marker": continue func_infile = predicted_funcs[func] func_output_dir = path.join(args.output, func + "_metagenome_out") print("Running metagenome pipeline for " + func) # Infer metagenome abundances per-sample. with TemporaryDirectory() as temp_dir: # Pass arguments to key function and get predicted functions # stratified and unstratified by genomes. strat_pred, unstrat_pred = run_metagenome_pipeline( input_biom=args.input, function=func_infile, marker=marker_infile, out_dir=func_output_dir, max_nsti=args.max_nsti, min_reads=args.min_reads, min_samples=args.min_samples, strat_out=args.stratified, proc=args.threads, output_normfile=True) print("Writing metagenome output files for " + func + " to: " + func_output_dir) # Generate output table filepaths and write out pandas dataframe. unstrat_outfile = path.join(func_output_dir, "pred_metagenome_unstrat.tsv") unstrat_pred.index.name = "function" unstrat_pred.reset_index(inplace=True) if args.custom_trait_tables is None: unstrat_pred = add_descrip_col(inputfile=unstrat_pred, mapfile=default_map[func], in_df=True) unstrat_pred.to_csv(path_or_buf=unstrat_outfile, sep="\t", index=False) # Write out stratified table only if that option was specified. if args.stratified: strat_outfile = path.join(func_output_dir, "pred_metagenome_strat.tsv") strat_pred.reset_index(inplace=True) if args.custom_trait_tables is None: strat_pred = add_descrip_col(inputfile=strat_pred, mapfile=default_map[func], in_df=True) strat_pred.to_csv(path_or_buf=strat_outfile, sep="\t", index=False) # Infer pathway abundances and coverages unless --no_pathways set. if not args.no_pathways: if args.stratified: in_metagenome = path.join(args.output, rxn_func + "_metagenome_out", "pred_metagenome_strat.tsv") else: in_metagenome = path.join(args.output, rxn_func + "_metagenome_out", "pred_metagenome_unstrat.tsv") print("Inferring MetaCyc pathways from predicted functions in this " "file: " + in_metagenome) with TemporaryDirectory() as temp_dir: unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline( inputfile=in_metagenome, mapfile=default_pathway_map, regroup_mapfile=default_regroup_map, proc=args.threads, out_dir=temp_dir, gap_fill=gap_fill_opt, per_sequence_contrib=args.per_sequence_contrib, print_cmds=args.print_cmds) pathways_out = path.join(args.output, "pathways_out") make_output_dir(pathways_out) print("Writing predicted pathway abundances and coverages to " + pathways_out) # Write output files. unstrat_abun_outfile = path.join(pathways_out, "path_abun_unstrat.tsv") unstrat_abun.reset_index(inplace=True) if args.custom_trait_tables is None: unstrat_abun = add_descrip_col(inputfile=unstrat_abun, mapfile=default_map["METACYC"], in_df=True) unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile, sep="\t", index=False) unstrat_cov_outfile = path.join(pathways_out, "path_cov_unstrat.tsv") unstrat_cov.reset_index(inplace=True) if args.custom_trait_tables is None: unstrat_cov = add_descrip_col(inputfile=unstrat_cov, mapfile=default_map["METACYC"], in_df=True) unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile, sep="\t", index=False) # Write stratified output only if something besides None was returned. if strat_abun is not None: strat_abun_outfile = path.join(pathways_out, "path_abun_strat.tsv") if args.custom_trait_tables is None: strat_abun = add_descrip_col(inputfile=strat_abun, mapfile=default_map["METACYC"], in_df=True) strat_abun.to_csv(path_or_buf=strat_abun_outfile, sep="\t", index=False) if strat_cov is not None: strat_cov_outfile = path.join(pathways_out, "path_cov_strat.tsv") if args.custom_trait_tables is None: strat_cov = add_descrip_col(inputfile=strat_cov, mapfile=default_map["METACYC"], in_df=True) strat_cov.to_csv(path_or_buf=strat_cov_outfile, sep="\t", index=False) # Print out elapsed time. elapsed_time = time.time() - start_time print("Completed PICRUSt2 pipeline in " + "%.2f" % elapsed_time + " seconds.")