Exemplo n.º 1
0
def main():

    args = parser.parse_args()

    # Check that input filenames exist.
    check_files_exist([args.study_fasta, args.ref_msa, args.tree])	

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:

        make_output_dir(args.intermediate)

        place_seqs_pipeline(study_fasta=args.study_fasta,
                            ref_msa=args.ref_msa,
                            tree=args.tree,
                            out_tree=args.out_tree,
                            threads=args.threads,
                            papara_output=args.papara_output,
                            out_dir=args.intermediate,
                            chunk_size=args.chunk_size,
                            print_cmds=args.print_cmds)

    else:
        with TemporaryDirectory() as temp_dir:
                place_seqs_pipeline(study_fasta=args.study_fasta,
                                    ref_msa=args.ref_msa,
                                    tree=args.tree,
                                    out_tree=args.out_tree,
                                    threads=args.threads,
                                    papara_output=args.papara_output,
                                    out_dir=temp_dir,
                                    chunk_size=args.chunk_size,
                                    print_cmds=args.print_cmds)
Exemplo n.º 2
0
def main():

    args = parser.parse_args()

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:

        make_output_dir(args.intermediate)

        place_seqs_pipeline(study_fasta=args.study_fasta,
                            ref_dir=args.ref_dir,
                            out_tree=args.out_tree,
                            threads=args.processes,
                            out_dir=args.intermediate,
                            chunk_size=args.chunk_size,
                            print_cmds=args.print_cmds)

    else:
        with TemporaryDirectory() as temp_dir:
                place_seqs_pipeline(study_fasta=args.study_fasta,
                                    ref_dir=args.ref_dir,
                                    out_tree=args.out_tree,
                                    threads=args.processes,
                                    out_dir=temp_dir,
                                    chunk_size=args.chunk_size,
                                    print_cmds=args.print_cmds)
Exemplo n.º 3
0
    def test_run_place_seqs_pipeline(self):
        '''Basic test of full place seqs pipeline. As for EPA-NG, exact
        matches to a treefile are not checked since slight differences
        are expected depending on different versions.'''

        with TemporaryDirectory() as temp_dir:
            tmp_tree = path.join(temp_dir, "out.tre")

            place_seqs_pipeline(study_fasta=test_study_seqs,
                                ref_dir=test_ref_dir,
                                out_tree=tmp_tree,
                                threads=1,
                                out_dir=temp_dir,
                                chunk_size=5000,
                                print_cmds=False)
Exemplo n.º 4
0
def full_pipeline(study_fasta, input_table, output_folder, threads, ref_msa,
                  tree, hmm, in_traits, custom_trait_tables, marker_gene_table,
                  pathway_map, no_pathways, regroup_map, no_regroup,
                  stratified, alignment_tool, max_nsti, min_reads, min_samples,
                  hsp_method, calculate_NSTI, confidence, seed, no_gap_fill,
                  per_sequence_contrib, no_descrip, verbose):
    '''Function that contains wrapper commands for full PICRUSt2 pipeline.
    Descriptions of all of these input arguments/options are given in the
    picrust2_pipeline.py script.'''

    # Check that input files exist.
    check_files_exist([study_fasta, input_table])

    if path.exists(output_folder):
        sys.exit("Stopping - output directory " + output_folder +
                 " already exists.")

    # Make output folder.
    make_output_dir(output_folder)

    out_tree = path.join(output_folder, "out.tre")

    if custom_trait_tables is None:

        # Check that specified functional categories are allowed.
        FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM']
        funcs = in_traits.split(",")
        for func in funcs:
            if func not in FUNC_TRAIT_OPTIONS:
                sys.exit("Error - specified category " + func +
                         " is not one of "
                         "the default categories.")

        # Add EC to this set if pathways are to be predicted.
        if "EC" not in funcs and not no_pathways:
            funcs.append("EC")

        rxn_func = "EC"

        func_tables = default_tables

    else:

        no_descrip = True

        funcs = []
        func_tables = {}

        table_i = 0

        for custom in custom_trait_tables.split(","):

            func_id = path.splitext(path.basename(custom))[0]
            funcs.append(func_id)
            func_tables[func_id] = custom

            if table_i == 0:
                rxn_func = func_id
                table_i += 1

    # Append marker as well, since this also needs to be run.
    funcs.append("marker")
    func_tables["marker"] = marker_gene_table

    # Methods for discrete trait prediction with CI enabled.
    discrete_set = set(['emp_prob', 'mp'])

    if confidence and hsp_method in discrete_set:
        ci_setting = True
    else:
        ci_setting = False

    gap_fill_opt = not no_gap_fill

    if verbose:
        print("Placing sequences onto reference tree.")

    # Define folders for intermediate files.
    intermediate_dir = path.join(output_folder, "intermediate")
    place_seqs_intermediate = path.join(intermediate_dir, "place_seqs")
    make_output_dir(intermediate_dir)
    make_output_dir(place_seqs_intermediate)

    place_seqs_pipeline(study_fasta=study_fasta,
                        ref_msa=ref_msa,
                        tree=tree,
                        hmm=hmm,
                        out_tree=out_tree,
                        alignment_tool=alignment_tool,
                        threads=threads,
                        out_dir=place_seqs_intermediate,
                        chunk_size=5000,
                        print_cmds=verbose)

    if verbose:
        print("Finished placing sequences on output tree: " + out_tree)

    # Get predictions for all specified functions and keep track of outfiles.
    predicted_funcs = {}

    for func in funcs:

        count_outfile = hsp_pipeline_steps(func=func,
                                           calculate_NSTI=calculate_NSTI,
                                           out_tree=out_tree,
                                           func_table_in=func_tables[func],
                                           hsp_method=hsp_method,
                                           ci_setting=ci_setting,
                                           threads=threads,
                                           seed=seed,
                                           output_folder=output_folder,
                                           verbose=verbose)

        # Keep track of output file name for next step of pipeline.
        predicted_funcs[func] = count_outfile

    marker_infile = predicted_funcs["marker"]

    # Inititalize dictionary of function names to output filenames to return.
    func_output = {}

    # Each value will be a list of 2 elements corresponding to the unstratified
    # and stratified tables respectively (stratified will be None of not calculated).

    # Loop over each function again and run metagenome pipeline.
    for func in funcs:

        if func == "marker":
            continue

        if verbose:
            print("Running metagenome pipeline for " + func)

        func_infile = predicted_funcs[func]

        func_output_dir = path.join(output_folder, func + "_metagenome_out")

        func_map = None

        if func in default_map:
            func_map = default_map[func]

        func_strat_out, func_unstrat_out = metagenome_pipeline_steps(
            input_table=input_table,
            func_infile=func_infile,
            marker_infile=marker_infile,
            func_output_dir=func_output_dir,
            no_descrip=no_descrip,
            max_nsti=max_nsti,
            min_reads=min_reads,
            min_samples=min_samples,
            stratified=stratified,
            threads=threads,
            func_map=func_map,
            verbose=verbose)
        if stratified:
            func_output[func] = func_strat_out
        else:
            func_output[func] = func_unstrat_out

    pathway_outfiles = None

    # Infer pathway abundances and coverages unless --no_pathways set.
    if not no_pathways:

        pathways_intermediate = path.join(intermediate_dir, "pathways")
        make_output_dir(pathways_intermediate)

        if verbose:
            print("Inferring pathways from predicted " + rxn_func)

        predicted_rxn = func_output[rxn_func]

        # Set regrouping mapfile to be empty if no_regroup set.
        if no_regroup:
            regroup_map = None

        unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline(
            inputfile=predicted_rxn,
            mapfile=pathway_map,
            regroup_mapfile=regroup_map,
            proc=threads,
            out_dir=pathways_intermediate,
            gap_fill=gap_fill_opt,
            per_sequence_contrib=per_sequence_contrib,
            print_cmds=verbose)

        pathways_out = path.join(output_folder, "pathways_out")

        unstrat_abun.index.name = 'pathway'
        unstrat_cov.index.name = 'pathway'
        unstrat_abun.reset_index(inplace=True)
        unstrat_cov.reset_index(inplace=True)

        pathway_outfiles = {}

        if not no_descrip:
            unstrat_abun = add_descrip_col(inputfile=unstrat_abun,
                                           mapfile=default_map["METACYC"],
                                           in_df=True)
        if not no_descrip:
            unstrat_cov = add_descrip_col(inputfile=unstrat_cov,
                                          mapfile=default_map["METACYC"],
                                          in_df=True)

        if verbose:
            print("Writing predicted pathway abundances and coverages to " +
                  pathways_out)

        make_output_dir(pathways_out)

        unstrat_abun_outfile = path.join(pathways_out, "path_abun_unstrat.tsv")
        unstrat_cov_outfile = path.join(pathways_out, "path_cov_unstrat.tsv")

        unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,
                            sep="\t",
                            index=False)
        unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,
                           sep="\t",
                           index=False)

        pathway_outfiles["unstrat_abun"] = unstrat_abun_outfile
        pathway_outfiles["unstrat_cov"] = unstrat_cov_outfile

        strat_abun_outfile = None
        strat_cov_outfile = None

        # Write stratified output only if something besides None was returned.
        if strat_abun is not None:

            if not no_descrip:
                strat_abun = add_descrip_col(inputfile=strat_abun,
                                             mapfile=default_map["METACYC"],
                                             in_df=True)
            strat_abun_outfile = path.join(pathways_out, "path_abun_strat.tsv")
            strat_abun.to_csv(path_or_buf=strat_abun_outfile,
                              sep="\t",
                              index=False)

        if strat_cov is not None:

            if not no_descrip:
                strat_cov = add_descrip_col(inputfile=strat_cov,
                                            mapfile=default_map["METACYC"],
                                            in_df=True)

            strat_cov_outfile = path.join(pathways_out, "path_cov_strat.tsv")
            strat_cov.to_csv(path_or_buf=strat_cov_outfile,
                             sep="\t",
                             index=False)

        pathway_outfiles["strat_abun"] = strat_abun_outfile
        pathway_outfiles["strat_cov"] = strat_cov_outfile

    return (func_output, pathway_outfiles)
Exemplo n.º 5
0
def main():

    args = parser.parse_args()

    # Get start time.
    start_time = time.time()

    # Check that input files exist.
    check_files_exist([args.study_fasta, args.input])

    # Make output folder.
    make_output_dir(args.output)

    out_tree = path.join(args.output, "out.tre")

    if args.custom_trait_tables is None:

        # Check that specified functional categories are allowed.
        FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM']
        funcs = args.in_traits.split(",")
        for func in funcs:
            if func not in FUNC_TRAIT_OPTIONS:
                sys.exit("Error - specified category " + func +
                         " is not one of "
                         "the default categories.")

        # Add EC to this set if pathways are to be predicted.
        if "EC" not in funcs and not args.no_pathways:
            funcs.append("EC")

        rxn_func = "EC"

        func_tables = default_tables

    else:
        funcs = []
        func_tables = {}

        table_i = 0

        for custom in args.custom_trait_tables.split(","):

            func_id = path.splitext(path.basename(custom))[0]
            funcs.append(func_id)
            func_tables[func_id] = custom

            if table_i == 0:
                rxn_func = func_id
                table_i += 1

    # Append marker as well, since this also needs to be run.
    funcs.append("marker")
    func_tables["marker"] = args.marker_gene_table

    # Methods for discrete trait prediction with CI enabled.
    discrete_set = set(['emp_prob', 'mp'])

    if args.confidence and args.hsp_method in discrete_set:
        ci_setting = True
    else:
        ci_setting = False

    gap_fill_opt = not args.no_gap_fill

    with TemporaryDirectory() as temp_dir:

        print("Placing sequences onto reference tree.")

        place_seqs_pipeline(study_fasta=args.study_fasta,
                            ref_msa=args.ref_msa,
                            tree=args.tree,
                            out_tree=out_tree,
                            threads=args.threads,
                            papara_output=None,
                            out_dir=temp_dir,
                            chunk_size=5000,
                            print_cmds=args.print_cmds)

        print("Finished placing sequences on output tree: " + out_tree)

    # Get predictions for all specified functions and keep track of outfiles.
    predicted_funcs = {}

    for func in funcs:

        # Only output NSTI in 16S table.
        nsti_setting = False
        if func == "marker" and args.calculate_NSTI:
            nsti_setting = True

        print("Running hidden-state prediction for " + func)

        hsp_table, ci_table = castor_hsp_workflow(
            tree_path=out_tree,
            trait_table_path=func_tables[func],
            hsp_method=args.hsp_method,
            calc_nsti=nsti_setting,
            calc_ci=ci_setting,
            check_input=False,
            num_proc=args.threads,
            ran_seed=args.seed)

        count_outfile = path.join(args.output, func + "_predicted.tsv")

        # Add "_nsti" to filename if output.
        if nsti_setting:
            count_outfile = path.join(args.output,
                                      func + "_nsti_predicted.tsv")

        # Keep track of output file name for next step of pipeline.
        predicted_funcs[func] = count_outfile

        print("Writing out predicted gene family abundances to " +
              count_outfile)

        hsp_table.to_csv(path_or_buf=count_outfile,
                         index_label="sequence",
                         sep="\t")

        # Output the CI file as well if option set.
        if ci_setting:
            ci_outfile = path.join(args.output, func + "_predicted_ci.tsv")
            print("Writing out predicted gene family CIs to " + ci_outfile)
            ci_table.to_csv(path_or_buf=ci_outfile,
                            index_label="sequence",
                            sep="\t")

    marker_infile = predicted_funcs["marker"]

    # Loop over each function again and run metagenome pipeline.
    for func in funcs:

        if func == "marker":
            continue

        func_infile = predicted_funcs[func]

        func_output_dir = path.join(args.output, func + "_metagenome_out")

        print("Running metagenome pipeline for " + func)

        # Infer metagenome abundances per-sample.
        with TemporaryDirectory() as temp_dir:

            # Pass arguments to key function and get predicted functions
            # stratified and unstratified by genomes.
            strat_pred, unstrat_pred = run_metagenome_pipeline(
                input_biom=args.input,
                function=func_infile,
                marker=marker_infile,
                out_dir=func_output_dir,
                max_nsti=args.max_nsti,
                min_reads=args.min_reads,
                min_samples=args.min_samples,
                strat_out=args.stratified,
                proc=args.threads,
                output_normfile=True)

            print("Writing metagenome output files for " + func + " to: " +
                  func_output_dir)

            # Generate output table filepaths and write out pandas dataframe.
            unstrat_outfile = path.join(func_output_dir,
                                        "pred_metagenome_unstrat.tsv")

            unstrat_pred.index.name = "function"
            unstrat_pred.reset_index(inplace=True)

            if args.custom_trait_tables is None:
                unstrat_pred = add_descrip_col(inputfile=unstrat_pred,
                                               mapfile=default_map[func],
                                               in_df=True)

            unstrat_pred.to_csv(path_or_buf=unstrat_outfile,
                                sep="\t",
                                index=False)

            # Write out stratified table only if that option was specified.
            if args.stratified:
                strat_outfile = path.join(func_output_dir,
                                          "pred_metagenome_strat.tsv")
                strat_pred.reset_index(inplace=True)

                if args.custom_trait_tables is None:
                    strat_pred = add_descrip_col(inputfile=strat_pred,
                                                 mapfile=default_map[func],
                                                 in_df=True)

                strat_pred.to_csv(path_or_buf=strat_outfile,
                                  sep="\t",
                                  index=False)

    # Infer pathway abundances and coverages unless --no_pathways set.
    if not args.no_pathways:

        if args.stratified:
            in_metagenome = path.join(args.output,
                                      rxn_func + "_metagenome_out",
                                      "pred_metagenome_strat.tsv")
        else:
            in_metagenome = path.join(args.output,
                                      rxn_func + "_metagenome_out",
                                      "pred_metagenome_unstrat.tsv")

        print("Inferring MetaCyc pathways from predicted functions in this "
              "file: " + in_metagenome)

        with TemporaryDirectory() as temp_dir:
            unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline(
                inputfile=in_metagenome,
                mapfile=default_pathway_map,
                regroup_mapfile=default_regroup_map,
                proc=args.threads,
                out_dir=temp_dir,
                gap_fill=gap_fill_opt,
                per_sequence_contrib=args.per_sequence_contrib,
                print_cmds=args.print_cmds)

        pathways_out = path.join(args.output, "pathways_out")

        make_output_dir(pathways_out)

        print("Writing predicted pathway abundances and coverages to " +
              pathways_out)

        # Write output files.
        unstrat_abun_outfile = path.join(pathways_out, "path_abun_unstrat.tsv")
        unstrat_abun.reset_index(inplace=True)

        if args.custom_trait_tables is None:
            unstrat_abun = add_descrip_col(inputfile=unstrat_abun,
                                           mapfile=default_map["METACYC"],
                                           in_df=True)

        unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,
                            sep="\t",
                            index=False)

        unstrat_cov_outfile = path.join(pathways_out, "path_cov_unstrat.tsv")
        unstrat_cov.reset_index(inplace=True)

        if args.custom_trait_tables is None:
            unstrat_cov = add_descrip_col(inputfile=unstrat_cov,
                                          mapfile=default_map["METACYC"],
                                          in_df=True)

        unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,
                           sep="\t",
                           index=False)

        # Write stratified output only if something besides None was returned.
        if strat_abun is not None:
            strat_abun_outfile = path.join(pathways_out, "path_abun_strat.tsv")

            if args.custom_trait_tables is None:
                strat_abun = add_descrip_col(inputfile=strat_abun,
                                             mapfile=default_map["METACYC"],
                                             in_df=True)
            strat_abun.to_csv(path_or_buf=strat_abun_outfile,
                              sep="\t",
                              index=False)

        if strat_cov is not None:
            strat_cov_outfile = path.join(pathways_out, "path_cov_strat.tsv")

            if args.custom_trait_tables is None:
                strat_cov = add_descrip_col(inputfile=strat_cov,
                                            mapfile=default_map["METACYC"],
                                            in_df=True)

            strat_cov.to_csv(path_or_buf=strat_cov_outfile,
                             sep="\t",
                             index=False)

    # Print out elapsed time.
    elapsed_time = time.time() - start_time
    print("Completed PICRUSt2 pipeline in " + "%.2f" % elapsed_time +
          " seconds.")