def test_mp_simple(self):

        predict_out, ci_out = castor_hsp_workflow(tree_path=in_tree1,
                                                 trait_table_path=in_traits1,
                                                 hsp_method="mp",
                                                 ran_seed=10)

        pd.testing.assert_frame_equal(predict_out, hsp_mp_pred_in, check_like=True)
    def test_emp_prob_ci(self):
        '''Test that Emp Prob confidence intervals calculated correctly.'''
        predict_out, ci_out = castor_hsp_workflow(tree_path=in_tree1,
                                                 trait_table_path=in_traits1,
                                                 hsp_method="emp_prob",
                                                 ran_seed=10,
                                                 calc_ci=True)

        pd.testing.assert_frame_equal(ci_out, hsp_emp_prob_pred_in_ci, check_like=True)
    def test_scp_simple(self):

        predict_out, ci_out = castor_hsp_workflow(tree_path=in_tree1,
                                                 trait_table_path=in_traits1,
                                                 hsp_method="scp",
                                                 ran_seed=10)

	# Since values can differ depending on exact dependency versions, just comparing dimension and names.
        predict_out[:] = 0
        hsp_scp_pred_in[:] = 0

        pd.testing.assert_frame_equal(predict_out, hsp_scp_pred_in, check_like=True)
示例#4
0
def main():

    args = parser.parse_args()

    # Determine which input trait table was specified. If neither a default
    # or custom table was specified then throw an error.
    if args.in_trait:
        trait_table = default_tables[args.in_trait]
    elif args.observed_trait_table:
        trait_table = args.observed_trait_table
    else:
        raise RuntimeError(
            "A default input trait table needs to be specified with the " +
            "--in_trait option, or alternatively a custom table can be " +
            "specified with the --observed_trait_table option")

    # Check that input filenames exist.
    check_files_exist([args.tree, trait_table])

    # Methods for discrete trait prediction with CI enabled.
    discrete_set = set(['emp_prob', 'mp'])

    if args.confidence and args.hsp_method in discrete_set:
        ci_setting = True
    else:
        ci_setting = False

    count_outfile = args.output_prefix + ".tsv"
    ci_outfile = args.output_prefix + "_ci.tsv"

    hsp_table, ci_table = castor_hsp_workflow(tree_path=args.tree,
                                              trait_table_path=trait_table,
                                              hsp_method=args.hsp_method,
                                              chunk_size=args.chunk_size,
                                              calc_nsti=args.calculate_NSTI,
                                              calc_ci=ci_setting,
                                              check_input=args.check,
                                              num_proc=args.processes,
                                              ran_seed=args.seed)

    # Output the table to file.
    make_output_dir_for_file(count_outfile)
    hsp_table.to_csv(path_or_buf=count_outfile,
                     index_label="sequence",
                     sep="\t")

    # Output the CI file as well if option set.
    if ci_setting:
        make_output_dir_for_file(ci_outfile)
        ci_table.to_csv(path_or_buf=ci_outfile,
                        index_label="sequence",
                        sep="\t")
示例#5
0
    def test_mp_ci(self):
        '''Test that MP confidence intervals calculated correctly.'''
        predict_out, ci_out = castor_hsp_workflow(tree_path=in_tree1,
                                                 trait_table_path=in_traits1,
                                                 hsp_method="mp",
                                                 ran_seed=10,
                                                 calc_ci=True)

       	# Since values can differ depending on exact dependency versions, just comparing dimension and names.
        #predict_out[:] = 0
        #hsp_mp_pred_in_ci[:] = 0

        pd.testing.assert_frame_equal(ci_out, hsp_mp_pred_in_ci, check_like=True)
示例#6
0
def hsp_pipeline_steps(func, calculate_NSTI, out_tree, func_table_in,
                       hsp_method, ci_setting, threads, seed, output_folder,
                       verbose):
    '''HSP pipeline steps moved to separate function for improved garbage
    collection (i.e. so that large objects no longer needed are removed from
    memory).'''

    # Only output NSTI in 16S table.
    nsti_setting = False
    if func == "marker" and calculate_NSTI:
        nsti_setting = True

    if verbose:
        print("Running hidden-state prediction for " + func)

    hsp_table, ci_table = castor_hsp_workflow(tree_path=out_tree,
                                              trait_table_path=func_table_in,
                                              hsp_method=hsp_method,
                                              calc_nsti=nsti_setting,
                                              calc_ci=ci_setting,
                                              check_input=False,
                                              num_proc=threads,
                                              ran_seed=seed)

    count_outfile = path.join(output_folder, func + "_predicted.tsv")

    # Add "_nsti" to filename if output.
    if nsti_setting:
        count_outfile = path.join(output_folder, func + "_nsti_predicted.tsv")

    if verbose:
        print("Writing out predicted gene family abundances to " +
              count_outfile)

    hsp_table.to_csv(path_or_buf=count_outfile,
                     index_label="sequence",
                     sep="\t")

    # Output the CI file as well if option set.
    if ci_setting:
        ci_outfile = path.join(output_folder, func + "_predicted_ci.tsv")

        if verbose:
            print("Writing out predicted gene family CIs to " + ci_outfile)

        ci_table.to_csv(path_or_buf=ci_outfile,
                        index_label="sequence",
                        sep="\t")

    return (count_outfile)
def main():

    args = parser.parse_args()

    # Determine which input trait table was specified. If neither a default
    # or custom table was specified then throw an error.
    if args.in_trait:
        trait_table = default_tables[args.in_trait]
    elif args.observed_trait_table:
        trait_table = args.observed_trait_table
    else:
        raise RuntimeError(
            "A default input trait table needs to be specified with the " +
            "--in_trait option, or alternatively a custom table can be " +
            "specified with the --observed_trait_table option")

    # Check that input filenames exist.
    check_files_exist([args.tree, trait_table])

    # No longer support outputting CIs with this script.
    ci_setting = False

    hsp_table, ci_table = castor_hsp_workflow(tree_path=args.tree,
                                              trait_table_path=trait_table,
                                              hsp_method=args.hsp_method,
                                              chunk_size=args.chunk_size,
                                              calc_nsti=args.calculate_NSTI,
                                              calc_ci=ci_setting,
                                              check_input=args.check,
                                              num_proc=args.processes,
                                              ran_seed=args.seed,
                                              verbose=args.verbose)

    # Output the table to file.
    make_output_dir_for_file(args.output)
    hsp_table.to_csv(path_or_buf=args.output,
                     index_label="sequence",
                     sep="\t",
                     compression="infer")
示例#8
0
def main():

    args = parser.parse_args()

    # Get start time.
    start_time = time.time()

    # Check that input files exist.
    check_files_exist([args.study_fasta, args.input])

    # Make output folder.
    make_output_dir(args.output)

    out_tree = path.join(args.output, "out.tre")

    if args.custom_trait_tables is None:

        # Check that specified functional categories are allowed.
        FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM']
        funcs = args.in_traits.split(",")
        for func in funcs:
            if func not in FUNC_TRAIT_OPTIONS:
                sys.exit("Error - specified category " + func +
                         " is not one of "
                         "the default categories.")

        # Add EC to this set if pathways are to be predicted.
        if "EC" not in funcs and not args.no_pathways:
            funcs.append("EC")

        rxn_func = "EC"

        func_tables = default_tables

    else:
        funcs = []
        func_tables = {}

        table_i = 0

        for custom in args.custom_trait_tables.split(","):

            func_id = path.splitext(path.basename(custom))[0]
            funcs.append(func_id)
            func_tables[func_id] = custom

            if table_i == 0:
                rxn_func = func_id
                table_i += 1

    # Append marker as well, since this also needs to be run.
    funcs.append("marker")
    func_tables["marker"] = args.marker_gene_table

    # Methods for discrete trait prediction with CI enabled.
    discrete_set = set(['emp_prob', 'mp'])

    if args.confidence and args.hsp_method in discrete_set:
        ci_setting = True
    else:
        ci_setting = False

    gap_fill_opt = not args.no_gap_fill

    with TemporaryDirectory() as temp_dir:

        print("Placing sequences onto reference tree.")

        place_seqs_pipeline(study_fasta=args.study_fasta,
                            ref_msa=args.ref_msa,
                            tree=args.tree,
                            out_tree=out_tree,
                            threads=args.threads,
                            papara_output=None,
                            out_dir=temp_dir,
                            chunk_size=5000,
                            print_cmds=args.print_cmds)

        print("Finished placing sequences on output tree: " + out_tree)

    # Get predictions for all specified functions and keep track of outfiles.
    predicted_funcs = {}

    for func in funcs:

        # Only output NSTI in 16S table.
        nsti_setting = False
        if func == "marker" and args.calculate_NSTI:
            nsti_setting = True

        print("Running hidden-state prediction for " + func)

        hsp_table, ci_table = castor_hsp_workflow(
            tree_path=out_tree,
            trait_table_path=func_tables[func],
            hsp_method=args.hsp_method,
            calc_nsti=nsti_setting,
            calc_ci=ci_setting,
            check_input=False,
            num_proc=args.threads,
            ran_seed=args.seed)

        count_outfile = path.join(args.output, func + "_predicted.tsv")

        # Add "_nsti" to filename if output.
        if nsti_setting:
            count_outfile = path.join(args.output,
                                      func + "_nsti_predicted.tsv")

        # Keep track of output file name for next step of pipeline.
        predicted_funcs[func] = count_outfile

        print("Writing out predicted gene family abundances to " +
              count_outfile)

        hsp_table.to_csv(path_or_buf=count_outfile,
                         index_label="sequence",
                         sep="\t")

        # Output the CI file as well if option set.
        if ci_setting:
            ci_outfile = path.join(args.output, func + "_predicted_ci.tsv")
            print("Writing out predicted gene family CIs to " + ci_outfile)
            ci_table.to_csv(path_or_buf=ci_outfile,
                            index_label="sequence",
                            sep="\t")

    marker_infile = predicted_funcs["marker"]

    # Loop over each function again and run metagenome pipeline.
    for func in funcs:

        if func == "marker":
            continue

        func_infile = predicted_funcs[func]

        func_output_dir = path.join(args.output, func + "_metagenome_out")

        print("Running metagenome pipeline for " + func)

        # Infer metagenome abundances per-sample.
        with TemporaryDirectory() as temp_dir:

            # Pass arguments to key function and get predicted functions
            # stratified and unstratified by genomes.
            strat_pred, unstrat_pred = run_metagenome_pipeline(
                input_biom=args.input,
                function=func_infile,
                marker=marker_infile,
                out_dir=func_output_dir,
                max_nsti=args.max_nsti,
                min_reads=args.min_reads,
                min_samples=args.min_samples,
                strat_out=args.stratified,
                proc=args.threads,
                output_normfile=True)

            print("Writing metagenome output files for " + func + " to: " +
                  func_output_dir)

            # Generate output table filepaths and write out pandas dataframe.
            unstrat_outfile = path.join(func_output_dir,
                                        "pred_metagenome_unstrat.tsv")

            unstrat_pred.index.name = "function"
            unstrat_pred.reset_index(inplace=True)

            if args.custom_trait_tables is None:
                unstrat_pred = add_descrip_col(inputfile=unstrat_pred,
                                               mapfile=default_map[func],
                                               in_df=True)

            unstrat_pred.to_csv(path_or_buf=unstrat_outfile,
                                sep="\t",
                                index=False)

            # Write out stratified table only if that option was specified.
            if args.stratified:
                strat_outfile = path.join(func_output_dir,
                                          "pred_metagenome_strat.tsv")
                strat_pred.reset_index(inplace=True)

                if args.custom_trait_tables is None:
                    strat_pred = add_descrip_col(inputfile=strat_pred,
                                                 mapfile=default_map[func],
                                                 in_df=True)

                strat_pred.to_csv(path_or_buf=strat_outfile,
                                  sep="\t",
                                  index=False)

    # Infer pathway abundances and coverages unless --no_pathways set.
    if not args.no_pathways:

        if args.stratified:
            in_metagenome = path.join(args.output,
                                      rxn_func + "_metagenome_out",
                                      "pred_metagenome_strat.tsv")
        else:
            in_metagenome = path.join(args.output,
                                      rxn_func + "_metagenome_out",
                                      "pred_metagenome_unstrat.tsv")

        print("Inferring MetaCyc pathways from predicted functions in this "
              "file: " + in_metagenome)

        with TemporaryDirectory() as temp_dir:
            unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline(
                inputfile=in_metagenome,
                mapfile=default_pathway_map,
                regroup_mapfile=default_regroup_map,
                proc=args.threads,
                out_dir=temp_dir,
                gap_fill=gap_fill_opt,
                per_sequence_contrib=args.per_sequence_contrib,
                print_cmds=args.print_cmds)

        pathways_out = path.join(args.output, "pathways_out")

        make_output_dir(pathways_out)

        print("Writing predicted pathway abundances and coverages to " +
              pathways_out)

        # Write output files.
        unstrat_abun_outfile = path.join(pathways_out, "path_abun_unstrat.tsv")
        unstrat_abun.reset_index(inplace=True)

        if args.custom_trait_tables is None:
            unstrat_abun = add_descrip_col(inputfile=unstrat_abun,
                                           mapfile=default_map["METACYC"],
                                           in_df=True)

        unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,
                            sep="\t",
                            index=False)

        unstrat_cov_outfile = path.join(pathways_out, "path_cov_unstrat.tsv")
        unstrat_cov.reset_index(inplace=True)

        if args.custom_trait_tables is None:
            unstrat_cov = add_descrip_col(inputfile=unstrat_cov,
                                          mapfile=default_map["METACYC"],
                                          in_df=True)

        unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,
                           sep="\t",
                           index=False)

        # Write stratified output only if something besides None was returned.
        if strat_abun is not None:
            strat_abun_outfile = path.join(pathways_out, "path_abun_strat.tsv")

            if args.custom_trait_tables is None:
                strat_abun = add_descrip_col(inputfile=strat_abun,
                                             mapfile=default_map["METACYC"],
                                             in_df=True)
            strat_abun.to_csv(path_or_buf=strat_abun_outfile,
                              sep="\t",
                              index=False)

        if strat_cov is not None:
            strat_cov_outfile = path.join(pathways_out, "path_cov_strat.tsv")

            if args.custom_trait_tables is None:
                strat_cov = add_descrip_col(inputfile=strat_cov,
                                            mapfile=default_map["METACYC"],
                                            in_df=True)

            strat_cov.to_csv(path_or_buf=strat_cov_outfile,
                             sep="\t",
                             index=False)

    # Print out elapsed time.
    elapsed_time = time.time() - start_time
    print("Completed PICRUSt2 pipeline in " + "%.2f" % elapsed_time +
          " seconds.")