示例#1
0
def run_epa_ng(tree: str,
               ref_msa_fastafile: str,
               study_msa_fastafile: str,
               model: str,
               out_dir: str,
               chunk_size=5000,
               threads=1,
               print_cmds=False):
    '''Run EPA-NG on specified tree, reference MSA, and study sequence MSA.
    Will output a .jplace file in out_dir.'''

    make_output_dir(out_dir)

    epa_ng_command = [
        "epa-ng", "--tree", tree, "--ref-msa", ref_msa_fastafile, "--query",
        study_msa_fastafile, "--chunk-size",
        str(chunk_size), "-T",
        str(threads), "-m", model, "-w", out_dir, "--filter-acc-lwr", "0.99",
        "--filter-max", "100"
    ]

    system_call_check(epa_ng_command, print_out=print_cmds)

    # Parse jplace file so that output is reprodicible.
    jplace_orig = path.join(out_dir, "epa_result.jplace")
    jplace_parsed = path.join(out_dir, "epa_result_parsed.jplace")
    parse_jplace(jplace_orig, jplace_parsed)
示例#2
0
def castor_nsti(tree_path,
                known_tips):
    '''Will calculate distance from each study sequence to the closest
    reference sequence. Takes in the path to treefile and the known tips
    (i.e. the rownames in the trait table - the reference genome ids).'''
    castor_nsti_script = path.join(path.dirname(path.abspath(__file__)),
                                   'Rscripts', 'castor_nsti.R')

    # Create temporary directory for working in.
    with TemporaryDirectory() as temp_dir:

        # Output known tip names to temp file
        # (note this object is a numpy.ndarray)
        known_tips_out = path.join(temp_dir, "known_tips.txt")
        known_tips.tofile(known_tips_out, sep="\n")

        nsti_tmp_out = path.join(temp_dir, "nsti_out.txt")

        # Run Rscript.
        system_call_check(" ".join(["Rscript",
                                    castor_nsti_script,
                                    tree_path,
                                    known_tips_out,
                                    nsti_tmp_out]))

        # Read in calculated NSTI values.
        nsti_out = pd.read_csv(nsti_tmp_out, sep="\t", index_col="sequence")

    # Make sure that the table has the correct number of rows.
    if len(known_tips) != nsti_out.shape[0]:
        ValueError("Number of rows in returned NSTI table is incorrect.")

    return(nsti_out)
示例#3
0
def run_papara(tree: str, ref_msa: dict, study_fasta: str, out_dir: str,
               threads=1, print_cmds=False):
    '''Run PaPaRa to place study sequences into reference multiple-sequence
    alignment (MSA). Will return dictionary of the the output MSA (sequence ids
    as keys). Expects path to tree and study FASTA as strings. Expects
    reference MSA as a dictionary output by read_fasta. This MSA will be
    converted to phylip format before running PaPaRa.'''

    # Get absolute paths to input files.
    tree = path.abspath(tree)
    study_fasta = path.abspath(study_fasta)

    # Change working directory to out directory (but keep track of original).
    # This is necessary because PaPaRa outputs into the current working
    # directory.
    orig_wd = getcwd()
    chdir(out_dir)

    # Convert ref sequences from MSA FASTA to phylip.
    write_phylip(ref_msa, "ref_seqs.phylip")

    # Make call to papara to place sequences (outputs phylip format).
    system_call_check("papara -t " + tree + " -s ref_seqs.phylip " +
                      "-q " + study_fasta + " -j " + str(threads) +
                      " -n out", print_command=print_cmds,
                      print_stdout=print_cmds, print_stderr=print_cmds)

    # Change back to original working directory.
    chdir(orig_wd)

    # Read in papara phylip output and return.
    return(read_phylip(path.join(out_dir, "papara_alignment.out"),
                       check_input=True))
    def test_full_pipeline_tsv(self):
        '''Test that full pipeline can be run without error with
        TSV sequence abundance table.'''

        with TemporaryDirectory() as temp_dir:

            out_tree = path.join(temp_dir, "out.tre")

            system_call_check("place_seqs.py -s " + test_study_seqs + " -r " +
                              test_ref_dir + " -o " + out_tree)

            traits_predict = path.join(temp_dir, "hsp_out.tsv.gz")
            marker_predict = path.join(temp_dir, "hsp_out_marker.tsv.gz")

            system_call_check("hsp.py -t " + out_tree +
                              " --observed_trait_table " + test_known_traits +
                              " -n -o " + traits_predict)

            system_call_check("hsp.py -t " + out_tree +
                              " --observed_trait_table " + test_known_marker +
                              " -n -o " + marker_predict)

            metagenome_out = path.join(temp_dir, "meta_out")

            system_call_check("metagenome_pipeline.py -i " +
                              test_seq_abun_tsv + " -f " + traits_predict +
                              " --strat_out " + " -m " + marker_predict +
                              " -o " + metagenome_out)

            metagenome_outfile = path.join(metagenome_out,
                                           "pred_metagenome_unstrat.tsv.gz")

            system_call_check("pathway_pipeline.py -i " + metagenome_outfile +
                              " -o " + temp_dir)
示例#5
0
    def test_picrust2_pipeline_script_per_seq_contrib_strat(self):
        '''Test that full pipeline can be run successfully with
        picrust2_pipeline.py with the --per_sequence_contrib and --stratified
        options.'''

        with TemporaryDirectory() as temp_dir:

            out_dir = path.join(temp_dir, "pipeline_out")

            system_call_check("picrust2_pipeline.py -s " + test_study_seqs +
                              " -i " + test_seq_abun_tsv +
                              " -o " + out_dir +
                              " -r " + test_ref_dir +
                              " -p 1" +
                              " --custom_trait_tables " + test_known_traits +
                              " --marker_gene_table " + test_known_marker +
                              " --reaction_func " + test_known_traits +
                              " --max_nsti 1.9" +
                              " --min_reads 2" +
                              " --min_samples 2" +
                              " --skip_minpath" +
                              " --no_gap_fill" +
                              " --coverage" +
                              " --remove_intermediate" +
                              " --stratified" +
                              " --per_sequence_contrib"
                              " --verbose")
示例#6
0
def minpath_wrapper(sample_id,
                    unstrat_input,
                    minpath_map,
                    minpath_outdir,
                    print_opt=False,
                    extra_str=""):
    '''Run MinPath based on gene abundances in a single sample. Will return
    a set of all pathways called as present.'''

    # Define MinPath input and output filenames.
    minpath_in = path.join(minpath_outdir,
                           str(sample_id) + extra_str + "_minpath_in.txt")

    minpath_report = path.join(
        minpath_outdir,
        str(sample_id) + extra_str + "_minpath_report.txt")

    minpath_details = path.join(
        minpath_outdir,
        str(sample_id) + extra_str + "_minpath_details.txt")

    minpath_mps = path.join(minpath_outdir,
                            str(sample_id) + extra_str + "_minpath.mps")

    id_minpath_fh = open(minpath_in, "w")

    # Loop over all reactions (which are the index labels in unstrat table
    # unless regrouped).
    for reaction_id in unstrat_input.index.values:
        # Get count of each sequence in sample and write that sequence out
        # along with count if non-zero abundance.
        reaction_count = unstrat_input.loc[reaction_id, sample_id]

        # If 0 then skip.
        if reaction_count == 0:
            continue

        id_minpath_fh.write(reaction_id + "\t" + str(reaction_count) + "\n")

    id_minpath_fh.close()

    # Run MinPath on this sample.
    path2minpath = path.join(path.dirname(path.abspath(__file__)), 'MinPath',
                             'MinPath12hmp.py')

    minpath_cmd = path2minpath + " -any " + minpath_in + " -map " +\
                  minpath_map + " -report " + minpath_report +\
                  " -details " + minpath_details + " -mps " + minpath_mps

    system_call_check(minpath_cmd, print_out=print_opt)

    # Read through MinPath report and keep track of pathways identified
    # to be present.
    path_present = identify_minpath_present(minpath_report)

    # Return list of which pathways are present.
    return (path_present)
示例#7
0
def castor_hsp_wrapper(tree_path,
                       trait_tab,
                       hsp_method,
                       calc_ci=False,
                       check_input=False,
                       ran_seed=None):
    '''Wrapper for making system calls to castor_hsp.py Rscript.'''

    castor_hsp_script = path.join(get_picrust_project_dir(), 'picrust2',
                                  'Rscripts', 'castor_hsp.R')

    # Need to format boolean setting as string for R to read in as argument.
    if calc_ci:
        calc_ci_setting = "TRUE"
    else:
        calc_ci_setting = "FALSE"

    if check_input:
        check_input_setting = "TRUE"
    else:
        check_input_setting = "FALSE"

    # Create temporary directory for writing output files of castor_hsp.R

    with TemporaryDirectory() as temp_dir:

        output_count_path = path.join(temp_dir, "predicted_counts.txt")
        output_ci_path = path.join(temp_dir, "predicted_ci.txt")

        hsp_cmd = " ".join([
            "Rscript", castor_hsp_script, tree_path, trait_tab, hsp_method,
            calc_ci_setting, check_input_setting, output_count_path,
            output_ci_path,
            str(ran_seed)
        ])

        # Run castor_hsp.R
        system_call_check(hsp_cmd)

        # Load the output into Table objects
        try:
            asr_table = pd.read_table(filepath_or_buffer=output_count_path,
                                      sep="\t",
                                      index_col="sequence")
        except IOError:
            raise ValueError("Cannot read in expected output file" +
                             output_ci_path)

        if calc_ci:
            asr_ci_table = pd.read_table(filepath_or_buffer=output_ci_path,
                                         sep="\t",
                                         index_col="sequence")
        else:
            asr_ci_table = None

    # Return list with predicted counts and CIs.
    return [asr_table, asr_ci_table]
示例#8
0
    def test_picrust2_pipeline_script(self):
        '''Test that full pipeline can be run successfully with
        picrust2_pipeline.py'''

        with TemporaryDirectory() as temp_dir:

            system_call_check("picrust2_pipeline.py -s " + test_study_seqs +
                              " -i " + test_seq_abun_tsv + " -o " + temp_dir +
                              " -r " + test_msa + " -t " + test_tree +
                              " --custom_trait_tables " + test_known_traits +
                              " --marker_gene_table " + test_known_marker +
                              " -o " + temp_dir)
示例#9
0
def place_seqs_pipeline(study_fasta, ref_dir, out_tree, threads, out_dir,
                        chunk_size, print_cmds):
    '''Full pipeline for running sequence placement.'''

    # Identify reference files to use.
    ref_msa, tree, hmm, model = identify_ref_files(ref_dir)

    # Run hmmalign to place study sequences into reference MSA.
    out_stockholm = path.join(out_dir, "query_align.stockholm")

    system_call_check("hmmalign  --trim --dna --mapali " + ref_msa +
                      " --informat FASTA -o " + out_stockholm + " " + hmm +
                      " " + study_fasta,
                      print_out=print_cmds)

    hmmalign_out = read_stockholm(out_stockholm, clean_char=True)

    # Specify split FASTA files to be created.
    study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta")
    ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta")

    ref_seqnames = set(list(read_fasta(ref_msa).keys()))

    study_seqnames = set(read_fasta(study_fasta).keys())

    ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames}
    study_hmmalign_subset = {seq: hmmalign_out[seq] for seq in study_seqnames}

    write_fasta(ref_hmmalign_subset, ref_msa_fastafile)
    write_fasta(study_hmmalign_subset, study_msa_fastafile)

    # Run EPA-NG to output .jplace file.
    epa_out_dir = path.join(out_dir, "epa_out")

    run_epa_ng(tree=tree,
               ref_msa_fastafile=ref_msa_fastafile,
               study_msa_fastafile=study_msa_fastafile,
               model=model,
               chunk_size=chunk_size,
               threads=threads,
               out_dir=epa_out_dir,
               print_cmds=print_cmds)

    jplace_outfile = path.join(epa_out_dir, "epa_result_parsed.jplace")

    gappa_jplace_to_newick(jplace_file=jplace_outfile,
                           outfile=out_tree,
                           print_cmds=print_cmds)
示例#10
0
def gappa_jplace_to_newick(jplace_file: str, outfile: str, print_cmds=False):
    '''System call to gappa binary to convert jplace object to newick
    treefile (with specified filename).'''

    gappa_out_dir = path.dirname(jplace_file)

    # Run gappa to convert jplace to newick.
    system_call_check("gappa analyze graft --jplace-path " + jplace_file +
                      " --fully-resolve --out-dir " + gappa_out_dir,
                      print_out=print_cmds)

    # Expected name of output newick file.
    newick_file = jplace_file.replace(".jplace", ".newick")

    # Rename newick file to be specified outfile.
    system_call_check("mv " + newick_file + " " + outfile,
                      print_out=print_cmds)
示例#11
0
def run_epa_ng(tree: str,
               ref_msa_fastafile: str,
               study_msa_fastafile: str,
               out_dir: str,
               chunk_size=5000,
               threads=1,
               print_cmds=False):
    '''Run EPA-NG on specified tree, reference MSA, and study sequence MSA.
    Will opath.joinutput a .jplace file in out_dir.'''

    make_output_dir(out_dir)

    system_call_check("epa-ng --tree " + tree + " --ref-msa " +
                      ref_msa_fastafile + " --query " + study_msa_fastafile +
                      " --chunk-size " + str(chunk_size) + " -T " +
                      str(threads) + " -w " + out_dir,
                      print_out=print_cmds)
示例#12
0
    def test_full_pipeline_biom(self):
        '''Test that full pipeline can be run without error with
        BIOM sequence abundance table.'''

        with TemporaryDirectory() as temp_dir:

            out_tree = path.join(temp_dir, "out.tre")

            system_call_check("place_seqs.py -s " + test_study_seqs + " -r " +\
                              test_msa + " -t " + test_tree + " -o " +\
                              out_tree)

            hsp_out_prefix = path.join(temp_dir, "hsp_out")
            hsp_out_prefix_marker = path.join(temp_dir, "hsp_out_marker")

            system_call_check("hsp.py -t " + out_tree +\
                " --observed_trait_table " + test_known_traits + " -n -c " +\
                "-o " + hsp_out_prefix)

            system_call_check("hsp.py -t " + out_tree +\
                " --observed_trait_table " + test_known_marker + " -n -c " +\
                "-o " + hsp_out_prefix_marker)

            traits_predict = path.join(temp_dir, hsp_out_prefix +\
                                       ".tsv")

            marker_predict = path.join(temp_dir, hsp_out_prefix_marker +\
                                       ".tsv")

            metagenome_out = path.join(temp_dir, "meta_out")

            system_call_check("metagenome_pipeline.py -i " + test_seq_abun_biom +\
                              " -f " + traits_predict + " -m " +
                              marker_predict + " -o " + metagenome_out)

            metagenome_outfile = path.join(metagenome_out,
                                           "pred_metagenome_strat.tsv")

            minpath_out = path.join(temp_dir, "minpath_out")

            system_call_check("run_minpath.py -i " + metagenome_outfile +\
                              " -m " + minpath_map + " -o " + minpath_out)
示例#13
0
def castor_hsp_loocv_wrapper(tree_path,
                             trait_table_path,
                             tips_path,
                             hsp_method,
                             expected_out_path,
                             predicted_out_path,
                             metrics_out_path,
                             num_cores=1):
    '''Runs the castor_hsp_loocv.R Rscript and writes out result tables'''
    castor_loocv_hsp_script_fp = path.join(get_picrust_project_dir(),
                                           'picrust2', 'Rscripts',
                                           'castor_hsp_loocv.R')

    loocv_cmd = " ".join([
        "Rscript", castor_loocv_hsp_script_fp, tree_path, trait_table_path,
        tips_path, hsp_method, expected_out_path, predicted_out_path,
        metrics_out_path,
        str(num_cores)
    ])

    # Run castor_hsp_loocv.R here
    system_call_check(loocv_cmd)
def full_pipeline(study_fasta, input_table, output_folder, processes, ref_dir,
                  in_traits, custom_trait_tables, marker_gene_table,
                  pathway_map, rxn_func, no_pathways, regroup_map, no_regroup,
                  stratified, max_nsti, min_reads, min_samples, hsp_method,
                  min_align, skip_nsti, skip_minpath, no_gap_fill, coverage,
                  per_sequence_contrib, wide_table, skip_norm,
                  remove_intermediate, verbose):
    '''Function that contains wrapper commands for full PICRUSt2 pipeline.
    Descriptions of all of these input arguments/options are given in the
    picrust2_pipeline.py script.'''

    # Throw warning if --per_sequence_contrib set but --stratified unset.
    if per_sequence_contrib and not stratified:
        print(
            "\nThe option --per_sequence_contrib was set, but not the option "
            "--stratified. This means that a stratified pathway table will "
            "be output only (i.e. a stratified metagenome table will NOT "
            "be output).\n",
            file=sys.stderr)

    out_tree = path.join(output_folder, "out.tre")

    if custom_trait_tables is None:

        # Check that specified functional categories are allowed.
        FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM']
        funcs = in_traits.split(",")
        for func in funcs:
            if func not in FUNC_TRAIT_OPTIONS:
                sys.exit("Error - specified category " + func + " is not " +
                         "one of the default categories.")

        func_tables = default_tables

    else:
        # Split paths to input custom trait tables and take the basename to be
        # the function id.
        funcs = []
        func_tables = {}

        for custom in custom_trait_tables.split(","):

            func_id = path.splitext(path.basename(custom))[0]
            funcs.append(func_id)
            func_tables[func_id] = custom

    # Add reaction function to be in set of gene families if it is not already
    # and as long as pathways are also to be predicted.
    if rxn_func not in funcs and not no_pathways:
        orig_rxn_func = rxn_func
        rxn_func = path.splitext(path.basename(rxn_func))[0]
        funcs.append(rxn_func)

        if rxn_func not in func_tables:
            func_tables[rxn_func] = orig_rxn_func

    if not skip_norm:
        # Append marker as well, since this also needs to be run.
        funcs.append("marker")
        func_tables["marker"] = marker_gene_table

    # Check that all input files exist.
    ref_msa, tree, hmm, model = identify_ref_files(ref_dir)
    files2check = [study_fasta, input_table, ref_msa, tree, hmm, model] + list(
        func_tables.values())

    if not no_pathways:
        files2check.append(pathway_map)

        # Throw warning if default pathway mapfile used with non-default
        # reference files.
        if pathway_map == default_pathway_map and ref_dir != default_ref_dir:
            print(
                "Warning - non-default reference files specified with "
                "default pathway mapfile of prokaryote-specific MetaCyc "
                "pathways (--pathway_map option). This usage may be "
                "unintended.",
                file=sys.stderr)

        if not no_regroup:
            files2check.append(regroup_map)

    # This will throw an error if any input files are not found.
    check_files_exist(files2check)

    # Check that sequence names in FASTA overlap with input table.
    check_overlapping_seqs(study_fasta, input_table, verbose)

    if path.exists(output_folder):
        sys.exit("Stopping since output directory " + output_folder +
                 " already exists.")

    # Make output folder.
    make_output_dir(output_folder)

    if verbose:
        print("Placing sequences onto reference tree", file=sys.stderr)

    # Define folders for intermediate files (unless --remove_intermediate set).
    if remove_intermediate:
        place_seqs_intermediate = ""
        pathways_intermediate = ""
    else:
        intermediate_dir = path.join(output_folder, "intermediate")
        make_output_dir(intermediate_dir)
        place_seqs_intermediate = path.join(intermediate_dir, "place_seqs")
        pathways_intermediate = path.join(intermediate_dir, "pathways")

    # Run place_seqs.py.
    place_seqs_cmd = [
        "place_seqs.py", "--study_fasta", study_fasta, "--ref_dir", ref_dir,
        "--out_tree", out_tree, "--processes",
        str(processes), "--intermediate", place_seqs_intermediate,
        "--min_align",
        str(min_align), "--chunk_size",
        str(5000)
    ]

    if verbose:
        place_seqs_cmd.append("--verbose")

    system_call_check(place_seqs_cmd,
                      print_command=verbose,
                      print_stdout=verbose,
                      print_stderr=True)

    if verbose:
        print("Finished placing sequences on output tree: " + out_tree,
              file=sys.stderr)

    # Get predictions for all specified functions and keep track of outfiles.
    predicted_funcs = {}

    if not skip_norm:
        # Make sure marker database is first in the list. This is because this will
        # be run on a single core and so will be easier to identify any errors
        # if the program exits when working on this function type.
        funcs.insert(0, funcs.pop(funcs.index("marker")))

    for func in funcs:
        # Change output filename for NSTI and non-NSTI containing files.
        hsp_outfile = path.join(output_folder, func + "_predicted")

        if (func == "marker" and not skip_nsti) or (skip_norm
                                                    and not skip_nsti):
            hsp_outfile = hsp_outfile + "_and_nsti.tsv.gz"
        else:
            hsp_outfile = hsp_outfile + ".tsv.gz"

        # Keep track of output filename for next step of pipeline.
        predicted_funcs[func] = hsp_outfile

        # Run hsp.py for each function database.
        hsp_cmd = [
            "hsp.py", "--tree", out_tree, "--output", hsp_outfile,
            "--observed_trait_table", func_tables[func], "--hsp_method",
            hsp_method, "--seed", "100"
        ]

        # Add flags to command if specified.
        if (func == "marker" and not skip_nsti) or (skip_norm
                                                    and not skip_nsti):
            hsp_cmd.append("--calculate_NSTI")

        # Run marker on only 1 processor.
        if func == "marker":
            hsp_cmd += ["--processes", "1"]
        else:
            hsp_cmd += ["--processes", str(processes)]

        if verbose:
            hsp_cmd.append("--verbose")

        system_call_check(hsp_cmd,
                          print_command=verbose,
                          print_stdout=verbose,
                          print_stderr=True)

    # Now run metagenome pipeline commands.
    # Inititalize dictionary of function names --> metagenome output files.
    func_output = {}

    # Loop over each function again and run metagenome pipeline.
    for func in funcs:

        if func == "marker":
            continue

        if verbose:
            print("Running metagenome pipeline for " + func, file=sys.stderr)

        func_output_dir = path.join(output_folder, func + "_metagenome_out")

        metagenome_pipeline_cmd = [
            "metagenome_pipeline.py", "--input", input_table, "--function",
            predicted_funcs[func], "--min_reads",
            str(min_reads), "--min_samples",
            str(min_samples), "--out_dir", func_output_dir
        ]

        # Initialize two-element list as value for each function.
        # First value will be unstratified output and second will be
        # stratified output.
        func_output[func] = [None, None]

        func_output[func][0] = path.join(func_output_dir,
                                         "pred_metagenome_unstrat.tsv.gz")

        if wide_table:
            metagenome_pipeline_cmd.append("--wide_table")

        if not skip_nsti:
            metagenome_pipeline_cmd += ["--max_nsti", str(max_nsti)]

        if skip_norm:
            metagenome_pipeline_cmd.append("--skip_norm")
        else:
            metagenome_pipeline_cmd += ["--marker", predicted_funcs["marker"]]

        if stratified:
            metagenome_pipeline_cmd.append("--strat_out")

            if wide_table:
                func_output[func][1] = path.join(
                    func_output_dir, "pred_metagenome_strat.tsv.gz")
            else:
                func_output[func][1] = path.join(
                    func_output_dir, "pred_metagenome_contrib.tsv.gz")

        system_call_check(metagenome_pipeline_cmd,
                          print_command=verbose,
                          print_stdout=verbose,
                          print_stderr=True)

    # Now infer pathway abundances and coverages unless --no_pathways set.
    pathway_outfiles = None

    if not no_pathways:

        path_output_dir = path.join(output_folder, "pathways_out")

        if verbose:
            print("Inferring pathways from predicted " + rxn_func)

        # Determine whether stratified or unstratified table should be input.
        if not stratified or per_sequence_contrib:
            rxn_input_metagenome = func_output[rxn_func][0]
        else:
            rxn_input_metagenome = func_output[rxn_func][1]

        pathway_pipeline_cmd = [
            "pathway_pipeline.py", "--input", rxn_input_metagenome,
            "--out_dir", path_output_dir, "--map", pathway_map,
            "--intermediate", pathways_intermediate, "--proc",
            str(processes)
        ]

        if no_gap_fill:
            pathway_pipeline_cmd.append("--no_gap_fill")

        if skip_minpath:
            pathway_pipeline_cmd.append("--skip_minpath")

        if coverage:
            pathway_pipeline_cmd.append("--coverage")

        if no_regroup:
            pathway_pipeline_cmd.append("--no_regroup")
        else:
            pathway_pipeline_cmd += ["--regroup_map", regroup_map]

        if wide_table:
            pathway_pipeline_cmd.append("--wide_table")

        if per_sequence_contrib:
            pathway_pipeline_cmd.append("--per_sequence_contrib")

            if skip_norm:
                norm_sequence_abun = input_table
            else:
                norm_sequence_abun = path.join(output_folder,
                                               rxn_func + "_metagenome_out",
                                               "seqtab_norm.tsv.gz")

            pathway_pipeline_cmd += ["--per_sequence_abun", norm_sequence_abun]

            pathway_pipeline_cmd += [
                "--per_sequence_function", predicted_funcs[rxn_func]
            ]

        if verbose:
            pathway_pipeline_cmd.append("--verbose")

        system_call_check(pathway_pipeline_cmd,
                          print_command=verbose,
                          print_stdout=False,
                          print_stderr=True)

        if verbose:
            print("Wrote predicted pathway abundances and coverages to " +
                  path_output_dir,
                  file=sys.stderr)

        # Keep track of output filenames if this function is being used in
        # a non-default way (e.g. with a QIIME2 plugin).
        pathway_outfiles = {}

        pathway_outfiles["unstrat_abun"] = path.join(
            path_output_dir, "path_abun_unstrat.tsv.gz")
        pathway_outfiles["unstrat_cov"] = path.join(path_output_dir,
                                                    "path_cov_unstrat.tsv.gz")

        pathway_outfiles["strat_abun"] = None
        pathway_outfiles["strat_cov"] = None

        if stratified or per_sequence_contrib:
            if wide_table:
                pathway_outfiles["strat_abun"] = path.join(
                    path_output_dir, "path_abun_strat.tsv.gz")

                if per_sequence_contrib:
                    pathway_outfiles["strat_cov"] = path.join(
                        path_output_dir, "path_cov_strat.tsv.gz")

            else:
                pathway_outfiles["strat_abun"] = path.join(
                    path_output_dir, "path_abun_contrib.tsv.gz")
                if per_sequence_contrib:
                    pathway_outfiles["strat_cov"] = path.join(
                        path_output_dir, "path_cov_contrib.tsv.gz")

    return (func_output, pathway_outfiles)
示例#15
0
def place_seqs_pipeline(study_fasta,
                        ref_dir,
                        out_tree,
                        threads,
                        out_dir,
                        min_align,
                        chunk_size,
                        verbose):
    '''Full pipeline for running sequence placement.'''

    # Throw error if there is a space in the study FASTA filepath.
    if " " in study_fasta:
        sys.exit("Stopping - remove the space from the input FASTA filepath.")

    # Identify reference files to use.
    ref_msa, tree, hmm, model = identify_ref_files(ref_dir)

    # Run hmmalign to place study sequences into reference MSA.
    out_stockholm = path.join(out_dir, "query_align.stockholm")

    system_call_check("hmmalign --trim --dna --mapali " +
                      ref_msa + " --informat FASTA -o " +
                      out_stockholm + " " + hmm + " " + study_fasta,
                      print_command=verbose, print_stdout=verbose,
                      print_stderr=verbose)

    hmmalign_out = read_stockholm(out_stockholm, clean_char=True)

    # Specify split FASTA files to be created.
    study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta")
    ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta")

    ref_seqnames = set(list(read_fasta(ref_msa).keys()))
    study_seqs = read_fasta(study_fasta)

    ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames}

    study_hmmalign_subset = check_alignments(raw_seqs=study_seqs,
                                             aligned_seqs=hmmalign_out,
                                             min_align=min_align,
                                             verbose=verbose)

    write_fasta(ref_hmmalign_subset, ref_msa_fastafile)
    write_fasta(study_hmmalign_subset, study_msa_fastafile)

    # Run EPA-ng to place input sequences and output JPLACE file.
    epa_out_dir = path.join(out_dir, "epa_out")

    run_epa_ng(tree=tree,
               ref_msa_fastafile=ref_msa_fastafile,
               study_msa_fastafile=study_msa_fastafile,
               model=model,
               chunk_size=chunk_size,
               threads=threads,
               out_dir=epa_out_dir,
               print_cmds=verbose)

    jplace_outfile = path.join(epa_out_dir, "epa_result_parsed.jplace")

    gappa_jplace_to_newick(jplace_file=jplace_outfile,
                           outfile=out_tree,
                           print_cmds=verbose)
示例#16
0
def place_seqs_pipeline(study_fasta, ref_msa, tree, hmm, out_tree,
                        alignment_tool, threads, out_dir, chunk_size,
                        print_cmds):
    '''Full pipeline for running sequence placement.'''

    if alignment_tool == "hmmalign":

        out_stockholm = path.join(out_dir, "query_align.stockholm")

        system_call_check("hmmalign  --trim --dna --mapali " + ref_msa +
                          " --informat FASTA -o " + out_stockholm + " " + hmm +
                          " " + study_fasta,
                          print_out=print_cmds)

        hmmalign_out = read_stockholm(out_stockholm, clean_char=True)

        # Specify split FASTA files to be created.
        study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta")
        ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta")

        ref_seqnames = set(list(read_fasta(ref_msa).keys()))

        study_seqnames = set(read_fasta(study_fasta).keys())

        ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames}
        study_hmmalign_subset = {
            seq: hmmalign_out[seq]
            for seq in study_seqnames
        }

        write_fasta(ref_hmmalign_subset, ref_msa_fastafile)
        write_fasta(study_hmmalign_subset, study_msa_fastafile)

    elif alignment_tool == "papara":

        # Read in ref seqs FASTA as a dict.
        ref_msa = read_fasta(ref_msa)

        # Run PaPaRa to place study sequences and read in Phylip file.
        papara_out = run_papara(tree=tree,
                                ref_msa=ref_msa,
                                study_fasta=study_fasta,
                                out_dir=out_dir,
                                threads=threads,
                                print_cmds=print_cmds)

        # Specify split FASTA files to be created.
        study_msa_fastafile = path.join(out_dir, "study_seqs_papara.fasta")
        ref_msa_fastafile = path.join(out_dir, "ref_seqs_papara.fasta")

        # Split PaPaRa output into two FASTA files containing study and reference
        # sequences respectively.
        split_ref_study_papara(papara_out=papara_out,
                               ref_seqnames=set(list(ref_msa.keys())),
                               study_fasta=study_msa_fastafile,
                               ref_fasta=ref_msa_fastafile)

    # Run EPA-NG to output .jplace file.
    epa_out_dir = path.join(out_dir, "epa_out")

    run_epa_ng(tree=tree,
               ref_msa_fastafile=ref_msa_fastafile,
               study_msa_fastafile=study_msa_fastafile,
               chunk_size=chunk_size,
               threads=threads,
               out_dir=epa_out_dir,
               print_cmds=print_cmds)

    jplace_outfile = path.join(epa_out_dir, "epa_result.jplace")

    gappa_jplace_to_newick(jplace_file=jplace_outfile,
                           outfile=out_tree,
                           print_cmds=print_cmds)
示例#17
0
def custom_tree_pipeline(table: biom.Table,
                         tree: skbio.TreeNode,
                         threads: int = 1,
                         hsp_method: str = "mp",
                         max_nsti: float = 2.0,
                         edge_exponent: float = 0.5,
                         skip_minpath: bool = False,
                         no_gap_fill: bool = False,
                         skip_norm: bool = False,
                         highly_verbose: bool = False) -> (biom.Table,
                                                           biom.Table,
                                                           biom.Table):

    # Run pipeline in temporary directory so that files are not saved locally.
    with TemporaryDirectory() as temp_dir:

        # Need to write out BIOM table and newick tree to be used in pipeline.

        # Write out biom table:
        biom_infile = path.join(temp_dir, "intable.biom")
        with biom.util.biom_open(biom_infile, 'w') as out_biom:
            table.to_hdf5(h5grp=out_biom,
                          generated_by="PICRUSt2 QIIME 2 Plugin")

        # Write out newick tree.
        newick_infile = path.join(temp_dir, "placed_seqs.tre")
        tree.write(newick_infile, format="newick")

        picrust2_out = path.join(temp_dir, "picrust2_out")

        print("Running the below commands:", file=sys.stderr)

        # Run hidden-state prediction step (on 16S, EC, and KO tables
        # separately.
        hsp_out_16S = path.join(picrust2_out, "16S_predicted.tsv.gz")
        hsp_out_16S_cmd = "hsp.py -i 16S " + \
                          " -t " + newick_infile + \
                          " -p 1 " + \
                          " -n " + \
                          " -o " + hsp_out_16S + \
                          " -m " + hsp_method + \
                          " -e " + str(edge_exponent)

        hsp_out_EC = path.join(picrust2_out, "EC_predicted.tsv.gz")
        hsp_out_EC_cmd = "hsp.py -i EC " + \
                          " -t " + newick_infile + \
                          " -p " + str(threads) + \
                          " -n " + \
                          " -o " + hsp_out_EC + \
                          " -m " + hsp_method + \
                          " -e " + str(edge_exponent)

        hsp_out_KO = path.join(picrust2_out, "KO_predicted.tsv.gz")
        hsp_out_KO_cmd = "hsp.py -i KO " + \
                          " -t " + newick_infile + \
                          " -p " + str(threads) + \
                          " -n " + \
                          " -o " + hsp_out_KO + \
                          " -m " + hsp_method + \
                          " -e " + str(edge_exponent)

        if highly_verbose:
            hsp_out_16S_cmd += " --verbose"
            hsp_out_EC_cmd += " --verbose"
            hsp_out_KO_cmd += " --verbose"

        if not skip_norm:
            system_call_check(hsp_out_16S_cmd,
                              print_command=True,
                              print_stdout=highly_verbose,
                              print_stderr=True)


        system_call_check(hsp_out_EC_cmd,
                          print_command=True,
                          print_stdout=highly_verbose,
                          print_stderr=True)

        system_call_check(hsp_out_KO_cmd,
                          print_command=True,
                          print_stdout=highly_verbose,
                          print_stderr=True)

        # Run metagenome pipeline step.
        EC_metagenome_out = path.join(picrust2_out, "EC_metagenome_out")
        KO_metagenome_out = path.join(picrust2_out, "KO_metagenome_out")

        EC_metagenome_cmd = "metagenome_pipeline.py -i " + biom_infile + \
                            " -f " + hsp_out_EC + \
                            " -o " + EC_metagenome_out + \
                            " --max_nsti " + str(max_nsti)

        KO_metagenome_cmd = "metagenome_pipeline.py -i " + biom_infile + \
                            " -f " + hsp_out_KO + \
                            " -o " + KO_metagenome_out + \
                            " --max_nsti " + str(max_nsti)

        if skip_norm:
            EC_metagenome_cmd += " --skip_norm"
            KO_metagenome_cmd += " --skip_norm"
        else:
            EC_metagenome_cmd += " -m " + hsp_out_16S
            KO_metagenome_cmd += " -m " + hsp_out_16S

        system_call_check(EC_metagenome_cmd, print_command=True,
                          print_stdout=highly_verbose,
                          print_stderr=True)
        system_call_check(KO_metagenome_cmd, print_command=True,
                          print_stdout=highly_verbose,
                          print_stderr=True)

        EC_out = path.join(EC_metagenome_out, "pred_metagenome_unstrat.tsv.gz")
        KO_out = path.join(KO_metagenome_out, "pred_metagenome_unstrat.tsv.gz")

        # Run pathway inference step.
        pathways_out = path.join(picrust2_out, "pathways_out")
        pathabun_out = path.join(pathways_out, "path_abun_unstrat.tsv.gz")

        pathway_pipeline_cmd = "pathway_pipeline.py -i " + EC_out + \
                               " -o " + pathways_out + \
                               " -p " + str(threads)

        if skip_minpath:
            pathway_pipeline_cmd += " --skip_minpath"

        if no_gap_fill:
            pathway_pipeline_cmd += " --no_gap_fill"

        if highly_verbose:
            pathway_pipeline_cmd += " --verbose"

        system_call_check(pathway_pipeline_cmd, print_command=True,
                          print_stdout=highly_verbose,
                          print_stderr=True)

        # Read in output unstratified metagenome tables and return as BIOM
        # objects.
        ko_biom = biom.load_table(KO_out)
        ec_biom = biom.load_table(EC_out)
        pathabun_biom = biom.load_table(pathabun_out)

        return ko_biom, ec_biom, pathabun_biom
示例#18
0
def minpath_wrapper(sample_id, strat_input, minpath_map, out_dir,
                    print_opt=False):
    '''Read in sample_id, gene family table, and out_dir, and run MinPath based
    on the gene family abundances. Returns both unstratified and stratified
    pathway abundances as dictionaries in a list.'''

    # Get gene family abundances summed over all sequences for this sample.
    unstrat_input = strat_to_unstrat_counts(strat_input)

    # Define MinPath input and outout filenames.
    minpath_in = path.join(out_dir, sample_id + "_minpath_in.txt")
    minpath_report = path.join(out_dir, sample_id + "_minpath_report.txt")
    minpath_details = path.join(out_dir, sample_id + "_minpath_details.txt")
    minpath_mps = path.join(out_dir, sample_id + "_minpath.mps")

    minpath_output = open(path.join(out_dir, sample_id + "_minpath_out.txt"),
                          "w")

    id_minpath_fh = open(minpath_in, "w")

    # Loop over all functions (which are the index labels in unstrat table).
    for func_id in unstrat_input.index.values:
        # Get count of each sequence in sample and write that sequence out
        # along with count if non-zero abundance.
        func_count = unstrat_input.loc[func_id, sample_id]

        # If 0 then skip.
        if func_count == 0:
            continue

        id_minpath_fh.write(func_id + "\t" + str(func_count) + "\n")

    id_minpath_fh.close()

    # Run MinPath on this sample.
    path2minpath = path.join(get_picrust_project_dir(), 'MinPath',
                                 'MinPath12hmp.py')

    minpath_cmd = path2minpath + " -any " + minpath_in + " -map " +\
                  minpath_map + " -report " + minpath_report +\
                  " -details " + minpath_details + " -mps " + minpath_mps

    system_call_check(minpath_cmd, print_out=print_opt,
                      stdout=minpath_output)

    # Read through MinPath report and keep track of pathways identified
    # to be present.
    path_present = identify_minpath_present(minpath_report)

    # Now read in details file and take abundance of pathway to be
    # mean of top 1/2 most abundant gene families.
    # Abundances of 0 will be added in for gene families not found.
    gf_abundances, gf_ids = parse_minpath_details(minpath_details, path_present)

    # Initialize series and dataframe that will contain pathway abundances.
    unstrat_abun = pd.Series()
    strat_abun = pd.DataFrame(columns=["pathway", "sequence", sample_id])
    strat_abun = strat_abun.set_index(["pathway", "sequence"])

    # Loop through all pathways present and get mean of 1/2 most abundant.
    for pathway in gf_abundances.keys():

        # Like HUMAnN2, sort enzyme reactions, take second half, and get
        # their mean abundance.

        # First get indices of sorted list.
        sorted_index = list(np.argsort(gf_abundances[pathway]))
        sorted_gf_abundances = [gf_abundances[pathway][i] for i in sorted_index]
        sorted_gf_ids = [gf_ids[pathway][i] for i in sorted_index]

        # Take second half of gene family abundances and ids lists.
        half_i = int(len(sorted_gf_abundances) / 2)
        gf_abundances_subset = sorted_gf_abundances[half_i:]
        gf_ids_subset = sorted_gf_ids[half_i:]

        # Take mean for unstratified pathway abundance.
        unstrat_abun[pathway] = sum(gf_abundances_subset)/len(gf_abundances_subset)

        # Get stratified pathway abundances by sequences.
        strat_path_abun = path_abun_by_seq(strat_input,
                                           gf_ids_subset,
                                           sum(gf_abundances_subset),
                                           unstrat_abun[pathway])
        # Remove rows that are all 0.
        strat_path_abun[strat_path_abun[sample_id] > 0]

        # Add pathway as new column.
        strat_path_abun["pathway"] = [pathway]*strat_path_abun.shape[0]

        strat_path_abun.set_index("pathway", append=True, inplace=True)

        # Changes levels of index labels.
        strat_path_abun = strat_path_abun.reorder_levels(["pathway",
                                                          "sequence"])

        strat_abun = pd.concat([strat_abun, strat_path_abun], levels=["pathway", "sequence"])

    # Return unstratified and stratified abundances.
    # Note that the stratified abundances are converted to a series.
    return([unstrat_abun, strat_abun[sample_id]])
示例#19
0
def castor_hsp_wrapper(tree_path,
                       trait_tab,
                       hsp_method,
                       calc_ci=False,
                       check_input=False,
                       ran_seed=None,
                       verbose=False):
    '''Wrapper for making system calls to castor_hsp.py Rscript.'''

    castor_hsp_script = path.join(path.dirname(path.abspath(__file__)),
                                  'Rscripts', 'castor_hsp.R')

    # Need to format boolean setting as string for R to read in as argument.
    if calc_ci:
        calc_ci_setting = "TRUE"
    else:
        calc_ci_setting = "FALSE"

    if check_input:
        check_input_setting = "TRUE"
    else:
        check_input_setting = "FALSE"

    # Create temporary directory for writing output files of castor_hsp.R
    with TemporaryDirectory() as temp_dir:

        output_count_path = path.join(temp_dir, "predicted_counts.txt")
        output_ci_path = path.join(temp_dir, "predicted_ci.txt")

        hsp_cmd = " ".join([
            "Rscript", castor_hsp_script, tree_path, trait_tab, hsp_method,
            calc_ci_setting, check_input_setting, output_count_path,
            output_ci_path,
            str(ran_seed)
        ])

        # Run castor_hsp.R
        system_call_check(hsp_cmd,
                          print_command=verbose,
                          print_stdout=verbose,
                          print_stderr=verbose)

        # Load the output into Table objects
        try:
            asr_table = pd.read_csv(filepath_or_buffer=output_count_path,
                                    sep="\t",
                                    dtype={'sequence': str})
            asr_table.set_index('sequence', drop=True, inplace=True)
        except IOError:
            raise ValueError("Cannot read in expected output file" +
                             output_ci_path)

        if calc_ci:
            asr_ci_table = pd.read_csv(filepath_or_buffer=output_ci_path,
                                       sep="\t",
                                       dtype={'sequence': str})
            asr_ci_table.set_index('sequence', drop=True, inplace=True)
        else:
            asr_ci_table = None

    # Return list with predicted counts and CIs.
    return [asr_table, asr_ci_table]
示例#20
0
def custom_tree_pipeline(
        table: biom.Table,
        tree: skbio.TreeNode,
        threads: int = 1,
        hsp_method: str = "mp",
        max_nsti: float = 2.0) -> (biom.Table, biom.Table, biom.Table):

    # Run pipeline in temporary directory so that files are not saved locally.
    with TemporaryDirectory() as temp_dir:

        # Need to write out BIOM table and newick tree to be used in pipeline.

        # Write out biom table:
        biom_infile = path.join(temp_dir, "intable.biom")
        with biom.util.biom_open(biom_infile, 'w') as out_biom:
            table.to_hdf5(h5grp=out_biom,
                          generated_by="PICRUSt2 QIIME2 Plugin")

        # Write out newick tree.
        newick_infile = path.join(temp_dir, "placed_seqs.tre")
        tree.write(newick_infile, format="newick")

        picrust2_out = path.join(temp_dir, "picrust2_out")

        print("Running the below commands:", file=sys.stderr)

        # Run hidden-state prediction step (on 16S, EC, and KO tables
        # separately.
        hsp_out_16S = path.join(picrust2_out, "16S_predicted.tsv.gz")
        system_call_check("hsp.py -i 16S " + " -t " + newick_infile +
                          " -p 1 " + " -n " + "-o " + hsp_out_16S + " -m " +
                          hsp_method,
                          print_out=True)

        hsp_out_EC = path.join(picrust2_out, "EC_predicted.tsv.gz")
        system_call_check("hsp.py -i EC " + " -t " + newick_infile + " -p " +
                          str(threads) + " -o " + hsp_out_EC + " -m " +
                          hsp_method,
                          print_out=True)

        hsp_out_KO = path.join(picrust2_out, "KO_predicted.tsv.gz")
        system_call_check("hsp.py -i KO " + " -t " + newick_infile + " -p " +
                          str(threads) + " -o " + hsp_out_KO + " -m " +
                          hsp_method,
                          print_out=True)

        # Run metagenome pipeline step.
        EC_metagenome_out = path.join(picrust2_out, "EC_metagenome_out")
        system_call_check("metagenome_pipeline.py -i " + biom_infile + " -m " +
                          hsp_out_16S + " -f " + hsp_out_EC + " -o " +
                          EC_metagenome_out + " --max_nsti " + str(max_nsti),
                          print_out=True)

        KO_metagenome_out = path.join(picrust2_out, "KO_metagenome_out")
        system_call_check("metagenome_pipeline.py -i " + biom_infile + " -m " +
                          hsp_out_16S + " -f " + hsp_out_KO + " -o " +
                          KO_metagenome_out + " --max_nsti " + str(max_nsti),
                          print_out=True)

        EC_out = path.join(EC_metagenome_out, "pred_metagenome_unstrat.tsv.gz")
        KO_out = path.join(KO_metagenome_out, "pred_metagenome_unstrat.tsv.gz")

        # Run pathway inference step.
        pathways_out = path.join(picrust2_out, "pathways_out")
        pathabun_out = path.join(pathways_out, "path_abun_unstrat.tsv.gz")
        system_call_check("pathway_pipeline.py -i " + EC_out + " -o " +
                          pathways_out + " -p " + str(threads),
                          print_out=True)

        # Read in output unstratified metagenome tables and return as BIOM
        # objects.
        ko_biom = biom.load_table(KO_out)
        ec_biom = biom.load_table(EC_out)
        pathabun_biom = biom.load_table(pathabun_out)

        return ko_biom, ec_biom, pathabun_biom
示例#21
0
def minpath_wrapper(sample_id,
                    unstrat_input,
                    minpath_map,
                    out_dir,
                    print_opt=False,
                    extra_str=""):
    '''Run MinPath based on gene abundances in a single sample. Will return
    the abundances of gene families within each identified pathway.'''

    # Make output directory for MinPath intermediate files.
    make_output_dir(path.join(out_dir, "minpath_running"))

    # Define MinPath input and output filenames.
    minpath_in = path.join(out_dir, "minpath_running",
                           sample_id + extra_str + "_minpath_in.txt")

    minpath_report = path.join(out_dir, "minpath_running",
                               sample_id + extra_str + "_minpath_report.txt")

    minpath_details = path.join(out_dir, "minpath_running",
                                sample_id + extra_str + "_minpath_details.txt")

    minpath_mps = path.join(out_dir, "minpath_running",
                            sample_id + extra_str + "_minpath.mps")

    minpath_output = open(path.join(out_dir, sample_id + "_minpath_out.txt"),
                          "w")

    id_minpath_fh = open(minpath_in, "w")

    # Inititalize dictionary for keeping track of reaction abundances.
    reaction_abun = defaultdict(int)

    # Loop over all reactions (which are the index labels in unstrat table
    # unless regrouped).
    for reaction_id in unstrat_input.index.values:
        # Get count of each sequence in sample and write that sequence out
        # along with count if non-zero abundance.
        reaction_count = unstrat_input.loc[reaction_id, sample_id]

        # If 0 then skip.
        if reaction_count == 0:
            continue

        id_minpath_fh.write(reaction_id + "\t" + str(reaction_count) + "\n")

        reaction_abun[reaction_id] = reaction_count

    id_minpath_fh.close()

    # Run MinPath on this sample.
    path2minpath = path.join(get_picrust_project_dir(), 'picrust2', 'MinPath',
                             'MinPath12hmp.py')

    minpath_cmd = path2minpath + " -any " + minpath_in + " -map " +\
                  minpath_map + " -report " + minpath_report +\
                  " -details " + minpath_details + " -mps " + minpath_mps

    system_call_check(minpath_cmd, print_out=print_opt, stdout=minpath_output)

    # Read through MinPath report and keep track of pathways identified
    # to be present.
    path_present = identify_minpath_present(minpath_report)

    # Return list of which pathways are present and the abundances of all gene
    # families.
    return (path_present, reaction_abun)