예제 #1
0
def castor_nsti(tree_path, known_tips):
    '''Will calculate distance from each study sequence to the closest
    reference sequence. Takes in the path to treefile and the known tips
    (i.e. the rownames in the trait table - the reference genome ids).'''
    castor_nsti_script = path.join(get_picrust_project_dir(), 'picrust2',
                                   'Rscripts', 'castor_nsti.R')

    # Create temporary directory for working in.
    with TemporaryDirectory() as temp_dir:

        # Output known tip names to temp file
        # (note this object is a numpy.ndarray)
        known_tips_out = path.join(temp_dir, "known_tips.txt")
        known_tips.tofile(known_tips_out, sep="\n")

        nsti_tmp_out = path.join(temp_dir, "nsti_out.txt")

        # Run Rscript.
        system_call_check(" ".join([
            "Rscript", castor_nsti_script, tree_path, known_tips_out,
            nsti_tmp_out
        ]))

        # Read in calculated NSTI values.
        nsti_out = pd.read_table(nsti_tmp_out, sep="\t", index_col="sequence")

    # Make sure that the table has the correct number of rows.
    if len(known_tips) != nsti_out.shape[0]:
        ValueError("Number of rows in returned NSTI table is incorrect.")

    return (nsti_out)
예제 #2
0
def castor_hsp_wrapper(tree_path,
                       trait_tab,
                       hsp_method,
                       calc_ci=False,
                       check_input=False,
                       ran_seed=None):
    '''Wrapper for making system calls to castor_hsp.py Rscript.'''

    castor_hsp_script = path.join(get_picrust_project_dir(), 'picrust2',
                                  'Rscripts', 'castor_hsp.R')

    # Need to format boolean setting as string for R to read in as argument.
    if calc_ci:
        calc_ci_setting = "TRUE"
    else:
        calc_ci_setting = "FALSE"

    if check_input:
        check_input_setting = "TRUE"
    else:
        check_input_setting = "FALSE"

    # Create temporary directory for writing output files of castor_hsp.R

    with TemporaryDirectory() as temp_dir:

        output_count_path = path.join(temp_dir, "predicted_counts.txt")
        output_ci_path = path.join(temp_dir, "predicted_ci.txt")

        hsp_cmd = " ".join([
            "Rscript", castor_hsp_script, tree_path, trait_tab, hsp_method,
            calc_ci_setting, check_input_setting, output_count_path,
            output_ci_path,
            str(ran_seed)
        ])

        # Run castor_hsp.R
        system_call_check(hsp_cmd)

        # Load the output into Table objects
        try:
            asr_table = pd.read_table(filepath_or_buffer=output_count_path,
                                      sep="\t",
                                      index_col="sequence")
        except IOError:
            raise ValueError("Cannot read in expected output file" +
                             output_ci_path)

        if calc_ci:
            asr_ci_table = pd.read_table(filepath_or_buffer=output_ci_path,
                                         sep="\t",
                                         index_col="sequence")
        else:
            asr_ci_table = None

    # Return list with predicted counts and CIs.
    return [asr_table, asr_ci_table]
예제 #3
0
def castor_hsp_loocv_wrapper(tree_path,
                             trait_table_path,
                             tips_path,
                             hsp_method,
                             expected_out_path,
                             predicted_out_path,
                             metrics_out_path,
                             num_cores=1):
    '''Runs the castor_hsp_loocv.R Rscript and writes out result tables'''
    castor_loocv_hsp_script_fp = path.join(get_picrust_project_dir(),
                                           'picrust2', 'Rscripts',
                                           'castor_hsp_loocv.R')

    loocv_cmd = " ".join([
        "Rscript", castor_loocv_hsp_script_fp, tree_path, trait_table_path,
        tips_path, hsp_method, expected_out_path, predicted_out_path,
        metrics_out_path,
        str(num_cores)
    ])

    # Run castor_hsp_loocv.R here
    system_call_check(loocv_cmd)
예제 #4
0
#!/usr/bin/env python

__copyright__ = "Copyright 2018, The PICRUSt Project"
__license__ = "GPL"
__version__ = "2.0.0-b.3"

import unittest
from os import path
from tempfile import TemporaryDirectory
from picrust2.util import get_picrust_project_dir, read_phylip, read_fasta
from picrust2.place_seqs import (place_seqs_pipeline, run_papara,
                                 split_ref_study_papara, run_epa_ng,
                                 gappa_jplace_to_newick)

# Set paths to test files.
test_dir_path = path.join(get_picrust_project_dir(), "tests")

test_study_seqs = path.join(test_dir_path, "test_data", "place_seqs",
                            "study_seqs_test.fasta")

test_tree = path.join(test_dir_path, "test_data", "place_seqs",
                      "img_centroid_16S_aligned_head30.tre")

test_msa = path.join(test_dir_path, "test_data", "place_seqs",
                     "img_centroid_16S_aligned_head30.fna")

exp_papara_phylip = path.join(test_dir_path, "test_data", "place_seqs",
                              "place_seqs_output", "place_seqs_working",
                              "papara_alignment.out")

exp_study_fasta = path.join(test_dir_path, "test_data", "place_seqs",
예제 #5
0
파일: default.py 프로젝트: misazaa/picrust2
#!/usr/bin/env python

__copyright__ = "Copyright 2018, The PICRUSt Project"
__license__ = "GPL"
__version__ = "2.0.0-b.7"

from picrust2.util import get_picrust_project_dir
from os import path

# Default support files packaged with PICRUSt2.
project_dir = get_picrust_project_dir()

default_fasta = path.join(project_dir, "default_files", "prokaryotic",
                          "reference.fna")

default_tree = path.join(project_dir, "default_files", "prokaryotic",
                         "reference.tre")

default_regroup_map = path.join(project_dir, "default_files",
                                "pathway_mapfiles",
                                "ec_level4_to_metacyc_rxn.tsv")

default_pathway_map = path.join(project_dir, "default_files",
                                "pathway_mapfiles",
                                "metacyc_path2rxn_struc_filt_pro.txt")

# Inititalize default trait table files for hsp.py.
prokaryotic_dir = path.join(project_dir, "default_files", "prokaryotic")

default_tables = {
    "16S": path.join(prokaryotic_dir, "16S.txt.gz"),
예제 #6
0
def minpath_wrapper(sample_id, strat_input, minpath_map, out_dir,
                    print_opt=False):
    '''Read in sample_id, gene family table, and out_dir, and run MinPath based
    on the gene family abundances. Returns both unstratified and stratified
    pathway abundances as dictionaries in a list.'''

    # Get gene family abundances summed over all sequences for this sample.
    unstrat_input = strat_to_unstrat_counts(strat_input)

    # Define MinPath input and outout filenames.
    minpath_in = path.join(out_dir, sample_id + "_minpath_in.txt")
    minpath_report = path.join(out_dir, sample_id + "_minpath_report.txt")
    minpath_details = path.join(out_dir, sample_id + "_minpath_details.txt")
    minpath_mps = path.join(out_dir, sample_id + "_minpath.mps")

    minpath_output = open(path.join(out_dir, sample_id + "_minpath_out.txt"),
                          "w")

    id_minpath_fh = open(minpath_in, "w")

    # Loop over all functions (which are the index labels in unstrat table).
    for func_id in unstrat_input.index.values:
        # Get count of each sequence in sample and write that sequence out
        # along with count if non-zero abundance.
        func_count = unstrat_input.loc[func_id, sample_id]

        # If 0 then skip.
        if func_count == 0:
            continue

        id_minpath_fh.write(func_id + "\t" + str(func_count) + "\n")

    id_minpath_fh.close()

    # Run MinPath on this sample.
    path2minpath = path.join(get_picrust_project_dir(), 'MinPath',
                                 'MinPath12hmp.py')

    minpath_cmd = path2minpath + " -any " + minpath_in + " -map " +\
                  minpath_map + " -report " + minpath_report +\
                  " -details " + minpath_details + " -mps " + minpath_mps

    system_call_check(minpath_cmd, print_out=print_opt,
                      stdout=minpath_output)

    # Read through MinPath report and keep track of pathways identified
    # to be present.
    path_present = identify_minpath_present(minpath_report)

    # Now read in details file and take abundance of pathway to be
    # mean of top 1/2 most abundant gene families.
    # Abundances of 0 will be added in for gene families not found.
    gf_abundances, gf_ids = parse_minpath_details(minpath_details, path_present)

    # Initialize series and dataframe that will contain pathway abundances.
    unstrat_abun = pd.Series()
    strat_abun = pd.DataFrame(columns=["pathway", "sequence", sample_id])
    strat_abun = strat_abun.set_index(["pathway", "sequence"])

    # Loop through all pathways present and get mean of 1/2 most abundant.
    for pathway in gf_abundances.keys():

        # Like HUMAnN2, sort enzyme reactions, take second half, and get
        # their mean abundance.

        # First get indices of sorted list.
        sorted_index = list(np.argsort(gf_abundances[pathway]))
        sorted_gf_abundances = [gf_abundances[pathway][i] for i in sorted_index]
        sorted_gf_ids = [gf_ids[pathway][i] for i in sorted_index]

        # Take second half of gene family abundances and ids lists.
        half_i = int(len(sorted_gf_abundances) / 2)
        gf_abundances_subset = sorted_gf_abundances[half_i:]
        gf_ids_subset = sorted_gf_ids[half_i:]

        # Take mean for unstratified pathway abundance.
        unstrat_abun[pathway] = sum(gf_abundances_subset)/len(gf_abundances_subset)

        # Get stratified pathway abundances by sequences.
        strat_path_abun = path_abun_by_seq(strat_input,
                                           gf_ids_subset,
                                           sum(gf_abundances_subset),
                                           unstrat_abun[pathway])
        # Remove rows that are all 0.
        strat_path_abun[strat_path_abun[sample_id] > 0]

        # Add pathway as new column.
        strat_path_abun["pathway"] = [pathway]*strat_path_abun.shape[0]

        strat_path_abun.set_index("pathway", append=True, inplace=True)

        # Changes levels of index labels.
        strat_path_abun = strat_path_abun.reorder_levels(["pathway",
                                                          "sequence"])

        strat_abun = pd.concat([strat_abun, strat_path_abun], levels=["pathway", "sequence"])

    # Return unstratified and stratified abundances.
    # Note that the stratified abundances are converted to a series.
    return([unstrat_abun, strat_abun[sample_id]])
예제 #7
0
#!/usr/bin/env python

__copyright__ = "Copyright 2018, The PICRUSt Project"
__license__ = "GPL"
__version__ = "2.0.0-b.3"

import unittest
from os import path
from tempfile import TemporaryDirectory
from picrust2.util import get_picrust_project_dir, system_call_check

# Paths to input files.
test_dir_path = path.join(get_picrust_project_dir(), "tests")

test_study_seqs = path.join(test_dir_path, "test_data", "place_seqs",
                            "study_seqs_test.fasta")

test_tree = path.join(test_dir_path, "test_data", "place_seqs",
                      "img_centroid_16S_aligned_head30.tre")

test_msa = path.join(test_dir_path, "test_data", "place_seqs",
                     "img_centroid_16S_aligned_head30.fna")

test_known_marker = path.join(test_dir_path, "test_data", "workflow",
                              "workflow_known_marker.tsv")

test_known_traits = path.join(test_dir_path, "test_data", "workflow",
                              "workflow_known_traits.tsv")

test_seq_abun_tsv = path.join(test_dir_path, "test_data", "workflow",
                             "workflow_seq_abun.tsv")
예제 #8
0
import unittest
from os import path
import pandas as pd
import hashlib
import gzip
from tempfile import TemporaryDirectory
from picrust2.util import (write_fasta, read_fasta, write_phylip, read_phylip,
                           three_df_index_overlap_sort, add_descrip_col,
                           get_picrust_project_dir,
                           convert_humann2_to_picrust2,
                           convert_picrust2_to_humann2,
                           convert_picrust2_to_humann2_merged)

from picrust2.default import default_map

descrip_test_dir_path = path.join(get_picrust_project_dir(), "tests",
                                  "test_data", "add_descriptions")

descrip_test_dir_out_path = path.join(descrip_test_dir_path, "output")

# Set paths to test input and output files for add_descriptions.py tests.
ec_unstrat_in = path.join(descrip_test_dir_path, "ec_unstrat_test.txt")
ec_unstrat_exp = path.join(descrip_test_dir_out_path, "ec_unstrat_exp.txt")

ec_strat_in = path.join(descrip_test_dir_path, "ec_strat_test.txt")
ec_strat_exp = path.join(descrip_test_dir_out_path, "ec_strat_exp.txt")

ec_nomatch_in = path.join(descrip_test_dir_path, "ec_nomatch_test.txt")

metacyc_unstrat_in = path.join(descrip_test_dir_path,
                               "metacyc_unstrat_test.txt")
예제 #9
0
#!/usr/bin/env python

__copyright__ = "Copyright 2018, The PICRUSt Project"
__license__ = "GPL"
__version__ = "2.0.0-b.4"

import unittest
import pandas as pd
from os import path
from tempfile import TemporaryDirectory
from picrust2.run_minpath import (minpath_wrapper, run_minpath_pipeline,
                                  read_strat_genes)
from picrust2.util import get_picrust_project_dir

# Path to test directory.
test_dir_path = path.join(get_picrust_project_dir(), "tests")

in_metagenome_abun = path.join(test_dir_path, "test_data", "run_minpath",
                               "test_metagenome_out.tsv")

exp_minpath_out_strat = path.join(test_dir_path, "test_data", "run_minpath",
                                  "expected_out_strat_path.tsv")

exp_minpath_out_unstrat = path.join(test_dir_path, "test_data", "run_minpath",
                                    "expected_out_unstrat_path.tsv")

map_ec2path_prokaryotic = path.join(get_picrust_project_dir(), "MinPath",
                                    "ec2metacyc_picrust_prokaryotic.txt")


class minpath_wrapper_tests(unittest.TestCase):
예제 #10
0
def minpath_wrapper(sample_id,
                    unstrat_input,
                    minpath_map,
                    out_dir,
                    print_opt=False,
                    extra_str=""):
    '''Run MinPath based on gene abundances in a single sample. Will return
    the abundances of gene families within each identified pathway.'''

    # Make output directory for MinPath intermediate files.
    make_output_dir(path.join(out_dir, "minpath_running"))

    # Define MinPath input and output filenames.
    minpath_in = path.join(out_dir, "minpath_running",
                           sample_id + extra_str + "_minpath_in.txt")

    minpath_report = path.join(out_dir, "minpath_running",
                               sample_id + extra_str + "_minpath_report.txt")

    minpath_details = path.join(out_dir, "minpath_running",
                                sample_id + extra_str + "_minpath_details.txt")

    minpath_mps = path.join(out_dir, "minpath_running",
                            sample_id + extra_str + "_minpath.mps")

    minpath_output = open(path.join(out_dir, sample_id + "_minpath_out.txt"),
                          "w")

    id_minpath_fh = open(minpath_in, "w")

    # Inititalize dictionary for keeping track of reaction abundances.
    reaction_abun = defaultdict(int)

    # Loop over all reactions (which are the index labels in unstrat table
    # unless regrouped).
    for reaction_id in unstrat_input.index.values:
        # Get count of each sequence in sample and write that sequence out
        # along with count if non-zero abundance.
        reaction_count = unstrat_input.loc[reaction_id, sample_id]

        # If 0 then skip.
        if reaction_count == 0:
            continue

        id_minpath_fh.write(reaction_id + "\t" + str(reaction_count) + "\n")

        reaction_abun[reaction_id] = reaction_count

    id_minpath_fh.close()

    # Run MinPath on this sample.
    path2minpath = path.join(get_picrust_project_dir(), 'picrust2', 'MinPath',
                             'MinPath12hmp.py')

    minpath_cmd = path2minpath + " -any " + minpath_in + " -map " +\
                  minpath_map + " -report " + minpath_report +\
                  " -details " + minpath_details + " -mps " + minpath_mps

    system_call_check(minpath_cmd, print_out=print_opt, stdout=minpath_output)

    # Read through MinPath report and keep track of pathways identified
    # to be present.
    path_present = identify_minpath_present(minpath_report)

    # Return list of which pathways are present and the abundances of all gene
    # families.
    return (path_present, reaction_abun)