예제 #1
0
파일: _cli.py 프로젝트: kwcantrell/empress
def community_plot(
    tree: str,
    table: str,
    sample_metadata: str,
    output_dir: str,
    pcoa: str,
    feature_metadata: str,
    ignore_missing_samples: bool,
    filter_extra_samples: bool,
    filter_missing_features: bool,
    number_of_pcoa_features: int,
    shear_to_table: bool,
) -> None:
    tree_newick, fm = check_and_process_files(output_dir, tree,
                                              feature_metadata)
    table = load_table(table)
    sample_metadata = pd.read_csv(sample_metadata, sep="\t", index_col=0)

    if pcoa is not None:
        pcoa = OrdinationResults.read(pcoa)
        pcoa = prepare_pcoa(pcoa, number_of_pcoa_features)

    viz = Empress(
        tree_newick,
        table=table,
        sample_metadata=sample_metadata,
        feature_metadata=fm,
        ordination=pcoa,
        ignore_missing_samples=ignore_missing_samples,
        filter_extra_samples=filter_extra_samples,
        filter_missing_features=filter_missing_features,
        shear_to_table=shear_to_table,
    )
    os.makedirs(output_dir)
    save_viz(viz, output_dir, q2=False)
예제 #2
0
파일: parse.py 프로젝트: wasade/emperor
def parse_coords(lines):
    """Parse skbio's ordination results file into  coords, labels, eigvals,
        pct_explained.

    Returns:
    - list of sample labels in order
    - array of coords (rows = samples, cols = axes in descending order)
    - list of eigenvalues
    - list of percent variance explained

    For the file format check
    skbio.stats.ordination.OrdinationResults.read

    Strategy: read the file using skbio's parser and return the objects
              we want
    """
    try:
        pcoa_results = OrdinationResults.read(lines)
        return (pcoa_results.site_ids, pcoa_results.site, pcoa_results.eigvals,
                pcoa_results.proportion_explained)
    except FileFormatError:
        try:
            lines.seek(0)
        except AttributeError:
            # looks like we have a list of lines, not a file-like object
            pass
        return qiime_parse_coords(lines)
 def test_io(self):
     # Very basic check that read/write public API is present and appears to
     # be functioning. Roundtrip from memory -> disk -> memory and ensure
     # results match.
     fh = StringIO()
     self.ordination_results.write(fh)
     fh.seek(0)
     deserialized = OrdinationResults.read(fh)
     assert_ordination_results_equal(deserialized, self.ordination_results)
     self.assertTrue(type(deserialized) == OrdinationResults)
예제 #4
0
    def setUp(self):
        or_f = StringIO(PCOA_STRING)
        self.ord_res = OrdinationResults.read(or_f)

        self.data = [['PC.354', 'Control', '20061218', 'Ctrol_mouse_I.D._354'],
            ['PC.355', 'Control', '20061218', 'Control_mouse_I.D._355'],
            ['PC.356', 'Control', '20061126', 'Control_mouse_I.D._356'],
            ['PC.481', 'Control', '20070314', 'Control_mouse_I.D._481'],
            ['PC.593', 'Control', '20071210', 'Control_mouse_I.D._593'],
            ['PC.607', 'Fast', '20071112', 'Fasting_mouse_I.D._607'],
            ['PC.634', 'Fast', '20080116', 'Fasting_mouse_I.D._634'],
            ['PC.635', 'Fast', '20080116', 'Fasting_mouse_I.D._635'],
            ['PC.636', 'Fast', '20080116', 'Fasting_mouse_I.D._636']]
        self.headers = ['SampleID', 'Treatment', 'DOB', 'Description']
예제 #5
0
파일: parse.py 프로젝트: johnchase/qiime
def parse_coords(lines):
    """Parse skbio's ordination results file into  coords, labels, eigvals,
        pct_explained.

    Returns:
    - list of sample labels in order
    - array of coords (rows = samples, cols = axes in descending order)
    - list of eigenvalues
    - list of percent variance explained

    For the file format check
    skbio.stats.ordination.OrdinationResults.read

    Strategy: read the file using skbio's parser and return the objects
              we want
    """
    pcoa_results = OrdinationResults.read(lines)
    return (pcoa_results.site_ids, pcoa_results.site, pcoa_results.eigvals, pcoa_results.proportion_explained)
예제 #6
0
def parse_coords(lines):
    """Parse skbio's ordination results file into  coords, labels, eigvals,
        pct_explained.

    Returns:
    - list of sample labels in order
    - array of coords (rows = samples, cols = axes in descending order)
    - list of eigenvalues
    - list of percent variance explained

    For the file format check
    skbio.stats.ordination.OrdinationResults.read

    Strategy: read the file using skbio's parser and return the objects
              we want
    """
    pcoa_results = OrdinationResults.read(lines)
    return (pcoa_results.site_ids, pcoa_results.site, pcoa_results.eigvals,
            pcoa_results.proportion_explained)
예제 #7
0
if __name__ == '__main__':
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    ord_fp = opts.input_fp
    mapping_fp = opts.map_fp
    categories = opts.categories.split(',')
    output_dir = opts.output_dir
    sort_by = opts.sort_by
    algorithm = opts.algorithm
    axes = opts.axes
    weighted = opts.weight_by_vector
    window_size = opts.window_size

    # Parse the ordination results
    with open(ord_fp, 'U') as f:
        ord_res = OrdinationResults.read(f)

    # Parse the mapping file
    with open(mapping_fp, 'U') as f:
        map_dict = parse_mapping_file_to_dict(f)[0]
    metamap = pd.DataFrame.from_dict(map_dict, orient='index')

    for category in categories:
        if category not in metamap.keys():
            option_parser.error("Category %s does not exist in the mapping "
                                "file" % categories)

    sort_category = None
    if sort_by:
        if sort_by == 'SampleID':
            sort_category = None
def get_procrustes_results(coords_f1, coords_f2, sample_id_map=None,
                           randomize=None, max_dimensions=None,
                           get_eigenvalues=get_mean_eigenvalues,
                           get_percent_variation_explained=get_mean_percent_variation):
    """ """
    # Parse the PCoA files
    ord_res_1 = OrdinationResults.read(coords_f1)
    ord_res_2 = OrdinationResults.read(coords_f2)

    sample_ids1 = ord_res_1.site_ids
    coords1 = ord_res_1.site
    eigvals1 = ord_res_1.eigvals
    pct_var1 = ord_res_1.proportion_explained

    sample_ids2 = ord_res_2.site_ids
    coords2 = ord_res_2.site
    eigvals2 = ord_res_2.eigvals
    pct_var2 = ord_res_2.proportion_explained

    if sample_id_map:
        sample_ids1 = map_sample_ids(sample_ids1, sample_id_map)
        sample_ids2 = map_sample_ids(sample_ids2, sample_id_map)
    # rearrange the order of coords in coords2 to correspond to
    # the order of coords in coords1
    order = list(set(sample_ids1) & set(sample_ids2))
    coords1 = reorder_coords(coords1, sample_ids1, order)
    coords2 = reorder_coords(coords2, sample_ids2, order)
    if len(order) == 0:
        raise ValueError('No overlapping samples in the two files')

    # If this is a random trial, apply the shuffling function passed as
    # randomize()
    if randomize:
        coords2 = randomize(coords2)
        randomized_coords2 = OrdinationResults(eigvals=eigvals2,
                                               proportion_explained=pct_var2,
                                               site=coords2,
                                               site_ids=order)
    else:
        randomized_coords2 = None

    coords1, coords2 = pad_coords_matrices(coords1, coords2)
    if max_dimensions:
        coords1 = filter_coords_matrix(coords1, max_dimensions)
        coords2 = filter_coords_matrix(coords2, max_dimensions)
        pct_var1 = pct_var1[:max_dimensions]
        pct_var2 = pct_var2[:max_dimensions]
        eigvals1 = eigvals1[:max_dimensions]
        eigvals2 = eigvals2[:max_dimensions]
    else:
        if len(pct_var1) > len(pct_var2):
            pct_var2 = append(pct_var2, zeros(len(pct_var1) - len(pct_var2)))
            eigvals2 = append(eigvals2, zeros(len(eigvals1) - len(eigvals2)))
        elif len(pct_var1) < len(pct_var2):
            pct_var1 = append(pct_var1, zeros(len(pct_var2) - len(pct_var1)))
            eigvals1 = append(eigvals1, zeros(len(eigvals2) - len(eigvals1)))

    # Run the Procrustes analysis
    transformed_coords_m1, transformed_coords_m2, m_squared =\
        procrustes(coords1, coords2)
    # print coords2
    # print transformed_coords_m2

    eigvals = get_eigenvalues(eigvals1, eigvals2)
    pct_var = get_percent_variation_explained(pct_var1, pct_var2)

    transformed_coords1 = OrdinationResults(eigvals=asarray(eigvals),
                                            proportion_explained=asarray(pct_var),
                                            site=asarray(transformed_coords_m1),
                                            site_ids=order)
    transformed_coords2 = OrdinationResults(eigvals=asarray(eigvals),
                                            proportion_explained=asarray(pct_var),
                                            site=asarray(transformed_coords_m2),
                                            site_ids=order)

    # Return the results
    return (transformed_coords1, transformed_coords2,
            m_squared, randomized_coords2)
예제 #9
0
if __name__ == '__main__':
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    ord_fp = opts.input_fp
    mapping_fp = opts.map_fp
    categories = opts.categories.split(',')
    output_dir = opts.output_dir
    sort_by = opts.sort_by
    algorithm = opts.algorithm
    axes = opts.axes
    weighted = opts.weight_by_vector
    window_size = opts.window_size

    # Parse the ordination results
    with open(ord_fp, 'U') as f:
        ord_res = OrdinationResults.read(f)

    # Parse the mapping file
    with open(mapping_fp, 'U') as f:
        map_dict = parse_mapping_file_to_dict(f)[0]
    metamap = pd.DataFrame.from_dict(map_dict, orient='index')

    for category in categories:
        if category not in metamap.keys():
            option_parser.error("Category %s does not exist in the mapping "
                                "file" % categories)

    sort_category = None
    if sort_by:
        if sort_by == 'SampleID':
            sort_category = None
예제 #10
0
def get_procrustes_results(coords_f1, coords_f2, sample_id_map=None,
                           randomize=None, max_dimensions=None,
                           get_eigenvalues=get_mean_eigenvalues,
                           get_percent_variation_explained=get_mean_percent_variation):
    """ """
    # Parse the PCoA files
    ord_res_1 = OrdinationResults.read(coords_f1)
    ord_res_2 = OrdinationResults.read(coords_f2)

    sample_ids1 = ord_res_1.site_ids
    coords1 = ord_res_1.site
    eigvals1 = ord_res_1.eigvals
    pct_var1 = ord_res_1.proportion_explained

    sample_ids2 = ord_res_2.site_ids
    coords2 = ord_res_2.site
    eigvals2 = ord_res_2.eigvals
    pct_var2 = ord_res_2.proportion_explained

    if sample_id_map:
        sample_ids1 = map_sample_ids(sample_ids1, sample_id_map)
        sample_ids2 = map_sample_ids(sample_ids2, sample_id_map)
    # rearrange the order of coords in coords2 to correspond to
    # the order of coords in coords1
    order = list(set(sample_ids1) & set(sample_ids2))
    coords1 = reorder_coords(coords1, sample_ids1, order)
    coords2 = reorder_coords(coords2, sample_ids2, order)
    if len(order) == 0:
        raise ValueError('No overlapping samples in the two files')

    # If this is a random trial, apply the shuffling function passed as
    # randomize()
    if randomize:
        coords2 = randomize(coords2)
        randomized_coords2 = OrdinationResults(eigvals=eigvals2,
                                               proportion_explained=pct_var2,
                                               site=coords2,
                                               site_ids=order)
    else:
        randomized_coords2 = None

    coords1, coords2 = pad_coords_matrices(coords1, coords2)
    if max_dimensions:
        coords1 = filter_coords_matrix(coords1, max_dimensions)
        coords2 = filter_coords_matrix(coords2, max_dimensions)
        pct_var1 = pct_var1[:max_dimensions]
        pct_var2 = pct_var2[:max_dimensions]
        eigvals1 = eigvals1[:max_dimensions]
        eigvals2 = eigvals2[:max_dimensions]
    else:
        if len(pct_var1) > len(pct_var2):
            pct_var2 = append(pct_var2, zeros(len(pct_var1) - len(pct_var2)))
            eigvals2 = append(eigvals2, zeros(len(eigvals1) - len(eigvals2)))
        elif len(pct_var1) < len(pct_var2):
            pct_var1 = append(pct_var1, zeros(len(pct_var2) - len(pct_var1)))
            eigvals1 = append(eigvals1, zeros(len(eigvals2) - len(eigvals1)))

    # Run the Procrustes analysis
    transformed_coords_m1, transformed_coords_m2, m_squared =\
        procrustes(coords1, coords2)
    # print coords2
    # print transformed_coords_m2

    eigvals = get_eigenvalues(eigvals1, eigvals2)
    pct_var = get_percent_variation_explained(pct_var1, pct_var2)

    transformed_coords1 = OrdinationResults(eigvals=asarray(eigvals),
                                            proportion_explained=asarray(pct_var),
                                            site=asarray(transformed_coords_m1),
                                            site_ids=order)
    transformed_coords2 = OrdinationResults(eigvals=asarray(eigvals),
                                            proportion_explained=asarray(pct_var),
                                            site=asarray(transformed_coords_m2),
                                            site_ids=order)

    # Return the results
    return (transformed_coords1, transformed_coords2,
            m_squared, randomized_coords2)
예제 #11
0
def load_mp_data(use_artifact_api=True, is_empire=True):
    """Loads data from the QIIME 2 moving pictures tutorial for visualization.

    It's assumed that this data is already stored in docs/moving-pictures/, aka
    the PREFIX_DIR global variable set above, which should be located relative
    to where this function is being run from. If this directory or the data
    files within it cannot be accessed, this function will (probably) break.

    Parameters
    ----------
    use_artifact_api: bool, optional (default True)
        If True, this will load the artifacts using the QIIME 2 Artifact API,
        and the returned objects will have types corresponding to the first
        listed types (before the | characters) shown below.
        If False, this will instead load the artifacts without using QIIME 2's
        APIs; in this case, the returned objects will have types corresponding
        to the second listed types (after the | characters) shown below.
    is_empire: bool, optional(default True)
        If True, this will return an ordination.
        If False, will return None in place of an ordination.

    Returns
    -------
    (tree, table, md, fmd, ordination)
        tree: qiime2.Artifact | skbio.tree.TreeNode
            Phylogenetic tree.
        table: qiime2.Artifact | biom.Table
            Feature table.
        md: qiime2.Metadata | pandas.DataFrame
            Sample metadata.
        fmd: qiime2.Metadata | pandas.DataFrame
            Feature metadata. (Although this is stored in the repository as a
            FeatureData[Taxonomy] artifact, we transform it to Metadata if
            use_artifact_api is True.)
        pcoa: qiime2.Artifact | skbio.OrdinationResults | None
    """
    q2_tree_loc = os.path.join(PREFIX_DIR, "rooted-tree.qza")
    q2_table_loc = os.path.join(PREFIX_DIR, "table.qza")
    q2_pcoa_loc = os.path.join(PREFIX_DIR,
                               "unweighted_unifrac_pcoa_results.qza")
    q2_tax_loc = os.path.join(PREFIX_DIR, "taxonomy.qza")
    md_loc = os.path.join(PREFIX_DIR, "sample_metadata.tsv")
    if use_artifact_api:
        from qiime2 import Artifact, Metadata

        tree = Artifact.load(q2_tree_loc)
        table = Artifact.load(q2_table_loc)
        pcoa = Artifact.load(q2_pcoa_loc) if is_empire else None
        md = Metadata.load(md_loc)
        # We have to transform the taxonomy QZA to Metadata ourselves
        fmd = Artifact.load(q2_tax_loc).view(Metadata)
    else:
        import biom
        import pandas as pd
        from skbio.stats.ordination import OrdinationResults
        from skbio.tree import TreeNode
        with tempfile.TemporaryDirectory() as _tmp:
            tree_loc = extract_q2_artifact_to_path(_tmp, q2_tree_loc,
                                                   "tree.nwk")
            tree = TreeNode.read(tree_loc)
            tbl_loc = extract_q2_artifact_to_path(_tmp, q2_table_loc,
                                                  "feature-table.biom")
            table = biom.load_table(tbl_loc)
            if is_empire:
                pcoa_loc = extract_q2_artifact_to_path(_tmp, q2_pcoa_loc,
                                                       "ordination.txt")
                pcoa = OrdinationResults.read(pcoa_loc)
            else:
                pcoa = None
            tax_loc = extract_q2_artifact_to_path(_tmp, q2_tax_loc,
                                                  "taxonomy.tsv")
            fmd = pd.read_csv(tax_loc, sep="\t", index_col=0)
            md = pd.read_csv(md_loc, sep="\t", index_col=0, skiprows=[1])
    return tree, table, md, fmd, pcoa
예제 #12
0
        distances[dataset_][(fold_, Nsamp_)]['Bray_Curtis'] = table_
        table_ = pd.read_table(os.path.join(subpath_, sub_set,
                                            'Robust_Aitchison_Distance.tsv'),
                               index_col=0,
                               low_memory=False)
        table_.index = table_.index.astype(str)
        table_.columns = table_.columns.astype(str)
        table_ = table_.reindex(index=index_me, columns=index_me)
        distances[dataset_][(fold_, Nsamp_)]['Robust_Aitchison'] = table_

        # ordination type file
        in_ord = os.path.join(subpath_, sub_set, 'RPCA_Ordination.txt')
        # get loadings from ordination files
        ordinations[dataset_][(
            fold_,
            Nsamp_)]['RPCA_Samples'] = OrdinationResults.read(in_ord).samples
        ordinations[dataset_][(
            fold_,
            Nsamp_)]['RPCA_Features'] = OrdinationResults.read(in_ord).features

# permanova analysis
from skbio import DistanceMatrix
from skbio.stats.distance import permanova

both_perm_res = {}
perm_res = {}
perm_res_tmp = {}
for dataset_, subs in distances.items():
    perm_res[dataset_] = {}
    perm_res_tmp[dataset_] = {}
    for (fold_, Nsamp_), methods_ in subs.items():
    def get_pair_cmds(self, omics_pairs):
        crowdeds = [0, 1]
        pc_sb_correlations = []
        for keys, values in self.mmvec_res.items():
            pair, case, omic1, omic2, filt1, filt2, sams, mmvec = keys
            ranks_fp, ordi_fp, meta_fp, omic1_common, omic2_common = values
            order_omics = get_order_omics(omic1, omic2, filt1, filt2, case,
                                          omics_pairs)
            omic1 = order_omics[0]
            omic2 = order_omics[1]
            filt1 = order_omics[2]
            filt2 = order_omics[3]
            omic_feature = order_omics[4]
            omic_sample = order_omics[5]
            omic_microbe = order_omics[6]
            omic_metabolite = order_omics[7]

            # get differentials
            meta1, meta_pd1, diff_cols1 = self.metas[(pair, case, omic1, filt1,
                                                      omic2, filt2)]
            meta2, meta_pd2, diff_cols2 = self.metas[(pair, case, omic2, filt2,
                                                      omic1, filt1)]
            # features are biplot, samples are dots
            ordi = OrdinationResults.read(ordi_fp)
            cur_pc_sb_correlations, max_r = get_pc_sb_correlations(
                pair, case, ordi, omic1, omic2, filt1, filt2, diff_cols1,
                meta_pd1, diff_cols2, meta_pd2, meta_fp, omic1_common,
                omic2_common, ranks_fp)
            pc_sb_correlations.append(cur_pc_sb_correlations)

            cmd = ''
            if pair in self.highlights:
                pair_highlights = self.highlights[pair]
                for highlight, regexes_list in pair_highlights.items():
                    n_edit, meta_edit, ordi_edit_fp = edit_ordi_qzv(
                        ordi, ordi_fp, highlight, regexes_list, meta1,
                        meta_pd1)
                    if n_edit:
                        qza, qzv = get_qzs(ordi_edit_fp)
                        cmd += get_biplot_commands(ordi_edit_fp, qza, qzv,
                                                   omic_feature, omic_sample,
                                                   meta_edit, meta2, n_edit,
                                                   max_r)
            ordi_edit_fp = ordi_fp
            qza, qzv = get_qzs(ordi_edit_fp)
            for crowded in crowdeds:
                if crowded:
                    n_ordi_feats = ordi.features.shape[0]
                    qzv = qzv.replace('.qzv', '_crowded.qzv')
                else:
                    n_ordi_feats = 15
                    # heat_qza, heat_qzv = get_heatmap_qzs(ranks_fp)
                    # cmd += get_heatmap_commands(
                    #     ranks_fp, heat_qza, heat_qzv, meta1,
                    #     meta2, meta_pd1, meta_pd2)
                cmd += get_biplot_commands(ordi_edit_fp, qza, qzv,
                                           omic_feature, omic_sample, meta1,
                                           meta2, n_ordi_feats, max_r)
            cmd += get_xmmvec_commands(ordi_edit_fp, omic1, omic2, meta1,
                                       meta2, self.xmmvecs, pair)

            topn = 5
            features_names = []
            if features_names:
                heat = '%s_paired_heatmaps_custom.qzv' % splitext(ranks_fp)[0]
            else:
                heat = '%s_paired_heatmaps_top%s.qzv' % (splitext(ranks_fp)[0],
                                                         topn)
            cmd += get_paired_heatmaps_command(ranks_fp, omic1_common,
                                               omic2_common, meta1,
                                               features_names, topn, heat)
            self.cmds.setdefault(pair, []).append(cmd)
        return pc_sb_correlations