Exemplo n.º 1
0
def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix,
                           sample_metadata: qiime2.Metadata,
                           where: str = None) -> skbio.DistanceMatrix:
    ids_to_keep = sample_metadata.ids(where=where)
    # NOTE: there is no guaranteed ordering to output distance matrix because
    # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration
    # order.
    try:
        return distance_matrix.filter(ids_to_keep, strict=False)
    except skbio.stats.distance.DissimilarityMatrixError:
        raise ValueError(
            "All samples were filtered out of the distance matrix.")
Exemplo n.º 2
0
def filter_seqs(data: pd.Series,
                metadata: qiime2.Metadata,
                where: str = None,
                exclude_ids: bool = False) -> pd.Series:
    # Note, no need to check for missing feature IDs in the metadata, because
    # that is basically the point of this method.
    ids_to_keep = metadata.ids(where=where)
    if exclude_ids is True:
        ids_to_keep = set(data.index) - set(ids_to_keep)
    filtered = data[data.index.isin(ids_to_keep)]
    if filtered.empty is True:
        raise ValueError('All features were filtered out of the data.')
    return filtered
Exemplo n.º 3
0
def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix,
                           metadata: qiime2.Metadata,
                           where: str=None,
                           exclude_ids: bool=False) -> skbio.DistanceMatrix:
    ids_to_keep = metadata.ids(where=where)
    if exclude_ids:
        ids_to_keep = set(distance_matrix.ids) - set(ids_to_keep)
    # NOTE: there is no guaranteed ordering to output distance matrix because
    # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration
    # order.
    try:
        return distance_matrix.filter(ids_to_keep, strict=False)
    except skbio.stats.distance.DissimilarityMatrixError:
        raise ValueError(
            "All samples were filtered out of the distance matrix.")
Exemplo n.º 4
0
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int,
                      phylogeny: skbio.TreeNode=None, metrics: set=None,
                      metadata: qiime2.Metadata=None, min_depth: int=1,
                      steps: int=10, iterations: int=10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))
    if metadata is not None:
        metadata_ids = metadata.ids()
        table_ids = set(table.ids(axis='sample'))
        if not table_ids.issubset(metadata_ids):
            raise ValueError('Missing samples in metadata: %r' %
                             table_ids.difference(metadata_ids))

    filenames, categories, empty_columns = [], [], []
    data = _compute_rarefaction_data(table, min_depth, max_depth,
                                     steps, iterations, phylogeny, metrics)
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            metadata_df = metadata.to_dataframe()
            metadata_df = metadata_df.loc[data.index]

            all_columns = metadata_df.columns
            metadata_df.dropna(axis='columns', how='all', inplace=True)
            empty_columns = set(all_columns) - set(metadata_df.columns)

            metadata_df.columns = pd.MultiIndex.from_tuples(
                [(c, '') for c in metadata_df.columns])
            merged = data.join(metadata_df, how='left')
            categories = metadata_df.columns.get_level_values(0)
            for category in categories:
                category_name = quote(category)
                reindexed_df, counts = _reindex_with_metadata(category,
                                                              categories,
                                                              merged)
                c_df = _compute_summary(reindexed_df, category, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, category_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, category_name)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = ['depth-%d_iter-%d' % (t[0], t[1])
                            for t in data.columns.values]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index, output_dir,
                       context={'metrics': list(metrics),
                                'filenames': filenames,
                                'categories': list(categories),
                                'empty_columns': sorted(empty_columns)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))