def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix, sample_metadata: qiime2.Metadata, where: str = None) -> skbio.DistanceMatrix: ids_to_keep = sample_metadata.ids(where=where) # NOTE: there is no guaranteed ordering to output distance matrix because # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration # order. try: return distance_matrix.filter(ids_to_keep, strict=False) except skbio.stats.distance.DissimilarityMatrixError: raise ValueError( "All samples were filtered out of the distance matrix.")
def filter_seqs(data: pd.Series, metadata: qiime2.Metadata, where: str = None, exclude_ids: bool = False) -> pd.Series: # Note, no need to check for missing feature IDs in the metadata, because # that is basically the point of this method. ids_to_keep = metadata.ids(where=where) if exclude_ids is True: ids_to_keep = set(data.index) - set(ids_to_keep) filtered = data[data.index.isin(ids_to_keep)] if filtered.empty is True: raise ValueError('All features were filtered out of the data.') return filtered
def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, where: str=None, exclude_ids: bool=False) -> skbio.DistanceMatrix: ids_to_keep = metadata.ids(where=where) if exclude_ids: ids_to_keep = set(distance_matrix.ids) - set(ids_to_keep) # NOTE: there is no guaranteed ordering to output distance matrix because # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration # order. try: return distance_matrix.filter(ids_to_keep, strict=False) except skbio.stats.distance.DissimilarityMatrixError: raise ValueError( "All samples were filtered out of the distance matrix.")
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode=None, metrics: set=None, metadata: qiime2.Metadata=None, min_depth: int=1, steps: int=10, iterations: int=10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is not None: metadata_ids = metadata.ids() table_ids = set(table.ids(axis='sample')) if not table_ids.issubset(metadata_ids): raise ValueError('Missing samples in metadata: %r' % table_ids.difference(metadata_ids)) filenames, categories, empty_columns = [], [], [] data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: metadata_df = metadata.to_dataframe() metadata_df = metadata_df.loc[data.index] all_columns = metadata_df.columns metadata_df.dropna(axis='columns', how='all', inplace=True) empty_columns = set(all_columns) - set(metadata_df.columns) metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) merged = data.join(metadata_df, how='left') categories = metadata_df.columns.get_level_values(0) for category in categories: category_name = quote(category) reindexed_df, counts = _reindex_with_metadata(category, categories, merged) c_df = _compute_summary(reindexed_df, category, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, category_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, category_name) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': filenames, 'categories': list(categories), 'empty_columns': sorted(empty_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))