def aldex2(table: pd.DataFrame, metadata: qiime2.Metadata, condition: str, mc_samples: int = 128, test: str = 't', denom: str = 'all') -> pd.DataFrame: with tempfile.TemporaryDirectory() as temp_dir_name: biom_fp = os.path.join(temp_dir_name, 'input.tsv.biom') map_fp = os.path.join(temp_dir_name, 'input.map.txt') summary_fp = os.path.join(temp_dir_name, 'output.summary.txt') table.to_csv(biom_fp, sep='\t') metadata.to_dataframe().to_csv(map_fp, sep='\t') cmd = [ 'run_aldex2.R', biom_fp, map_fp, condition, mc_samples, test, denom, summary_fp ] cmd = list(map(str, cmd)) try: run_commands([cmd]) except subprocess.CalledProcessError as e: raise Exception("An error was encountered while running ALDEx2" " in R (return code %d), please inspect stdout" " and stderr to learn more." % e.returncode) summary = pd.read_csv(summary_fp, index_col=0) differentials = summary[['effect']] # don't return summary for now (TODO!) return differentials
def _generic_plot(output_dir: str, master: skbio.OrdinationResults, metadata: qiime2.Metadata, other_pcoa: skbio.OrdinationResults, plot_name, custom_axes: str=None, feature_metadata: qiime2.Metadata=None): mf = metadata.to_dataframe() if feature_metadata is not None: feature_metadata = feature_metadata.to_dataframe() if other_pcoa is None: procrustes = None else: procrustes = [other_pcoa] viz = Emperor(master, mf, feature_mapping_file=feature_metadata, procrustes=procrustes, remote='.') if custom_axes is not None: viz.custom_axes = custom_axes if other_pcoa: viz.procrustes_names = ['reference', 'other'] html = viz.make_emperor(standalone=True) viz.copy_support_files(output_dir) with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh: fh.write(html) index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={'plot_name': plot_name})
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) # drop non-numeric columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric') non_numeric_cols = pre_filtered_cols - set(metadata.columns) # Drop samples that have any missing values. # TODO use Metadata API if more filtering is supported in the future. df = metadata.to_dataframe() df = df.dropna() metadata = qiime2.Metadata(df) # filter 0 variance numerical columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_zero_variance=True, drop_all_missing=True) zero_variance_cols = pre_filtered_cols - set(metadata.columns) df = metadata.to_dataframe() # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = q2templates.df_to_html(result) index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'non_numeric_cols': ', '.join(sorted(non_numeric_cols)), 'zero_variance_cols': ', '.join(sorted(zero_variance_cols)), 'result': result })
def barplot(output_dir: str, table: pd.DataFrame, taxonomy: pd.Series, metadata: Metadata) -> None: metadata = metadata.to_dataframe() filenames = [] collapsed_tables = _extract_to_level(taxonomy, table) for level, df in enumerate(collapsed_tables, 1): # Join collapsed table with metadata taxa_cols = df.columns.values.tolist() df = df.join(metadata, how='left') df = df.reset_index(drop=False) # Move SampleID index into columns df = df.fillna('') # JS sort works best with empty strings vs null all_cols = df.columns.values.tolist() filename = 'lvl-%d.jsonp' % level filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('Level %d'," % level) json.dump(taxa_cols, fh) fh.write(",") json.dump(all_cols, fh) fh.write(",") df.to_json(fh, orient='records') fh.write(");") # Now that the tables have been collapsed, write out the index template index = os.path.join(TEMPLATES, 'barplot', 'index.html') q2templates.render(index, output_dir, context={'filenames': filenames}) # Copy assets for rendering figure shutil.copytree(os.path.join(TEMPLATES, 'barplot', 'dst'), os.path.join(output_dir, 'dist'))
def _generic_plot(output_dir: str, master: skbio.OrdinationResults, metadata: qiime2.Metadata, other_pcoa: skbio.OrdinationResults, plot_name, custom_axes: str=None): mf = metadata.to_dataframe() if other_pcoa is None: procrustes = None else: procrustes = [other_pcoa] viz = Emperor(master, mf, procrustes=procrustes, remote='.') if custom_axes is not None: viz.custom_axes = custom_axes if other_pcoa: viz.procrustes_names = ['reference', 'other'] html = viz.make_emperor(standalone=True) viz.copy_support_files(output_dir) with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh: fh.write(html) index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={'plot_name': plot_name})
def filter_results( results: GrowthResults, metadata: Metadata, query: str = None, exclude: bool = False, ) -> GrowthResults: """Filter samples from the simulation results.""" sids = results.growth_rates.sample_id exchanges = results.exchanges rates = results.growth_rates metadata = metadata.to_dataframe() if query is not None: metadata = metadata.query(query) if exclude: filtered_sids = sids[~sids.isin(metadata.index)] else: filtered_sids = sids[sids.isin(metadata.index)] if len(filtered_sids) == 0: raise ValueError("There are no samples left after filtering :O") filtered_results = GrowthResults( growth_rates=rates[rates.sample_id.isin(filtered_sids)], exchanges=exchanges[exchanges.sample_id.isin(filtered_sids)], annotations=results.annotations, ) return filtered_results
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # convert metadata to numeric values where applicable, drop the non-numeric # values, and then drop samples that contain NaNs df = metadata.to_dataframe() df = df.apply(lambda x: pd.to_numeric(x, errors='ignore')) # filter categorical columns pre_filtered_cols = set(df.columns) df = df.select_dtypes([numpy.number]).dropna() filtered_categorical_cols = pre_filtered_cols - set(df.columns) # filter 0 variance numerical columns pre_filtered_cols = set(df.columns) df = df.loc[:, df.var() != 0] filtered_zero_variance_cols = pre_filtered_cols - set(df.columns) # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index, strict=False) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = result.to_html(classes='table table-striped table-hover').replace( 'border="1"', 'border="0"') index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'filtered_categorical_cols': ', '.join(filtered_categorical_cols), 'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols), 'result': result})
def tabulate(output_dir: str, input: qiime2.Metadata, page_size: int = 100) -> None: if page_size < 1: raise ValueError('Cannot render less than one record per page.') df = input.to_dataframe() df.reset_index(inplace=True) table = df.to_json(orient='split') # JSON spec doesn't allow single quotes in string values, at all. It does # however allow unicode values. table = table.replace("'", r'\u0027') index = os.path.join(TEMPLATES, 'tabulate', 'index.html') q2templates.render(index, output_dir, context={ 'table': table, 'page_size': page_size }) js = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.js') os.mkdir(os.path.join(output_dir, 'js')) shutil.copy(js, os.path.join(output_dir, 'js', 'datatables.min.js')) css = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.css') os.mkdir(os.path.join(output_dir, 'css')) shutil.copy(css, os.path.join(output_dir, 'css', 'datatables.min.css'))
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # convert metadata to numeric values where applicable, drop the non-numeric # values, and then drop samples that contain NaNs df = metadata.to_dataframe() df = df.apply(lambda x: pd.to_numeric(x, errors='ignore')) # filter categorical columns pre_filtered_cols = set(df.columns) df = df.select_dtypes([numpy.number]).dropna() filtered_categorical_cols = pre_filtered_cols - set(df.columns) # filter 0 variance numerical columns pre_filtered_cols = set(df.columns) df = df.loc[:, df.var() != 0] filtered_zero_variance_cols = pre_filtered_cols - set(df.columns) # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index, strict=False) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = result.to_html(classes='table table-striped table-hover').replace( 'border="1"', 'border="0"') index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'filtered_categorical_cols': ', '.join(filtered_categorical_cols), 'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols), 'result': result})
def first_differences(metadata: qiime2.Metadata, state_column: str, individual_id_column: str, metric: str, replicate_handling: str = 'error', baseline: float = None, table: pd.DataFrame = None) -> pd.Series: # find metric in metadata or derive from table and merge into metadata if table is not None: _validate_metadata_is_superset(metadata.to_dataframe(), table) metadata = _add_metric_to_metadata(table, metadata, metric) else: metadata = _load_metadata(metadata) _validate_is_numeric_column(metadata, metric) # validate columns _validate_input_columns(metadata, individual_id_column, None, state_column, metric) return _first_differences(metadata, state_column, individual_id_column, metric, replicate_handling, baseline=baseline, distance_matrix=None)
def tabulate(output_dir: str, input: qiime2.Metadata, page_size: int = 100) -> None: if page_size < 1: raise ValueError('Cannot render less than one record per page.') df = input.to_dataframe() df_columns = pd.MultiIndex.from_tuples([(n, t.type) for n, t in input.columns.items()], names=['column header', 'type']) df.columns = df_columns df.reset_index(inplace=True) table = df.to_json(orient='split') index = os.path.join(TEMPLATES, 'tabulate', 'index.html') q2templates.render(index, output_dir, context={ 'table': table, 'page_size': page_size }) input.save(os.path.join(output_dir, 'metadata.tsv')) js = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.js') os.mkdir(os.path.join(output_dir, 'js')) shutil.copy(js, os.path.join(output_dir, 'js', 'datatables.min.js')) css = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.css') os.mkdir(os.path.join(output_dir, 'css')) shutil.copy(css, os.path.join(output_dir, 'css', 'datatables.min.css'))
def quadtree(metadata: qiime2.Metadata, y_coord: str, x_coord: str, threshold: int) -> (skbio.TreeNode, pd.DataFrame): metadata = metadata.to_dataframe() index = metadata.index.name cleaned_df = clean(metadata, y_coord, x_coord) tree, samples = get_results(cleaned_df, threshold, index) return tree, samples
def normalize(metadata: q2.Metadata, rules_dir: q2.plugin.Str) -> q2.Metadata: """ Parameters ---------- metadata : q2.Metadata The sample metadata. rules_dir : q2.plugin.Str The path to the yaml rules folder. Returns ------- metadata_curated : q2.Metadata Curated metadata table. """ # TEMPORARY FUNCTION TO PASS THE DEFAULT FOLDER CONTAINING OUR 8 RULES # (A REAL USER SHOULD PASS ANOTHER FOLDER LOCATION TO '--p-rules-dir') variables_rules_dir = get_variables_rules_dir(rules_dir, RULES) # Collect rules from yaml files folder by instantiating a class rules = RulesCollection(variables_rules_dir) # Get metadata as pandas data frame md = metadata.to_dataframe() # get metadata variables that have rules focus = get_intersection(rules.get_variables_names(), md.columns.tolist()) # apply rules one variable at a time # for variable in focus: # md[variable] = rules.normalize(variable, md[variable]) # only during dev so that the function return something :) md_out = pd.DataFrame() return q2.Metadata(md_out)
def filter_models( models: CommunityModelDirectory, metadata: Metadata, query: str = None, exclude: bool = False, ) -> CommunityModelDirectory: """Filter samples from a set of community models.""" manifest = models.manifest.view(pd.DataFrame) metadata = metadata.to_dataframe() if query is not None: metadata = metadata.query(query) if exclude: filtered_manifest = manifest[~manifest.sample_id.isin(metadata.index)] else: filtered_manifest = manifest[manifest.sample_id.isin(metadata.index)] if filtered_manifest.shape[0] == 0: raise ValueError("There are no samples left after filtering :O") out = CommunityModelDirectory() filtered_manifest.to_csv(out.manifest.path_maker()) filtered_manifest.sample_id.apply( lambda sid: shutil.copy( models.model_files.path_maker(model_id=sid), out.model_files.path_maker(model_id=sid), ) ) return out
def community_plot(output_dir: str, tree: NewickFormat, feature_table: biom.Table, sample_metadata: qiime2.Metadata, pcoa: OrdinationResults = None, feature_metadata: qiime2.Metadata = None, ignore_missing_samples: bool = False, filter_extra_samples: bool = False, filter_missing_features: bool = False, number_of_features: int = 5, shear_tree: bool = True) -> None: """Visualizes a tree alongside community-level data. The functionality available in this visualization is a superset of the functionality in tree_plot() -- including sample metadata coloring / barplots, animations, and Emperor integration support. """ if pcoa is not None and pcoa.features is not None: # select the top N most important features based on the vector's # magnitude (coped from q2-emperor) feats = pcoa.features.copy() # in cases where the axes are all zero there might be all-NA # columns feats.fillna(0, inplace=True) origin = np.zeros_like(feats.columns) feats['importance'] = feats.apply(euclidean, axis=1, args=(origin, )) feats.sort_values('importance', inplace=True, ascending=False) feats.drop(['importance'], inplace=True, axis=1) pcoa.features = feats[:number_of_features].copy() sample_metadata = sample_metadata.to_dataframe() if feature_metadata is not None: feature_metadata = feature_metadata.to_dataframe() t = get_bp(tree) viz = Empress(tree=t, table=feature_table, sample_metadata=sample_metadata, feature_metadata=feature_metadata, ordination=pcoa, ignore_missing_samples=ignore_missing_samples, filter_extra_samples=filter_extra_samples, filter_missing_features=filter_missing_features, shear_tree=shear_tree) save_viz(viz, output_dir)
def plot(output_dir: str, tree: NewickFormat, feature_table: pd.DataFrame, sample_metadata: qiime2.Metadata, pcoa: OrdinationResults = None, feature_metadata: qiime2.Metadata = None, ignore_missing_samples: bool = False, filter_missing_features: bool = False, number_of_features: int = 5, filter_unobserved_features_from_phylogeny: bool = True) -> None: if pcoa is not None and pcoa.features is not None: # select the top N most important features based on the vector's # magnitude (coped from q2-emperor) feats = pcoa.features.copy() origin = np.zeros_like(feats.columns) feats['importance'] = feats.apply(euclidean, axis=1, args=(origin, )) feats.sort_values('importance', inplace=True, ascending=False) feats.drop(['importance'], inplace=True, axis=1) pcoa.features = feats[:number_of_features].copy() sample_metadata = sample_metadata.to_dataframe() if feature_metadata is not None: feature_metadata = feature_metadata.to_dataframe() # path to the actual newick file with open(str(tree)) as file: t = parse_newick(file.readline()) trim_tree = filter_unobserved_features_from_phylogeny viz = Empress(tree=t, table=feature_table, sample_metadata=sample_metadata, feature_metadata=feature_metadata, ordination=pcoa, ignore_missing_samples=ignore_missing_samples, filter_missing_features=filter_missing_features, filter_unobserved_features_from_phylogeny=trim_tree) with open(os.path.join(output_dir, 'empress.html'), 'w') as file: file.write(str(viz)) viz.copy_support_files(output_dir) index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir)
def parse_meta( query_results: MetaStormsSearchResultsDirFmt, metadata: Metadata, number_predicted: str = _default_params['number_predicted'], base_of_similarity: str = _default_params['base_of_similarity'], max_number_matches: str = _default_params['max_number_matches'], number_of_skipped: str = _default_params['number_of_skipped']) -> str: tmpdir = tempfile.mkdtemp() qr_path = os.path.join(str(query_results), 'query.out') result_fname = os.path.join(tmpdir, 'query.out.meta') md_fname = os.path.join(tmpdir, 'metadata.tsv') metadata.to_dataframe().to_csv(md_fname, sep='\t', index=True, header=True) run_command( _build_parse_meta_command(qr_path, md_fname, result_fname, number_predicted, base_of_similarity, max_number_matches, number_of_skipped)) return result_fname
def label_seqs(seqs: pd.Series, delimiter: str, metadata: qiime2.Metadata = None, columns: str = None, missing_value: str = 'missing') \ -> pd.Series: if columns is not None and metadata is None \ or metadata is not None and columns is None: raise ValueError('Columns and metadata must be passed or not passed ' 'together.') if delimiter in missing_value: raise ValueError(f'The provided delimiter ({repr(delimiter)}) cannot ' 'be contained in the missing value placeholder ' f'({repr(missing_value)}).') # This is necessary because QIIME 2 will not accept an empty list as an # argument of type List[str] if columns is None: columns = [] # Make sure we have strings at this point not skbio DNA objects because we # experienced a bizarre segmentation fault while using DNA objects seqs = seqs.apply(str) seqs.index = seqs.index.map(lambda x: x.split(delimiter)[0]) if metadata is not None: md_df = metadata.to_dataframe() for column in columns: if column not in md_df.columns: raise ValueError(f'The column {repr(column)} is not present ' 'in the metadata') missing_ids = seqs.index.difference(md_df.index) if len(missing_ids): difference = \ ' '.join(repr(value) for value in missing_ids.values[0:10]) additional_missing = len(missing_ids.values[10:]) error_message = ('The following ids are present in the sequences ' f'but not the metadata {difference}') if additional_missing > 0: error_message += (f' ({additional_missing} additional ids are' ' missing from metadata but omitted from' ' this list)') raise ValueError(error_message) else: md_df = pd.DataFrame({}, index=seqs.index) selected = md_df[columns] selected = selected.fillna(missing_value) rename = pd.Series([delimiter.join(row) for row in selected.itertuples()], index=selected.index) seqs.index = seqs.index.map(rename) return seqs
def db(meta: Metadata, rank: str = "genus", threads: int = 1) -> JSONDirectory: """Create a model database from a set of SBML files.""" meta = meta.to_dataframe() json_dir = JSONDirectory() path = str(json_dir.json_files.path_maker(model_id="dummy")) path = os.path.dirname(path) meta = build_database(meta, path, rank, threads, compress=False) os.rename(os.path.join(path, "manifest.csv"), json_dir.manifest.path_maker()) return json_dir
def generic_plot(output_dir: str, master: skbio.OrdinationResults, metadata: qiime2.Metadata, other_pcoa: skbio.OrdinationResults, plot_name: str, info: str = None, custom_axes: str = None, settings: dict = None, ignore_missing_samples: bool = False, feature_metadata: qiime2.Metadata = None): mf = metadata.to_dataframe() if feature_metadata is not None: feature_metadata = feature_metadata.to_dataframe() if other_pcoa is None: procrustes = None else: procrustes = [other_pcoa] viz = Emperor(master, mf, feature_mapping_file=feature_metadata, ignore_missing_samples=ignore_missing_samples, procrustes=procrustes, remote='.') if custom_axes is not None: viz.custom_axes = custom_axes if other_pcoa: viz.procrustes_names = ['reference', 'other'] viz.info = info viz.settings = settings html = viz.make_emperor(standalone=True) viz.copy_support_files(output_dir) with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh: fh.write(html) index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={'plot_name': plot_name})
def pibble(table: pd.DataFrame, metadata: qiime2.Metadata, formula: str, learning_rate: float = 1e-3, beta1: float = 0.9, beta2: float = 0.99) -> (pd.DataFrame, pd.DataFrame): with tempfile.TemporaryDirectory() as temp_dir_name: biom_fp = os.path.join(temp_dir_name, 'input.tsv.biom') map_fp = os.path.join(temp_dir_name, 'input.map.txt') differential_fp = os.path.join(temp_dir_name, 'output.differential.csv') posterior_fp = os.path.join(temp_dir_name, 'output.posterior.csv') table.to_csv(biom_fp, sep='\t') metadata.to_dataframe().to_csv(map_fp, sep='\t') cmd = [ 'run_pibble.R', biom_fp, map_fp, formula, learning_rate, beta1, beta2, differential_fp, posterior_fp ] cmd = list(map(str, cmd)) try: run_commands([cmd]) except subprocess.CalledProcessError as e: raise Exception("An error was encountered while running stray" " in R (return code %d), please inspect stdout" " and stderr to learn more." % e.returncode) lam_summary = pd.read_csv(differential_fp, index_col=0) alr_diffs = summary[['covariate', 'coord', 'mean']] alr_diffs = differentials.pivot('coord', 'covariate', 'mean') diffs = np.vstack((alr_diffs.values, np.zeros(alr_diffs.shape[1]))) # convert to alr coordinates diffs = diffs - diffs.mean(axis=0) differential = pd.Dataframe(diff, columns=alr_diffs.columns, index=table.index) posterior = pd.read_csv(posterior_fp, index_col=0) return differential, posterior
def barplot(output_dir: str, proportions: pd.DataFrame, sample_metadata: Metadata, category_column: str = DEFAULT_CAT) -> None: # scriptable metadata sample_metadata = sample_metadata.to_dataframe() # make the sample metadata # check if proportion index in metadata index if sum([i in sample_metadata.index for i in proportions.columns]) > 0: # then subset sample metadata by index mf_samples = sample_metadata.loc[proportions.columns, :] mf_samples.index.name = 'sampleid' else: # else subset sample metadata by category (in loo case) keep_ = sample_metadata[category_column].isin(proportions.columns) mf_samples = sample_metadata[keep_] mf_samples = mf_samples.set_index(category_column) mf_samples = mf_samples.loc[~mf_samples.index.duplicated(keep='first')] mf_samples[category_column] = list(mf_samples.index) mf_samples = mf_samples[mf_samples.columns[::-1]] mf_samples.index.name = 'sampleid' # make the feature metadata (mock taxonomy) keep_ = sample_metadata[category_column].isin(proportions.index) mf_feature = sample_metadata[keep_] mf_feature = mf_feature.set_index(category_column) mf_feature = mf_feature.loc[~mf_feature.index.duplicated(keep='first')] mf_feature.loc['Unknown', :] = 'Unknown' mf_feature[category_column] = list(mf_feature.index) mf_feature = mf_feature[mf_feature.columns[::-1]] mf_feature = mf_feature.astype(str).apply(lambda x: '; '.join(x), axis=1) mf_feature = pd.DataFrame(mf_feature, columns=['Taxon']) mf_feature.index.name = 'Feature ID' # make barplot _barplot(output_dir, proportions.T, pd.Series(mf_feature.Taxon), Metadata(mf_samples)) # grab bundle location to fix bundle = os.path.join(output_dir, 'dist', 'bundle.js') # bundle terms to fix for our purpose bundle_rplc = { 'Relative Frequency': 'Source Contribution', 'Taxonomic Level': 'Source Grouping', 'Sample': 'Sink' } # make small text chnage to bundle with open(bundle) as f: newText = f.read() for prev, repl in bundle_rplc.items(): newText = newText.replace(prev, repl) with open(bundle, "w") as f: f.write(newText)
def tree_plot(output_dir: str, tree: NewickFormat, feature_metadata: qiime2.Metadata = None) -> None: """Visualizes a tree (optionally with feature metadata).""" if feature_metadata is not None: feature_metadata = feature_metadata.to_dataframe() t = get_bp(tree) viz = Empress(tree=t, feature_metadata=feature_metadata) save_viz(viz, output_dir)
def sample_random(ids: qiime2.Metadata, n: int, seed: int = None) \ -> IDSelection: if n > ids.id_count: raise ValueError("Value for n is larger than the number of IDs" " present") df = ids.to_dataframe() samples = df.sample(n, replace=False, random_state=seed) inclusion = pd.Series(False, index=df.index) inclusion[samples.index] = True return IDSelection(inclusion, ids, "sample_random")
def augment(table: biom.Table, sampling_depth: int, augment_times: int, output_path_metadata: str, raw_metadata: qiime2.Metadata, with_replacement: bool = False, rarefy_start: bool = True) -> biom.Table: metadata = raw_metadata.to_dataframe() metadata = metadata.sort_index() all_df = table.to_dataframe().sort_index().sort_index(axis=1) ## change sorted table back to biom table = biom.Table(all_df.values, all_df.index.to_list(), all_df.columns.to_list()) zero_df = all_df[all_df==0].fillna(0) zero_table = biom.Table(zero_df.values, zero_df.index.to_list(), zero_df.columns.to_list()) sub_table = table.subsample(sampling_depth, axis='sample', by_id=False, with_replacement=with_replacement) if rarefy_start == True: output_table = zero_table.merge(sub_table) else: output_table = table output_metadata = metadata for i in range(augment_times): num = i+1 sub_table = table.subsample(sampling_depth, axis='sample', by_id=False, with_replacement=with_replacement) sub_df = sub_table.to_dataframe().sort_index().sort_index(axis=1) ## rename sub_df_names = sub_df.columns.to_list() sub_df_names_added = [x + '_' + str(num) for x in sub_df_names] sub_df.columns = sub_df_names_added sub_table = biom.Table(sub_df.values, sub_df.index.to_list(), sub_df.columns.to_list()) output_table = output_table.merge(sub_table) metadata_names = metadata.index.to_list() metadata_names_added = [x + '_' + str(num) for x in metadata_names] tmp_metadata = metadata.copy() tmp_metadata.index = metadata_names_added print(output_metadata) output_metadata = pd.concat((output_metadata, tmp_metadata)) output_metadata.index.name = 'sample-id' output_metadata = qiime2.metadata.Metadata(output_metadata) output_metadata.save(output_path_metadata) if output_table.is_empty(): raise ValueError('The output table contains no features.') return output_table
def report(output_dir: str, pcoa: skbio.OrdinationResults, metadata: Metadata, alpha: pd.Series, table: biom.Table, taxonomy: pd.Series, samples: list) -> None: metadata = metadata.to_dataframe() _insanity_checker(samples, metadata, table, alpha, pcoa) index = os.path.join(TEMPLATES, 'report', 'index.html') q2templates.render(index, output_dir, context={'name': 'foo'}) # Copy assets for rendering figure shutil.copytree(os.path.join(TEMPLATES, 'report', 'resources'), os.path.join(output_dir, 'resources'))
def anova(output_dir: str, metadata: qiime2.Metadata, formula: str, sstype: str = 'II') -> None: # Grab metric and covariate names from formula metric, group_columns = _parse_formula(formula) columns = [metric] + list(group_columns) # Validate formula (columns are in metadata, etc) for col in columns: metadata.get_column(col) # store categorical column names for later use cats = metadata.filter_columns(column_type='categorical').columns.keys() metadata = metadata.to_dataframe()[columns].dropna() # Run anova lm = ols(formula, metadata).fit() results = pd.DataFrame(sm.stats.anova_lm(lm, typ=sstype)).fillna('') results.to_csv(os.path.join(output_dir, 'anova.tsv'), sep='\t') # Run pairwise t-tests with multiple test correction pairwise_tests = pd.DataFrame() for group in group_columns: # only run on categorical columns — numeric columns raise error if group in cats: ttests = lm.t_test_pairwise(group, method='fdr_bh').result_frame pairwise_tests = pd.concat([pairwise_tests, pd.DataFrame(ttests)]) if pairwise_tests.empty: pairwise_tests = False # Plot fit vs. residuals metadata['residual'] = lm.resid metadata['fitted_values'] = lm.fittedvalues res = _regplot_subplots_from_dataframe('fitted_values', 'residual', metadata, group_columns, lowess=False, ci=95, palette='Set1', fit_reg=False) # Visualize results _visualize_anova(output_dir, pairwise_tests=pairwise_tests, model_results=results, residuals=res, pairwise_test_name='Pairwise t-tests')
def barplot(output_dir: str, table: pd.DataFrame, taxonomy: pd.Series, metadata: Metadata = None) -> None: if metadata is None: metadata = Metadata(pd.DataFrame({'id': table.index}).set_index('id')) ids_not_in_metadata = set(table.index) - set(metadata.ids) if ids_not_in_metadata: raise ValueError('Sample IDs found in the table are missing in the ' f'metadata: {ids_not_in_metadata!r}.') metadata = metadata.to_dataframe() jsonp_files, csv_files = [], [] collapsed_tables = _extract_to_level(taxonomy, table) for level, df in enumerate(collapsed_tables, 1): # Stash column labels before manipulating dataframe taxa_cols = df.columns.values.tolist() # Join collapsed table with metadata df = df.join(metadata, how='left') df = df.reset_index(drop=False) # Move index into columns # Our JS sort works best with empty strings vs nulls df = df.fillna('') all_cols = df.columns.values.tolist() jsonp_file = 'level-%d.jsonp' % level csv_file = 'level-%d.csv' % level jsonp_files.append(jsonp_file) csv_files.append(csv_file) df.to_csv(os.path.join(output_dir, csv_file), index=False) with open(os.path.join(output_dir, jsonp_file), 'w') as fh: fh.write('load_data(%d,' % level) json.dump(taxa_cols, fh) fh.write(',') json.dump(all_cols, fh) fh.write(',') df.to_json(fh, orient='records') fh.write(');') # Now that the tables have been collapsed, write out the index template index = os.path.join(TEMPLATES, 'barplot', 'index.html') q2templates.render(index, output_dir, context={'jsonp_files': jsonp_files}) # Copy assets for rendering figure shutil.copytree(os.path.join(TEMPLATES, 'barplot', 'dist'), os.path.join(output_dir, 'dist'))
def plot(output_dir: str, model: STLDirFmt, metadata: qiime2.Metadata) -> None: mf = metadata.to_dataframe() ili_path = os.path.join(ASSETS, 'ili') # copy the ili contents into the output folder copy_tree(ili_path, output_dir) stl = os.path.join(str(model.path), 'model.stl') # we save the data to the workers folder since that's where the files are # loaded from, and to avoid requests to external sites, etc. mf.to_csv(os.path.join(output_dir, 'js/workers', 'features.csv')) copyfile(stl, os.path.join(output_dir, 'js/workers', 'model.stl'))
def community_plot(output_dir: str, tree: NewickFormat, feature_table: biom.Table, sample_metadata: qiime2.Metadata, pcoa: OrdinationResults = None, feature_metadata: qiime2.Metadata = None, ignore_missing_samples: bool = False, filter_extra_samples: bool = False, filter_missing_features: bool = False, number_of_features: int = 5, shear_to_table: bool = True) -> None: """Visualizes a tree alongside community-level data. The functionality available in this visualization is a superset of the functionality in tree_plot() -- including sample metadata coloring / barplots, animations, and Emperor integration support. """ if pcoa is not None and pcoa.features is not None: pcoa = prepare_pcoa(pcoa, number_of_features) sample_metadata = sample_metadata.to_dataframe() if feature_metadata is not None: feature_metadata = feature_metadata.to_dataframe() t = get_bp(tree) viz = Empress(tree=t, table=feature_table, sample_metadata=sample_metadata, feature_metadata=feature_metadata, ordination=pcoa, ignore_missing_samples=ignore_missing_samples, filter_extra_samples=filter_extra_samples, filter_missing_features=filter_missing_features, shear_to_table=shear_to_table) save_viz(viz, output_dir)
def adonis(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, formula: str, permutations: int = 999, n_jobs: int = 1) -> None: # Validate sample metadata is superset et cetera metadata_ids = set(metadata.ids) dm_ids = distance_matrix.ids _validate_metadata_is_superset(metadata_ids, set(dm_ids)) # filter ids. ids must be in same order as dm filtered_md = metadata.to_dataframe().reindex(dm_ids) filtered_md.index.name = 'sample-id' metadata = qiime2.Metadata(filtered_md) # Validate formula terms = ModelDesc.from_formula(formula) for t in terms.rhs_termlist: for i in t.factors: column = metadata.get_column(i.name()) if column.has_missing_values(): raise ValueError( 'adonis requires metadata columns with no ' 'NaN values (missing values in column `%s`.)' % (column.name, )) # Run adonis results_fp = os.path.join(output_dir, 'adonis.tsv') with tempfile.TemporaryDirectory() as temp_dir_name: dm_fp = os.path.join(temp_dir_name, 'dm.tsv') distance_matrix.write(dm_fp) md_fp = os.path.join(temp_dir_name, 'md.tsv') metadata.save(md_fp) cmd = [ 'run_adonis.R', dm_fp, md_fp, formula, str(permutations), str(n_jobs), results_fp ] _run_command(cmd) # Visualize results results = pd.read_csv(results_fp, sep='\t') results = q2templates.df_to_html(results) index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html') q2templates.render(index, output_dir, context={'results': results})
def plot(output_dir: str, pcoa: skbio.OrdinationResults, metadata: qiime2.Metadata, custom_axis: str=None) -> None: mf = metadata.to_dataframe() viz = Emperor(pcoa, mf, remote='.') if custom_axis is not None: # put custom_axis inside a list to workaround the type system not # supporting lists of types html = viz.make_emperor(standalone=True, custom_axes=[custom_axis]) else: html = viz.make_emperor(standalone=True) viz.copy_support_files(output_dir) with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh: fh.write(html) index = os.path.join(TEMPLATES, 'index.html') q2templates.render(index, output_dir)
def tabulate(output_dir: str, input: qiime2.Metadata, page_size: int=100) -> None: if page_size < 1: raise ValueError('Cannot render less than one record per page.') df = input.to_dataframe() df.reset_index(inplace=True) table = df.to_json(orient='split') index = os.path.join(TEMPLATES, 'tabulate', 'index.html') q2templates.render(index, output_dir, context={'table': table, 'page_size': page_size}) js = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.js') os.mkdir(os.path.join(output_dir, 'js')) shutil.copy(js, os.path.join(output_dir, 'js', 'datatables.min.js')) css = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.css') os.mkdir(os.path.join(output_dir, 'css')) shutil.copy(css, os.path.join(output_dir, 'css', 'datatables.min.css'))
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) # drop non-numeric columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric') non_numeric_cols = pre_filtered_cols - set(metadata.columns) # filter 0 variance numerical columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_zero_variance=True, drop_all_missing=True) zero_variance_cols = pre_filtered_cols - set(metadata.columns) # Drop samples that have any missing values. # TODO use Metadata API if this type of filtering is supported in the # future. df = metadata.to_dataframe() df = df.dropna(axis='index', how='any') # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = q2templates.df_to_html(result) index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'non_numeric_cols': ', '.join(sorted(non_numeric_cols)), 'zero_variance_cols': ', '.join(sorted(zero_variance_cols)), 'result': result})
def adonis(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, formula: str, permutations: int = 999, n_jobs: str = 1) -> None: # Validate sample metadata is superset et cetera metadata_ids = set(metadata.ids) dm_ids = distance_matrix.ids _validate_metadata_is_superset(metadata_ids, set(dm_ids)) # filter ids. ids must be in same order as dm filtered_md = metadata.to_dataframe().reindex(dm_ids) filtered_md.index.name = 'sample-id' metadata = qiime2.Metadata(filtered_md) # Validate formula terms = ModelDesc.from_formula(formula) for t in terms.rhs_termlist: for i in t.factors: metadata.get_column(i.name()) # Run adonis results_fp = os.path.join(output_dir, 'adonis.tsv') with tempfile.TemporaryDirectory() as temp_dir_name: dm_fp = os.path.join(temp_dir_name, 'dm.tsv') distance_matrix.write(dm_fp) md_fp = os.path.join(temp_dir_name, 'md.tsv') metadata.save(md_fp) cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations), str(n_jobs), results_fp] _run_command(cmd) # Visualize results results = pd.read_csv(results_fp, sep='\t') results = q2templates.df_to_html(results) index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html') q2templates.render(index, output_dir, context={'results': results})
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode = None, metrics: set = None, metadata: qiime2.Metadata = None, min_depth: int = 1, steps: int = 10, iterations: int = 10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is None: columns, filtered_columns = set(), set() else: # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) # Drop metadata columns that aren't categorical, or consist solely of # missing values. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) metadata_df = metadata.to_dataframe() if metadata_df.empty or len(metadata.columns) == 0: raise ValueError("All metadata filtered after dropping columns " "that contained non-categorical data.") metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) columns = metadata_df.columns.get_level_values(0) data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) filenames = [] for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: merged = data.join(metadata_df, how='left') for column in columns: column_name = quote(column) reindexed_df, counts = _reindex_with_metadata(column, columns, merged) c_df = _compute_summary(reindexed_df, column, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, column) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': [quote(f) for f in filenames], 'columns': list(columns), 'steps': steps, 'filtered_columns': sorted(filtered_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: metadata_df = metadata.to_dataframe() metadata_df = metadata_df.apply(pd.to_numeric, errors='ignore') pre_filtered_cols = set(metadata_df.columns) metadata_df = metadata_df.select_dtypes(exclude=[np.number]) post_filtered_cols = set(metadata_df.columns) filtered_numeric_categories = pre_filtered_cols - post_filtered_cols filtered_group_comparisons = [] categories = metadata_df.columns metric_name = alpha_diversity.name if len(categories) == 0: raise ValueError('Only numeric data is present in metadata file.') filenames = [] filtered_categories = [] for category in categories: metadata_category = metadata.get_category(category).to_series() metadata_category = metadata_category[alpha_diversity.index] metadata_category = metadata_category.replace(r'', np.nan).dropna() initial_data_length = alpha_diversity.shape[0] data = pd.concat([alpha_diversity, metadata_category], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_category.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[alpha_diversity.name])) if (len(groups) > 1 and len(groups) != len(data.index)): escaped_category = quote(category) filename = 'category-%s.jsonp' % escaped_category filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis(groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append( ['%s:%s' % (category, names[i]), '%s:%s' % (category, names[j])]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests( kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_category pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: df = pd.Series(groups, index=names) fh.write("load_data('%s'," % category) df.to_json(fh, orient='split') fh.write(",") json.dump({'initial': initial_data_length, 'filtered': filtered_data_length}, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = kw_H_pairwise.to_html(classes="table table-striped " "table-hover") table = table.replace('border="1"', 'border="0"') fh.write(table.replace('\n', '')) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) else: filtered_categories.append(category) index = os.path.join( TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'categories': [quote(fn) for fn in filenames], 'filtered_numeric_categories': ', '.join(filtered_numeric_categories), 'filtered_categories': ', '.join(filtered_categories), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons])}) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dst'), os.path.join(output_dir, 'dist'))
def alpha_correlation(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata, method: str='spearman') -> None: try: alpha_correlation_fn = _alpha_correlation_fns[method] except KeyError: raise ValueError('Unknown alpha correlation method %s. The available ' 'options are %s.' % (method, ', '.join(_alpha_correlation_fns.keys()))) metadata_df = metadata.to_dataframe() metadata_df = metadata_df.apply(pd.to_numeric, errors='ignore') pre_filtered_cols = set(metadata_df.columns) metadata_df = metadata_df.select_dtypes(include=[np.number]) post_filtered_cols = set(metadata_df.columns) filtered_categories = pre_filtered_cols - post_filtered_cols categories = metadata_df.columns if len(categories) == 0: raise ValueError('Only non-numeric data is present in metadata file.') filenames = [] for category in categories: metadata_category = metadata_df[category] metadata_category = metadata_category[alpha_diversity.index] metadata_category = metadata_category.dropna() # create a dataframe containing the data to be correlated, and drop # any samples that have no data in either column df = pd.concat([metadata_category, alpha_diversity], axis=1, join='inner') # compute correlation correlation_result = alpha_correlation_fn(df[metadata_category.name], df[alpha_diversity.name]) warning = None if alpha_diversity.shape[0] != df.shape[0]: warning = {'initial': alpha_diversity.shape[0], 'method': method.title(), 'filtered': df.shape[0]} escaped_category = quote(category) filename = 'category-%s.jsonp' % escaped_category filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('%s'," % category) df.to_json(fh, orient='split') fh.write(",") json.dump(warning, fh) fh.write(",") json.dump({ 'method': method.title(), 'testStat': '%1.4f' % correlation_result[0], 'pVal': '%1.4f' % correlation_result[1], 'sampleSize': df.shape[0]}, fh) fh.write(");") index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'categories': [quote(fn) for fn in filenames], 'filtered_categories': ', '.join(filtered_categories)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dst'), os.path.join(output_dir, 'dist'))
def summarize(output_dir: str, table: biom.Table, sample_metadata: qiime2.Metadata=None) -> None: number_of_features, number_of_samples = table.shape sample_summary, sample_frequencies = _frequency_summary( table, axis='sample') if number_of_samples > 1: # Calculate the bin count, with a minimum of 5 bins IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile'] if IQR == 0.0: bins = 5 else: # Freedman–Diaconis rule bin_width = (2 * IQR) / (number_of_samples ** (1/3)) bins = max((sample_summary['Maximum frequency'] - sample_summary['Minimum frequency']) / bin_width, 5) sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False, rug=True, bins=int(round(bins))) sample_frequencies_ax.get_xaxis().set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))) sample_frequencies_ax.set_xlabel('Frequency per sample') sample_frequencies_ax.set_ylabel('Number of samples') sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.pdf')) sample_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'sample-frequencies.png')) plt.gcf().clear() feature_summary, feature_frequencies = _frequency_summary( table, axis='observation') if number_of_features > 1: feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False, rug=False) feature_frequencies_ax.set_xlabel('Frequency per feature') feature_frequencies_ax.set_ylabel('Number of features') feature_frequencies_ax.set_xscale('log') feature_frequencies_ax.set_yscale('log') feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.pdf')) feature_frequencies_ax.get_figure().savefig( os.path.join(output_dir, 'feature-frequencies.png')) sample_summary_table = q2templates.df_to_html( sample_summary.apply('{:,}'.format).to_frame('Frequency')) feature_summary_table = q2templates.df_to_html( feature_summary.apply('{:,}'.format).to_frame('Frequency')) index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html') context = { 'number_of_samples': number_of_samples, 'number_of_features': number_of_features, 'total_frequencies': int(np.sum(sample_frequencies)), 'sample_summary_table': sample_summary_table, 'feature_summary_table': feature_summary_table, } feature_qualitative_data = _compute_qualitative_summary(table) sample_frequencies.sort_values(inplace=True, ascending=False) feature_frequencies.sort_values(inplace=True, ascending=False) sample_frequencies.to_csv( os.path.join(output_dir, 'sample-frequency-detail.csv')) feature_frequencies.to_csv( os.path.join(output_dir, 'feature-frequency-detail.csv')) feature_frequencies = feature_frequencies.astype(int) \ .apply('{:,}'.format).to_frame('Frequency') feature_frequencies['# of Samples Observed In'] = \ pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format) feature_frequencies_table = q2templates.df_to_html(feature_frequencies) overview_template = os.path.join( TEMPLATES, 'summarize_assets', 'overview.html') sample_frequency_template = os.path.join( TEMPLATES, 'summarize_assets', 'sample-frequency-detail.html') feature_frequency_template = os.path.join( TEMPLATES, 'summarize_assets', 'feature-frequency-detail.html') context.update({'max_count': sample_frequencies.max(), 'feature_frequencies_table': feature_frequencies_table, 'feature_qualitative_data': feature_qualitative_data, 'tabs': [{'url': 'overview.html', 'title': 'Overview'}, {'url': 'sample-frequency-detail.html', 'title': 'Interactive Sample Detail'}, {'url': 'feature-frequency-detail.html', 'title': 'Feature Detail'}]}) templates = [index, sample_frequency_template, feature_frequency_template, overview_template] q2templates.render(templates, output_dir, context=context) shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'), os.path.join(output_dir, 'dist')) with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh: fh.write("app.init(") if sample_metadata: sample_metadata = sample_metadata.filter_ids( sample_frequencies.index) # TODO use Metadata.to_json() API if/when it exists in the future. sample_metadata.to_dataframe().to_json(fh) else: fh.write('{}') fh.write(', ') sample_frequencies.to_json(fh) fh.write(');')
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode=None, metrics: set=None, metadata: qiime2.Metadata=None, min_depth: int=1, steps: int=10, iterations: int=10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is not None: metadata_ids = metadata.ids() table_ids = set(table.ids(axis='sample')) if not table_ids.issubset(metadata_ids): raise ValueError('Missing samples in metadata: %r' % table_ids.difference(metadata_ids)) filenames, categories, empty_columns = [], [], [] data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: metadata_df = metadata.to_dataframe() metadata_df = metadata_df.loc[data.index] all_columns = metadata_df.columns metadata_df.dropna(axis='columns', how='all', inplace=True) empty_columns = set(all_columns) - set(metadata_df.columns) metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) merged = data.join(metadata_df, how='left') categories = metadata_df.columns.get_level_values(0) for category in categories: category_name = quote(category) reindexed_df, counts = _reindex_with_metadata(category, categories, merged) c_df = _compute_summary(reindexed_df, category, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, category_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, category_name) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': filenames, 'categories': list(categories), 'empty_columns': sorted(empty_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))