def distance_matrix(metadata: qiime2.MetadataCategory) -> skbio.DistanceMatrix: try: series = pd.to_numeric(metadata.to_series(), errors='raise') except ValueError as e: raise ValueError( "Encountered non-numeric values in the metadata cateogry. A " "distance matrix can only be computed from numeric metadata. " "Original error message:\n\n%s" % e) # TODO this check can be removed when MetadataCategory is no longer allowed # to be empty if series.empty: raise ValueError( "Encountered metadata category that is empty, i.e. there are no " "samples or features in the metadata to compute distances " "between.") if series.hasnans: raise ValueError( "Encountered missing value(s) in the metadata category. Computing " "a distance matrix from missing values is not supported.") # This code is derived from @jairideout's scikit-bio cookbook recipe, # "Exploring Microbial Community Diversity" # https://github.com/biocore/scikit-bio-cookbook distances = scipy.spatial.distance.pdist(series.values[:, np.newaxis], metric='euclidean') return skbio.DistanceMatrix(distances, ids=series.index)
def regress_samples(output_dir: str, table: pd.DataFrame, metadata: qiime2.MetadataCategory, test_size: float=defaults['test_size'], step: float=defaults['step'], cv: int=defaults['cv'], random_state: int=None, n_jobs: int=defaults['n_jobs'], n_estimators: int=defaults['n_estimators'], estimator: str='RandomForestRegressor', optimize_feature_selection: bool=False, stratify: str=False, parameter_tuning: bool=False) -> None: # extract category name from MetadataCategory category = metadata.to_series().name # disable feature selection for unsupported estimators optimize_feature_selection, calc_feature_importance = \ _disable_feature_selection(estimator, optimize_feature_selection) # specify parameters and distributions to sample from for parameter tuning estimator, param_dist, parameter_tuning = _set_parameters_and_estimator( estimator, table, metadata, category, n_estimators, n_jobs, cv, random_state, parameter_tuning, classification=True) estimator, cm, accuracy, importances = split_optimize_classify( table, metadata, category, estimator, output_dir, test_size=test_size, step=step, cv=cv, random_state=random_state, n_jobs=n_jobs, optimize_feature_selection=optimize_feature_selection, parameter_tuning=parameter_tuning, param_dist=param_dist, calc_feature_importance=calc_feature_importance, scoring=mean_squared_error, stratify=stratify, classification=False) _visualize(output_dir, estimator, cm, accuracy, importances, optimize_feature_selection)
def test_all_matched(self): metadata = MetadataCategory( pd.Series(['AAAA', 'CCCC', 'GGGG'], index=['sample_a', 'sample_b', 'sample_c'], name='Barcode')) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), obs_demuxed_art) # obs_untrimmed should be empty, since everything matched self.assert_untrimmed_results(b'', obs_untrimmed_art)
def test_typical(self): metadata = MetadataCategory( pd.Series(['AAAA', 'CCCC'], index=['sample_a', 'sample_b'], name='Barcode')) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), obs_demuxed_art) self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
def test_error_tolerance_high_enough_to_prevent_filtering(self): metadata = MetadataCategory( pd.Series(['AAAG', 'CCCC'], index=['sample_a', 'sample_b'], name='Barcode')) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata, error_tolerance=0.25) # This test should yield the same results as test_typical, above self.assert_demux_results(metadata.to_series(), obs_demuxed_art) self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
def permutation_fdr(table: pd.DataFrame, metadata: qiime2.MetadataCategory, statistical_test: str = 'meandiff', transform_function: str = 'log', alpha: float = 0.05, permutations: int = 1000) -> pd.Series: # See q2-composition for more details # https://github.com/qiime2/q2-composition/blob/master/q2_composition/_ancom.py # TODO : Consider renaming the functions to match q2-composition metadata_series = metadata.to_series()[table.index] # Make sure that metadata and table match up reject_idx = _pfdr(table.values.T, metadata_series.values, statistical_test, transform_function, alpha, permutations) return reject_idx
def test_variable_length_barcodes(self): metadata = MetadataCategory( pd.Series(['AAAAA', 'CCCCCC', 'GGGG'], index=['sample_a', 'sample_b', 'sample_c'], name='Barcode')) muxed_sequences_fp = self.get_data_path('variable_length.fastq.gz') muxed_sequences = Artifact.import_data( 'MultiplexedSingleEndBarcodeInSequence', muxed_sequences_fp) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(muxed_sequences, metadata) # This test should yield the same results as test_typical, above, just # with variable length barcodes self.assert_demux_results(metadata.to_series(), obs_demuxed_art) self.assert_untrimmed_results(b'', obs_untrimmed_art)
def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt, barcodes: qiime2.MetadataCategory, error_tolerance: float=0.1) -> \ (CasavaOneEightSingleLanePerSampleDirFmt, MultiplexedSingleEndBarcodeInSequenceDirFmt): barcodes = barcodes.to_series() per_sample_sequences = CasavaOneEightSingleLanePerSampleDirFmt() untrimmed = MultiplexedSingleEndBarcodeInSequenceDirFmt() _write_empty_fastq_to_mux_barcode_in_seq_fmt(untrimmed) with tempfile.NamedTemporaryFile() as barcode_fasta: _write_barcode_fasta(barcodes, barcode_fasta) cmd = _build_demux_command(seqs, barcode_fasta, per_sample_sequences, untrimmed, error_tolerance) run_command(cmd) _rename_files(per_sample_sequences, barcodes) muxed = len(list(per_sample_sequences.sequences.iter_views(FastqGzFormat))) if muxed == 0: raise ValueError('No samples were demultiplexed.') return per_sample_sequences, untrimmed
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.MetadataCategory, method: str='permanova', permutations: int=999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Cast metadata to numeric (if applicable), which gives better sorting # in boxplots. Then filter any samples that are not in the distance matrix, # and drop samples with have no data for this metadata # category, including those with empty strings as values. metadata = pd.to_numeric(metadata.to_series(), errors='ignore') metadata = metadata.loc[list(distance_matrix.ids)] metadata = metadata.replace(r'', numpy.nan).dropna() # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.index) filtered_dm_length = distance_matrix.shape[0] # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style("white") # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. groupings = collections.OrderedDict( [(id, list(series.index)) for id, series in sorted(metadata.groupby(metadata))]) for group_id in groupings: group_distances, x_ticklabels = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5}) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig(os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote_plus(str(group_id)))) fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote_plus(str(group_id)))) fig.clear() result = result.to_frame().to_html(classes="table table-striped " "table-hover") result = result.replace('border="1"', 'border="0"') index = os.path.join( TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'groupings': groupings, 'result': result })
def beta_correlation(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.MetadataCategory, method: str = 'spearman', permutations: int = 999) -> None: test_statistics = {'spearman': 'rho', 'pearson': 'r'} alt_hypothesis = 'two-sided' try: metadata = pd.to_numeric(metadata.to_series(), errors='raise') except ValueError as e: raise ValueError('Only numeric data can be used with the Mantel test. ' 'Non-numeric data was encountered in the sample ' 'metadata. Orignal error message follows:\n%s' % str(e)) initial_metadata_length = len(metadata) metadata = metadata.loc[list(distance_matrix.ids)] metadata = metadata.replace(r'', numpy.nan).dropna() filtered_metadata_length = len(metadata) ids_with_missing_metadata = set(distance_matrix.ids) - set(metadata.index) if len(ids_with_missing_metadata) > 0: raise ValueError('All samples in distance matrix must be present ' 'and contain data in the sample metadata. The ' 'following samples were present in the distance ' 'matrix, but were missing from the sample metadata ' 'or had no data: %s' % ', '.join(ids_with_missing_metadata)) metadata_distances = _metadata_distance(metadata) r, p, n = skbio.stats.distance.mantel(distance_matrix, metadata_distances, method=method, permutations=permutations, alternative=alt_hypothesis, strict=True) result = pd.Series( [method.title(), n, permutations, alt_hypothesis, metadata.name, r, p], index=[ 'Method', 'Sample size', 'Permutations', 'Alternative hypothesis', 'Metadata category', '%s %s' % (method.title(), test_statistics[method]), 'p-value' ], name='Mantel test results') result_html = result.to_frame().to_html(classes=("table table-striped " "table-hover")) result_html = result_html.replace('border="1"', 'border="0"') scatter_data = [] for id1, id2 in itertools.combinations(distance_matrix.ids, 2): scatter_data.append( (distance_matrix[id1, id2], metadata_distances[id1, id2])) x = 'Input distance' y = 'Euclidean distance of\n%s' % metadata.name plt.figure() scatter_data = pd.DataFrame(scatter_data, columns=[x, y]) sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False) plt.savefig(os.path.join(output_dir, 'beta-correlation-scatter.png')) plt.savefig(os.path.join(output_dir, 'beta-correlation-scatter.pdf')) index = os.path.join(TEMPLATES, 'beta_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_metadata_length': initial_metadata_length, 'filtered_metadata_length': filtered_metadata_length, 'result': result_html })
def beta_correlation(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.MetadataCategory, method: str='spearman', permutations: int=999) -> None: test_statistics = {'spearman': 'rho', 'pearson': 'r'} alt_hypothesis = 'two-sided' try: metadata = pd.to_numeric(metadata.to_series(), errors='raise') except ValueError as e: raise ValueError('Only numeric data can be used with the Mantel test. ' 'Non-numeric data was encountered in the sample ' 'metadata. Orignal error message follows:\n%s' % str(e)) initial_metadata_length = len(metadata) metadata = metadata.loc[list(distance_matrix.ids)] metadata = metadata.replace(r'', numpy.nan).dropna() filtered_metadata_length = len(metadata) ids_with_missing_metadata = set(distance_matrix.ids) - set(metadata.index) if len(ids_with_missing_metadata) > 0: raise ValueError('All samples in distance matrix must be present ' 'and contain data in the sample metadata. The ' 'following samples were present in the distance ' 'matrix, but were missing from the sample metadata ' 'or had no data: %s' % ', '.join(ids_with_missing_metadata)) metadata_distances = _metadata_distance(metadata) r, p, n = skbio.stats.distance.mantel( distance_matrix, metadata_distances, method=method, permutations=permutations, alternative=alt_hypothesis, strict=True) result = pd.Series([method.title(), n, permutations, alt_hypothesis, metadata.name, r, p], index=['Method', 'Sample size', 'Permutations', 'Alternative hypothesis', 'Metadata category', '%s %s' % (method.title(), test_statistics[method]), 'p-value'], name='Mantel test results') result_html = result.to_frame().to_html(classes=("table table-striped " "table-hover")) result_html = result_html.replace('border="1"', 'border="0"') scatter_data = [] for id1, id2 in itertools.combinations(distance_matrix.ids, 2): scatter_data.append((distance_matrix[id1, id2], metadata_distances[id1, id2])) x = 'Input distance' y = 'Euclidean distance of\n%s' % metadata.name scatter_data = pd.DataFrame(scatter_data, columns=[x, y]) fig = sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False).get_figure() fig.savefig(os.path.join(output_dir, 'beta-correlation-scatter.png')) fig.savefig(os.path.join(output_dir, 'beta-correlation-scatter.pdf')) index = os.path.join( TEMPLATES, 'beta_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_metadata_length': initial_metadata_length, 'filtered_metadata_length': filtered_metadata_length, 'result': result_html })
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.MetadataCategory, method: str='permanova', pairwise: bool=False, permutations: int=999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Cast metadata to numeric (if applicable), which gives better sorting # in boxplots. Then filter any samples that are not in the distance matrix, # and drop samples with have no data for this metadata # category, including those with empty strings as values. metadata = pd.to_numeric(metadata.to_series(), errors='ignore') metadata = metadata.loc[list(distance_matrix.ids)] metadata = metadata.replace(r'', numpy.nan).dropna() # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.index) filtered_dm_length = distance_matrix.shape[0] # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style("white") # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. groupings = collections.OrderedDict( [(id, list(series.index)) for id, series in sorted(metadata.groupby(metadata))]) for group_id in groupings: group_distances, x_ticklabels = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5}) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig(os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote_plus(str(group_id)))) fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote_plus(str(group_id)))) fig.clear() result_html = result.to_frame().to_html(classes=("table table-striped " "table-hover")) result_html = result_html.replace('border="1"', 'border="0"') if pairwise: pairwise_results = [] for group1_id, group2_id in itertools.combinations(groupings, 2): pairwise_result = \ _get_pairwise_group_significance_stats( distance_matrix=distance_matrix, group1_id=group1_id, group2_id=group2_id, groupings=groupings, metadata=metadata, beta_group_significance_fn=beta_group_significance_fn, permutations=permutations) pairwise_results.append([group1_id, group2_id, pairwise_result['sample size'], permutations, pairwise_result['test statistic'], pairwise_result['p-value']]) columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations', result['test statistic name'], 'p-value'] pairwise_results = pd.DataFrame(pairwise_results, columns=columns) pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True) pairwise_results['q-value'] = multipletests( pairwise_results['p-value'], method='fdr_bh')[1] pairwise_results.sort_index(inplace=True) pairwise_path = os.path.join( output_dir, '%s-pairwise.csv' % method) pairwise_results.to_csv(pairwise_path) pairwise_results_html = pairwise_results.to_html( classes=("table table-striped table-hover")) pairwise_results_html = pairwise_results_html.replace( 'border="1"', 'border="0"') else: pairwise_results_html = None index = os.path.join( TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'groupings': groupings, 'result': result_html, 'pairwise_results': pairwise_results_html })
def ancom(output_dir: str, table: pd.DataFrame, metadata: qiime2.MetadataCategory, statistical_test: str = 'f_oneway', transform_function: str = 'clr', difference_function: str = None) -> None: index_fp = os.path.join(output_dir, 'index.html') if statistical_test not in statistical_tests(): raise ValueError("Unknown statistical test: %s" % statistical_test) metadata_series = metadata.to_series() metadata_series = metadata_series.loc[table.index] if pd.isnull(metadata_series).any(): missing_data_sids = metadata_series[pd.isnull(metadata_series)].index missing_data_sids = ', '.join(missing_data_sids) raise ValueError('Metadata category is missing values for the ' 'following samples. Values need to be added for ' 'these samples, or the samples need to be removed ' 'from the table. %s' % missing_data_sids) statistical_test = _sig_tests[statistical_test] ancom_results = skbio_ancom(table, metadata_series, significance_test=statistical_test) # scikit-bio 0.4.2 returns a single tuple from ancom, and scikit-bio 0.5.0 # returns two tuples. We want to support both scikit-bio versions, so we # tuplize ancom_result to support both. Similarly, the "reject" column # was renamed in scikit-bio 0.5.0, so we apply a rename here (which does # nothing if a column called "reject" isn't found). ancom_results = qiime2.core.util.tuplize(ancom_results) ancom_results[0].sort_values(by='W', ascending=False) ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'}, inplace=True) ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.csv'), header=True, index=True) html = _volcanoplot(output_dir, table, metadata, ancom_results[0], transform_function, difference_function) significant_features = ancom_results[0][ancom_results[0] ['Reject null hypothesis']] with open(index_fp, 'w') as index_f: index_f.write('<html><body>\n') index_f.write('<h1>ANCOM statistical results</h1>\n') index_f.write('<a href="ancom.csv">Download as CSV</a><br>\n') index_f.write( q2templates.df_to_html(significant_features['W'].to_frame(), border=None, classes=None)) if len(ancom_results) == 2: ancom_results[1].to_csv(os.path.join(output_dir, 'percent-abundances.csv'), header=True, index=True) index_f.write(('<h1>Percentile abundances of features ' 'by group</h1>\n')) index_f.write(('<a href="percent-abundances.csv">' 'Download as CSV</a><br>\n')) index_f.write( q2templates.df_to_html( ancom_results[1].loc[significant_features.index], border=None, classes=None)) index_f.write(html) index_f.write('</body></html>\n')
def heatmap(output_dir, table: pd.DataFrame, metadata: qiime2.MetadataCategory = None, normalize: bool = True, title: str = None, metric: str = 'euclidean', method: str = 'average', cluster: str = 'both', color_scheme: str = 'rocket') -> None: if table.empty: raise ValueError('Cannot visualize an empty table.') # Validation if metadata is not None: table = _munge_metadata(metadata.to_series(), table, cluster) cbar_label = 'frequency' if normalize: table = table.apply(lambda x: np.log10(x + 1)) cbar_label = 'log10 frequency' # Hard-coded values for reasonable plots scaletron, labelsize, dpi = 50, 8, 100 sns.set( rc={ 'xtick.labelsize': labelsize, 'ytick.labelsize': labelsize, 'figure.dpi': dpi }) width, height = table.shape[1] / scaletron, table.shape[0] / scaletron heatmap_plot = sns.clustermap(table, method=method, metric=metric, **_clustering_map[cluster], cmap=color_scheme, xticklabels=True, yticklabels=True, cbar_kws={'label': cbar_label}) if title is not None: heatmap_plot.fig.suptitle(title) hm = heatmap_plot.ax_heatmap.get_position() cbar = heatmap_plot.cax.get_position() row = heatmap_plot.ax_row_dendrogram.get_position() col = heatmap_plot.ax_col_dendrogram.get_position() # Resize the plot to set cell aspect-ratio to square heatmap_plot.ax_heatmap.set_position([hm.x0, hm.y0, width, height]) heatmap_plot.cax.set_position( [cbar.x0, hm.y0 + height, cbar.width, cbar.height]) heatmap_plot.ax_row_dendrogram.set_position( [row.x0, row.y0, row.width, height]) heatmap_plot.ax_col_dendrogram.set_position( [col.x0, hm.y0 + height, width, col.height]) # https://stackoverflow.com/a/34697479/3776794 plt.setp(heatmap_plot.ax_heatmap.xaxis.get_majorticklabels(), rotation=90) plt.setp(heatmap_plot.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) for ext in ['png', 'svg']: img_fp = os.path.join(output_dir, 'feature-table-heatmap.%s' % ext) heatmap_plot.savefig(img_fp) index_fp = os.path.join(TEMPLATES, 'index.html') q2templates.render(index_fp, output_dir, context={'normalize': normalize})