def sample_longitudinal(dates: qiime2.CategoricalMetadataColumn, context_seqs: DNAFASTAFormat = None, start_date: str = None, samples_per_interval: int = 7, days_per_interval: int = 7, seed: int = None) -> IDSelection: window_size = '%dD' % days_per_interval if context_seqs is not None: # filter dates to only include the ids that sequence data is # available for ids_to_include = ids_from_fasta(str(context_seqs)) dates = dates.filter_ids(ids_to_include) dt_series = pd.to_datetime(dates.to_series(), errors='coerce') df = pd.DataFrame({'ids': dates.to_series().index}, index=dt_series) if start_date is not None: filter_before = pd.Timestamp(start_date) df = df.iloc[np.where(dt_series >= filter_before)] if filter_before not in df.index: # this will be stripped in _sample_group::_sampler # the purpose is to force Pandas to begin the window at this # time instead of the first observation (by making NaN the first # observation) df.loc[filter_before] = float('nan') grouped = df.groupby(pd.Grouper(freq=window_size, convention='start', closed='left'), group_keys=False) filtered_df = grouped.apply(_sample_group(samples_per_interval, seed)) df = df.dropna(axis=0) selection = pd.Series(False, index=dates.to_series().index) selection[filtered_df['ids']] = True md = qiime2.Metadata(dates.to_dataframe()) return IDSelection(selection, md, 'sample_longitudinal')
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.CategoricalMetadataColumn, method: str = 'permanova', pairwise: bool = False, permutations: int = 999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) metadata = metadata.drop_missing_values() # filter the distance matrix to exclude samples that were dropped from # the metadata due to missing values, and keep track of how many samples # survived the filtering so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.ids) filtered_dm_length = distance_matrix.shape[0] metadata = metadata.to_series() # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style('white') # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. # TODO: update to use a grouping API and natsort API on # CategoricalMetadataColumn, if those become available. groupings = collections.OrderedDict([ (id, list(series.index)) for id, series in natsorted(metadata.groupby(metadata)) ]) pairs_summary = pd.DataFrame( columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) for group_id in groupings: group_distances, x_ticklabels, group_pairs_summary = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) group_pairs_summary = pd.DataFrame(group_pairs_summary, columns=[ 'SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance' ]) pairs_summary = pd.concat([pairs_summary, group_pairs_summary]) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5 }) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig( os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote(str(group_id)))) fig.savefig( os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote(str(group_id)))) fig.clear() pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t') result_html = q2templates.df_to_html(result.to_frame()) if pairwise: pairwise_results = [] for group1_id, group2_id in itertools.combinations(groupings, 2): pairwise_result = \ _get_pairwise_group_significance_stats( distance_matrix=distance_matrix, group1_id=group1_id, group2_id=group2_id, groupings=groupings, metadata=metadata, beta_group_significance_fn=beta_group_significance_fn, permutations=permutations) pairwise_results.append([ group1_id, group2_id, pairwise_result['sample size'], permutations, pairwise_result['test statistic'], pairwise_result['p-value'] ]) columns = [ 'Group 1', 'Group 2', 'Sample size', 'Permutations', result['test statistic name'], 'p-value' ] pairwise_results = pd.DataFrame(pairwise_results, columns=columns) pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True) pairwise_results['q-value'] = multipletests( pairwise_results['p-value'], method='fdr_bh')[1] pairwise_results.sort_index(inplace=True) pairwise_path = os.path.join(output_dir, '%s-pairwise.csv' % method) pairwise_results.to_csv(pairwise_path) pairwise_results_html = q2templates.df_to_html(pairwise_results) else: pairwise_results_html = None # repartition groupings for rendering group_ids = [ # We have to DOUBLE encode this, as the file/resource name is a literal # URI-encoded string, we do this to prevent issues with the filesystem # however, as a result, our links need to escape % so that the browser # asks for the right escaped name (instead of the original name, which # doesn't exist inside the visualization). urllib.parse.quote(urllib.parse.quote(k)) for k in groupings.keys() ] row_count, group_count = 3, len(group_ids) # Start at three plots per row while group_count % row_count != 0: row_count = row_count - 1 group_rows = [ group_ids[g:g + row_count] for g in range(0, group_count, row_count) ] index = os.path.join(TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'group_rows': group_rows, 'bootstrap_group_col_size': int(12 / row_count), 'result': result_html, 'pairwise_results': pairwise_results_html })
def ancom(output_dir: str, table: pd.DataFrame, metadata: qiime2.CategoricalMetadataColumn, transform_function: str = 'clr', difference_function: str = None) -> None: metadata = metadata.filter_ids(table.index) if metadata.has_missing_values(): missing_data_sids = metadata.get_ids(where_values_missing=True) missing_data_sids = ', '.join(sorted(missing_data_sids)) raise ValueError('Metadata column is missing values for the ' 'following samples. Values need to be added for ' 'these samples, or the samples need to be removed ' 'from the table: %s' % missing_data_sids) ancom_results = skbio_ancom(table, metadata.to_series(), significance_test=f_oneway) ancom_results[0].sort_values(by='W', ascending=False, inplace=True) ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'}, inplace=True) significant_features = ancom_results[0][ancom_results[0] ['Reject null hypothesis']] context = dict() if not significant_features.empty: context['significant_features'] = q2templates.df_to_html( significant_features['W'].to_frame()) context['percent_abundances'] = q2templates.df_to_html( ancom_results[1].loc[significant_features.index]) metadata = metadata.to_series() cats = list(set(metadata)) transform_function_name = transform_function transform_function = _transform_functions[transform_function] transformed_table = table.apply(transform_function, axis=1, result_type='broadcast') if difference_function is None: if len(cats) == 2: difference_function = 'mean_difference' else: # len(categories) > 2 difference_function = 'f_statistic' _d_func = _difference_functions[difference_function] def diff_func(x): args = _d_func(*[x[metadata == c] for c in cats]) if isinstance(args, tuple): return args[0] else: return args # effectively doing a groupby operation wrt to the metadata fold_change = transformed_table.apply(diff_func, axis=0) if not pd.isnull(fold_change).all(): pre_filtered_ids = set(fold_change.index) with pd.option_context('mode.use_inf_as_na', True): fold_change = fold_change.dropna(axis=0) filtered_ids = pre_filtered_ids - set(fold_change.index) filtered_ancom_results = ancom_results[0].drop(labels=filtered_ids) volcano_results = pd.DataFrame({ transform_function_name: fold_change, 'W': filtered_ancom_results.W }) volcano_results.index.name = 'id' volcano_results.to_csv(os.path.join(output_dir, 'data.tsv'), header=True, index=True, sep='\t') volcano_results = volcano_results.reset_index(drop=False) spec = { '$schema': 'https://vega.github.io/schema/vega/v4.json', 'width': 300, 'height': 300, 'data': [{ 'name': 'values', 'values': volcano_results.to_dict(orient='records') }], 'scales': [{ 'name': 'xScale', 'domain': { 'data': 'values', 'field': transform_function_name }, 'range': 'width' }, { 'name': 'yScale', 'domain': { 'data': 'values', 'field': 'W' }, 'range': 'height' }], 'axes': [{ 'scale': 'xScale', 'orient': 'bottom', 'title': transform_function_name }, { 'scale': 'yScale', 'orient': 'left', 'title': 'W' }], 'marks': [{ 'type': 'symbol', 'from': { 'data': 'values' }, 'encode': { 'hover': { 'fill': { 'value': '#FF0000' }, 'opacity': { 'value': 1 } }, 'enter': { 'x': { 'scale': 'xScale', 'field': transform_function_name }, 'y': { 'scale': 'yScale', 'field': 'W' } }, 'update': { 'fill': { 'value': 'black' }, 'opacity': { 'value': 0.3 }, 'tooltip': { 'signal': "{{'title': datum['id'], '{0}': " "datum['{0}'], 'W': datum['W']}}".format( transform_function_name) } } } }] } context['vega_spec'] = json.dumps(spec) if filtered_ids: context['filtered_ids'] = ', '.join(sorted(filtered_ids)) copy_tree(os.path.join(TEMPLATES, 'ancom'), output_dir) ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.tsv'), header=True, index=True, sep='\t') ancom_results[1].to_csv(os.path.join(output_dir, 'percent-abundances.tsv'), header=True, index=True, sep='\t') index = os.path.join(TEMPLATES, 'ancom', 'index.html') q2templates.render(index, output_dir, context=context)
def ancom(output_dir: str, table: pd.DataFrame, metadata: qiime2.CategoricalMetadataColumn, transform_function: str = 'clr', difference_function: str = None) -> None: index_fp = os.path.join(output_dir, 'index.html') metadata = metadata.filter_ids(table.index) if metadata.has_missing_values(): missing_data_sids = metadata.get_ids(where_values_missing=True) missing_data_sids = ', '.join(sorted(missing_data_sids)) raise ValueError('Metadata column is missing values for the ' 'following samples. Values need to be added for ' 'these samples, or the samples need to be removed ' 'from the table: %s' % missing_data_sids) ancom_results = skbio_ancom(table, metadata.to_series(), significance_test=f_oneway) # scikit-bio 0.4.2 returns a single tuple from ancom, and scikit-bio 0.5.0 # returns two tuples. We want to support both scikit-bio versions, so we # tuplize ancom_result to support both. Similarly, the "reject" column # was renamed in scikit-bio 0.5.0, so we apply a rename here (which does # nothing if a column called "reject" isn't found). ancom_results = qiime2.core.util.tuplize(ancom_results) ancom_results[0].sort_values(by='W', ascending=False) ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'}, inplace=True) ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.csv'), header=True, index=True) html = _volcanoplot(output_dir, table, metadata, ancom_results[0], transform_function, difference_function) significant_features = ancom_results[0][ancom_results[0] ['Reject null hypothesis']] significant_features_present = not significant_features.empty insignificant_div = ('<div>No significant features identified!</div>') with open(index_fp, 'w') as index_f: index_f.write('<html>\n') if html is not None: index_f.write('<head>\n') index_f.write(INLINE.render()) index_f.write('</head>\n') index_f.write('<body>\n') index_f.write('<h1>ANCOM statistical results</h1>\n') index_f.write('<a href="ancom.csv">Download complete table as CSV</a>' '<br>\n') if significant_features_present: index_f.write( q2templates.df_to_html(significant_features['W'].to_frame(), border=None, classes=None)) else: index_f.write(insignificant_div) if len(ancom_results) == 2: ancom_results[1].to_csv(os.path.join(output_dir, 'percent-abundances.csv'), header=True, index=True) index_f.write(('<h1>Percentile abundances of features ' 'by group</h1>\n')) index_f.write(('<a href="percent-abundances.csv">' 'Download complete table as CSV</a><br>\n')) if significant_features_present: index_f.write( q2templates.df_to_html( ancom_results[1].loc[significant_features.index], border=None, classes=None)) else: index_f.write(insignificant_div) if html is not None: index_f.write(html[1]) index_f.write(html[0]) else: index_f.write('<p>Unable to generate volcano plot, please check ' 'the ANCOM statistical results (above).</p>\n') index_f.write('</body></html>\n')
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.CategoricalMetadataColumn, method: str = 'permanova', pairwise: bool = False, permutations: int = 999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) metadata = metadata.drop_missing_values() # filter the distance matrix to exclude samples that were dropped from # the metadata due to missing values, and keep track of how many samples # survived the filtering so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.ids) filtered_dm_length = distance_matrix.shape[0] metadata = metadata.to_series() # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style('white') # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. # TODO: update to use a grouping API and natsort API on # CategoricalMetadataColumn, if those become available. groupings = collections.OrderedDict( [(id, list(series.index)) for id, series in natsorted(metadata.groupby(metadata))]) pairs_summary = pd.DataFrame(columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) for group_id in groupings: group_distances, x_ticklabels, group_pairs_summary = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) group_pairs_summary = pd.DataFrame( group_pairs_summary, columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) pairs_summary = pd.concat([pairs_summary, group_pairs_summary]) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5}) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig(os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote_plus(str(group_id)))) fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote_plus(str(group_id)))) fig.clear() pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t') result_html = q2templates.df_to_html(result.to_frame()) if pairwise: pairwise_results = [] for group1_id, group2_id in itertools.combinations(groupings, 2): pairwise_result = \ _get_pairwise_group_significance_stats( distance_matrix=distance_matrix, group1_id=group1_id, group2_id=group2_id, groupings=groupings, metadata=metadata, beta_group_significance_fn=beta_group_significance_fn, permutations=permutations) pairwise_results.append([group1_id, group2_id, pairwise_result['sample size'], permutations, pairwise_result['test statistic'], pairwise_result['p-value']]) columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations', result['test statistic name'], 'p-value'] pairwise_results = pd.DataFrame(pairwise_results, columns=columns) pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True) pairwise_results['q-value'] = multipletests( pairwise_results['p-value'], method='fdr_bh')[1] pairwise_results.sort_index(inplace=True) pairwise_path = os.path.join( output_dir, '%s-pairwise.csv' % method) pairwise_results.to_csv(pairwise_path) pairwise_results_html = q2templates.df_to_html(pairwise_results) else: pairwise_results_html = None # repartition groupings for rendering group_ids = list(groupings.keys()) row_count, group_count = 3, len(group_ids) # Start at three plots per row while group_count % row_count != 0: row_count = row_count - 1 group_rows = [group_ids[g:g+row_count] for g in range(0, group_count, row_count)] index = os.path.join( TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'group_rows': group_rows, 'bootstrap_group_col_size': int(12 / row_count), 'result': result_html, 'pairwise_results': pairwise_results_html })
def ancom(output_dir: str, table: pd.DataFrame, metadata: qiime2.CategoricalMetadataColumn, transform_function: str = 'clr', difference_function: str = None) -> None: metadata = metadata.filter_ids(table.index) if metadata.has_missing_values(): missing_data_sids = metadata.get_ids(where_values_missing=True) missing_data_sids = ', '.join(sorted(missing_data_sids)) raise ValueError('Metadata column is missing values for the ' 'following samples. Values need to be added for ' 'these samples, or the samples need to be removed ' 'from the table: %s' % missing_data_sids) ancom_results = skbio_ancom(table, metadata.to_series(), significance_test=f_oneway) ancom_results[0].sort_values(by='W', ascending=False, inplace=True) ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'}, inplace=True) significant_features = ancom_results[0][ ancom_results[0]['Reject null hypothesis']] context = dict() if not significant_features.empty: context['significant_features'] = q2templates.df_to_html( significant_features['W'].to_frame()) context['percent_abundances'] = q2templates.df_to_html( ancom_results[1].loc[significant_features.index]) metadata = metadata.to_series() cats = list(set(metadata)) transform_function_name = transform_function transform_function = _transform_functions[transform_function] transformed_table = table.apply( transform_function, axis=1, result_type='broadcast') if difference_function is None: if len(cats) == 2: difference_function = 'mean_difference' else: # len(categories) > 2 difference_function = 'f_statistic' _d_func = _difference_functions[difference_function] def diff_func(x): args = _d_func(*[x[metadata == c] for c in cats]) if isinstance(args, tuple): return args[0] else: return args # effectively doing a groupby operation wrt to the metadata fold_change = transformed_table.apply(diff_func, axis=0) if not pd.isnull(fold_change).all(): volcano_results = pd.DataFrame({transform_function_name: fold_change, 'W': ancom_results[0].W}) volcano_results = volcano_results.reset_index(drop=False) spec = { '$schema': 'https://vega.github.io/schema/vega/v4.json', 'width': 300, 'height': 300, 'data': [ {'name': 'values', 'values': volcano_results.to_dict(orient='records')}], 'scales': [ {'name': 'xScale', 'domain': {'data': 'values', 'field': transform_function_name}, 'range': 'width'}, {'name': 'yScale', 'domain': {'data': 'values', 'field': 'W'}, 'range': 'height'}], 'axes': [ {'scale': 'xScale', 'orient': 'bottom', 'title': transform_function_name}, {'scale': 'yScale', 'orient': 'left', 'title': 'W'}], 'marks': [ {'type': 'symbol', 'from': {'data': 'values'}, 'encode': { 'hover': { 'fill': {'value': '#FF0000'}, 'opacity': {'value': 1}}, 'enter': { 'x': {'scale': 'xScale', 'field': transform_function_name}, 'y': {'scale': 'yScale', 'field': 'W'}}, 'update': { 'fill': {'value': 'black'}, 'opacity': {'value': 0.3}, 'tooltip': { 'signal': "{{'title': datum['index'], '{0}': " "datum['{0}'], 'W': datum['W']}}".format( transform_function_name)}}}}]} context['vega_spec'] = json.dumps(spec) copy_tree(os.path.join(TEMPLATES, 'ancom'), output_dir) ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.tsv'), header=True, index=True, sep='\t') ancom_results[1].to_csv(os.path.join(output_dir, 'percent-abundances.tsv'), header=True, index=True, sep='\t') index = os.path.join(TEMPLATES, 'ancom', 'index.html') q2templates.render(index, output_dir, context=context)