def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # convert metadata to numeric values where applicable, drop the non-numeric # values, and then drop samples that contain NaNs df = metadata.to_dataframe() df = df.apply(lambda x: pd.to_numeric(x, errors='ignore')) # filter categorical columns pre_filtered_cols = set(df.columns) df = df.select_dtypes([numpy.number]).dropna() filtered_categorical_cols = pre_filtered_cols - set(df.columns) # filter 0 variance numerical columns pre_filtered_cols = set(df.columns) df = df.loc[:, df.var() != 0] filtered_zero_variance_cols = pre_filtered_cols - set(df.columns) # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index, strict=False) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = result.to_html(classes='table table-striped table-hover').replace( 'border="1"', 'border="0"') index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'filtered_categorical_cols': ', '.join(filtered_categorical_cols), 'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols), 'result': result})
def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix, sample_metadata: qiime2.Metadata, where: str = None) -> skbio.DistanceMatrix: ids_to_keep = sample_metadata.ids(where=where) # NOTE: there is no guaranteed ordering to output distance matrix because # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration # order. try: return distance_matrix.filter(ids_to_keep, strict=False) except skbio.stats.distance.DissimilarityMatrixError: raise ValueError( "All samples were filtered out of the distance matrix.")
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) # drop non-numeric columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric') non_numeric_cols = pre_filtered_cols - set(metadata.columns) # Drop samples that have any missing values. # TODO use Metadata API if more filtering is supported in the future. df = metadata.to_dataframe() df = df.dropna() metadata = qiime2.Metadata(df) # filter 0 variance numerical columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_zero_variance=True, drop_all_missing=True) zero_variance_cols = pre_filtered_cols - set(metadata.columns) df = metadata.to_dataframe() # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = q2templates.df_to_html(result) index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'non_numeric_cols': ', '.join(sorted(non_numeric_cols)), 'zero_variance_cols': ', '.join(sorted(zero_variance_cols)), 'result': result })
def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, where: str=None, exclude_ids: bool=False) -> skbio.DistanceMatrix: ids_to_keep = metadata.ids(where=where) if exclude_ids: ids_to_keep = set(distance_matrix.ids) - set(ids_to_keep) # NOTE: there is no guaranteed ordering to output distance matrix because # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration # order. try: return distance_matrix.filter(ids_to_keep, strict=False) except skbio.stats.distance.DissimilarityMatrixError: raise ValueError( "All samples were filtered out of the distance matrix.")
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) # drop non-numeric columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric') non_numeric_cols = pre_filtered_cols - set(metadata.columns) # filter 0 variance numerical columns and empty columns pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_zero_variance=True, drop_all_missing=True) zero_variance_cols = pre_filtered_cols - set(metadata.columns) # Drop samples that have any missing values. # TODO use Metadata API if this type of filtering is supported in the # future. df = metadata.to_dataframe() df = df.dropna(axis='index', how='any') # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = q2templates.df_to_html(result) index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'non_numeric_cols': ', '.join(sorted(non_numeric_cols)), 'zero_variance_cols': ', '.join(sorted(zero_variance_cols)), 'result': result})
def mantel(output_dir: str, dm1: skbio.DistanceMatrix, dm2: skbio.DistanceMatrix, method: str = 'spearman', permutations: int = 999, intersect_ids: bool = False, label1: str = 'Distance Matrix 1', label2: str = 'Distance Matrix 2') -> None: test_statistics = {'spearman': 'rho', 'pearson': 'r'} alt_hypothesis = 'two-sided' # The following code to handle mismatched IDs, and subsequently filter the # distance matrices, is not technically necessary because skbio's mantel # function will raise an error on mismatches with `strict=True`, and will # handle intersection if `strict=False`. However, we need to handle the ID # matching explicitly to find *which* IDs are mismatched -- the error # message coming from scikit-bio doesn't describe those. We also need to # have the mismatched IDs to display as a warning in the viz if # `intersect_ids=True`. Finally, the distance matrices are explicitly # filtered to matching IDs only because their data are used elsewhere in # this function (e.g. extracting scatter plot data). # Find the symmetric difference between ID sets. ids1 = set(dm1.ids) ids2 = set(dm2.ids) mismatched_ids = ids1 ^ ids2 if not intersect_ids and mismatched_ids: raise ValueError( 'The following ID(s) are not contained in both distance matrices. ' 'This sometimes occurs when mismatched files are passed. If this ' 'is not the case, you can use `intersect_ids` to discard these ' 'mismatches and apply the Mantel test to only those IDs that are ' 'found in both distance matrices.\n\n%s' % ', '.join(sorted(mismatched_ids))) if mismatched_ids: matched_ids = ids1 & ids2 # Run in `strict` mode because the matches should all be found in both # matrices. dm1 = dm1.filter(matched_ids, strict=True) dm2 = dm2.filter(matched_ids, strict=True) # Run in `strict` mode because all IDs should be matched at this point. r, p, sample_size = skbio.stats.distance.mantel(dm1, dm2, method=method, permutations=permutations, alternative=alt_hypothesis, strict=True) result = pd.Series( [method.title(), sample_size, permutations, alt_hypothesis, r, p], index=[ 'Method', 'Sample size', 'Permutations', 'Alternative hypothesis', '%s %s' % (method.title(), test_statistics[method]), 'p-value' ], name='Mantel test results') table_html = q2templates.df_to_html(result.to_frame()) # We know the distance matrices have matching ID sets at this point, so we # can safely generate all pairs of IDs using one of the matrices' ID sets # (it doesn't matter which one). scatter_data = [] for id1, id2 in itertools.combinations(dm1.ids, 2): scatter_data.append((dm1[id1, id2], dm2[id1, id2])) plt.figure() x = 'Pairwise Distance (%s)' % label1 y = 'Pairwise Distance (%s)' % label2 scatter_data = pd.DataFrame(scatter_data, columns=[x, y]) sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False) plt.savefig(os.path.join(output_dir, 'mantel-scatter.svg')) context = { 'table': table_html, 'sample_size': sample_size, 'mismatched_ids': mismatched_ids } index = os.path.join(TEMPLATES, 'mantel_assets', 'index.html') q2templates.render(index, output_dir, context=context)
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.CategoricalMetadataColumn, method: str = 'permanova', pairwise: bool = False, permutations: int = 999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) metadata = metadata.drop_missing_values() # filter the distance matrix to exclude samples that were dropped from # the metadata due to missing values, and keep track of how many samples # survived the filtering so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.ids) filtered_dm_length = distance_matrix.shape[0] metadata = metadata.to_series() # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style('white') # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. # TODO: update to use a grouping API and natsort API on # CategoricalMetadataColumn, if those become available. groupings = collections.OrderedDict([ (id, list(series.index)) for id, series in natsorted(metadata.groupby(metadata)) ]) pairs_summary = pd.DataFrame( columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) for group_id in groupings: group_distances, x_ticklabels, group_pairs_summary = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) group_pairs_summary = pd.DataFrame(group_pairs_summary, columns=[ 'SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance' ]) pairs_summary = pd.concat([pairs_summary, group_pairs_summary]) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5 }) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig( os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote(str(group_id)))) fig.savefig( os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote(str(group_id)))) fig.clear() pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t') result_html = q2templates.df_to_html(result.to_frame()) if pairwise: pairwise_results = [] for group1_id, group2_id in itertools.combinations(groupings, 2): pairwise_result = \ _get_pairwise_group_significance_stats( distance_matrix=distance_matrix, group1_id=group1_id, group2_id=group2_id, groupings=groupings, metadata=metadata, beta_group_significance_fn=beta_group_significance_fn, permutations=permutations) pairwise_results.append([ group1_id, group2_id, pairwise_result['sample size'], permutations, pairwise_result['test statistic'], pairwise_result['p-value'] ]) columns = [ 'Group 1', 'Group 2', 'Sample size', 'Permutations', result['test statistic name'], 'p-value' ] pairwise_results = pd.DataFrame(pairwise_results, columns=columns) pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True) pairwise_results['q-value'] = multipletests( pairwise_results['p-value'], method='fdr_bh')[1] pairwise_results.sort_index(inplace=True) pairwise_path = os.path.join(output_dir, '%s-pairwise.csv' % method) pairwise_results.to_csv(pairwise_path) pairwise_results_html = q2templates.df_to_html(pairwise_results) else: pairwise_results_html = None # repartition groupings for rendering group_ids = [ # We have to DOUBLE encode this, as the file/resource name is a literal # URI-encoded string, we do this to prevent issues with the filesystem # however, as a result, our links need to escape % so that the browser # asks for the right escaped name (instead of the original name, which # doesn't exist inside the visualization). urllib.parse.quote(urllib.parse.quote(k)) for k in groupings.keys() ] row_count, group_count = 3, len(group_ids) # Start at three plots per row while group_count % row_count != 0: row_count = row_count - 1 group_rows = [ group_ids[g:g + row_count] for g in range(0, group_count, row_count) ] index = os.path.join(TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'group_rows': group_rows, 'bootstrap_group_col_size': int(12 / row_count), 'result': result_html, 'pairwise_results': pairwise_results_html })
class PairwiseMantelTests(TestCase): def setUp(self): self.minx = DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]]) self.miny = DistanceMatrix([[0, 2, 7], [2, 0, 6], [7, 6, 0]]) self.minz = DistanceMatrix([[0, 0.5, 0.25], [0.5, 0, 0.1], [0.25, 0.1, 0]]) self.min_dms = (self.minx, self.miny, self.minz) # Versions of self.minx and self.minz (above) that each have an extra # ID on the end. self.x_extra = DistanceMatrix([[0, 1, 2, 7], [1, 0, 3, 2], [2, 3, 0, 4], [7, 2, 4, 0]], ['0', '1', '2', 'foo']) self.z_extra = DistanceMatrix([[0, 0.5, 0.25, 3], [0.5, 0, 0.1, 24], [0.25, 0.1, 0, 5], [3, 24, 5, 0]], ['0', '1', '2', 'bar']) # Load expected results. We have to load the p-value column (column # index 3) as a string dtype in order to compare with the in-memory # results since we're formatting the p-values as strings with the # correct number of decimal places. Without this explicit converter, # the p-value column will be loaded as a float dtype and the frames # won't compare equal. p_val_conv = {3: str} self.exp_results_minimal = pd.read_csv( get_data_path('pwmantel_exp_results_minimal.txt'), sep='\t', index_col=(0, 1), converters=p_val_conv) self.exp_results_minimal_with_labels = pd.read_csv( get_data_path('pwmantel_exp_results_minimal_with_labels.txt'), sep='\t', index_col=(0, 1), converters=p_val_conv) self.exp_results_duplicate_dms = pd.read_csv( get_data_path('pwmantel_exp_results_duplicate_dms.txt'), sep='\t', index_col=(0, 1), converters=p_val_conv) self.exp_results_na_p_value = pd.read_csv( get_data_path('pwmantel_exp_results_na_p_value.txt'), sep='\t', index_col=(0, 1), converters=p_val_conv) self.exp_results_too_few_permutations = pd.read_csv( get_data_path('pwmantel_exp_results_too_few_permutations.txt'), sep='\t', index_col=(0, 1), converters=p_val_conv) self.exp_results_reordered_distance_matrices = pd.read_csv( get_data_path('pwmantel_exp_results_reordered_distance_matrices' '.txt'), sep='\t', index_col=(0, 1), converters=p_val_conv) def test_minimal_compatible_input(self): # Matrices are already in the correct order and have matching IDs. np.random.seed(0) obs = pwmantel(self.min_dms, alternative='greater') assert_frame_equal(obs, self.exp_results_minimal) def test_minimal_compatible_input_with_labels(self): np.random.seed(0) obs = pwmantel(self.min_dms, alternative='greater', labels=('minx', 'miny', 'minz')) assert_frame_equal(obs, self.exp_results_minimal_with_labels) def test_duplicate_dms(self): obs = pwmantel((self.minx, self.minx, self.minx), alternative='less') assert_frame_equal(obs, self.exp_results_duplicate_dms) def test_na_p_value(self): obs = pwmantel((self.miny, self.minx), method='spearman', permutations=0) assert_frame_equal(obs, self.exp_results_na_p_value) def test_too_few_permutations_for_p_value(self): obs = pwmantel((self.miny, self.minx), method='spearman', permutations=9) assert_frame_equal(obs, self.exp_results_too_few_permutations) def test_reordered_distance_matrices(self): # Matrices have matching IDs but they all have different ordering. x = self.minx.filter(['1', '0', '2']) y = self.miny.filter(['0', '2', '1']) z = self.minz.filter(['1', '2', '0']) np.random.seed(0) obs = pwmantel((x, y, z), alternative='greater') assert_frame_equal(obs, self.exp_results_reordered_distance_matrices) def test_strict(self): # Matrices have some matching and nonmatching IDs, with different # ordering. x = self.x_extra.filter(['1', '0', 'foo', '2']) y = self.miny.filter(['0', '2', '1']) z = self.z_extra.filter(['bar', '1', '2', '0']) np.random.seed(0) # strict=False should discard IDs that aren't found in both matrices obs = pwmantel((x, y, z), alternative='greater', strict=False) assert_frame_equal(obs, self.exp_results_reordered_distance_matrices) with self.assertRaises(ValueError): pwmantel((x, y, z), strict=True) def test_id_lookup(self): # Matrices have mismatched IDs but a lookup is provided. self.x_extra.ids = ['a', 'b', 'c', 'foo'] self.z_extra.ids = ['d', 'e', 'f', 'bar'] lookup = {'a': '0', 'b': '1', 'c': '2', 'foo': 'foo', 'd': '0', 'e': '1', 'f': '2', 'bar': 'bar', '0': '0', '1': '1', '2': '2'} x = self.x_extra.filter(['b', 'a', 'foo', 'c']) y = self.miny.filter(['0', '2', '1']) z = self.z_extra.filter(['bar', 'e', 'f', 'd']) x_copy = x.copy() y_copy = y.copy() z_copy = z.copy() np.random.seed(0) obs = pwmantel((x, y, z), alternative='greater', strict=False, lookup=lookup) assert_frame_equal(obs, self.exp_results_reordered_distance_matrices) # Make sure the inputs aren't modified. self.assertEqual(x, x_copy) self.assertEqual(y, y_copy) self.assertEqual(z, z_copy) def test_too_few_dms(self): with self.assertRaises(ValueError): pwmantel([self.miny]) def test_invalid_input_type(self): with self.assertRaises(TypeError): pwmantel([self.miny, self.minx, [[0, 42], [42, 0]]]) def test_wrong_number_of_labels(self): with self.assertRaises(ValueError): pwmantel(self.min_dms, labels=['foo', 'bar']) def test_duplicate_labels(self): with self.assertRaises(ValueError): pwmantel(self.min_dms, labels=['foo', 'bar', 'foo']) def test_missing_ids_in_lookup(self): # mapping for '1' is missing lookup = {'0': 'a', '2': 'c'} with self.assertRaises(KeyError): pwmantel(self.min_dms, lookup=lookup) def test_no_matching_ids(self): self.minx.ids = ['foo', 'bar', 'baz'] self.miny.ids = ['bro', 'fist', 'breh'] with self.assertRaises(ValueError): pwmantel((self.minx, self.miny, self.minz), strict=False)
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.MetadataCategory, method: str='permanova', permutations: int=999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Cast metadata to numeric (if applicable), which gives better sorting # in boxplots. Then filter any samples that are not in the distance matrix, # and drop samples with have no data for this metadata # category, including those with empty strings as values. metadata = pd.to_numeric(metadata.to_series(), errors='ignore') metadata = metadata.loc[list(distance_matrix.ids)] metadata = metadata.replace(r'', numpy.nan).dropna() # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.index) filtered_dm_length = distance_matrix.shape[0] # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style("white") # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. groupings = collections.OrderedDict( [(id, list(series.index)) for id, series in sorted(metadata.groupby(metadata))]) for group_id in groupings: group_distances, x_ticklabels = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5}) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig(os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote_plus(str(group_id)))) fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote_plus(str(group_id)))) fig.clear() result = result.to_frame().to_html(classes="table table-striped " "table-hover") result = result.replace('border="1"', 'border="0"') index = os.path.join( TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'groupings': groupings, 'result': result })
print("Running default parameters for " + name) is_distmatrix = name in dm_set #Boolean switch for distance-matrix specific code blocks if is_distmatrix: ##### Use specific X and y for distance matrix benchmarking, not amplicon experiment object if name=="jensenshannon": md = exp.sample_metadata existing_dm = DistanceMatrix.read(dir_prefix+"/beta-q2/"+"aitchison"+'.txt') print("Computing Jensen-Shannon Distance Matrix") dm = DistanceMatrix(data=distance.pdist(exp.data.todense(), metric="jensenshannon"), ids=existing_dm.ids) else: dm = DistanceMatrix.read(dir_prefix+"/beta-q2/"+name+'.txt') md = exp.sample_metadata md = md.filter(dm.ids,axis='index') dm = dm.filter(md.index, strict=True) X_dist = dm.data y_dist = md[target] # Make directory for this regressor if it does not yet exist dir_name = dir_prefix +'/' +dir_prefix + '-' + name print(dir_name) if not os.path.isdir(dir_name): os.mkdir(dir_name, mode=0o755) paramsList = list(ParameterGrid(grid)) # For each set of parameters, get scores for model across 10 folds for param_idx, param in enumerate(paramsList):
def mantel(output_dir: str, dm1: skbio.DistanceMatrix, dm2: skbio.DistanceMatrix, method: str = 'spearman', permutations: int = 999, intersect_ids: bool = False, label1: str = 'Distance Matrix 1', label2: str = 'Distance Matrix 2') -> None: test_statistics = {'spearman': 'rho', 'pearson': 'r'} alt_hypothesis = 'two-sided' # The following code to handle mismatched IDs, and subsequently filter the # distance matrices, is not technically necessary because skbio's mantel # function will raise an error on mismatches with `strict=True`, and will # handle intersection if `strict=False`. However, we need to handle the ID # matching explicitly to find *which* IDs are mismatched -- the error # message coming from scikit-bio doesn't describe those. We also need to # have the mismatched IDs to display as a warning in the viz if # `intersect_ids=True`. Finally, the distance matrices are explicitly # filtered to matching IDs only because their data are used elsewhere in # this function (e.g. extracting scatter plot data). # Find the symmetric difference between ID sets. ids1 = set(dm1.ids) ids2 = set(dm2.ids) mismatched_ids = ids1 ^ ids2 if not intersect_ids and mismatched_ids: raise ValueError( 'The following ID(s) are not contained in both distance matrices. ' 'This sometimes occurs when mismatched files are passed. If this ' 'is not the case, you can use `intersect_ids` to discard these ' 'mismatches and apply the Mantel test to only those IDs that are ' 'found in both distance matrices.\n\n%s' % ', '.join(sorted(mismatched_ids))) if mismatched_ids: matched_ids = ids1 & ids2 # Run in `strict` mode because the matches should all be found in both # matrices. dm1 = dm1.filter(matched_ids, strict=True) dm2 = dm2.filter(matched_ids, strict=True) # Run in `strict` mode because all IDs should be matched at this point. r, p, sample_size = skbio.stats.distance.mantel( dm1, dm2, method=method, permutations=permutations, alternative=alt_hypothesis, strict=True) result = pd.Series([method.title(), sample_size, permutations, alt_hypothesis, r, p], index=['Method', 'Sample size', 'Permutations', 'Alternative hypothesis', '%s %s' % (method.title(), test_statistics[method]), 'p-value'], name='Mantel test results') table_html = q2templates.df_to_html(result.to_frame()) # We know the distance matrices have matching ID sets at this point, so we # can safely generate all pairs of IDs using one of the matrices' ID sets # (it doesn't matter which one). scatter_data = [] for id1, id2 in itertools.combinations(dm1.ids, 2): scatter_data.append((dm1[id1, id2], dm2[id1, id2])) plt.figure() x = 'Pairwise Distance (%s)' % label1 y = 'Pairwise Distance (%s)' % label2 scatter_data = pd.DataFrame(scatter_data, columns=[x, y]) sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False) plt.savefig(os.path.join(output_dir, 'mantel-scatter.svg')) context = { 'table': table_html, 'sample_size': sample_size, 'mismatched_ids': mismatched_ids } index = os.path.join( TEMPLATES, 'mantel_assets', 'index.html') q2templates.render(index, output_dir, context=context)
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.CategoricalMetadataColumn, method: str = 'permanova', pairwise: bool = False, permutations: int = 999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) metadata = metadata.drop_missing_values() # filter the distance matrix to exclude samples that were dropped from # the metadata due to missing values, and keep track of how many samples # survived the filtering so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.ids) filtered_dm_length = distance_matrix.shape[0] metadata = metadata.to_series() # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style('white') # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. # TODO: update to use a grouping API and natsort API on # CategoricalMetadataColumn, if those become available. groupings = collections.OrderedDict( [(id, list(series.index)) for id, series in natsorted(metadata.groupby(metadata))]) pairs_summary = pd.DataFrame(columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) for group_id in groupings: group_distances, x_ticklabels, group_pairs_summary = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) group_pairs_summary = pd.DataFrame( group_pairs_summary, columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) pairs_summary = pd.concat([pairs_summary, group_pairs_summary]) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5}) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig(os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote_plus(str(group_id)))) fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote_plus(str(group_id)))) fig.clear() pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t') result_html = q2templates.df_to_html(result.to_frame()) if pairwise: pairwise_results = [] for group1_id, group2_id in itertools.combinations(groupings, 2): pairwise_result = \ _get_pairwise_group_significance_stats( distance_matrix=distance_matrix, group1_id=group1_id, group2_id=group2_id, groupings=groupings, metadata=metadata, beta_group_significance_fn=beta_group_significance_fn, permutations=permutations) pairwise_results.append([group1_id, group2_id, pairwise_result['sample size'], permutations, pairwise_result['test statistic'], pairwise_result['p-value']]) columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations', result['test statistic name'], 'p-value'] pairwise_results = pd.DataFrame(pairwise_results, columns=columns) pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True) pairwise_results['q-value'] = multipletests( pairwise_results['p-value'], method='fdr_bh')[1] pairwise_results.sort_index(inplace=True) pairwise_path = os.path.join( output_dir, '%s-pairwise.csv' % method) pairwise_results.to_csv(pairwise_path) pairwise_results_html = q2templates.df_to_html(pairwise_results) else: pairwise_results_html = None # repartition groupings for rendering group_ids = list(groupings.keys()) row_count, group_count = 3, len(group_ids) # Start at three plots per row while group_count % row_count != 0: row_count = row_count - 1 group_rows = [group_ids[g:g+row_count] for g in range(0, group_count, row_count)] index = os.path.join( TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'group_rows': group_rows, 'bootstrap_group_col_size': int(12 / row_count), 'result': result_html, 'pairwise_results': pairwise_results_html })
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.MetadataCategory, method: str='permanova', pairwise: bool=False, permutations: int=999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Cast metadata to numeric (if applicable), which gives better sorting # in boxplots. Then filter any samples that are not in the distance matrix, # and drop samples with have no data for this metadata # category, including those with empty strings as values. metadata = pd.to_numeric(metadata.to_series(), errors='ignore') metadata = metadata.loc[list(distance_matrix.ids)] metadata = metadata.replace(r'', numpy.nan).dropna() # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.index) filtered_dm_length = distance_matrix.shape[0] # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style("white") # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. groupings = collections.OrderedDict( [(id, list(series.index)) for id, series in sorted(metadata.groupby(metadata))]) for group_id in groupings: group_distances, x_ticklabels = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5}) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig(os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote_plus(str(group_id)))) fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote_plus(str(group_id)))) fig.clear() result_html = result.to_frame().to_html(classes=("table table-striped " "table-hover")) result_html = result_html.replace('border="1"', 'border="0"') if pairwise: pairwise_results = [] for group1_id, group2_id in itertools.combinations(groupings, 2): pairwise_result = \ _get_pairwise_group_significance_stats( distance_matrix=distance_matrix, group1_id=group1_id, group2_id=group2_id, groupings=groupings, metadata=metadata, beta_group_significance_fn=beta_group_significance_fn, permutations=permutations) pairwise_results.append([group1_id, group2_id, pairwise_result['sample size'], permutations, pairwise_result['test statistic'], pairwise_result['p-value']]) columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations', result['test statistic name'], 'p-value'] pairwise_results = pd.DataFrame(pairwise_results, columns=columns) pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True) pairwise_results['q-value'] = multipletests( pairwise_results['p-value'], method='fdr_bh')[1] pairwise_results.sort_index(inplace=True) pairwise_path = os.path.join( output_dir, '%s-pairwise.csv' % method) pairwise_results.to_csv(pairwise_path) pairwise_results_html = pairwise_results.to_html( classes=("table table-striped table-hover")) pairwise_results_html = pairwise_results_html.replace( 'border="1"', 'border="0"') else: pairwise_results_html = None index = os.path.join( TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'groupings': groupings, 'result': result_html, 'pairwise_results': pairwise_results_html })