def test_nn_tally(self): dat, pw = generate_peptide_data() res = hierdiff.neighborhood_tally(dat, pwmat=pw, x_cols=['trait1'], count_col='count', knn_neighbors=None, knn_radius=3) res = dat.join(res) self.assertTrue(res.shape[0] == dat.shape[0]) res = hierdiff.neighborhood_tally(dat, pwmat=pw, x_cols=['trait1'], count_col='count', knn_neighbors=30, knn_radius=None) res = dat.join(res) self.assertTrue(res.shape[0] == dat.shape[0]) res = hierdiff.neighborhood_tally(dat, pwmat=pw, x_cols=['trait1'], count_col='count', knn_neighbors=0.1, knn_radius=None) res = dat.join(res) self.assertTrue(res.shape[0] == dat.shape[0])
def test_running_nn_tally(self): st = time.time() dat, pw = generate_peptide_data() print('Generated data and computed distances (%1.0fs)' % (time.time() - st)) st = time.time() res = hierdiff.neighborhood_tally(dat, pwmat=pw, x_cols=['trait1'], count_col='count', knn_neighbors=None, knn_radius=3) print('Tallied neighborhoods with pre-computed distances (%1.0fs)' % (time.time() - st)) st = time.time() rres = hierdiff.running_neighborhood_tally(dat, dist_func=_hamming_wrapper, dist_cols=['seq'], x_cols=['trait1'], count_col='count', knn_neighbors=None, knn_radius=3) print('Tallied neighborhoods without pre-computed distances (%1.0fs)' % (time.time() - st)) self.assertTrue(res.shape[0] == rres.shape[0]) self.assertTrue((res == rres).all().all())
def test_nn_fishers(self): dat, pw = generate_peptide_data() res = neighborhood_tally(dat, pwmat=scipy.spatial.distance.squareform(pw), x_cols=['trait1'], count_col='count', knn_neighbors=None, knn_radius=3) res = dat.join(res) res = cluster_association_test(res, method='fishers')
def test_nn_tally(self): dat, pw = generate_peptide_data() res = hierdiff.neighborhood_tally( dat, pwmat=scipy.spatial.distance.squareform(pw), x_cols=['trait1'], count_col='count', knn_neighbors=None, knn_radius=3) res = dat.join(res) self.assertTrue(res.shape[0] == dat.shape[0]) res = hierdiff.neighborhood_tally( dat, pwmat=scipy.spatial.distance.squareform(pw), x_cols=['trait1'], count_col='count', knn_neighbors=30, knn_radius=None) res = dat.join(res) self.assertTrue(res.shape[0] == dat.shape[0]) res = hierdiff.neighborhood_tally( dat, pwmat=scipy.spatial.distance.squareform(pw), x_cols=['trait1'], count_col='count', knn_neighbors=0.1, knn_radius=None) res = dat.join(res) self.assertTrue(res.shape[0] == dat.shape[0]) res = hierdiff.neighborhood_tally( dat, pwmat=scipy.spatial.distance.squareform(pw), x_cols=['trait1'], count_col='count', knn_neighbors=0.1, knn_radius=None, cluster_ind=np.arange(50)) self.assertTrue(res.shape[0] == 50)
def test_nn_rect_tally(self): dat, pw = generate_peptide_data() res = hierdiff.neighborhood_tally(dat, pwmat=pw[:10, :], x_cols=['trait1'], df_centroids=dat.iloc[:10], count_col='count', knn_neighbors=0.1, knn_radius=None) self.assertTrue(res.shape[0] == 10)
def test_chm_NN(self): print(self.clone_df.shape) print(self.clone_df.head()) print(self.pw.shape) import hierdiff as hd print(dir(hd)) print(hd.__spec__) res = hd.neighborhood_tally(df=self.clone_df, pwmat=self.pw, x_cols=['Visit', 'Stim'], count_col='count', knn_neighbors=50, knn_radius=None, #subset_ind=None, cluster_ind=None) res = td.stats.neighborhood_diff(self.clone_df, self.pw, x_cols=['Visit', 'Stim'], test_method='chm') self.assertTrue(res.shape[0] == self.clone_df.shape[0])
def neighborhood_diff(clone_df, pwmat, x_cols, count_col='count', knn_neighbors=50, knn_radius=None, subset_ind=None, cluster_ind=None, test_method='fishers'): """Tests for association of categorical variables in x_cols with the neighborhood around each TCR in clone_df. The neighborhood is defined by the K closest neighbors using pairwise distances in pwmat, or defined by a distance radius, knn_radius. Uses hierdiff package (available on PyPI) for tallying counts in each cluster and running tests. The statistical tests made available by this function are limited and meant only as a way to scan for signals. More sophisticated testing/modeling frameworks should be considered for real-world problems. Use test_method = None to return a table of counts for all neighborhoods that can be saved as a CSV and used to run other, more sophisticated tests (e.g. edgeR or other regressions). Use Fisher's exact test (test='fishers') to detect enrichment/association of the neighborhood with one binary variable. For example, test the 2 x 2 table for each clone: +----+----+-------+--------+ | | Neighborhood | | +-------+--------+ | | MEM+ | MEM- | +----+----+-------+--------+ |VAR | 0 | a | b | | +----+-------+--------+ | | 1 | c | d | +----+----+-------+--------+ Use the chi-squared test (test='chi2') to detect association across multiple variables. Note that with sparse neighborhoods Chi-squared tests are unreliable. Use the Cochran-Mantel-Haenszel test (test='chm') to test stratified 2 x 2 tables: one VAR vs. neighborhood, over several strata defined in other variables. Use x_cols[0] as the primary (binary) variable and other x_cols for the categorical strata-defining variables. This tests the overall null that OR = 1 for x_cols[0]. A test is also performed for homogeneity of the ORs among the strata (Breslow-Day test). Params ------ clone_df : pd.DataFrame [nclones x metadata] Contains metadata for each clone. pwmat : np.ndarray [nclones x nclones] Square distance matrix for defining neighborhoods x_cols : list List of columns to be tested for association with the neighborhood count_col : str Column in clone_df that specifies counts. Default none assumes count of 1 cell for each row. knn_neighbors : int Number of neighbors to include in the neighborhood. knn_radius : float Radius for inclusion of neighbors within the neighborhood. Specify K or R but not both. subset_ind : None or np.ndarray with partial index of df, optional Provides option to tally counts only within a subset of df, but to maintain the clustering of all individuals. Allows for one clustering of pooled TCRs, but tallying/testing within a subset (e.g. participants or conditions) cluster_ind : None or np.ndarray Indices into df specifying the neighborhoods for testing. test_method : str or None Specifies Fisher's exact test ("fishers"), Chi-squared ("chi2") or Cochran-Mantel-Haenszel test ("chm") for testing. Returns ------- res : pd.DataFrame [nclones x results] Results from testing the neighborhood around each clone.""" res = hd.neighborhood_tally(df=clone_df, pwmat=pwmat, x_cols=x_cols, count_col=count_col, knn_neighbors=knn_neighbors, knn_radius=knn_radius, subset_ind=subset_ind, cluster_ind=cluster_ind) if not test_method is None: res = hd.cluster_association_test(res, y_col='cmember', method=test_method) return res