예제 #1
0
    def test_nn_tally(self):
        dat, pw = generate_peptide_data()
        res = hierdiff.neighborhood_tally(dat,
                                          pwmat=pw,
                                          x_cols=['trait1'],
                                          count_col='count',
                                          knn_neighbors=None,
                                          knn_radius=3)
        res = dat.join(res)
        self.assertTrue(res.shape[0] == dat.shape[0])

        res = hierdiff.neighborhood_tally(dat,
                                          pwmat=pw,
                                          x_cols=['trait1'],
                                          count_col='count',
                                          knn_neighbors=30,
                                          knn_radius=None)
        res = dat.join(res)
        self.assertTrue(res.shape[0] == dat.shape[0])

        res = hierdiff.neighborhood_tally(dat,
                                          pwmat=pw,
                                          x_cols=['trait1'],
                                          count_col='count',
                                          knn_neighbors=0.1,
                                          knn_radius=None)
        res = dat.join(res)
        self.assertTrue(res.shape[0] == dat.shape[0])
예제 #2
0
    def test_running_nn_tally(self):
        st = time.time()
        dat, pw = generate_peptide_data()
        print('Generated data and computed distances (%1.0fs)' %
              (time.time() - st))
        st = time.time()
        res = hierdiff.neighborhood_tally(dat,
                                          pwmat=pw,
                                          x_cols=['trait1'],
                                          count_col='count',
                                          knn_neighbors=None,
                                          knn_radius=3)
        print('Tallied neighborhoods with pre-computed distances (%1.0fs)' %
              (time.time() - st))
        st = time.time()

        rres = hierdiff.running_neighborhood_tally(dat,
                                                   dist_func=_hamming_wrapper,
                                                   dist_cols=['seq'],
                                                   x_cols=['trait1'],
                                                   count_col='count',
                                                   knn_neighbors=None,
                                                   knn_radius=3)
        print('Tallied neighborhoods without pre-computed distances (%1.0fs)' %
              (time.time() - st))
        self.assertTrue(res.shape[0] == rres.shape[0])
        self.assertTrue((res == rres).all().all())
예제 #3
0
 def test_nn_fishers(self):
     dat, pw = generate_peptide_data()
     res = neighborhood_tally(dat,
                              pwmat=scipy.spatial.distance.squareform(pw),
                              x_cols=['trait1'],
                              count_col='count',
                              knn_neighbors=None,
                              knn_radius=3)
     res = dat.join(res)
     res = cluster_association_test(res, method='fishers')
예제 #4
0
    def test_nn_tally(self):
        dat, pw = generate_peptide_data()
        res = hierdiff.neighborhood_tally(
            dat,
            pwmat=scipy.spatial.distance.squareform(pw),
            x_cols=['trait1'],
            count_col='count',
            knn_neighbors=None,
            knn_radius=3)
        res = dat.join(res)
        self.assertTrue(res.shape[0] == dat.shape[0])

        res = hierdiff.neighborhood_tally(
            dat,
            pwmat=scipy.spatial.distance.squareform(pw),
            x_cols=['trait1'],
            count_col='count',
            knn_neighbors=30,
            knn_radius=None)
        res = dat.join(res)
        self.assertTrue(res.shape[0] == dat.shape[0])

        res = hierdiff.neighborhood_tally(
            dat,
            pwmat=scipy.spatial.distance.squareform(pw),
            x_cols=['trait1'],
            count_col='count',
            knn_neighbors=0.1,
            knn_radius=None)
        res = dat.join(res)
        self.assertTrue(res.shape[0] == dat.shape[0])

        res = hierdiff.neighborhood_tally(
            dat,
            pwmat=scipy.spatial.distance.squareform(pw),
            x_cols=['trait1'],
            count_col='count',
            knn_neighbors=0.1,
            knn_radius=None,
            cluster_ind=np.arange(50))

        self.assertTrue(res.shape[0] == 50)
예제 #5
0
    def test_nn_rect_tally(self):
        dat, pw = generate_peptide_data()
        res = hierdiff.neighborhood_tally(dat,
                                          pwmat=pw[:10, :],
                                          x_cols=['trait1'],
                                          df_centroids=dat.iloc[:10],
                                          count_col='count',
                                          knn_neighbors=0.1,
                                          knn_radius=None)

        self.assertTrue(res.shape[0] == 10)
예제 #6
0
    def test_chm_NN(self):
        print(self.clone_df.shape)
        print(self.clone_df.head())
        print(self.pw.shape)
        import hierdiff as hd
        print(dir(hd))
        print(hd.__spec__)
        res = hd.neighborhood_tally(df=self.clone_df,
                                  pwmat=self.pw,
                                  x_cols=['Visit', 'Stim'],
                                  count_col='count',
                                  knn_neighbors=50,
                                  knn_radius=None,
                                  #subset_ind=None,
                                  cluster_ind=None)

        res = td.stats.neighborhood_diff(self.clone_df, self.pw, x_cols=['Visit', 'Stim'], test_method='chm')
        self.assertTrue(res.shape[0] == self.clone_df.shape[0])
예제 #7
0
def neighborhood_diff(clone_df,
                      pwmat,
                      x_cols,
                      count_col='count',
                      knn_neighbors=50,
                      knn_radius=None,
                      subset_ind=None,
                      cluster_ind=None,
                      test_method='fishers'):
    """Tests for association of categorical variables in x_cols with the neighborhood
    around each TCR in clone_df. The neighborhood is defined by the K closest neighbors
    using pairwise distances in pwmat, or defined by a distance radius, knn_radius.

    Uses hierdiff package (available on PyPI) for tallying counts in each cluster
    and running tests.

    The statistical tests made available by this function are limited and meant only
    as a way to scan for signals. More sophisticated testing/modeling frameworks
    should be considered for real-world problems.

    Use test_method = None to return a table of counts for all neighborhoods that can be saved as
    a CSV and used to run other, more sophisticated tests (e.g. edgeR or other regressions).

    Use Fisher's exact test (test='fishers') to detect enrichment/association of the neighborhood
    with one binary variable. For example, test the 2 x 2 table for each clone:

    +----+----+-------+--------+
    |         |  Neighborhood  |
    |         +-------+--------+
    |         | MEM+  |   MEM- |
    +----+----+-------+--------+
    |VAR |  0 | a     |    b   |
    |    +----+-------+--------+
    |    |  1 | c     |    d   |
    +----+----+-------+--------+

    Use the chi-squared test (test='chi2') to detect association across multiple variables.
    Note that with sparse neighborhoods Chi-squared tests are unreliable.

    Use the Cochran-Mantel-Haenszel test (test='chm') to test stratified 2 x 2 tables:
    one VAR vs. neighborhood, over several strata defined in other variables.
    Use x_cols[0] as the primary (binary) variable and other x_cols for the categorical
    strata-defining variables. This tests the overall null that OR = 1 for x_cols[0].
    A test is also performed for homogeneity of the ORs among the strata (Breslow-Day test).

    Params
    ------
    clone_df : pd.DataFrame [nclones x metadata]
        Contains metadata for each clone.
    pwmat : np.ndarray [nclones x nclones]
        Square distance matrix for defining neighborhoods
    x_cols : list
        List of columns to be tested for association with the neighborhood
    count_col : str
        Column in clone_df that specifies counts.
        Default none assumes count of 1 cell for each row.
    knn_neighbors : int
        Number of neighbors to include in the neighborhood.
    knn_radius : float
        Radius for inclusion of neighbors within the neighborhood.
        Specify K or R but not both.
    subset_ind : None or np.ndarray with partial index of df, optional
        Provides option to tally counts only within a subset of df, but to maintain the clustering
        of all individuals. Allows for one clustering of pooled TCRs,
        but tallying/testing within a subset (e.g. participants or conditions)
    cluster_ind : None or np.ndarray
        Indices into df specifying the neighborhoods for testing.
    test_method : str or None
        Specifies Fisher's exact test ("fishers"), Chi-squared ("chi2") or
        Cochran-Mantel-Haenszel test ("chm") for testing.

    Returns
    -------
    res : pd.DataFrame [nclones x results]
        Results from testing the neighborhood around each clone."""
    res = hd.neighborhood_tally(df=clone_df,
                                pwmat=pwmat,
                                x_cols=x_cols,
                                count_col=count_col,
                                knn_neighbors=knn_neighbors,
                                knn_radius=knn_radius,
                                subset_ind=subset_ind,
                                cluster_ind=cluster_ind)
    if not test_method is None:
        res = hd.cluster_association_test(res,
                                          y_col='cmember',
                                          method=test_method)
    return res