def test_DCER_fit(self): np.random.seed(8888) graph = self.graph p_mat = self.p_mat dcsbe = DCSBMEstimator(directed=True, loops=False) dcsbe.fit(graph) assert_allclose(p_mat, dcsbe.p_mat_, atol=0.12)
def test_DCSBM_fit_supervised(self): p_mat = self.p_mat labels = self.labels g = self.g dcsbe = DCSBMEstimator(directed=True, loops=False) dcsbe.fit(g, y=labels) assert_allclose(dcsbe.p_mat_, p_mat, atol=0.1)
def test_DCSBM_score(self): p_mat = self.p_mat graph = self.g estimator = DCSBMEstimator() _test_score(estimator, p_mat, graph) with pytest.raises(ValueError): estimator.score_samples(graph=graph[1:100, 1:100])
def test_DCSBM_fit_unsupervised(self): np.random.seed(12345) n_verts = 1500 distances = np.random.beta(4, 1, n_verts) B = np.array([[0.7, 0.1, 0.1], [0.1, 0.9, 0.1], [0.05, 0.1, 0.75]]) n = np.array([500, 500, 500]) labels = _n_to_labels(n) p_mat = _block_to_full(B, labels, (n_verts, n_verts)) p_mat = p_mat * np.outer(distances, distances) p_mat -= np.diag(np.diag(p_mat)) graph = sample_edges(p_mat, directed=True, loops=False) dcsbe = DCSBMEstimator(directed=True, loops=False) dcsbe.fit(graph) assert adjusted_rand_score(labels, dcsbe.vertex_assignments_) > 0.95 assert_allclose(p_mat, dcsbe.p_mat_, atol=0.12)
def dcsbm_pvalue(G1, G2, max_comm, num_perm, pooled_variance=True, min_comm=1, epsilon1=1e-3, epsilon2=1e-3, Z1=None, Z2=None): """ Estimate p-value via parametric bootstrap, i.e. fit a DC-SBM """ # if we are fixing the number of communities, we should also fix the number of latent dimensions of the embedding # otherwise (when we let the algorithm to automatically choose the number of communities) # we also let it choose the number of latent dimensions if min_comm == max_comm: K = min_comm else: K = None obs_test_stat = gcorr_dcsbm(G1, G2, min_comm=min_comm, max_comm=max_comm, pooled_variance=pooled_variance, epsilon1=epsilon1, epsilon2=epsilon2) G1_dcsbm = DCSBMEstimator(directed=False, min_comm=min_comm, max_comm=max_comm, n_components=K).fit(G1, y=Z1) G2_dcsbm = DCSBMEstimator(directed=False, min_comm=min_comm, max_comm=max_comm, n_components=K).fit(G2, y=Z2) # create bootstrap samples G1_bootstrap = G1_dcsbm.sample(n_samples=num_perm) G2_bootstrap = G2_dcsbm.sample(n_samples=num_perm) null_test_stats = np.zeros(num_perm) for i in tqdm(range(num_perm)): null_test_stats[i] = gcorr_dcsbm(G1_bootstrap[i], G2_bootstrap[i], min_comm=min_comm, max_comm=max_comm, pooled_variance=pooled_variance, epsilon1=epsilon1, epsilon2=epsilon2) num_extreme = np.where(null_test_stats >= obs_test_stat)[0].size if num_extreme < num_perm / 2: # P(T > t | H0) is smaller return (2 * num_extreme + 1) / (num_perm + 1) else: # P(T < t | H0) is smaller return (2 * (num_perm - num_extreme) + 1) / (num_perm + 1)
def dcsbm_corr(n, p, r, theta, epsilon1=1e-3, epsilon2=1e-3, directed=False, loops=False): ''' Sample a pair of DC-SBM with the same marginal probabilities ''' Z = np.repeat(np.arange(0, np.array(n).size), n) R = r * np.ones((np.sum(n), np.sum(n))) # sample a DC-SBM w/ block prob p G = sbm(n, p, dc=theta) # fit DC-SBM to G1 to estimate P G_dcsbm = DCSBMEstimator(directed=False).fit(G, y=Z) p_mat = G_dcsbm.p_mat_ # P could be out of range p_mat[p_mat < epsilon1] = epsilon1 p_mat[p_mat > 1 - epsilon2] = 1 - epsilon2 # sample correlated graphs based on P G1, G2 = sample_edges_corr(p_mat, R, directed, loops) return G1, G2
def test_DCSBM_nparams(self): n_verts = 3000 n_class = 4 graph = self.g labels = self.labels e = DCSBMEstimator(directed=True) e.fit(graph) assert e._n_parameters() == (n_verts + n_class - 1 + n_class ** 2) e = DCSBMEstimator(directed=True) e.fit(graph, y=labels) assert e._n_parameters() == (n_verts + n_class ** 2) e = DCSBMEstimator(directed=True, degree_directed=True) e.fit(graph, y=labels) assert e._n_parameters() == (2 * n_verts + n_class ** 2) e = DCSBMEstimator(directed=False) e.fit(graph, y=labels) assert e._n_parameters() == (n_verts + 10)
def test_DCSBM_sample(self): np.random.seed(8888) estimator = DCSBMEstimator(directed=True, loops=False) B = np.array([[0.9, 0.1], [0.1, 0.9]]) dc = np.random.uniform(0.25, 0.75, size=100) labels = _n_to_labels([50, 50]) p_mat = _block_to_full(B, labels, (100, 100)) p_mat = p_mat * np.outer(dc, dc) p_mat -= np.diag(np.diag(p_mat)) g = sample_edges(p_mat, directed=True) with pytest.raises(NotFittedError): estimator.sample() estimator.fit(g, y=labels) with pytest.raises(ValueError): estimator.sample(n_samples=-1) with pytest.raises(TypeError): estimator.sample(n_samples="nope") estimator.p_mat_ = p_mat _test_sample(estimator, p_mat, n_samples=1000, atol=0.1)
def test_DCSBM_inputs(self): with pytest.raises(TypeError): DCSBMEstimator(directed="hey") with pytest.raises(TypeError): DCSBMEstimator(loops=6) with pytest.raises(TypeError): DCSBMEstimator(n_components="XD") with pytest.raises(ValueError): DCSBMEstimator(n_components=-1) with pytest.raises(TypeError): DCSBMEstimator(min_comm="1") with pytest.raises(ValueError): DCSBMEstimator(min_comm=-1) with pytest.raises(TypeError): DCSBMEstimator(max_comm="ay") with pytest.raises(ValueError): DCSBMEstimator(max_comm=-1) with pytest.raises(ValueError): DCSBMEstimator(min_comm=4, max_comm=2) graph = er_np(100, 0.5) bad_y = np.zeros(99) dcsbe = DCSBMEstimator() with pytest.raises(ValueError): dcsbe.fit(graph, y=bad_y) with pytest.raises(ValueError): dcsbe.fit(graph[:, :99]) with pytest.raises(ValueError): dcsbe.fit(graph[..., np.newaxis]) with pytest.raises(TypeError): DCSBMEstimator(cluster_kws=1) with pytest.raises(TypeError): DCSBMEstimator(embed_kws=1)
def gcorr_dcsbm(G1, G2, max_comm, pooled_variance=True, min_comm=1, epsilon1=1e-3, epsilon2=1e-3, Z1=None, Z2=None, return_fit=False, seed=None): """ Compute a test statistic based on DC-SBM fit Note this test statistic doesn't require the vertex assignment optionally give fitted DC-SBM to save computation time Note: if `G1_dcsbm` or `G2_dcsbm` is given, the estimated P matrices are extracted from these model fits otherwise, they are extracted from the model fitted on `G1`, `G2` """ # if we are fixing the number of communities, we should also fix the number of latent dimensions of the embedding # otherwise (when we let the algorithm to automatically choose the number of communities) # we also let it choose the number of latent dimensions if min_comm == max_comm: K = min_comm else: K = None G1_dcsbm = DCSBMEstimator(directed=False, min_comm=min_comm, max_comm=max_comm, n_components=K, cluster_kws={ 'random_state': seed }).fit(G1, y=Z1) G2_dcsbm = DCSBMEstimator(directed=False, min_comm=min_comm, max_comm=max_comm, n_components=K, cluster_kws={ 'random_state': seed }).fit(G2, y=Z2) # since the diagonal entries are forced to be zeros in graphs with no loops # we should ignore them in the calculation of correlation g1 = off_diag(G1) g2 = off_diag(G2) phat = off_diag(G1_dcsbm.p_mat_) qhat = off_diag(G2_dcsbm.p_mat_) # trim the estimated probability matrix phat[phat < epsilon1] = epsilon1 phat[phat > 1 - epsilon2] = 1 - epsilon2 qhat[qhat < epsilon1] = epsilon1 qhat[qhat > 1 - epsilon2] = 1 - epsilon2 # calculate the test statistic if pooled_variance: T = np.sum((g1 - phat) * (g2 - qhat)) / np.sqrt( np.sum(np.square(g1 - phat)) * np.sum(np.square(g2 - qhat))) else: num_vertices = G1.shape[0] T = np.sum((g1 - phat) * (g2 - qhat) / np.sqrt(phat * (1 - phat) * qhat * (1 - qhat))) / (num_vertices * (num_vertices - 1)) if return_fit: dcsbm_fit = {'G1': G1_dcsbm, 'G2': G2_dcsbm} return T, dcsbm_fit else: return T
if args.sim == 'sbm': G1, G2 = sbm_corr(n, p, args.rho) elif args.sim == 'dcsbm': theta = np.linspace(100, 1, n[0]) theta /= theta.sum() theta = np.concatenate([theta, theta]) G1, G2 = dcsbm_corr(n, p, args.rho, theta) # null by block permutation Z = community_estimation(G1, G2, min_components=max_comm) # Z = np.repeat([0, 1], n) G2_block_perm = block_permutation(G2, Z) # null by parametric bootstrap G1_dcsbm = DCSBMEstimator(directed=False).fit(G1) G2_dcsbm = DCSBMEstimator(directed=False).fit(G2) G1_bootstrap = G1_dcsbm.sample()[0] G2_bootstrap = G2_dcsbm.sample()[0] test_stats_alt['gcorr_block_perm'][i, rep] = gcorr(G1, G2, Z) test_stats_null['gcorr_block_perm'][i, rep] = gcorr(G1, G2_block_perm, Z) test_stats_alt['gcorr_param_bootstrap'][i, rep] = gcorr(G1, G2, Z) test_stats_null['gcorr_param_bootstrap'][i, rep] = gcorr(G1_bootstrap, G2_bootstrap, Z) test_stats_alt['gcorrDC_param_bootstrap'][i, rep] = gcorr_dcsbm(G1, G2, max_comm) test_stats_null['gcorrDC_param_bootstrap'][i, rep] = gcorr_dcsbm(G1_bootstrap, G2_bootstrap, max_comm) test_stats_alt['gcorrDC_block_perm'][i, rep] = gcorr_dcsbm(G1, G2, max_comm) test_stats_null['gcorrDC_block_perm'][i, rep] = gcorr_dcsbm(G1, G2_block_perm, max_comm) # compute power