def simulation(n, pi, normal_params, beta_params, cond_ind=True, errors=None, smooth=False, acorn=None): #- Type checks if isinstance(normal_params, list): sbm_check = False # there are other checks to do.. elif isinstance(normal_params, np.ndarray): if normal_params.ndim is 2: if np.sum(normal_params == normal_params.T) == np.prod( normal_params.shape): sbm_check = True else: msg = 'if normal_params is a 2 dimensional array it must be symmetric' raise ValueError(msg) else: msg = 'if normal_params is an array, it must be a 2 dimensional array' raise TypeError(msg) else: msg = 'normal_params must be either a list or a 2 dimensional array' raise TypeError(msg) if acorn is None: acorn = np.random.randint(10**6) np.random.seed(acorn) #- Multinomial trial counts = np.random.multinomial(n, [pi, 1 - pi]) #- Hard code the number of blocks K = 2 #- Set labels labels = np.concatenate((np.zeros(counts[0]), np.ones(counts[1]))) #- number of seeds = n_{i}/10 n_seeds = np.round(0.1 * counts).astype(int) #- Set training and test data class_train_idx = [ range(np.sum(counts[:k]), np.sum(counts[:k]) + n_seeds[k]) for k in range(K) ] train_idx = np.concatenate((class_train_idx)).astype(int) test_idx = [k for k in range(n) if k not in train_idx] #- Total number of seeds m = np.sum(n_seeds) #- estimate class probabilities pi_hats = n_seeds / m #- Sample from beta distributions beta_samples = beta_sampler(counts, beta_params) Z = beta_samples #- Sample from multivariate normal or SBM either independently of Zs or otherwise if cond_ind: if sbm_check: A = sbm(counts, normal_params) ase_obj = ASE(n_elbows=1) X = ase_obj.fit_transform(A) else: X = MVN_sampler(counts, normal_params) if len(normal_params[0][0]) is 1: X = X[:, np.newaxis] else: if sbm_check: P = blowup( normal_params, counts ) # A big version of B to be able to change connectivity probabilities of individual nodes scales = np.prod(Z, axis=1)**( 1 / Z.shape[1] ) # would do just the outer product, but if the Z's are too small we risk not being connected new_P = P * (scales @ scale.T) # new probability matrix A = sbm(np.ones(n).astype(int), new_P) ase_obj = ASE(n_elbows=1) X = ase_obj.fit_transform(A) else: X = conditional_MVN_sampler(Z=Z, rho=1, counts=counts, params=normal_params, seed=None) if len(normal_params[0][0]) is 1: X = X[:, np.newaxis] XZ = np.concatenate((X, Z), axis=1) #- Estimate normal parameters using seeds params = [] for i in range(K): temp_mu, temp_cov = estimate_normal_parameters(X[class_train_idx[i]]) params.append([temp_mu, temp_cov]) #- Using conditional indendence assumption (RF, KNN used for posterior estimates) if errors is None: errors = [[] for i in range(5)] rf1 = RF(n_estimators=100, max_depth=int(np.round(np.log(Z[train_idx].shape[0])))) rf1.fit(Z[train_idx], labels[train_idx]) knn1 = KNN(n_neighbors=int(np.round(np.log(Z[train_idx].shape[0])))) knn1.fit(Z[train_idx], labels[train_idx]) if smooth: temp_pred = classify(X[test_idx], Z[test_idx], params, rf1, m=m) temp_error = 1 - np.sum(temp_pred == labels[test_idx]) / len(test_idx) errors[0].append(temp_error) temp_pred = classify(X[test_idx], Z[test_idx], params, knn1, m=m) temp_error = 1 - np.sum(temp_pred == labels[test_idx]) / len(test_idx) errors[1].append(temp_error) else: temp_pred = classify(X[test_idx], Z[test_idx], params, rf1) temp_error = 1 - np.sum(temp_pred == labels[test_idx]) / len(test_idx) errors[0].append(temp_error) knn1 = KNN(n_neighbors=int(np.round(np.log(m)))) knn1.fit(Z[train_idx], labels[train_idx]) temp_pred = classify(X[test_idx], Z[test_idx], params, knn1) temp_error = 1 - np.sum(temp_pred == labels[test_idx]) / len(test_idx) errors[1].append(temp_error) temp_pred = QDA(X[test_idx], pi_hats, params) temp_error = 1 - np.sum(temp_pred == labels[test_idx]) / len(test_idx) errors[2].append(temp_error) #- Not using conditional independence assumption (RF, KNN used for classification) XZseeds = np.concatenate((X[train_idx], Z[train_idx]), axis=1) rf2 = RF(n_estimators=100, max_depth=int(np.round(np.log(m)))) rf2.fit(XZ[train_idx], labels[train_idx]) temp_pred = rf2.predict(XZ[test_idx]) temp_error = 1 - np.sum(temp_pred == labels[test_idx]) / len(test_idx) errors[3].append(temp_error) knn2 = KNN(n_neighbors=int(np.round(np.log(m)))) knn2.fit(XZ[train_idx], labels[train_idx]) temp_pred = knn2.predict(XZ[test_idx]) temp_error = 1 - np.sum(temp_pred == labels[test_idx]) / len(test_idx) errors[4].append(temp_error) temp_accuracy = GCN(adj, features, train_idx, labels) temp_error = 1 - temp_accuracy errors[5].append(temp_error) return errors
def wHardy_Weinberg(n, m, c0, c1, density=np.random.uniform, params=[0, 1], truncated=False, acorn=None): """ zero-inflated Z_+ weighted LSM network (model & methods) Let T_1, ... T_n ~iid density on [0,1]. Let h: [0,1] to R^d and X_i = h(T_i) so that the latent positions lie on a one-dimensional curve in R^d. (start with h(t) = [t^2 , 2t(1-t) , (1-t)^2]^{T} so the X_i's are on Hardy-Weinberg in Delta^2 subset [0,1]^3 subset R^3.) (for sanity check: do ase(G) into R^3 for such an LSM, and you should get Xhat's around HW (up to orthogonal transformation).) Let p_ij = X_i^{T} X_j Let B_ij ~ Bernoulli(p_ij) and Z_ij ~ G(p_ij) be independent -- independent of each other, and independent across ij. Let W_ij = Z_ij * I{B_ij}, so W_ij is 0-inflated -- 0 with probability 1 - p_ij and weighted otherwise. (start with G(p_ij) = Poisson(c0 * p_ij) so we have a 0-inflated Poisson LSM.) that's H0. for HA: generate null G. choose m vertices uniformly at random -- S subset V = [n]. let H be the induced subgraph Omega(S;G). let the edges W_ij for this induced subgraph to be of the form W_ij is 0 with probability 1 - p_ij and is Poisson(c1 * p_ij) with probability p_ij. so ... for the "start with" case we have just four parameters: n, m, c, c'. """ if acorn is None: acorn = np.random.randint(10**6) np.random.seed(acorn) V = range(n) V1 = np.random.choice(V, m, replace=False) t = sample(n, density, params) X = get_latent_positions(t) if truncated: X = X[:, :2] P = X @ X.T A0 = sbm(np.ones(n).astype(int), P) pois = np.random.poisson L0 = c0 * np.ones((n, n)) * P Z0 = sample((n, n), pois, L0) W0 = A0 * Z0 L1 = c1 * np.ones((n, n)) * P A1 = sbm(np.ones(n).astype(int), P) Z1 = sample((n, n), pois, L0) transplant = sample((n, n), pois, L1) Z1[np.ix_(V1, V1)] = transplant[np.ix_(V1, V1)] W1 = A1 * Z1 ase0 = ASE(n_components=min(100, n - 1)) ase0.fit(W0) ase1 = ASE(n_components=min(100, n - 1)) ase1.fit(W1) return ase0, ase1
#- Total likelihood likeli = surface_log_likelihood + gmm_log_likelihood + prop_log_likelihoods #- BIC bic_ = 2 * likeli - temp_n_params * np.log(n) #- ARI ari_ = ari(true_labels, temp_c_hat) return [combo, likeli, ari_, bic_] np.random.seed(16661) A = binarize(right_adj) X_hat = np.concatenate(ASE(n_components=3).fit_transform(A), axis=1) n, d = X_hat.shape gclust = GCLUST(max_components=15) est_labels = gclust.fit_predict(X_hat) loglikelihoods = [np.sum(gclust.model_.score_samples(X_hat))] combos = [None] aris = [ari(right_labels, est_labels)] bic = [gclust.model_.bic(X_hat)] unique_labels = np.unique(est_labels) class_idx = np.array([np.where(est_labels == u)[0] for u in unique_labels]) for k in range(len(unique_labels)):
print(likeli, ari_, bic_) return [combo, likeli, ari_, bic_] X = np.array([0.2, 0.2, 0.2]) n = 1000 pi = 0.9 A, counts = generate_cyclops(X, n, pi, None) c = [0]*counts[0] c += [1]*counts[1] true_labels = c ase = ASE(n_components=3) X_hat = ase.fit_transform(A) gclust_model = GCLUST(max_components=8) est_labels = gclust.fit_predict(X_hat) loglikelihoods = [np.sum(gclust.model_.score_samples(X_hat))] combos = [None] aris = [ari(c, est_labels)] bic = [gclust.model_.bic(X_hat)] unique_labels = np.unique(est_labels) class_idx = np.array([np.where(est_labels == u)[0] for u in unique_labels]) for k in range(len(unique_labels)):