Пример #1
0
def simulation(n,
               pi,
               normal_params,
               beta_params,
               cond_ind=True,
               errors=None,
               smooth=False,
               acorn=None):
    #- Type checks
    if isinstance(normal_params, list):
        sbm_check = False
        # there are other checks to do..
    elif isinstance(normal_params, np.ndarray):
        if normal_params.ndim is 2:
            if np.sum(normal_params == normal_params.T) == np.prod(
                    normal_params.shape):
                sbm_check = True
            else:
                msg = 'if normal_params is a 2 dimensional array it must be symmetric'
                raise ValueError(msg)
        else:
            msg = 'if normal_params is an array, it must be a 2 dimensional array'
            raise TypeError(msg)
    else:
        msg = 'normal_params must be either a list or a 2 dimensional array'
        raise TypeError(msg)

    if acorn is None:
        acorn = np.random.randint(10**6)
    np.random.seed(acorn)

    #- Multinomial trial
    counts = np.random.multinomial(n, [pi, 1 - pi])

    #- Hard code the number of blocks
    K = 2

    #- Set labels
    labels = np.concatenate((np.zeros(counts[0]), np.ones(counts[1])))

    #- number of seeds = n_{i}/10
    n_seeds = np.round(0.1 * counts).astype(int)

    #- Set training and test data
    class_train_idx = [
        range(np.sum(counts[:k]),
              np.sum(counts[:k]) + n_seeds[k]) for k in range(K)
    ]
    train_idx = np.concatenate((class_train_idx)).astype(int)

    test_idx = [k for k in range(n) if k not in train_idx]

    #- Total number of seeds
    m = np.sum(n_seeds)

    #- estimate class probabilities
    pi_hats = n_seeds / m

    #- Sample from beta distributions
    beta_samples = beta_sampler(counts, beta_params)
    Z = beta_samples

    #- Sample from multivariate normal or SBM either independently of Zs or otherwise
    if cond_ind:
        if sbm_check:
            A = sbm(counts, normal_params)
            ase_obj = ASE(n_elbows=1)
            X = ase_obj.fit_transform(A)
        else:
            X = MVN_sampler(counts, normal_params)
            if len(normal_params[0][0]) is 1:
                X = X[:, np.newaxis]
    else:
        if sbm_check:
            P = blowup(
                normal_params, counts
            )  # A big version of B to be able to change connectivity probabilities of individual nodes
            scales = np.prod(Z, axis=1)**(
                1 / Z.shape[1]
            )  # would do just the outer product, but if the Z's are too small we risk not being connected
            new_P = P * (scales @ scale.T)  # new probability matrix
            A = sbm(np.ones(n).astype(int), new_P)
            ase_obj = ASE(n_elbows=1)
            X = ase_obj.fit_transform(A)
        else:
            X = conditional_MVN_sampler(Z=Z,
                                        rho=1,
                                        counts=counts,
                                        params=normal_params,
                                        seed=None)
            if len(normal_params[0][0]) is 1:
                X = X[:, np.newaxis]

    XZ = np.concatenate((X, Z), axis=1)

    #- Estimate normal parameters using seeds
    params = []
    for i in range(K):
        temp_mu, temp_cov = estimate_normal_parameters(X[class_train_idx[i]])
        params.append([temp_mu, temp_cov])

    #- Using conditional indendence assumption (RF, KNN used for posterior estimates)
    if errors is None:
        errors = [[] for i in range(5)]

    rf1 = RF(n_estimators=100,
             max_depth=int(np.round(np.log(Z[train_idx].shape[0]))))
    rf1.fit(Z[train_idx], labels[train_idx])

    knn1 = KNN(n_neighbors=int(np.round(np.log(Z[train_idx].shape[0]))))
    knn1.fit(Z[train_idx], labels[train_idx])

    if smooth:
        temp_pred = classify(X[test_idx], Z[test_idx], params, rf1, m=m)
        temp_error = 1 - np.sum(temp_pred == labels[test_idx]) / len(test_idx)
        errors[0].append(temp_error)

        temp_pred = classify(X[test_idx], Z[test_idx], params, knn1, m=m)
        temp_error = 1 - np.sum(temp_pred == labels[test_idx]) / len(test_idx)
        errors[1].append(temp_error)
    else:
        temp_pred = classify(X[test_idx], Z[test_idx], params, rf1)
        temp_error = 1 - np.sum(temp_pred == labels[test_idx]) / len(test_idx)
        errors[0].append(temp_error)

        knn1 = KNN(n_neighbors=int(np.round(np.log(m))))
        knn1.fit(Z[train_idx], labels[train_idx])

        temp_pred = classify(X[test_idx], Z[test_idx], params, knn1)
        temp_error = 1 - np.sum(temp_pred == labels[test_idx]) / len(test_idx)
        errors[1].append(temp_error)

    temp_pred = QDA(X[test_idx], pi_hats, params)
    temp_error = 1 - np.sum(temp_pred == labels[test_idx]) / len(test_idx)
    errors[2].append(temp_error)

    #- Not using conditional independence assumption (RF, KNN used for classification)
    XZseeds = np.concatenate((X[train_idx], Z[train_idx]), axis=1)

    rf2 = RF(n_estimators=100, max_depth=int(np.round(np.log(m))))
    rf2.fit(XZ[train_idx], labels[train_idx])
    temp_pred = rf2.predict(XZ[test_idx])
    temp_error = 1 - np.sum(temp_pred == labels[test_idx]) / len(test_idx)
    errors[3].append(temp_error)

    knn2 = KNN(n_neighbors=int(np.round(np.log(m))))
    knn2.fit(XZ[train_idx], labels[train_idx])

    temp_pred = knn2.predict(XZ[test_idx])
    temp_error = 1 - np.sum(temp_pred == labels[test_idx]) / len(test_idx)
    errors[4].append(temp_error)

    temp_accuracy = GCN(adj, features, train_idx, labels)
    temp_error = 1 - temp_accuracy
    errors[5].append(temp_error)

    return errors
Пример #2
0
def wHardy_Weinberg(n,
                    m,
                    c0,
                    c1,
                    density=np.random.uniform,
                    params=[0, 1],
                    truncated=False,
                    acorn=None):
    """
    zero-inflated Z_+ weighted LSM network (model & methods)

    Let T_1, ... T_n ~iid density on [0,1].

    Let h: [0,1] to R^d and X_i = h(T_i) so that the latent positions lie on a one-dimensional curve in R^d.
    (start with h(t) = [t^2 , 2t(1-t) , (1-t)^2]^{T} so the X_i's are on Hardy-Weinberg in Delta^2 subset [0,1]^3 subset R^3.)
    (for sanity check: do ase(G) into R^3 for such an LSM, and you should get Xhat's around HW (up to orthogonal transformation).)
    Let p_ij = X_i^{T} X_j
    Let B_ij ~ Bernoulli(p_ij) and Z_ij ~ G(p_ij) be independent -- independent of each other, and independent across ij.
    Let W_ij = Z_ij * I{B_ij}, so W_ij is 0-inflated -- 0 with probability 1 - p_ij and weighted otherwise.
    (start with G(p_ij) = Poisson(c0 * p_ij) so we have a 0-inflated Poisson LSM.) 
    that's H0.
    for HA:
    generate null G.
    choose m vertices uniformly at random -- S subset V = [n].
    let H be the induced subgraph Omega(S;G).
    let the edges W_ij for this induced subgraph to be of the form 
      W_ij is 0 with probability 1 - p_ij and is Poisson(c1 * p_ij) with probability p_ij.

    so ... for the "start with" case
    we have just four parameters: n, m, c, c'.
    """

    if acorn is None:
        acorn = np.random.randint(10**6)

    np.random.seed(acorn)
    V = range(n)
    V1 = np.random.choice(V, m, replace=False)

    t = sample(n, density, params)

    X = get_latent_positions(t)

    if truncated:
        X = X[:, :2]

    P = X @ X.T

    A0 = sbm(np.ones(n).astype(int), P)

    pois = np.random.poisson
    L0 = c0 * np.ones((n, n)) * P

    Z0 = sample((n, n), pois, L0)
    W0 = A0 * Z0

    L1 = c1 * np.ones((n, n)) * P
    A1 = sbm(np.ones(n).astype(int), P)

    Z1 = sample((n, n), pois, L0)
    transplant = sample((n, n), pois, L1)

    Z1[np.ix_(V1, V1)] = transplant[np.ix_(V1, V1)]

    W1 = A1 * Z1

    ase0 = ASE(n_components=min(100, n - 1))
    ase0.fit(W0)

    ase1 = ASE(n_components=min(100, n - 1))
    ase1.fit(W1)

    return ase0, ase1
    #- Total likelihood
    likeli = surface_log_likelihood + gmm_log_likelihood + prop_log_likelihoods

    #- BIC
    bic_ = 2 * likeli - temp_n_params * np.log(n)

    #- ARI
    ari_ = ari(true_labels, temp_c_hat)

    return [combo, likeli, ari_, bic_]


np.random.seed(16661)
A = binarize(right_adj)
X_hat = np.concatenate(ASE(n_components=3).fit_transform(A), axis=1)
n, d = X_hat.shape

gclust = GCLUST(max_components=15)
est_labels = gclust.fit_predict(X_hat)

loglikelihoods = [np.sum(gclust.model_.score_samples(X_hat))]
combos = [None]
aris = [ari(right_labels, est_labels)]
bic = [gclust.model_.bic(X_hat)]

unique_labels = np.unique(est_labels)

class_idx = np.array([np.where(est_labels == u)[0] for u in unique_labels])

for k in range(len(unique_labels)):
Пример #4
0
    
    print(likeli, ari_, bic_)
    return [combo, likeli, ari_, bic_]

X = np.array([0.2, 0.2, 0.2])

n = 1000
pi = 0.9

A, counts = generate_cyclops(X, n, pi, None)
c = [0]*counts[0]
c += [1]*counts[1]

true_labels = c

ase = ASE(n_components=3)
X_hat = ase.fit_transform(A)

gclust_model = GCLUST(max_components=8)
est_labels = gclust.fit_predict(X_hat)

loglikelihoods = [np.sum(gclust.model_.score_samples(X_hat))]
combos = [None]
aris = [ari(c, est_labels)]
bic = [gclust.model_.bic(X_hat)]

unique_labels = np.unique(est_labels)

class_idx = np.array([np.where(est_labels == u)[0] for u in unique_labels])

for k in range(len(unique_labels)):