def _insert_stats(stats_dict, docs, d_ref, z_ref, z_lda, phi, idx):
    φ = np.array(phi).flatten()

    docsf = np.array([doc['w'] for doc in docs]).flatten()

    stats_dict['doc_ari'][idx] = ari(d_ref, docsf)
    stats_dict['ari'][idx] = ari(z_ref, z_lda)
    stats_dict['phi_std'][idx] = np.std(φ)

    return stats_dict
def _insert_stats(stats_dict, docs, d_ref, z_ref, z_lda, phi, idx):
    φ = np.array(phi).flatten()

    docsf = np.array([doc['w'] for doc in docs]).flatten()

    stats_dict['doc_ari'][idx] = ari(d_ref, docsf)
    stats_dict['ari'][idx] = ari(z_ref, z_lda)
    stats_dict['phi_std'][idx] = np.std(φ)

    return stats_dict
Exemplo n.º 3
0
def get_accuracy(cluster_assignments, y_true, n_clusters):
    '''
    Computes the accuracy based on the provided kmeans cluster assignments
    and true labels, using the Munkres algorithm

    cluster_assignments:    array of labels, outputted by kmeans
    y_true:                 true labels
    n_clusters:             number of clusters in the dataset

    returns:    a tuple containing the accuracy and confusion matrix,
                in that order
    '''
    y_pred, confusion_matrix = get_y_preds(cluster_assignments, y_true,
                                           n_clusters)

    from sklearn.metrics import normalized_mutual_info_score as nmi
    nmi_score = nmi(y_true, y_pred)
    print('NMI: ' + str(np.round(nmi_score, 4)))

    from sklearn.metrics import adjusted_rand_score as ari
    ari_score = ari(y_true, y_pred)
    print('ARI: ' + str(np.round(ari_score, 4)))

    # with open('C:/Users/mals6571/Desktop/SpectralNet-master/src/applications/Results.txt','a') as my_file:
    ProjectDir = get_project_root()
    with open(os.path.join(ProjectDir, 'Results.txt'), 'a') as my_file:
        my_file.write("\n")
        my_file.write('NMI: ' + str(np.round(nmi_score, 4)))
        my_file.write("\n")
        my_file.write('ARI: ' + str(np.round(ari_score, 4)))
        my_file.write("\n")

    # calculate the accuracy
    return np.mean(y_pred == y_true), confusion_matrix
Exemplo n.º 4
0
 def form_matrix(self, result, dataset):
     n = int(sqrt(len(result)))
     res_matrix = np.empty((
         n,
         n,
     ))
     res_matrix[:] = np.nan
     if self.type == "sw":
         for result_row in result:
             p, beta, sw = result_row
             row = beta_to_row(beta)  # 50 - int(beta * 10)
             col = p_to_col(p)  # int(p * 10) - 10
             res_matrix[row, col] = sw
     if self.type == "ari":
         for result_row in result:
             p, beta, labels = result_row
             labels = labels.replace("[", "")
             labels = labels.replace("]", "")
             labels = np.fromstring(labels, sep=" ", dtype=int)
             row = beta_to_row(beta)  # 50 - int(beta * 10)
             col = p_to_col(p)  # int(p * 10) - 10
             labels_true = np.loadtxt(ds_directory + "/" +
                                      cut_extention(dataset) + ".lbs",
                                      skiprows=1)
             labels_true = labels_true.astype(int)
             res_matrix[row, col] = ari(labels_true, labels)
     return res_matrix
Exemplo n.º 5
0
    def benchmark(self, name: str, features: np.ndarray,
                  labels: np.ndarray) -> Tuple[str, Dict]:
        """
        Returns the clustering performance results in str and dict format.

        The metrics used are as follows:
            1. Duration
            2. Adjusted RAND Score
            3. Normalized Mutual Information
            4. Davies-Bouldin Index
            5. Silhouette Score
            6. Calinski-Harabasz Score
            7. Clustering Accuracy

        Parameters
        ----------
        name: str
            The name of the benchmark.
        features: np.ndarray
            The test instances to cluster.
        labels: np.ndarray
            The test labels.

        Returns
        -------
        str
            The formatted string of the benchmark results.
        results: Dict
            The dictionary of benchmark results.
        """
        start_time = time.time()
        predictions = self.predict(features)

        results = {}

        results["name"] = name
        results["duration"] = time.time() - start_time
        results["ari"] = ari(labels_true=labels, labels_pred=predictions)
        results["nmi"] = nmi(labels_true=labels, labels_pred=predictions)
        results["dbi"] = davies_bouldin_score(features, predictions)
        results["silhouette"] = silhouette_score(features,
                                                 predictions,
                                                 metric="euclidean")
        results["ch_score"] = calinski_harabasz_score(features, predictions)
        results["clustering_accuracy"] = clustering_accuracy(
            target=labels, prediction=predictions)

        return (
            "%-9s\t%.2fs\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f" % (
                results.get("name"),
                results.get("duration"),
                results.get("dbi"),
                results.get("silhouette"),
                results.get("ch_score"),
                results.get("nmi"),
                results.get("ari"),
                results.get("clustering_accuracy"),
            ),
            results,
        )
def metriques(model, y_true, y_pred):
    pred1 = model.row_labels_
    nmi_ = nmi(y_true, pred1)
    ari_ = ari(y_true, pred1)
    accuracy = ACCURACY(y_true, pred1)
    print("NMI: {}\nARI: {} ".format(nmi_, ari_))
    print("ACCURACY: %s" % accuracy)
    return nmi_, ari_, accuracy
def tensfact_baseline():
    n_clusters = 81
    f = open('buzz_user_tensor_45.npy')
    X_buzz = np.load(f)
    print X_buzz.shape

    X_buzz = X_buzz[buzz_ground.keys()]
    buzz_ground1 = buzz_ground.values()

    km = KMeans(n_clusters=81, init='k-means++', n_init=1, verbose=False)
    sc = 0.0
    sc1 = 0.0
    sc2 = 0.0
    for i in xrange(10):
        km.fit(X_buzz)
        sc += nmi(buzz_ground1, km.labels_)
        sc1 += ari(buzz_ground1, km.labels_)
        sc2 += ami(buzz_ground1, km.labels_)

    print "BUZZ"
    print "nmi score %f" % (sc / float(10))
    print "ari score %f" % (sc1 / float(10))
    print "ami score %f" % (sc2 / float(10))

    f = open('poli_user_tensor_75.npy')
    X_poli = np.load(f)
    print X_poli.shape
    X_poli = X_poli[poli_ground.keys()]
    poli_ground1 = poli_ground.values()
    sc = 0.0
    sc1 = 0.0
    km1 = KMeans(n_clusters=310, init='k-means++', n_init=1, verbose=False)
    sc = 0.0
    sc1 = 0.0
    sc2 = 0.0
    for i in xrange(10):
        km1.fit(X_poli)
        sc += nmi(poli_ground1, km1.labels_)
        sc1 += ari(poli_ground1, km1.labels_)
        sc2 += ami(poli_ground1, km1.labels_)

    print "poli"
    print "nmi score %f" % (sc / float(10))
    print "ari score %f" % (sc1 / float(10))
    print "ami score %f" % (sc2 / float(10))
Exemplo n.º 8
0
def execute_algo(model, model_name, X, y, verbose=True):
    print("##############\n# {}\n##############".format(model_name))
    model.fit(X)
    res_nmi = nmi(model.row_labels_, y)
    res_ari = ari(model.row_labels_, y)
    res_acc = accuracy(model.row_labels_, y)
    if verbose:
        print("NMI =", res_nmi)
        print("ARI =", res_ari)
        print("ACC =", res_acc)
    return res_nmi, res_ari, res_acc
Exemplo n.º 9
0
    def train(self):
        x, y = np.load('images/64px_image_x.npy'), np.load(
            'images/64px_image_y.npy')
        x = np.reshape(x, (40000, 64, 64, 1))
        kmeans = KMeans(n_clusters=2, n_init=20)
        y_pred = kmeans.fit_predict(self.encoder.predict(x))
        y_pred_last = np.copy(y_pred)
        self.model.get_layer(name='clustering').set_weights(
            [kmeans.cluster_centers_])

        loss = 0
        ae_loss = 0
        index = 0
        maxiter = 80000
        update_interval = 100
        index_array = np.arange(x.shape[0])
        batch_size = 16
        tol = 0.001

        # model.load_weights('DEC_model_final.h5')

        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q = self.model.predict(x, verbose=0)
                # update the auxiliary target distribution p
                p = self.target_distribution(q)

                # evaluate the clustering performance
                y_pred = q.argmax(1)
                if y is not None:
                    acc = np.round(metrics.acc(y, y_pred), 5)
                    nmi = np.round(metrics.nmi(y, y_pred), 5)
                    ari = np.round(metrics.ari(y, y_pred), 5)
                    loss = np.round(loss, 5)
                    print(
                        'Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f, loss=%.5f'
                        % (ite, acc, nmi, ari, loss))

                # check stop criterion - model convergence
                delta_label = np.sum(y_pred != y_pred_last).astype(
                    np.float32) / y_pred.shape[0]
                y_pred_last = np.copy(y_pred)
                if ite > 0 and delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    break
            idx = np.random.randint(low=0, high=x.shape[0], size=batch_size)
            # ae_loss = ae.train_on_batch(x=x[idx], y=x[idx])
            loss = self.model.train_on_batch(x=x[idx], y=p[idx])
            index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0

        self.model.save_weights('DEC_model_final_64px.h5')
        self.test_model()
Exemplo n.º 10
0
def for_loop_function(combo, X_hat, est_labels, true_labels, gclust_model, M):
    print(combo)
    n, d = X_hat.shape
    unique_labels, counts = np.unique(est_labels, return_counts=True)
    K = len(unique_labels)

    class_idx = np.array([np.where(c_hat == u)[0] for u in unique_labels])
    temp_quad_labels = np.concatenate(class_idx[combo])

    surface_count = np.sum(counts[temp_quad_labels])
    surface_prop = surface_count/n

    temp_n = len(temp_quad_labels)
    temp_K = K - len(combo)
    temp_mean_params = temp_K * d
    temp_cov_params = temp_K * d * (d + 1) / 2
    temp_quad_params = (d - 1)*2 + d - 1 + (d - 1) * (d - 2) / 2 + 1
    temp_n_params = temp_mean_params + temp_cov_params
    temp_n_params = temp_quad_params + temp_K - 1
    
    temp_label = min(combo)
    new_counts = np.zeros(temp_K)
    for i in range(new_counts):
        if unique_labels[i] == temp_label:
            new_counts = surface_count
        else:
            new_counts = counts[i]

    new_props = (surface_count / n)**counts
    prop_log_likelihoods = np.sum(np.log(new_props))

    temp_c_hat = c_hat.copy()
    temp_c_hat[temp_quad_labels] = temp_label
    
    params, pcov = optimize.curve_fit(func, X_hat[temp_quad_labels, :2], X_hat[temp_quad_labels, 2])
    
    # integral = abs(monte_carlo_integration(X_hat[temp_quad_labels], func, params, M))
    delly = Delaunay(X_hat[temp_quad_labels ,:-1])
    content = np.sum([calculate_simplex_content(X_hat[temp_quad_labels][del_]) for del_ in delly.simplices])
    
    quad_log_likelihood = quadratic_log_likelihood(X_hat[temp_quad_labels], params, curve_density=False)
    quad_log_likelihood -= temp_n * np.log(content)
    gmm_log_likelihood = np.sum(gclust.model_.score_samples(X_hat[-temp_quad_labels]))

    log_likeli = quad_log_likelihood + gmm_log_likelihood + prop_log_likelihoods
    
    bic_ = 2*log_likeli - temp_n_params * np.log(n)

    ari_ = ari(c, temp_c_hat)
    
    print(likeli, ari_, bic_)
    return [combo, likeli, ari_, bic_]
Exemplo n.º 11
0
def calculate_ari(p, beta, dataset):
    start = time()
    dataset = ".".join([get_basename(dataset), "pts"])
    data_file = "/".join([data_directory, dataset])
    k_star = get_k_star(dataset)
    algorithm, t1, t2, t3, labels, cluster_structure = single_run(
        data_file, p, beta, k_star)
    labels_file = ".".join([cut_extention(data_file), "lbs"])
    labels_true = pd.read_csv(labels_file).as_matrix().astype(int).flatten()
    assert len(labels_true) == 1000
    ari_value = ari(labels_true=labels_true, labels_pred=labels)
    end = time()
    print("calculate ARI = {} in {:5.2f} sec".format(ari_value, end - start))
    return ari_value
def run_trial(X, labels, k):
    errors = '"'

    # Run our dbscan
    start = time()
    """
    if metric == 'seuclidean':
        db = KMeans(eps,minPts,metric=metric,metric_params={'V':V})
    else:
        db = kmean(,minPts,metric=metric)
    """
    db = KMeans(k, n_jobs=12)
    pred_labels = db.fit_predict(X)
    elapsed = time() - start

    try:
        ari_score = ari(pred_labels, labels)
    except Exception as e:
        errors += str(e) + '; '
        ari_score = np.nan
    try:
        nmi_score = nmi(pred_labels, labels, average_method='arithmetic')
    except Exception as e:
        errors += str(e) + '; '
        nmi_score = np.nan
    try:
        ss_score = ss(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        ss_score = np.nan
    try:
        vrc_score = vrc(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        vrc_score = np.nan
    try:
        dbs_score = dbs(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        dbs_score = np.nan

    errors += '"'

    return [
        k, elapsed, ari_score, nmi_score, ss_score, vrc_score, dbs_score,
        errors
    ]
Exemplo n.º 13
0
def get_scores(x, y, n, k, dtr, dev):
    tx = tens0((n, k), dt=dtr, dev=dev)
    ty = tens0((n, k), dt=dtr, dev=dev)
    tx = tens_sel_set(tx, x, 1)
    ty = tens_sel_set(ty, y, 1)
    t = tx.t().matmul(ty)
    del tx, ty
    tt = t.max() - t
    tt = tt.cpu().numpy()
    row, col = ass(tt)
    del tt
    t = t.cpu().numpy()
    t = t[row, col].sum()
    t = t.tolist() / n
    x = x.cpu().numpy()
    y = y.cpu().numpy()
    s = {
        'nmi': nmi(x, y, average_method='geometric'),
        'ari': ari(x, y),
        'acc': t,
    }
    return s
Exemplo n.º 14
0
for index in range(1, 101):
    goldtopics = [gold[r] for r in results if math.floor(float(r)) == index]
    hyper_topics = [
        HyperLex[r] for r in results if math.floor(float(r)) == index
    ]
    spinglass_topics = [
        spinglass[r] for r in results if math.floor(float(r)) == index
    ]
    gold_count.append(len(set(goldtopics)))
    HyperLex_count.append(len(set(hyper_topics)))
    spinglass_count.append(len(set(spinglass_topics)))
    #     print(hyper_topics)
    #     print(goldtopics)
    #     exit()
    #     total_clusters.append(max(int(number.split(".")[1]) + 1 for number in hyper_topics))
    scores.append((index, ari(goldtopics, spinglass_topics)))

# HyperLex_count = [count for count in HyperLex_count if count > 1]
# spinglass_count = [count for count in spinglass_count if count > 1]
histogram(gold_count, "Gold standard")
histogram(HyperLex_count, "HyperLex")
histogram(spinglass_count, "Spinglass")
# histogram(total_clusters, "total")
print(scores)
print(gold_count)
print(HyperLex_count)
# print(total_clusters)
# print("ARI:", numpy.average(scores))
scores = sorted(scores, key=itemgetter(1))
topics = numpy.genfromtxt("topics.txt", dtype=None, skip_header=1)
print(scores)
Exemplo n.º 15
0
    def fit(self, x, y=None, maxiter=2e4, batch_size=256, tol=1e-3,
            update_interval=140, save_dir='./results/temp'):

        print('Update interval', update_interval)
        save_interval = int(x.shape[0] / batch_size) * 5  # 5 epochs
        print('Save interval', save_interval)

        # Step 1: initialize cluster centers using k-means
        t1 = time()
        print('Initializing cluster centers with k-means.')
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        y_pred = kmeans.fit_predict(self.encoder.predict(x))
        y_pred_last = np.copy(y_pred)
        self.model.get_layer(name='clustering').set_weights([kmeans.cluster_centers_])

        # Step 2: deep clustering
        # logging file
        import csv
        logfile = open(save_dir + '/dec_log.csv', 'w')
        logwriter = csv.DictWriter(logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'loss'])
        logwriter.writeheader()

        loss = 0
        index = 0
        index_array = np.arange(x.shape[0])
        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q = self.model.predict(x, verbose=0)
                p = self.target_distribution(q)  # update the auxiliary target distribution p

                # evaluate the clustering performance
                y_pred = q.argmax(1)
                if y is not None:
                    acc = np.round(metrics.acc(y, y_pred), 5)
                    nmi = np.round(metrics.nmi(y, y_pred), 5)
                    ari = np.round(metrics.ari(y, y_pred), 5)
                    loss = np.round(loss, 5)
                    logdict = dict(iter=ite, acc=acc, nmi=nmi, ari=ari, loss=loss)
                    logwriter.writerow(logdict)
                    print('Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f' % (ite, acc, nmi, ari), ' ; loss=', loss)

                # check stop criterion
                delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0]
                y_pred_last = np.copy(y_pred)
                if ite > 0 and delta_label < tol:
                    print('delta_label ', delta_label, '< tol ', tol)
                    print('Reached tolerance threshold. Stopping training.')
                    logfile.close()
                    break

            # train on batch
            # if index == 0:
            #     np.random.shuffle(index_array)
            idx = index_array[index * batch_size: min((index+1) * batch_size, x.shape[0])]
            loss = self.model.train_on_batch(x=x[idx], y=p[idx])
            index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0

            # save intermediate model
            if ite % save_interval == 0:
                print('saving model to:', save_dir + '/DEC_model_' + str(ite) + '.h5')
                self.model.save_weights(save_dir + '/DEC_model_' + str(ite) + '.h5')

            ite += 1

        # save the trained model
        logfile.close()
        print('saving model to:', save_dir + '/DEC_model_final.h5')
        self.model.save_weights(save_dir + '/DEC_model_final.h5')

        return y_pred
def for_loop_function(combo, X_hat, est_labels, true_labels, gclust_model, M):
    print(combo)

    # Grab number of data points and dimension of data
    n, d = X_hat.shape

    # Grab cluster labels and corresponding counts and the number of clusters
    unique_labels, counts = np.unique(est_labels, return_counts=True)
    K = len(unique_labels)

    # Partition the data by cluster
    class_idx = np.array([np.where(est_labels == u)[0] for u in unique_labels])

    # Combine clusters in combo into a single cluster
    temp_quad_labels = np.concatenate(class_idx[combo])

    # Grab the number of data included in the surface
    surface_count = np.sum(counts[combo])
    temp_n = len(temp_quad_labels)

    assert temp_n == surface_count

    temp_K = K - len(combo) + 1  # new number of "clusters"
    temp_mean_params = (temp_K - 1) * d  # temp_K - 1 means in d space
    temp_cov_params = (temp_K - 1) * d * (
        d - 1) / 2  # temp_K - 1 symmetric covariances in M^{d x d}
    temp_quad_params = (
        d - 1) + 1 + 1  # polynomial parameters + variance off surface

    #- Total parameters = parameters from gaussians + parameters from surface
    temp_n_params = temp_mean_params + temp_cov_params
    temp_n_params += temp_quad_params + temp_K - 1  # Include mixing proportions

    #- Give each cluster in combos the label of the smallest label in combos
    temp_label = min(combo)
    temp_c_hat = est_labels.copy()
    temp_c_hat[temp_quad_labels] = temp_label

    #- New counts vector
    new_counts = np.zeros(temp_K)
    for i in range(temp_K):
        if unique_labels[i] == temp_label:
            new_counts = surface_count
        else:
            new_counts = counts[i]

    #- New proportions vector
    new_props = (new_counts / n)**new_counts

    #- For BIC
    prop_log_likelihoods = np.sum(np.log(new_props))
    guess = (1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
    #- Find fitted surface
    params, pcov = optimize.curve_fit(func, X_hat[temp_quad_labels, :-1],
                                      X_hat[temp_quad_labels, -1], guess)

    #- Estimate surface area (comutationally expensive!!!)
    integral = abs(
        monte_carlo_integration(X_hat[temp_quad_labels], func, params, M))

    #- Find likelihood of surface
    surface_log_likelihood = quadratic_log_likelihood(X_hat[temp_quad_labels],
                                                      params,
                                                      curve_density=False)
    surface_log_likelihood -= surface_count * np.log(integral)

    #- Find likelihood of Gaussians
    gmm_log_likelihood = np.sum(gclust.model_.score(X_hat[-temp_quad_labels]))

    #- Total likelihood
    likeli = surface_log_likelihood + gmm_log_likelihood + prop_log_likelihoods

    #- BIC
    bic_ = 2 * likeli - temp_n_params * np.log(n)

    #- ARI
    ari_ = ari(true_labels, temp_c_hat)

    return [combo, likeli, ari_, bic_]
    ari_ = ari(true_labels, temp_c_hat)

    return [combo, likeli, ari_, bic_]


np.random.seed(16661)
A = binarize(right_adj)
X_hat = np.concatenate(ASE(n_components=3).fit_transform(A), axis=1)
n, d = X_hat.shape

gclust = GCLUST(max_components=15)
est_labels = gclust.fit_predict(X_hat)

loglikelihoods = [np.sum(gclust.model_.score_samples(X_hat))]
combos = [None]
aris = [ari(right_labels, est_labels)]
bic = [gclust.model_.bic(X_hat)]

unique_labels = np.unique(est_labels)

class_idx = np.array([np.where(est_labels == u)[0] for u in unique_labels])

for k in range(len(unique_labels)):
    for combo in list(combinations(np.unique(est_labels), k + 1)):
        combo = np.array(list(combo)).astype(int)
        combos.append(combo)

M = 10**8

condensed_func = lambda combo: for_loop_function(combo, X_hat, est_labels,
                                                 right_labels, gclust, M)
Exemplo n.º 18
0
def tensfact_baseline():
	G_buzz, N_buzz, C_buzz, G_poli, N_poli, C_poli = parse_graphs()
	n_news1 = N_buzz.shape[0]
	n_news2 = N_poli.shape[0]
	y_buzz = [0] * n_news1
	y_poli = [0] * n_news2
	y_buzz = np.array(y_buzz)
	y_poli = np.array(y_poli)
	y_buzz[91:] = 1
	y_poli[120:] = 1
	n_clusters = 81
	if not os.path.isfile('tensor_buzz.npy'):
		T = np.zeros((N_buzz.shape[0], G_buzz.shape[0], C_buzz.shape[1]))
		n_users = G_buzz.shape[0]
		n_news = N_buzz.shape[0]
		n_comm = C_buzz.shape[1]
		for i in xrange(n_news):
			for j in xrange(n_users):
				for k in xrange(n_comm):
					T[i,j,k] = N_buzz[i,j] * C_buzz[j, k] 
		np.save('tensor_buzz.npy', T)
	else:
		f = open('tensor_buzz.npy')
		T_buzz = np.load(f)
		print T_buzz.shape
		print "Buzz tensor loaded"
		#T = dtensor(T_buzz)
		#print T.shape
		#factors = parafac(T_buzz, rank=25, init='random')
		#T_buzz = tl.tensor(T_buzz)
		# Best so far [50, 100, 5]
		core, factors = tucker(T_buzz, ranks=[45, 100, 5])
		print core.shape
		print factors[0].shape
		print factors[1].shape
		#P, fit, itr, exectimes = cp_als(T, 35, init='random')
		#P, F, D, A, fit, itr, exectimes = parafac2.parafac2(T, 10, init=42)
		# Extracting news embeddings
		#X_buzz = T_buzz
		X_buzz = factors[1]
		#X_buzz = P.U[0]
		F = open('buzz_lsi.npy', 'r')
		buzz_lsi = np.load(F)
		#X_buzz = np.hstack((X_buzz, buzz_lsi))
		print X_buzz.shape	
		#caler = MinMaxScaler()
		#X_buzz = preprocessing.scale(X_buzz)
		#X_buzz = scaler.fit_transform(X_buzz)
		#assert np.where(np.isnan(X_buzz) == True)[0].shape[0] == 0
	
		#X_buzz = X_buzz[buzz_ground.keys()]

		buzz_ground1 = buzz_ground.values()

	
		km = KMeans(n_clusters=81, init='k-means++', n_init=1, verbose=False)
		print "Buzzfeed dataset's feat. extracted"
		#print X_buzz.shape 
        	#X_buzz, y_buzz = shuffle(X_buzz, y_buzz, random_state=42)
		sc = 0.0
		sc1 = 0.0
		sc2 = 0.0
		for i in xrange(10):
		        km.fit(X_buzz)
		        sc+=nmi(buzz_ground1, km.labels_)
		        sc1+=ari(buzz_ground1, km.labels_)
		        sc2+=ami(buzz_ground1, km.labels_)


		print "BUZZ"
		print "nmi score %f"%(sc/float(10))
		print "ari score %f"%(sc1/float(10))
		print "ami score %f"%(sc2/float(10))
		









	

	if not os.path.isfile('tensor_poli.npy'):
		T = np.zeros((N_poli.shape[0], G_poli.shape[0], C_poli.shape[1]))
		n_users = G_poli.shape[0]
		n_news = N_poli.shape[0]
		n_comm = C_poli.shape[1]
		for i in xrange(n_news):
			for j in xrange(n_users):
				for k in xrange(n_comm):
					T[i,j,k] = N_poli[i,j] * C_poli[j, k] 
		np.save('tensor_poli.npy', T)
	else:
		f = open('tensor_poli.npy')
		T_poli = np.load(f)
		print T_poli.shape
		print "Politifact tensor loaded"
		T = dtensor(T_poli)
		#factors = parafac(T_poli, rank=50)
		#P, fit, itr, exectimes = cp_als(T, 35,  init='random')
		# Best so far: [50, 100, 5]
		T_poli = tl.tensor(T_poli)
		core, factors = tucker(T_poli, ranks=[45, 100, 5])
		#print " Fit value, Itr and Exectimes are:"
		#print fit
		#print itr
		#print exectimes
		# Extracting news embeddings
		X_poli = factors[1]
		#X_poli = P.U[0]
		F = open('poli_lsi.npy', 'r')
		poli_lsi = np.load(F)
		
		
		#X_poli = X_poli[poli_ground.keys()]
		#X_poli = np.hstack((X_poli, poli_lsi))
		print X_poli.shape
		#X_buzz = preprocessing.scale(X_poli)
		#X_poli = scaler.fit_transform(X_poli)
		assert np.where(np.isnan(X_buzz) == True)[0].shape[0] == 0
		print X_poli.shape
		print "Politifact news feats. extracted"
		poli_ground1 = poli_ground.values()
		km = KMeans(n_clusters=310, init='k-means++', n_init=1, verbose=False)
		print "Buzzfeed dataset's feat. extracted"
		#print X_buzz.shape 
        	#X_buzz, y_buzz = shuffle(X_buzz, y_buzz, random_state=42)
		sc = 0.0
		sc1 = 0.0
		sc2 = 0.0
		for i in xrange(10):
		        km.fit(X_poli)
		        sc+=nmi(poli_ground1, km.labels_)
		        sc1+=ari(poli_ground1, km.labels_)
		        sc2+=ami(poli_ground1, km.labels_)


		print "BUZZ"
		print "nmi score %f"%(sc/float(10))
		print "ari score %f"%(sc1/float(10))
		print "ami score %f"%(sc2/float(10))
T = np.zeros((n, m, l))
y = np.zeros(m)
for index, row in final.iterrows():
    T[row['user_le'], row['movie_le'], row['tag_le']] = 1
    y[row['movie_le']] = row['genre_le']

sparsity = 1 - (np.sum(T > 0) / np.product(T.shape))

model = CoClust(n_iterations=np.sum(T.shape) * 100,
                optimization_strategy=alg,
                path=output_path)
model.fit(T)

tau = model.final_tau_
n = nmi(model.y_, y, average_method='arithmetic')
a = ari(model.y_, y)

f.write(
    f"{T.shape[0]},{T.shape[1]},{T.shape[2]},,{len(set(y))},,,{tau[0]},{tau[1]},{tau[2]},,{n},,,{a},,{model._n_clusters[0]},{model._n_clusters[1]},{model._n_clusters[2]},{model.execution_time_},{sparsity},{alg}\n"
)
f.close()

gy = open(output_path + alg + "_assignments_ML_" + k + "_y.txt", 'w')
for i in range(T.shape[1]):
    gy.write(f"{i}\t{model._assignment[1][i]}\n")

gy.close()

gz = open(output_path + alg + "_assignments_ML_" + k + "_z.txt", 'w')
for i in range(T.shape[2]):
    gz.write(f"{i}\t{model._assignment[2][i]}\n")
Exemplo n.º 20
0
y_pred = torch.argmax(result['y_pred'], dim=-1)
y_true = result['y_true']
embeddings = result['embedding'].cpu().numpy()
print(y_pred.size())
print(y_true.size())
print(embeddings.shape)
'''
    caculate NMI and ARI
    normalized_mutual_info_score(labels_true, labels_pred, *, average_method='arithmetic')
    adjusted_rand_score(labels_true, labels_pred)[source]
'''
kmeans = KMeans(n_clusters=n_class, random_state=0).fit(embeddings)
y_cluster = kmeans.labels_

nmi_score = nmi(y_true.cpu().numpy(), y_cluster)
ari_score = ari(y_true.cpu().numpy(), y_cluster)
print('nmi_score={}, ari_score={}'.format(nmi_score, ari_score))
'''
    visualize (using t-SNE)
'''
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=10, n_iter=500)
tsne_results = tsne.fit_transform(embeddings)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))
df_subset = pd.DataFrame(index=list(range(embeddings.shape[0])),
                         columns=['axis_x', 'axis_y'])
df_subset['axis_x'] = tsne_results[:, 0]
df_subset['axis_y'] = tsne_results[:, 1]
df_subset['y'] = y_true.cpu().numpy()

# plt.figure(figsize=(16,7))
Exemplo n.º 21
0
    gmm = GMM(random_state=5)

    st = clock()
    for k in clusters:
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(wineX)
        gmm.fit(wineX)
        SSE[k]['Wine'] = km.score(wineX)
        ll[k]['Wine'] = gmm.score(wineX)
        acc[k]['Wine']['Kmeans'] = cluster_acc(wineY.ravel(),
                                               km.predict(wineX))
        acc[k]['Wine']['GMM'] = cluster_acc(wineY.ravel(), gmm.predict(wineX))
        adjMI[k]['Wine']['Kmeans'] = ami(wineY.ravel(), km.predict(wineX))
        adjMI[k]['Wine']['GMM'] = ami(wineY.ravel(), gmm.predict(wineX))
        adjRI[k]['Wine']['Kmeans'] = ari(wineY.ravel(), km.predict(wineX))
        adjRI[k]['Wine']['GMM'] = ari(wineY.ravel(), gmm.predict(wineX))
        bic[k]['Wine']['Kmeans'] = -compute_bic(km, wineX)
        bic[k]['Wine']['GMM'] = gmm.bic(wineX)
        silh[k]['Wine']['Kmeans'] = silhouette_score(wineX, km.predict(wineX))
        silh[k]['Wine']['GMM'] = silhouette_score(wineX, gmm.predict(wineX))

        km.fit(digitX)
        gmm.fit(digitX)
        SSE[k]['Digit'] = km.score(digitX)
        ll[k]['Digit'] = gmm.score(digitX)
        acc[k]['Digit']['Kmeans'] = cluster_acc(digitY.ravel(),
                                                km.predict(digitX))
        acc[k]['Digit']['GMM'] = cluster_acc(digitY.ravel(),
                                             gmm.predict(digitX))
        adjMI[k]['Digit']['Kmeans'] = ami(digitY.ravel(), km.predict(digitX))
# 谱聚类
# ================================================================================

from sklearn import metrics
from sklearn.cluster import SpectralClustering

chs = metrics.calinski_harabasz_score
from sklearn.metrics import adjusted_rand_score as ari

neig = np.arange(5, 50, 5)
ychs = []
for k in neig:
    y_pred = SpectralClustering(n_clusters=3,
                                affinity='nearest_neighbors',
                                n_neighbors=k).fit_predict(X)
    s = ari(y_true, y_pred)
    ychs.append(s)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(neig, ychs)
ax.set_xlabel('numbers of neighbors')
ax.set_ylabel('ARI')
plt.show()  #观察选取最佳聚类数

t1 = time.time()
sc_pred = SpectralClustering(n_clusters=3,
                             affinity='nearest_neighbors',
                             n_neighbors=20).fit_predict(X)
t2 = time.time()
sc_t = t2 - t1
# ===================================================================
Exemplo n.º 23
0
def run_trial(X, labels, eps, minPts, metric, V):
    errors = '"'

    # Run our dbscan
    start = time()
    if metric == 'seuclidean':
        db = DBSCAN(eps,
                    minPts,
                    metric=metric,
                    metric_params={'V': V},
                    n_jobs=6)
    else:
        db = DBSCAN(eps, minPts, metric=metric, n_jobs=6)
    pred_labels = db.fit_predict(X)
    elapsed = time() - start
    perc_noise = np.sum(pred_labels == -1) / len(pred_labels)
    n_clust = pred_labels.max()

    # Remove noisy points
    clean_idx = np.where(pred_labels != -1)
    nn_preds = pred_labels[clean_idx]
    nn_labels = labels[clean_idx]
    nn_X = X[clean_idx]

    try:
        ari_score = ari(pred_labels, labels)
    except Exception as e:
        errors += str(e) + '; '
        ari_score = np.nan
    try:
        nmi_score = nmi(pred_labels, labels, average_method='arithmetic')
    except Exception as e:
        errors += str(e) + '; '
        nmi_score = np.nan
    try:
        if metric == 'seuclidean':
            ss_score = ss(X, pred_labels, metric=metric, V=V)
        else:
            ss_score = ss(X, pred_labels, metric=metric)
    except Exception as e:
        errors += str(e) + '; '
        ss_score = np.nan
    try:
        vrc_score = vrc(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        vrc_score = np.nan
    try:
        dbs_score = dbs(X, pred_labels)
    except Exception as e:
        errors += str(e) + '; '
        dbs_score = np.nan

    try:
        nn_ari_score = ari(nn_preds, nn_labels)
    except Exception as e:
        errors += str(e) + '; '
        nn_ari_score = np.nan
    try:
        nn_nmi_score = nmi(nn_preds, nn_labels, average_method='arithmetic')
    except Exception as e:
        errors += str(e) + '; '
        nn_nmi_score = np.nan
    try:
        if metric == 'seuclidean':
            nn_ss_score = ss(nn_X, nn_preds, metric=metric, V=V)
        else:
            nn_ss_score = ss(nn_X, nn_preds, metric=metric)
    except Exception as e:
        errors += str(e) + '; '
        nn_ss_score = np.nan
    try:
        nn_vrc_score = vrc(nn_X, nn_preds)
    except Exception as e:
        errors += str(e) + '; '
        nn_vrc_score = np.nan
    try:
        nn_dbs_score = dbs(nn_X, nn_preds)
    except Exception as e:
        errors += str(e) + '; '
        nn_dbs_score = np.nan

    errors += '"'

    return [
        metric, eps, minPts, n_clust, perc_noise, elapsed, ari_score,
        nn_ari_score, nmi_score, nn_nmi_score, ss_score, nn_ss_score,
        vrc_score, nn_vrc_score, dbs_score, nn_dbs_score, errors
    ]
Exemplo n.º 24
0
def get_metrics(metrics,
                x,
                y_true,
                mu_true,
                y_pred,
                mu_pred,
                outliers_identified_by=-1):
    '''
    this function computes all metrics that are available in known_metrics (see below)
    it outputs the desired metrics in the same order as in the argument 'metrics'
    '''
    from sklearn.metrics import adjusted_rand_score as ari
    known_metrics = [
        "RMSE", "ACC", "ARI", "DISTORSION", "n_sample", "OT_CENTERS"
    ]

    if not any(np.isin(element=known_metrics, test_elements=metrics)):
        raise Exception(
            'all desired metrics are unknown of the function get_metrics')
    if outliers_identified_by != -1:
        raise exception(
            "outliers must be identified by '-1' in the assignment array y_true and y_pred"
        )
    if type(y_pred[0]) != np.int64 or type(y_true[0]) != np.int64:
        raise Exception("y_pred and y_true must contain integer")

    nb_outliers_true = np.sum(y_true == -1)
    # nb_outliers_pred = np.sum(y_pred==-1) # in case it is useful one day, but now it's not used

    res = np.zeros(len(metrics))
    for metric in metrics:
        # take one metric after one another and compute the corresponding piece of code
        if metric == "RMSE":
            mapp = mapping(y_true, y_pred)
            # position of RMSE
            position = np.isin(element=metrics, test_elements="RMSE")
            # compute RMSE
            rmse = RMSE(
                mu_true, mu_pred, mapp
            )  # RMSE in which a map indicate how points should be associated
            # store RMSE
            res[position] = rmse
#             print('rmse  . = '+str(rmse))

        elif metric == "RMSE_ot":
            # position of RMSE
            position = np.isin(element=metrics, test_elements="RMSE")
            # compute RMSE
            rmse = RMSE_ot(
                mu_true, mu_pred
            )  # RMSE in which the points are automatically assciated so that the distance is minimal
            # store RMSE
            res[position] = rmse


#             print('RMSE_ot = '+str(rmse))

        elif metric == "ACC":
            mapp = mapping(y_true, y_pred)
            #             print('mapp in ACC'+str(mapp))
            # position of ACC
            position = np.isin(element=metrics, test_elements="ACC")
            # compute ACC
            acc = accuracy(y_true[nb_outliers_true:],
                           y_pred[nb_outliers_true:], mapp)
            # store ACC
            res[position] = acc

        elif metric == "ARI":
            # position of ARI
            position = np.isin(element=metrics, test_elements="ARI")
            # compute ARI
            ari = ari(y_true[nb_outliers_true:], y_pred[nb_outliers_true:])
            # store ARI
            res[position] = ari

        elif metric == "DISTORSION" or metric == "n_sample":
            # position of DISTORSION
            position = np.isin(element=metrics, test_elements="DISTORSION")
            # a == distorsion value obtained with data that are inliers for both y_pred and y_true against the empirical probability measure Pn (=against data)
            distorsion, inliers_qtty = my_inlier_distorsion(
                data=x, y_true=y_true, y_pred=y_pred, centers_pred=mu_pred)
            # store DISTORSION
            res[position] = distorsion
            if 'n_sample' in metrics:
                position = np.isin(element=metrics, test_elements='n_sample')
                res[position] = inliers_qtty

        elif metric == "OT_CENTERS":
            raise Exception("OT_CENTERS is not yet available in get_metrics")
            mapp = mapping(y_true, y_pred)
            # position of OT_CENTERS
            position = np.isin(element=metrics, test_elements="OT_CENTERS")
            # compute OT_CENTERS
            ot_centers = RMSE(mu_true, mu_pred, mapp)
            # store OT_CENTERS
            res[position] = ot_centers

        else:
            raise Exception("the desired metric " + str(metric) +
                            " is not supported in the function get_metrics")

    return (res)
Exemplo n.º 25
0
def evaluate(labels, labels_pred):
    # Compare predicted labels to known ground-truths
    # Returns error rate, 1-NMI, 1-ARI
    return err_rate(labels, labels_pred), 1 - nmi(
        labels, labels_pred, average_method="geometric"), 1 - ari(
            labels, labels_pred)
buzz_featvec = buzz_featvec[buzz_ground.keys()]
poli_featvec = poli_featvec[poli_ground.keys()]

buzz_ground = buzz_ground.values()
poli_ground = poli_ground.values()

km = KMeans(n_clusters=81, n_init=1)
km1 = KMeans(n_clusters=310, n_init=1)

sc = 0.0
sc1 = 0.0
sc2 = 0.0
for i in xrange(10):
    km.fit(buzz_featvec)
    sc += nmi(buzz_ground, km.labels_)
    sc1 += ari(buzz_ground, km.labels_)
    sc2 += ami(buzz_ground, km.labels_)

print "BUZZ"
print "nmi score %f" % (sc / float(10))
print "ari score %f" % (sc1 / float(10))
print "ami score %f" % (sc2 / float(10))

sc = 0.0
sc1 = 0.0
sc2 = 0.0
for i in xrange(10):
    km1.fit(poli_featvec)
    sc += nmi(poli_ground, km1.labels_)
    sc1 += ari(poli_ground, km1.labels_)
    sc2 += ami(poli_ground, km1.labels_)
Exemplo n.º 27
0
sparsity = 1 - (np.sum(T>0) / np.product(T.shape))
f, dt = CreateOutputFile("yelp", date = True)


output_path = f"./output/_yelp/" + dt[:10] + "_" + dt[11:13] + "." + dt[14:16] + "." + dt[17:19] + "/"
directory = os.path.dirname(output_path)
if not os.path.exists(directory):
    os.makedirs(directory)

model = CoClust(np.sum(T.shape) * 10, optimization_strategy = alg, path = output_path)
model.fit(T)

tau = model.final_tau_
nmi_x = nmi(y, model.x_, average_method='arithmetic')
ari_x = ari(y, model.x_)

f.write(f"{T.shape[0]},{T.shape[1]},{T.shape[2]},{len(set(y))},,,,{tau[0]},{tau[1]},{tau[2]},{nmi_x},,,{ari_x},,,{model._n_clusters[0]},{model._n_clusters[1]},{model._n_clusters[2]},{model.execution_time_},{sparsity},{alg}\n")
f.close()

gx = open(output_path + alg + "_assignments_"+ tensor + "_x.txt", 'w')
for i in range(T.shape[0]):
    gx.write(f"{i}\t{model._assignment[0][i]}\n")
gx.close()


gy = open(output_path + alg + "_assignments_"+ tensor + "_y.txt", 'w')
for i in range(T.shape[1]):
    gy.write(f"{i}\t{model._assignment[1][i]}\n")
gy.close()
tsne_ari, pca_ari, km_ari, sc_ari = [], [], [], []
n_features_range = range(50, n_features_max + 1, step)
for n_features in n_features_range:
    samples = 300
    X, y = generate_data(samples, n_features)
    tsne_proj = TSNE(random_state=1).fit_transform(X)
    tsne_pred = KMeans(n_clusters=2, random_state=9).fit_predict(tsne_proj)
    pca = PCA(n_components=int(n_features / 2)).fit_transform(X)
    pca_pred = KMeans(n_clusters=2, n_init=1).fit_predict(pca)
    km_pred = KMeans(init='random', n_clusters=2, n_init=10,
                     algorithm='full').fit_predict(X)
    sc_pred = SpectralClustering(n_clusters=2,
                                 affinity='nearest_neighbors',
                                 n_neighbors=20).fit_predict(X)

    t_a = ari(y, tsne_pred)
    p_a = ari(y, pca_pred)
    k_a = ari(y, km_pred)
    s_a = ari(y, sc_pred)

    tsne_ari.append(t_a)
    pca_ari.append(p_a)
    km_ari.append(k_a)
    sc_ari.append(s_a)

plt.plot(n_features_range, tsne_ari, linewidth=2, label="ARI of t-SNE")
plt.plot(n_features_range, pca_ari, linewidth=2, label="ARI of PCA")
plt.plot(n_features_range, km_ari, linewidth=2, label="ARI of k-means")
plt.plot(n_features_range, sc_ari, linewidth=2, label="ARI of SC")

plt.xlabel('n_features ')
Exemplo n.º 29
0
# In[326]:


acc(y_test, y_pred_init)


# In[327]:


nmi(y_test,y_pred_init,average_method='arithmetic')


# In[328]:


ari(y_test,y_pred_init)


# ##### K-means sur les images encodées

# In[329]:


encoder.compile(optimizer='adam', loss='categorical_crossentropy',metrics=["acc"])


# In[330]:


y_train_hot = to_categorical(y_train, 10)
y_test_hot = to_categorical(y_test, 10)
Exemplo n.º 30
0
num_classes_y = len(set(y))
num_classes_z = len(set(z))


f, dt = CreateOutputFile("DBLP4A", date = True)

output_path = f"./output/_DBLP4A/" + dt[:10] + "_" + dt[11:13] + "." + dt[14:16] + "." + dt[17:19] + "/"
directory = os.path.dirname(output_path)
if not os.path.exists(directory):
    os.makedirs(directory)

model = CoClust(np.sum(np.shape(T)) * 10, optimization_strategy = alg, path = output_path)
model.fit(T)
tau = model.final_tau_
nmi_y = nmi(y, model.y_, average_method='arithmetic')
ari_y = ari(y, model.y_)
nmi_z = nmi(z, model.z_, average_method='arithmetic')
ari_z = ari(z, model.z_)

sparsity = 1 - (np.sum(T>0) / np.product(T.shape))

f.write(f"{T.shape[0]},{T.shape[1]},{T.shape[2]},,{num_classes_y},{num_classes_z},,{tau[0]},{tau[1]},{tau[2]},,{nmi_y},{nmi_z},,{ari_y},{ari_z},{model._n_clusters[0]},{model._n_clusters[1]},{model._n_clusters[2]},{model.execution_time_},{sparsity},{alg}\n")
f.close()

gx = open(output_path + alg + "_assignments_x.txt", 'w')
gy = open(output_path + alg + "_assignments_y.txt", 'w')
gz = open(output_path + alg + "_assignments_z.txt", 'w')
for i in range(T.shape[0]):
    gx.write(f"{i}\t{model._assignment[0][i]}\n")
for i in range(T.shape[1]):
    gy.write(f"{i}\t{model._assignment[1][i]}\n")
Exemplo n.º 31
0
A, counts = generate_cyclops(X, n, pi, None)
c = [0]*counts[0]
c += [1]*counts[1]

true_labels = c

ase = ASE(n_components=3)
X_hat = ase.fit_transform(A)

gclust_model = GCLUST(max_components=8)
est_labels = gclust.fit_predict(X_hat)

loglikelihoods = [np.sum(gclust.model_.score_samples(X_hat))]
combos = [None]
aris = [ari(c, est_labels)]
bic = [gclust.model_.bic(X_hat)]

unique_labels = np.unique(est_labels)

class_idx = np.array([np.where(est_labels == u)[0] for u in unique_labels])

for k in range(len(unique_labels)):
    for combo in list(combinations(np.unique(est_labels), k+1)):
        combo = np.array(list(combo)).astype(int)
        combos.append(combo)

M = 10**8

condensed_func = lambda combo : for_loop_function(combo, X_hat, est_labels, true_labels, gclust_model, M)
results = Parallel(n_jobs=15)(delayed(condensed_func)(combo) for combo in combos[1:])