예제 #1
0
def run_fkmeans(X_train, X_train_norm, X_train_tfidf, X_train_norm_tfidf, labels_true, dataset_name, kk, ll):
    params = {
        'newsgroup': {
            'k': [20],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'ig': {
            'k': [13],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'igtoy': {
            'k': [3],
            'l': [2, 3, 4, 5, 6],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'nips': {
            'k': [9],
            'l': [5, 7, 9, 11, 13],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        }
    }
    output_file = codecs.open(dataset_name + '_fuzzy_cmeans_news_results.csv', 'w', 'utf-8')
    output_file.write('X,K,NMI,RAND,DAVIES\n')
    output_file.flush()
    for k in params[dataset_name]['k']:
        for data_str in params[dataset_name]['X']:
            data = eval(data_str)
            data = data.toarray().astype(np.float64)

            error_best = np.inf
            for _ in range(10):
                tick1 = time.time()
                centroids, U, _, _, errors, _, _ = fuzz.cluster.cmeans(
                    data.T,
                    k,
                    2,
                    error=0.00000000001,
                    maxiter=10000)
                tick2 = time.time()
                print(u'Took {} secs to train the {} model...'.format((tick2 - tick1), 'fkmeans'))

                labels_pred = np.argmax(U, axis=0)
                error = errors[-1]

                nmi_score = normalized_mutual_info_score(labels_true, labels_pred)
                rand_score = adjusted_rand_score(labels_true, labels_pred)
                davies_score = davies_bouldin_score(data, labels_pred, centroids)
                tick3 = time.time()
                print(u'Took {} secs to calculate {} metrics...'.format((tick3 - tick2), 'fkmeans'))

                output_file.write(u'{},{},{},{},{}\n'.format(data_str, k, nmi_score, rand_score, davies_score))
                output_file.flush()

                print('Execution: X: {}, k: {}'.format(data_str, k))
                print('NMI score: {}'.format(nmi_score))
                print('Rand score: {}'.format(rand_score))
                print('Davies score: {}'.format(davies_score))
                print('-----------------------------------------------\n')

    output_file.close()
예제 #2
0
def run_kmeans(X_train, X_train_norm, X_train_tfidf, X_train_norm_tfidf, labels_true, dataset_name, kk, ll):
    params = {
        'newsgroup': {
            'k': [10, 15, 20, 25, 30],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'ig': {
            'k': [13],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'igtoy': {
            'k': [3],
            'l': [2, 3, 4, 5, 6],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'nips': {
            'k': [9],
            'l': [5, 7, 9, 11, 13],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        }
    }
    output_file = codecs.open(dataset_name + '_kmeans_news_results.csv', 'w', 'utf-8')
    output_file.write('X,K,NMI,RAND,DAVIES\n')
    for k in params[dataset_name]['k']:
        for data_str in params[dataset_name]['X']:
            data = eval(data_str)
            data = data.toarray().astype(np.float64)

            error_best = np.inf
            for _ in range(10):
                tick1 = time.time()
                datat = data.T
                # n, _ = data.shape
                # temp = np.diag(np.squeeze(np.asarray((data.dot(datat).dot(np.ones(n).reshape(n, 1))))))
                # d = datat.dot(np.sqrt(temp))
                estimator = KMeans(n_clusters=k, max_iter=10000)
                estimator.fit(data)
                tick2 = time.time()
                print(u'Took {} secs to train the {} model...'.format((tick2 - tick1), 'kmeans'))

                labels_pred = estimator.labels_
                centroids = estimator.cluster_centers_
                error = estimator.inertia_

                nmi_score = normalized_mutual_info_score(labels_true, labels_pred)
                rand_score = adjusted_rand_score(labels_true, labels_pred)
                davies_score = davies_bouldin_score(data, labels_pred, centroids)
                tick3 = time.time()
                print(u'Took {} secs to calculate {} metrics...'.format((tick3 - tick2), 'kmeans'))

                output_file.write(u'{},{},{},{},{}\n'.format(data_str, k, nmi_score, rand_score, davies_score))

            print('Execution: X: {}, k: {}'.format(data_str, k))
            print('NMI score: {}'.format(nmi_score))
            print('Rand score: {}'.format(rand_score))
            print('Davies score: {}'.format(davies_score))
            print('-----------------------------------------------\n')

    output_file.close()
예제 #3
0
def run_bin_ovnmtf(X_train, X_train_norm, X_train_tfidf, X_train_norm_tfidf, labels_true, dataset_name, kk, ll):
    params = {
        'newsgroup': {
            'k': [20],
            'l': [15, 20, 25, 30],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'igtoy': {
            'k': [3],
            'l': [2, 3, 4, 5, 6],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'ig': {
            'k': [7, 10, 13, 16, 19],
            'l': [7, 10, 13, 16, 19],
            'X': ['X_train_norm_tfidf']
            # 'X': ['X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'nips': {
            'k': [9],
            'l': [6, 9, 12, 15, 18],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        }
    }

    if kk:
        filename = dataset_name + '_kk=' + str(kk) + '_ll=' + str(ll) + '_X=' + params[dataset_name]['X'][0] + '_bin_ovnmtf_news_results.csv'
        params[dataset_name]['k'] = [int(kk)]
        params[dataset_name]['l'] = [int(ll)]
    else:
        filename = dataset_name + '_bin_ovnmtf_news_results.csv'

    out_f = codecs.open(filename, 'w', 'utf-8')
    out_f.write('X,K,L,NMI,RAND,DAVIES\n')
    for k in params[dataset_name]['k']:
        for l in params[dataset_name]['l']:
            for data_str in params[dataset_name]['X']:
                data = eval(data_str)
                data = data.toarray().astype(np.float64)

                h5f = h5py.File('data.h5', 'w')
                h5f.create_dataset('X', data=data.T)
                h5f.close()

                error_best = np.inf
                for _ in xrange(10):
                    tick1 = time.time()
                    # U, S, V, labels_pred, _, error = fnmtf(data, k, l)

                    proc = subprocess.Popen(['./algos', 'bin_ovnmtf', str(k), str(l), '10000'],
                                            stdin=subprocess.PIPE,
                                            stdout=subprocess.PIPE,
                                            stderr=subprocess.PIPE)

                    (out, err) = proc.communicate()
                    print('out: {}'.format(out))

                    U = np.genfromtxt('U.csv', delimiter=',')
                    S = np.genfromtxt('S.csv', delimiter=',')
                    # V = np.genfromtxt('V.csv', delimiter=',')
                    with open('error.csv') as f:
                        error = float(f.read())
                    labels_pred = np.argmax(U, axis=1)

                    tick2 = time.time()
                    print(u'Took {} secs to train the {} model...'.format((tick2 - tick1), 'bin_ovnmtf'))

                    nmi_score = normalized_mutual_info_score(labels_true, labels_pred)
                    rand_score = adjusted_rand_score(labels_true, labels_pred)
                    davies_score = davies_bouldin_score(data, labels_pred, calculate_centroids_doc_mean(data, labels_pred, k))

                    out_f.write(u'{},{},{},{},{},{}\n'.format(data_str, k, l, nmi_score, rand_score, davies_score))

                    print('Execution: X: {}, k: {}, l: {}'.format(data_str, k, l))
                    print('Algo error: {}'.format(error))
                    print('NMI score: {}'.format(nmi_score))
                    print('Rand score: {}'.format(rand_score))
                    print('Davies score: {}'.format(davies_score))
                    print('-----------------------------------------------\n')
예제 #4
0
def clustering_and_metrics(dataset, clustering_alg):

    samples_to_delete = np.array([])
    cleanlabels = np.array([])
    clusters = {}

    l_clustering_alg = [
        'kmeans_++',
        'kmeans_random',
        'kmeans_pca',
        'dbscan',
        'birch',
        'meanshift',
    ]

    # Scale data
    scaleddata = StandardScaler().fit_transform(dataset)

    # Clustering phase

    if clustering_alg == 'kmeans_++':
        estimator, c_elap_time = k_means_clustering(data=scaleddata,
                                                    plot=0,
                                                    p_init='k-means++',
                                                    p_n_init=10,
                                                    p_n_jobs=parallelism)
    elif clustering_alg == 'kmeans_random':
        estimator, c_elap_time = k_means_clustering(data=scaleddata,
                                                    plot=0,
                                                    p_init='random',
                                                    p_n_init=10,
                                                    p_n_jobs=parallelism)
    elif clustering_alg == 'kmeans_pca':
        estimator, c_elap_time = k_means_clustering(data=scaleddata,
                                                    plot=0,
                                                    p_init='PCA-based',
                                                    p_n_init=10,
                                                    p_n_jobs=parallelism)
    elif clustering_alg == 'dbscan':
        estimator, c_elap_time = dbscan_clustering(data=scaleddata,
                                                   plot=0,
                                                   p_n_jobs=parallelism)
    elif clustering_alg == 'birch':
        estimator, c_elap_time = birch_clustering(data=scaleddata,
                                                  plot=0,
                                                  p_n_jobs=parallelism)
    elif clustering_alg == 'meanshift':
        estimator, c_elap_time = meanshift_clustering(data=scaleddata,
                                                      plot=0,
                                                      p_n_jobs=parallelism)

    else:
        print('Clustering algorithm not found')
        return {}, samples_to_delete, cleanlabels, {}

    # Split data in clusters
    clusters, sin_ele_clus, cleanscaleddata, cleanlabels, samples_to_delete, cluster_cnt, ignored_samples = split_data_in_clusters(
        estimator, scaleddata)

    for singleclus in clusters:
        print('Cluster ' + singleclus.__str__() + ':',
              len(clusters[singleclus]))

    # Compute clustering metrics
    clus_metrics = {}

    clus_metrics['name'] = clustering_alg
    clus_metrics['sin_ele_clus'] = sin_ele_clus
    clus_metrics['cluster_cnt'] = cluster_cnt
    clus_metrics['ignored_samples'] = ignored_samples

    # Check that more than 1 cluster was found
    if cluster_cnt <= 1:
        print('Less than ', min_clusters,
              ' clusters found. Skipping metrics calculation')
        clus_metrics['dunn_index'] = None
        clus_metrics['calinski_harabaz_score'] = None
        clus_metrics['silhouette_score'] = None
        clus_metrics['time'] = 0
        clus_metrics['wb_index'] = None
        clus_metrics['davies_bouldin_score'] = None
    else:
        clus_metrics['time'] = round(c_elap_time, metric_decimals)
        clus_metrics['wb_index'] = float(
            round(wb_index(clusters, cleanscaleddata), metric_decimals))
        clus_metrics['dunn_index'] = float(
            round(dunn_index(clusters), metric_decimals))
        clus_metrics['calinski_harabaz_score'] = float(
            round(calinski_harabaz_score(cleanscaleddata, cleanlabels),
                  metric_decimals))
        clus_metrics['silhouette_score'] = float(
            round(
                silhouette_score(cleanscaleddata,
                                 cleanlabels,
                                 metric='euclidean',
                                 sample_size=None),
                metric_decimals))  # forcing data type due to insert error

        # Supress expected runtime "divide by zero" warning
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            clus_metrics['davies_bouldin_score'] = float(
                round(davies_bouldin_score(cleanscaleddata, cleanlabels),
                      metric_decimals))

    return clus_metrics, samples_to_delete, cleanlabels, clusters