def adjusted_rand_index():
	#The text file is updated by a stream of data
	#inputf=Streaming_AbstractGenerator.StreamAbsGen("USBWWAN_stream","USBWWAN")
	#inputf=Streaming_AbstractGenerator.StreamAbsGen("file","StreamingData.txt")
	#inputf=Streaming_AbstractGenerator.StreamAbsGen("Spark_Parquet","Spark_Streaming")
	#inputf=Streaming_AbstractGenerator.StreamAbsGen("AsFer_Encoded_Strings","NeuronRain")
	#inputf=Streaming_AbstractGenerator.StreamAbsGen("Socket_Streaming","localhost")
	inputf1=Streaming_AbstractGenerator.StreamAbsGen("TextHistogramPartition",["/var/log/kern.log","/var/log/syslog","/var/log/ufw.log","/var/log/dmesg","/var/log/kern.log"])
	histograms=[]
	for p in inputf1:
		histograms.append(p)
	ari=adjusted_rand_score(tocluster(histograms[0],"Text")[:20000],tocluster(histograms[1],"Text")[:20000])
	print "Adjusted Rand Index of first two histogram set partitions(truncated):",ari
	prev=0
	for n in range(1,len(histograms)):
		truncatedlen=int(min(len(histograms[prev]),len(histograms[n]))*0.9)
		ari=adjusted_rand_score(tocluster(histograms[prev],"Text")[:truncatedlen],tocluster(histograms[n],"Text")[:truncatedlen])
		print "Adjusted Rand Index(truncated):",ari
		ami=adjusted_mutual_info_score(tocluster(histograms[prev],"Text")[:truncatedlen],tocluster(histograms[n],"Text")[:truncatedlen])
		print "Adjusted Mutual Info Index(truncated):",ami
		prev=n
	#################################################################
	histograms=[]
	inputf2=Streaming_AbstractGenerator.StreamAbsGen("DictionaryHistogramPartition","Streaming_SetPartitionAnalytics.txt")
	for p in inputf2:
		histograms.append(p)
	prev=0
	print "histograms:",histograms
	for n in range(1,len(histograms)):
		truncatedlen=int(min(len(histograms[prev]),len(histograms[n]))*0.9)
		ari=adjusted_rand_score(tocluster(histograms[prev],"Dict")[:truncatedlen],tocluster(histograms[n],"Dict")[:truncatedlen])
		print "Adjusted Rand Index (truncated):",ari
		ami=adjusted_mutual_info_score(tocluster(histograms[prev],"Dict")[:truncatedlen],tocluster(histograms[n],"Dict")[:truncatedlen])
		print "Adjusted Mutual Info Index (truncated):",ami
		prev=n
示例#2
0
def static_test():
    files = ['aggregation', 'compound', 'moons', 'circles']
    for f in files:
        data = np.genfromtxt('data/' + f + '.csv', delimiter=',')
        pts = data[:, :2]
        labels = data[:, -1]
        labels = list(labels)

        # tri
        start = timer()
        tri = Tri(pts)
        end = timer()
        tri_time = end - start
        tri_labels = labelset_to_labels(tri.labels, len(labels))
        tri_res = adjusted_rand_score(labels, tri_labels)

        # auto
        start = timer()
        auto = Autoclust(pts)
        end = timer()
        auto_time = end - start
        auto_labels = labelset_to_labels(auto.labels, len(labels))
        auto_res = adjusted_rand_score(labels, auto_labels)

        res_dict = {'labels': labels, 'tri_label': tri_labels, 'tri_score': tri_res, 'tri_time': tri_time,
                    'auto_labels': auto_labels, 'auto_score': auto_res, 'auto_time': auto_time, 'name': f}

        with open('res', 'a') as fi:
            print(res_dict, file=fi)
示例#3
0
def train(times, X, y,c, lea, ep1, ep2, lamda1, lamda2 ):
    t0 = time.time()
#     times = 1
    # for lea in [0.0001, 0.00001, 0.000001]:
#    lea = .00001
    print 'learn={}, ep1={}, ep2={}, la1={}, la2={}'.format(lea, ep1, ep2, lamda1, lamda2)
    ari,ri,accu = [], [], []
    for ddd in range(times):
        y_pred_old = sof(X, y, k=len(np.unique(y)), c=1, 
                                 lamda1=lamda1,lamda2=lamda2, mu=2, 
                                 gamma=lea, ep1=ep1, ep2=ep2 )
        row, col = linear_sum_assignment(-confusion_matrix(y, y_pred_old))
        y_pred = np.copy(y_pred_old)
        for i, q in enumerate(col):
            y_pred[y_pred_old==q] = i
        ari.append( adjusted_rand_score(y,y_pred) )
        ri.append(rand_score(y, y_pred))
        accu.append(accuracy_score(y,y_pred))
        print '\taccu={}, RI={}'.format(accuracy_score(y,y_pred),rand_score(y, y_pred))
    # print 'ARI: ', adjusted_rand_score(y,y_pred)
    # print 'RI: ', rand_score(y, y_pred)
    # print 'Accu: ', accuracy_score(y,y_pred)
            

    print confusion_matrix(y, y_pred)
    # print y_pred
    print 'time, ', time.time()-t0
    print 'title\tmax\tmean\tstd'
    print 'ARI, ', np.array(ari).max(), np.array(ari).mean(), np.array(ari).std()
    print 'RI, ', np.array(ri).max(), np.array(ri).mean(), np.array(ri).std()
    print 'Accu, ', np.array(accu).max(), np.array(accu).mean(), np.array(accu).std()
    print ''
def run_fkmeans(X_train, X_train_norm, X_train_tfidf, X_train_norm_tfidf, labels_true, dataset_name, kk, ll):
    params = {
        'newsgroup': {
            'k': [20],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'ig': {
            'k': [13],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'igtoy': {
            'k': [3],
            'l': [2, 3, 4, 5, 6],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'nips': {
            'k': [9],
            'l': [5, 7, 9, 11, 13],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        }
    }
    output_file = codecs.open(dataset_name + '_fuzzy_cmeans_news_results.csv', 'w', 'utf-8')
    output_file.write('X,K,NMI,RAND,DAVIES\n')
    output_file.flush()
    for k in params[dataset_name]['k']:
        for data_str in params[dataset_name]['X']:
            data = eval(data_str)
            data = data.toarray().astype(np.float64)

            error_best = np.inf
            for _ in range(10):
                tick1 = time.time()
                centroids, U, _, _, errors, _, _ = fuzz.cluster.cmeans(
                    data.T,
                    k,
                    2,
                    error=0.00000000001,
                    maxiter=10000)
                tick2 = time.time()
                print(u'Took {} secs to train the {} model...'.format((tick2 - tick1), 'fkmeans'))

                labels_pred = np.argmax(U, axis=0)
                error = errors[-1]

                nmi_score = normalized_mutual_info_score(labels_true, labels_pred)
                rand_score = adjusted_rand_score(labels_true, labels_pred)
                davies_score = davies_bouldin_score(data, labels_pred, centroids)
                tick3 = time.time()
                print(u'Took {} secs to calculate {} metrics...'.format((tick3 - tick2), 'fkmeans'))

                output_file.write(u'{},{},{},{},{}\n'.format(data_str, k, nmi_score, rand_score, davies_score))
                output_file.flush()

                print('Execution: X: {}, k: {}'.format(data_str, k))
                print('NMI score: {}'.format(nmi_score))
                print('Rand score: {}'.format(rand_score))
                print('Davies score: {}'.format(davies_score))
                print('-----------------------------------------------\n')

    output_file.close()
def test_non_consicutive_labels():
    # regression tests for labels with gaps
    h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 2, 2, 2], [0, 1, 0, 1, 2, 2])
    assert_almost_equal(h, 0.67, 2)
    assert_almost_equal(c, 0.42, 2)
    assert_almost_equal(v, 0.52, 2)

    h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
    assert_almost_equal(h, 0.67, 2)
    assert_almost_equal(c, 0.42, 2)
    assert_almost_equal(v, 0.52, 2)

    ari_1 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
    ari_2 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
    assert_almost_equal(ari_1, 0.24, 2)
    assert_almost_equal(ari_2, 0.24, 2)
示例#6
0
def assert_fit_predict_correct(model, X):
    model2 = copy.deepcopy(model)

    predictions_1 = model.fit(X).predict(X)
    predictions_2 = model2.fit_predict(X)

    assert adjusted_rand_score(predictions_1, predictions_2) == 1.0
def test_bayesian_mixture_predict_predict_proba():
    # this is the same test as test_gaussian_mixture_predict_predict_proba()
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)
    for prior_type in PRIOR_TYPE:
        for covar_type in COVARIANCE_TYPE:
            X = rand_data.X[covar_type]
            Y = rand_data.Y
            bgmm = BayesianGaussianMixture(
                n_components=rand_data.n_components,
                random_state=rng,
                weight_concentration_prior_type=prior_type,
                covariance_type=covar_type)

            # Check a warning message arrive if we don't do fit
            assert_raise_message(NotFittedError,
                                 "This BayesianGaussianMixture instance"
                                 " is not fitted yet. Call 'fit' with "
                                 "appropriate arguments before using "
                                 "this method.", bgmm.predict, X)

            bgmm.fit(X)
            Y_pred = bgmm.predict(X)
            Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1)
            assert_array_equal(Y_pred, Y_pred_proba)
            assert_greater_equal(adjusted_rand_score(Y, Y_pred), .95)
def run_kmeans(X_train, X_train_norm, X_train_tfidf, X_train_norm_tfidf, labels_true, dataset_name, kk, ll):
    params = {
        'newsgroup': {
            'k': [10, 15, 20, 25, 30],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'ig': {
            'k': [13],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'igtoy': {
            'k': [3],
            'l': [2, 3, 4, 5, 6],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        },
        'nips': {
            'k': [9],
            'l': [5, 7, 9, 11, 13],
            'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
        }
    }
    output_file = codecs.open(dataset_name + '_kmeans_news_results.csv', 'w', 'utf-8')
    output_file.write('X,K,NMI,RAND,DAVIES\n')
    for k in params[dataset_name]['k']:
        for data_str in params[dataset_name]['X']:
            data = eval(data_str)
            data = data.toarray().astype(np.float64)

            error_best = np.inf
            for _ in range(10):
                tick1 = time.time()
                datat = data.T
                # n, _ = data.shape
                # temp = np.diag(np.squeeze(np.asarray((data.dot(datat).dot(np.ones(n).reshape(n, 1))))))
                # d = datat.dot(np.sqrt(temp))
                estimator = KMeans(n_clusters=k, max_iter=10000)
                estimator.fit(data)
                tick2 = time.time()
                print(u'Took {} secs to train the {} model...'.format((tick2 - tick1), 'kmeans'))

                labels_pred = estimator.labels_
                centroids = estimator.cluster_centers_
                error = estimator.inertia_

                nmi_score = normalized_mutual_info_score(labels_true, labels_pred)
                rand_score = adjusted_rand_score(labels_true, labels_pred)
                davies_score = davies_bouldin_score(data, labels_pred, centroids)
                tick3 = time.time()
                print(u'Took {} secs to calculate {} metrics...'.format((tick3 - tick2), 'kmeans'))

                output_file.write(u'{},{},{},{},{}\n'.format(data_str, k, nmi_score, rand_score, davies_score))

            print('Execution: X: {}, k: {}'.format(data_str, k))
            print('NMI score: {}'.format(nmi_score))
            print('Rand score: {}'.format(rand_score))
            print('Davies score: {}'.format(davies_score))
            print('-----------------------------------------------\n')

    output_file.close()
示例#9
0
  def test_nn_classifier(self):
    blob_graphs, expected = self._make_blob_graphs(k=4)
    partial = expected.copy()
    partial[1:-1] = -1

    for g in blob_graphs:
      labels = g.classify_nearest(partial)
      self.assertGreater(adjusted_rand_score(expected, labels), 0.95)
示例#10
0
  def test_harmonic_classifier(self):
    blob_graphs, expected = self._make_blob_graphs(k=4)
    partial = expected.copy()
    partial[1:-1] = -1

    for g in blob_graphs:
      labels = g.classify_harmonic(partial, use_CMN=True)
      self.assertGreater(adjusted_rand_score(expected, labels), 0.95)
 def calculate_scores(self):
   x, c, labels = self.x, self.c, self.labels
   self.v_measure = v_measure_score(c, labels)
   self.complete = completeness_score(c, labels)
   self.adjusted_mutual = adjusted_mutual_info_score(c, labels)
   self.adjusted_rand = adjusted_rand_score(c, labels)
   self.silhouette = silhouette_score(x, c)
   self.purity, self.partial_purity = self.__purity__()
示例#12
0
  def test_lgc_classifier(self):
    blob_graphs, expected = self._make_blob_graphs(k=11)
    partial = expected.copy()
    partial[1:-1] = -1

    for g in blob_graphs:
      labels = g.classify_lgc(partial, kernel='rbf', alpha=0.2, tol=1e-3,
                              max_iter=30)
      self.assertGreater(adjusted_rand_score(expected, labels), 0.95)
示例#13
0
文件: hotel.py 项目: galindus/IAA
def __adjusted_rand_index(generated):
    # generate expected assignment array.
    expected = []
    for x in range(4):
        for y in range(5):
            expected.append(x)
    predicted = [x for x in generated.itervalues()]
    pprint(predicted)
    pprint(expected)
    return adjusted_rand_score(expected, predicted)
示例#14
0
def test_unsupervised_scores():
    # test clustering where there is some true y.
    # We don't have any real unsupervised SCORERS yet
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    km = KMeans(n_clusters=3)
    km.fit(X_train)
    score1 = SCORERS['ari'](km, X_test, y_test)
    score2 = adjusted_rand_score(y_test, km.predict(X_test))
    assert_almost_equal(score1, score2)
def test_unsupervised_scorers():
    """Test clustering scorers against gold standard labeling."""
    # We don't have any real unsupervised Scorers yet.
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    km = KMeans(n_clusters=3)
    km.fit(X_train)
    score1 = SCORERS['adjusted_rand_score'](km, X_test, y_test)
    score2 = adjusted_rand_score(y_test, km.predict(X_test))
    assert_almost_equal(score1, score2)
示例#16
0
def calc(center):
    for _ in range(333):
        for n_sample in range(100, 501, 100):
            for n_feature in range(2, 5):
                #for center in range(2, 10):
                seed = np.random.randint(0, 10000)
                pts, labels = datasets.make_blobs(n_samples=n_sample, n_features=n_feature, cluster_std=0.5, centers=center, random_state=seed)
                labels = list(labels)
                tri = Tri(pts)
                #tri_res = compare_labels(labels, tri.labels)
                tri_labels = labelset_to_labels(tri.labels, n_sample)
                tri_res = adjusted_rand_score(labels, tri_labels)

                auto = Autoclust(pts)
                #auto_res = compare_labels(labels, auto.labels)
                auto_labels = labelset_to_labels(auto.labels, n_sample)
                auto_res = adjusted_rand_score(labels, auto_labels)

                res_dict = {'labels': labels, 'tri_label': tri_labels, 'tri_score': tri_res, 'auto_labels': auto_labels, 'auto_score': auto_res, 'seed': seed}

                with open('S' + str(n_sample) + 'F' + str(n_feature) + 'C' + str(center), 'a') as f:
                    print(res_dict, file=f)
示例#17
0
	def evaluate( self, partition, clustered_ids ):
		# no class info?
		if not self.has_class_info():
			return {}
		# get two clusterings that we can compare
		n = len(clustered_ids)
		classes_subset = np.zeros( n )
		for row in range(n):
			classes_subset[row] = self.class_map[clustered_ids[row]]		
		scores = {}
		scores["external-nmi"] = normalized_mutual_info_score( classes_subset, partition )
		scores["external-ami"] = adjusted_mutual_info_score( classes_subset, partition )
		scores["external-ari"] = adjusted_rand_score( classes_subset, partition )
		return scores
示例#18
0
def Rand_index_cal(infile, infile2, prefix):
    """function to calcutae the rand index between
    clustering programs/ Call other functions to
    open parse the file, and return a list of results.
    requires:
    import sklearn
    from sklearn.metrics.cluster +
    import adjusted_rand_score"""
    cluster_list = prepare_rand_list(infile)
    cluster_list2 = prepare_rand_list(infile2)
    rant_result = adjusted_rand_score(cluster_list,
                                      cluster_list2)
    result = ("%s\tadjusted_rand_score =\t%f\n" %
              (prefix, rant_result))
    return result
def sklearn_measures(U, V):
    #     http://scikit-learn.org/stable/modules/classes.html#clustering-metrics
    import sklearn.metrics.cluster as sym
    U_labels = np.nonzero(U)[1]
    V_labels = np.nonzero(V)[1]
    print U_labels, V_labels
#     V2_labels = np.nonzero(V2)[1]
    print 'entro(U)=',sym.entropy(U_labels),'entro(V)=',sym.entropy(V_labels), 'entro(U,V)=',sym.mutual_info_score(U_labels, V_labels)
    res = [ ['ari', 'nmi', 'ami', 'vm' ], \
            [ sym.adjusted_rand_score(U_labels, V_labels),\
              sym.normalized_mutual_info_score(U_labels, V_labels),\
              sym.adjusted_mutual_info_score(U_labels, V_labels),\
              sym.v_measure_score(U_labels, V_labels)]]
    print res
    return res
示例#20
0
    def evaluate(input_matrix, eigen_order):
        _, pred_cluster_labels = predict_cluster_labels(
            input_matrix, k, eigen_order)
        true_cluster_labels = [j for i in range(group_number)
                               for j in repeat(i, group_size)]

        # print('true_cluster_labels:')
        # print(true_cluster_labels)
        # print('pred_cluster_labels:')
        # print(pred_cluster_labels)
        arc = adjusted_rand_score(true_cluster_labels, pred_cluster_labels)

        # partition-based sign prediction
        pred_sign_mat = predict_signs_via_partition(pred_cluster_labels)
        p_acc = np.count_nonzero(true_Q == pred_sign_mat) / (N * N)
        return arc, p_acc
示例#21
0
def main():
    ttt_data = loadData('tic-tac-toe.data')
    training_set = [row[:9] for row in ttt_data]
    #print(training_set)
    dmatrix = Dissimilarity(training_set).calculate_dmatrix()

#    with open('ttt_dmatrix.csv', 'w', newline='') as csvfile:
#        spamwriter = csv.writer(csvfile,
#                                delimiter=',',
#                                quotechar='|',
#                                quoting=csv.QUOTE_MINIMAL)
#
#        spamwriter.writerow(['x'+str(i) for i in range(len(dmatrix))])
#        for line in dmatrix:
#            spamwriter.writerow(line)

    results = []
    sfcmdd = SFCMdd(training_set,dmatrix)
    for i in range(100):
        U,G,J = sfcmdd.compute(K=2,T=150,emax=(10.e-10),m=2,q=2)
        success=0
        fail=0
        for y,n in U[:626]:
            if y < n: fail+=1
            else: success+=1
        for y,n in U[626:]:
            if n < y: fail+=1
            else: success+=1
        #print("RESULTS: \n>>>>> sucess: "+str(success)+"\n>>>>> fail: "+str(fail))
        print("Classification Rate: "+str(success/958.0))
        results.append([J,U,(success/958.0)])

    results.sort(key=lambda tup: tup[0])
    for i in results[:10]:
        print("J: "+results[0])
        print("Rate: "+results[2])

    fuzzy_partition, prototypes, best_rate = results[0]
    hard_partition = [e for e in hard_partition_generator(fuzzy_partition)]
    ars = adjusted_rand_score(
        [e1 for e1,e2 in hard_partition],
        [e2 for e1,e2 in hard_partition]
    )

    write_csv_partition(fuzzy_partition, "fuzzy_k_medoids_result.csv")
    write_csv_partition(hard_partition, "hard_partition.csv")
    write_csv_partition([ars], "adjusted_rand_score.csv")
def test_gaussian_mixture_fit_predict():
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)
    for covar_type in COVARIANCE_TYPE:
        X = rand_data.X[covar_type]
        Y = rand_data.Y
        g = GaussianMixture(n_components=rand_data.n_components,
                            random_state=rng, weights_init=rand_data.weights,
                            means_init=rand_data.means,
                            precisions_init=rand_data.precisions[covar_type],
                            covariance_type=covar_type)

        # check if fit_predict(X) is equivalent to fit(X).predict(X)
        f = copy.deepcopy(g)
        Y_pred1 = f.fit(X).predict(X)
        Y_pred2 = g.fit_predict(X)
        assert_array_equal(Y_pred1, Y_pred2)
        assert_greater(adjusted_rand_score(Y, Y_pred2), .95)
def evaluation(X_selected, n_clusters, y):
    """
    This function calculates ARI, ACC and NMI of clustering results

    Input
    -----
    X_selected: {numpy array}, shape (n_samples, n_selected_features}
            input data on the selected features
    n_clusters: {int}
            number of clusters
    y: {numpy array}, shape (n_samples,)
            true labels

    Output
    ------
    ari: {float}
        Adjusted Rand Index
    nmi: {float}
        Normalized Mutual Information
    acc: {float}
        Accuracy
    """
    k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300,
                     tol=0.0001, precompute_distances=True, verbose=0,
                     random_state=None, copy_x=True, n_jobs=1)

    k_means.fit(X_selected)
    y_predict = k_means.labels_

    # calculate ARI
    ari = adjusted_rand_score(y, y_predict)

    # calculate NMI
    nmi = normalized_mutual_info_score(y, y_predict)

    # calculate ACC

    y_permuted_predict = best_map(y, y_predict)
    acc = accuracy_score(y, y_permuted_predict)

    return ari, nmi, acc
示例#24
0
def process_evaluation(args, model):
    if args['true_row_labels']:
        try:
            with open(args['true_row_labels'], 'r') as f:
                labels = f.read().split()

            from sklearn.metrics.cluster import normalized_mutual_info_score
            from sklearn.metrics.cluster import adjusted_rand_score
            from sklearn.metrics import confusion_matrix

            n = normalized_mutual_info_score(labels, model.row_labels_)
            ari = adjusted_rand_score(labels, model.row_labels_)
            cm = confusion_matrix(labels, model.row_labels_)

            print("nmi ==>" + str(n))
            print("adjusted rand index ==>" + str(ari))
            print()
            print(cm)
        except Exception as e:
            logging.error("--true_row_labels option (evaluation) exception:\
                          %s" % e)
def test_gaussian_mixture_predict_predict_proba():
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)
    for covar_type in COVARIANCE_TYPE:
        X = rand_data.X[covar_type]
        Y = rand_data.Y
        g = GaussianMixture(n_components=rand_data.n_components,
                            random_state=rng, weights_init=rand_data.weights,
                            means_init=rand_data.means,
                            precisions_init=rand_data.precisions[covar_type],
                            covariance_type=covar_type)

        # Check a warning message arrive if we don't do fit
        assert_raise_message(NotFittedError,
                             "This GaussianMixture instance is not fitted "
                             "yet. Call 'fit' with appropriate arguments "
                             "before using this method.", g.predict, X)

        g.fit(X)
        Y_pred = g.predict(X)
        Y_pred_proba = g.predict_proba(X).argmax(axis=1)
        assert_array_equal(Y_pred, Y_pred_proba)
        assert_greater(adjusted_rand_score(Y, Y_pred), .95)
示例#26
0
def main(_):
  ed.set_seed(42)

  # DATA
  X_data, Z_true = karate("~/data")
  N = X_data.shape[0]  # number of vertices
  K = 2  # number of clusters

  # MODEL
  gamma = Dirichlet(concentration=tf.ones([K]))
  Pi = Beta(concentration0=tf.ones([K, K]), concentration1=tf.ones([K, K]))
  Z = Multinomial(total_count=1.0, probs=gamma, sample_shape=N)
  X = Bernoulli(probs=tf.matmul(Z, tf.matmul(Pi, tf.transpose(Z))))

  # INFERENCE (EM algorithm)
  qgamma = PointMass(tf.nn.softmax(tf.get_variable("qgamma/params", [K])))
  qPi = PointMass(tf.nn.sigmoid(tf.get_variable("qPi/params", [K, K])))
  qZ = PointMass(tf.nn.softmax(tf.get_variable("qZ/params", [N, K])))

  inference = ed.MAP({gamma: qgamma, Pi: qPi, Z: qZ}, data={X: X_data})
  inference.initialize(n_iter=250)

  tf.global_variables_initializer().run()

  for _ in range(inference.n_iter):
    info_dict = inference.update()
    inference.print_progress(info_dict)

  # CRITICISM
  Z_pred = qZ.mean().eval().argmax(axis=1)
  print("Result (label flip can happen):")
  print("Predicted")
  print(Z_pred)
  print("True")
  print(Z_true)
  print("Adjusted Rand Index =", adjusted_rand_score(Z_pred, Z_true))
示例#27
0
aClasses = [(l.strip()) for l in (open("./classes.csv").readlines())]
maxA = -10
aClassesInt = list()
# check every classes's codification quality
for a in lk:
    aClassesInt.clear()
    for c in aClasses:
        if c == 'Agents':
            aClassesInt.append(a[0])
        elif c == 'IR':
            aClassesInt.append(a[1])
        elif c == 'DB':
            aClassesInt.append(a[2])
        elif c == 'AI':
            aClassesInt.append(a[3])
        elif c == 'HCI':
            aClassesInt.append(a[4])
        elif c == 'ML':
            aClassesInt.append(a[5])
        else:
            print("Wrong argument data in classes.csv file")
    tmpA = adjusted_rand_score(finalClasses, aClassesInt)
    # store the best classes's codification quality
    if tmpA > maxA:
        maxA = tmpA
        maxComb = a
        bestAClassesInt = aClassesInt
# print results
print("Best classes's codification -> Cluster quality")
print(str(maxComb) + " -> " + str(maxA))
print()
示例#28
0
                set([i for (i, j) in enumerate(louvain) if j == l]))
        print("Louvain Modularity:",
              nx.algorithms.community.modularity(graph, communities))

        print()

        # AMI
        LPA_AMI = sk.adjusted_mutual_info_score(cluster, lpa)
        Louvain_AMI = sk.adjusted_mutual_info_score(cluster, louvain)
        print("LPA AMI:", LPA_AMI)
        print("Louvain AMI:", Louvain_AMI)

        print()

        # RI
        LPA_RI = sk.adjusted_rand_score(cluster, lpa)
        Louvain_RI = sk.adjusted_rand_score(cluster, louvain)
        print("LPA RI:", LPA_RI)
        print("Louvain RI:", Louvain_RI)

        print()

        # NMI
        LPA_NMI = sk.normalized_mutual_info_score(cluster, lpa)
        Louvain_NMI = sk.normalized_mutual_info_score(cluster, louvain)
        print("LPA NMI:", LPA_NMI)
        print("Louvain NMI:", Louvain_NMI)

    # divisive approach
    for i in range(len(graphs_div)):
        graph_file = graphs_div[i]
示例#29
0
            permuted_list = list()
    
            for perm in range(perm_num):
                for i in range(AllDataMatrix_temp.shape[1]):
                    np.random.shuffle(AllDataMatrix_temp[:,i])
                permuted_list.append(AllDataMatrix_temp)
            
            
            try:
                if "group" in SIMULATION_TYPES: 
                    results_group = Parallel(n_jobs = machine_cores_to_use)(delayed(get_gap_one_s_group)(i,AllDataMatrix = AllDataMatrix,permuted_list = permuted_list) for i in zip(s_list,[nclust]*len(s_list),[multi]*len(s_list)))  
     
                    best_sparse_kmeans_group,lgroup,wgroup  = sparse_kmeans(AllDataMatrix = AllDataMatrix, nclust = nclust, s = s_list[np.argmax(results_group)],
                                                                                                                   niter=niter,group = True, multi = multi) 
                    print(s_list[np.argmax(results_group)])
                    print(adjusted_rand_score(labels,lgroup))
    
                    sparse_group_res[sim,:] = adjusted_rand_score(labels,lgroup)
                    
                    path = path_to_save_files +"GROUP_n=" +str(n)+ "sigma=" +str(sigma) + "signal=" + SIGNAL_TYPE +".txt"
                    np.savetxt(path, sparse_group_res)
                if "sparse" in SIMULATION_TYPES: 

                    results = R_sparse_kmeans(data = numpy2ri(AllDataMatrix),nclust = nclust,nperms  = perm_num, s = -1) 
                    l =  np.array(results[0])
                    print adjusted_rand_score(labels,l)
                    sparse_res[sim,:] = adjusted_rand_score(labels,l)
                    
                    path = path_to_save_files +"sparse_n=" +str(n)+ "sigma=" +str(sigma) + "signal=" + SIGNAL_TYPE +".txt"
    
                    np.savetxt(path, sparse_res)                
示例#30
0
#plot sorted list

#checking the clustering using ARI
TrueClusters = []
for i in range (seqOBJArr.__len__()):
    TrueClusters.append(seqOBJArr[i].trueCluster)

AssignedClusters = []
for i in range (seqOBJArr.__len__()):
    AssignedClusters.append(seqOBJArr[i].currentCluster)

print("True Clusters: ", TrueClusters)
print("Assgined Clusters: ", AssignedClusters)

#check ARI
print(adjusted_rand_score(TrueClusters, AssignedClusters))

'''
alphaArr = [0.01, 0.1,0.2,0.5,0.75,1.0,2.0]
ariArr = [0.56206, 0.96054, 0.96053, 0.82823, 0.33733, 0.211045, 0.07455]
plt.plot(alphaArr, ariArr)
plt.show()
'''
#xhecking with leelu m actual
ans = []
q = open("/Users/mallika/PycharmProjects/DirichletBio/venv/lib/actual.txt", "r")

for line in q:
    values=line.split()
    if (values[0]=="2"):
        ans.append(2)
示例#31
0
X = []
y = []
for line in file.readlines():
    curLine = line.strip().split(", ")
    X.append([float(i) for i in curLine[0:-1]])
    y.append(curLine[-1].strip('.'))


# iterate over classifiers-------------------------------------------
glass_score = []
params = range(1, 19, 1)
for param in params:
    algorithm = KMeans(n_clusters=param)
    algorithm.fit(X)
    y_pred = algorithm.predict(X)
    s = adjusted_rand_score(y, y_pred)
    glass_score.append(s)
print('glass_score', glass_score)

# draw score pic---------------------------------------
plt.figure(figsize=(6, 4), dpi=120)
plt.grid()
plt.xlabel('n_clusters for KMeans')
plt.xticks(params)
plt.plot(params, glass_score, label='glass_score', color='g')
plt.legend()
plt.title("glass KMeans score")

plt.savefig("img/KMeans.png")

plt.show()
def main():
    # Sorting out arguments.
    if len(sys.argv) != 4:
        print('Usage: %s symptoms/herbs/both top/section/subsection'
            ' similarity_threshold' % sys.argv[0])
        exit()
    vector_type = sys.argv[1]
    assert vector_type in ['symptoms', 'herbs', 'both']
    label_type = sys.argv[2]
    assert label_type in ['top', 'section', 'subsection']
    # linkage = sys.argv[3]
    # assert linkage in ['average', 'complete']
    # # 'full' to use all herbs and symptoms. 'partial' to use only dictionary.
    # abridged = (sys.argv[4] == 'partial')
    similarity_threshold = float(sys.argv[3])

    feature_list, master_patient_dct = get_master_patient_dct(vector_type,
        False)
    
    # Get the patient by attribute matrix.
    (attribute_by_patient_matrix, section_labels, subsection_labels,
        file_num_labels) = get_attribute_by_patient_matrix(feature_list,
        master_patient_dct)

    # Picking the type of labels.
    if label_type == 'top':
        true_labels = get_label_to_index_conversions(file_num_labels)
    elif label_type == 'section':
        true_labels = get_label_to_index_conversions(section_labels)
    elif label_type == 'subsection':
        true_labels = get_label_to_index_conversions(subsection_labels)

    num_clusters = len(set(true_labels))

    # Uncomment this block if making changes to similarity matrix.
    similarity_matrix = get_similarity_matrix(feature_list,
        similarity_threshold)

    # similarity_matrix = get_top_k_elements_per_row_sim_mat(
    #     similarity_matrix, top_k)

    embedded_matrix = similarity_matrix * attribute_by_patient_matrix

    # embedded_matrix = upper_bound_matrix(np.array(embedded_matrix))

    # np.savetxt('./results/embedded_%s_matrix.txt' % vector_type,
    #     embedded_matrix)
    # exit()

    # embedded_matrix = np.loadtxt('./results/embedded_%s_matrix.txt' % (
    #     vector_type))

    # Get the list of entropies for the embedded matrix.
    entropy_list = np.apply_along_axis(entropy, axis=1, arr=embedded_matrix)

    # Delete the percentage% lowest entropy elements.
    for percentage in [p / 20.0 for p in range(20)]:
        num_att_to_delete = int(len(feature_list) * percentage)

        # Deleting lowest entropy attributes.
        att_indices_to_delete = entropy_list.argsort()[:num_att_to_delete]

        # First, copy the attribute by patient matrix.
        feature_vectors = np.copy(embedded_matrix)

        # Delete the lowest entropy attributes, and transpose.
        feature_vectors = np.delete(feature_vectors,
            att_indices_to_delete, axis=0).T

        # random_state = 5191993
        # y_pred = KMeans(n_clusters=num_clusters,
        #     random_state=random_state).fit_predict(feature_vectors)
        # print 'k-means %g' % (adjusted_rand_score(true_labels, y_pred))

        # y_pred = SpectralClustering(n_clusters=num_clusters,
        #     eigen_solver='arpack', random_state=random_state, 
        #     affinity="cosine").fit_predict(feature_vectors)
        # print 'spectral %g' % (adjusted_rand_score(true_labels, y_pred))

        y_pred = AgglomerativeClustering(n_clusters=num_clusters,
            affinity='cosine', linkage='average').fit_predict(
            feature_vectors)
        # cluster_dct = {}
        # for i, cluster_label in enumerate(y_pred):
        #     section_label = section_labels[i]
        #     subsection_label = subsection_labels[i]
        #     patient = (section_label, subsection_label)
        #     if cluster_label in cluster_dct:
        #         cluster_dct[cluster_label] += [patient]
        #     else:
        #         cluster_dct[cluster_label] = [patient]

        # out = open('./results/embedding_patient_clusters.txt', 'w')
        # for cluster_label in cluster_dct:
        #     patient_cluster = cluster_dct[cluster_label]
        #     for section_label, subsection_label in patient_cluster:
        #         out.write('%s,%s\t' % (section_label, subsection_label))
        #     out.write('\n')
        # out.close()


        rand_index = adjusted_rand_score(true_labels, y_pred)
        # if rand_index >= 0.292420:
        print rand_index, percentage
示例#33
0
#print("Actual");
#print(np.asarray(phi));
#print(membership_act);

#print("predicted")
#print(pi_pred);
#print(qgamma.mean().eval());

X_pred = np.array(X.mean().eval() > 0.5, dtype=int);
cnt = N*N;
correct = np.sum(X_data == X_pred);

plt.subplot(211);
plt.imshow(X_data, cmap='Greys');

plt.subplot(212)
plt.imshow(X_pred, cmap='Greys');
plt.show();

print("Correctly predicted: ", correct);
print("Total entries: ", cnt);

print("Train Accuracy: ", correct/cnt);

print("Result (label flip can happen):")
print("Predicted")
print(Z_pred)
print("True")
print(Z_true)
print("Adjusted Rand Index =", adjusted_rand_score(Z_pred, Z_true))
示例#34
0
def ecac_run(X,
             n_clusters,
             data,
             pop_size=20,
             max_gens=2000,
             p_crossover=0.95,
             p_mutation=0.98,
             runs=10,
             y=None,
             log_file=False,
             evolutionary_plot=False):
    tifont = {
        'fontname': 'Times New Roman',
        'fontsize': 20,
        'fontweight': 'bold'
    }
    axfont = {'fontname': 'Times New Roman', 'fontsize': 16}

    for run in range(runs):
        print('============= TEST {} ============='.format(run + 1))
        print('Clustering started using ECAC'.format(data))
        print('Dataset: {}, Clusters: {}, Instances: {}, Features: {}'.format(
            data, n_clusters, len(X), len(X[0])))
        print('Population size: {}, Generations: {}'.format(
            pop_size, max_gens))

        start = time.time()
        population = []
        fit_log = []
        X = StandardScaler().fit_transform(X)

        print('Generating initial population')
        for _ in range(pop_size):
            individual = {'partition': random_gen(n_clusters, X)}
            individual['fitness'] = fitness_value(X, individual['partition'],
                                                  n_clusters)
            if individual not in population:
                population.append(individual)
        best = sorted(population, key=lambda k: k['fitness'], reverse=True)[0]

        print('Starting genetic process...')
        for i in range(max_gens):
            print('Generation {}'.format(i + 1))
            selected = []
            for _ in range(pop_size):
                selected.append(binary_tournament(population))
            children = reproduce(selected, pop_size, p_crossover, p_mutation,
                                 n_clusters)
            for j in range(len(children)):
                children[j]['fitness'] = fitness_value(
                    X, children[j]['partition'], n_clusters)
            children.sort(key=lambda l: l['fitness'], reverse=True)
            if children[0]['fitness'] >= best['fitness']:
                best = children[0]
            population = children
            if log_file:
                fit_log.append((i + 1, best['fitness']))

            if evolutionary_plot:
                plt.figure(figsize=(12, 8), dpi=200)
                plt.title('ECAC - Generation {}'.format(i + 1), **tifont)
                plt.xlabel('Column 1', **axfont)
                plt.ylabel('Column 2', **axfont)
                colors = best['partition']
                plt.scatter(X[:, 0],
                            X[:, 1],
                            c=colors,
                            edgecolor='k',
                            cmap='YlGnBu')
                plt.tight_layout()
                if not os.path.exists('figures/{}/{}'.format(data, run + 1)):
                    os.makedirs('figures/{}/{}'.format(data, run + 1))
                plt.savefig('figures/{}/{}/scatter_{}.jpg'.format(
                    data, run + 1, i + 1),
                            format='jpg')
            if best['fitness'] == 1:
                break

        run_time = time.time() - start
        best['time'] = run_time
        print('Optimization finished in {:.2f}s with an objective of {:.4f}'.
              format(best['time'], best['fitness']))
        best['partition'] = np.array(best['partition'])

        d = dict()
        d['Dataset'] = data
        d['Algorithm'] = 'ecac'
        d['Clusters'] = n_clusters
        d['Instances'] = len(X)
        d['Features'] = len(X[0])
        d['Pop. size'] = pop_size
        d['Max. gens'] = max_gens
        d['No. objectives'] = 1
        d['Obj. 1 name'] = 'generalization'
        d['Objective 1'] = best['fitness']
        d['Obj. 2 name'] = np.nan
        d['Objective 2'] = np.nan
        d['Time'] = best['time']
        if y is None:
            d['Adjusted Rand Index'] = np.nan
            print('No labels provided')
        else:
            adj_rand_index = adjusted_rand_score(y, best['partition'])
            d['Adjusted Rand Index'] = adj_rand_index
            print('Adjusted RAND index: {:.4f}'.format())
        for i in range(len(best['partition'])):
            d['X{}'.format(i + 1)] = '{}'.format(best['partition'][i])

        out = pd.DataFrame(d, index=[data])
        if not os.path.exists('ecac-out/{}_{}_{}_{}'.format(
                data, n_clusters, pop_size, max_gens)):
            os.makedirs('ecac-out/{}_{}_{}_{}'.format(data, n_clusters,
                                                      pop_size, max_gens))
        out.to_csv('ecac-out/{}_{}_{}_{}/solution-{}_{}_{}_{}-{}.csv'.format(
            data, n_clusters, pop_size, max_gens, data, n_clusters, pop_size,
            max_gens, run + 1),
                   index=False)
        if log_file:
            log = pd.DataFrame(fit_log, columns=['gen', 'fitness'])
            log.to_csv('ecac-out/{}_{}_{}_{}/log-{}_{}_{}_{}-{}.csv'.format(
                data, n_clusters, pop_size, max_gens, data, n_clusters,
                pop_size, max_gens, run + 1),
                       index=False)

        filenames = glob.glob("ecac-out/{}_{}_{}_{}/solution*".format(
            data, n_clusters, pop_size, max_gens))
        df = pd.DataFrame()
        for name in filenames:
            temp_df = pd.read_csv(name)
            df = df.append(temp_df)
        df.reset_index(drop=True, inplace=True)
        df.to_csv('ecac-out/solutions-{}_{}_{}_{}-{}.csv'.format(
            data, n_clusters, pop_size, max_gens, runs))
示例#35
0
def compute_stability_fold(samples, train, test, method='ward',
                           max_k=None, stack=False,
                           stability=True, cv_likelihood=False,
                           corr_score=None,
                           ground_truth=None, n_neighbors=1,  **kwargs):
    """
    General function to compute the stability on a cross-validation fold.
    
    Parameters:
    -----------
        samples : list of arrays
            List of arrays containing the samples to cluster, each
            array has shape (n_samples, n_features) in PyMVPA terminology.
            We are clustering the features, i.e., the nodes.
        train : list or array
            Indices for the training set.
        test : list or array
            Indices for the test set.
        method : {'complete', 'gmm', 'kmeans', 'ward'}
            Clustering method to use. Default is 'ward'.
        max_k : int or None
            Maximum k to compute the stability testing, starting from 2. By
            default it will compute up to the maximum possible k, i.e.,
            the number of points.
        stack : bool
            Whether to stack or average the datasets. Default is False,
            meaning that the datasets are averaged by default.
        stability : bool
            Whether to compute the stability measure described in Lange et
            al., 2004. Default is True.
        cv_likelihood : bool
            Whether to compute the cross-validated likelihood for mixture
            model; only valid if 'gmm' method is used. Default is False.
        corr_score : {'pearson','spearman'} or None
            Whether to compute the specified type of correlation score. 
            Default is None.
        ground_truth : array or None
            Array containing the ground truth of the clustering of the data,
            useful to compare stability against ground truth for simulations.
        n_neighbors : int
            Number of neighbors to use to predict clustering solution on
            test set using K-nearest neighbors. Currently used only for
            methods `complete` and `ward`. Default is 1.
        kwargs : optional
            Keyword arguments being passed to the clustering method (only for
            'ward' and 'gmm').
    
    Returns:
    --------
        ks : array
            A (max_k-1,) array, where ks[i] is the `k` of the clustering
            solution for iteration `i`.
        ari : array
            A (max_k-1,) array, where ari[i] is the Adjusted Rand Index of the
            predicted clustering solution on the test set and the actual
            clustering solution of the test set for `k` of ks[i].
        ami : array
            A (max_k-1,) array, where ari[i] is the Adjusted Mutual
            Information of the predicted clustering solution on the test set
            and the actual clustering solution of the test set for
            `k` of ks[i].
        stab : array or None
            A (max_k-1,) array, where stab[i] is the stability measure
            described in Lange et al., 2004 for `k` of ks[i]. Note that this
            measure is the un-normalized one. It will be normalized later in
            the process.
        likelihood : array or None
            If method is 'gmm' and cv_likelihood is True, a
            (max_k-1,) array, where likelihood[i] is the cross-validated
            likelihood of the GMM clustering solution for `k` of ks[i].
            Otherwise returns None.
        ari_gt : array or None
            If ground_truth is not None, a (max_k-1,) array, where ari_gt[i]
            is the Adjusted Rand Index of the predicted clustering solution on
            the test set for `k` of ks[i] and the ground truth clusters of the
            data.
            Otherwise returns None.
        ami_gt : array or None
            If ground_truth is not None, a (max_k-1,) array, where ami_gt[i]
            is the Adjusted Mutual Information of the predicted clustering
            solution on the test set for `k` of ks[i] and the ground truth
            clusters of the data.
            Otherwise returns None.
        stab_gt : array or None
            If ground_truth is not None, a (max_k-1,) array, where stab_gt[i]
            is the stability measure of the predicted clustering
            solution on the test set for `k` of ks[i] and the ground truth
            clusters of the data.
            Otherwise returns None.
        corr : array or None
            Average correlation for each fold. TODO
        corr_gt : array or None
            Avg correlation against GT. TODO
    """
    if method not in AVAILABLE_METHODS:
        raise ValueError('Method {0} not implemented'.format(method))

    if cv_likelihood and method != 'gmm':
        raise ValueError(
            "Cross-validated likelihood is only available for 'gmm' method")

    # if max_k is None, set max_k to maximum value
    if not max_k:
        max_k = samples[0].shape[1]

    # preallocate arrays for results
    ks = np.zeros(max_k-1, dtype=int)
    ari = np.zeros(max_k-1)
    ami = np.zeros(max_k-1)
    if stability:
        stab = np.zeros(max_k-1)
    if cv_likelihood:
        likelihood = np.zeros(max_k-1)
    if corr_score is not None:
        corr = np.zeros(max_k-1)
    if ground_truth is not None:
        ari_gt = np.zeros(max_k-1)
        ami_gt = np.zeros(max_k-1)
        if stability:
            stab_gt = np.zeros(max_k-1)
        if corr_score is not None:
            corr_gt = np.zeros(max_k-1)

    # get training and test
    train_set = [samples[x] for x in train]
    test_set = [samples[x] for x in test]
    
    if stack:
        train_ds = np.vstack(train_set)
        test_ds = np.vstack(test_set)
    else:
        train_ds = np.mean(np.dstack(train_set), axis=2)
        test_ds = np.mean(np.dstack(test_set), axis=2)

    # compute clustering on training set
    if method == 'complete':
        train_ds_dist = pdist(train_ds.T, metric='correlation')
        test_ds_dist = pdist(test_ds.T, metric='correlation')
        # I'm computing the full tree and then cutting
        # afterwards to speed computation
        Y_train = complete(train_ds_dist)
        # same on testing set
        Y_test = complete(test_ds_dist)
    elif method == 'ward':
        (children_train, n_comp_train, 
         n_leaves_train, parents_train) = ward_tree(train_ds.T, **kwargs)
        # same on testing set
        (children_test, n_comp_test, 
         n_leaves_test, parents_test) = ward_tree(test_ds.T, **kwargs)
    elif method == 'gmm' or method == 'kmeans':
        pass  # we'll have to run it for each k
    else:
        raise ValueError("We shouldn't get here")

    for i_k, k in enumerate(range(2, max_k+1)):
        if method == 'complete':
            # cut the tree with right K for both train and test
            train_label = cut_tree_scipy(Y_train, k)
            test_label = cut_tree_scipy(Y_test, k)
            # train a classifier on this clustering
            knn = KNeighborsClassifier(#algorithm='brute',
            # metric='correlation',
                                       n_neighbors=n_neighbors)
            knn.fit(train_ds.T, train_label)
            # predict the clusters in the test set
            prediction_label = knn.predict(test_ds.T)
        elif method == 'ward':
            # cut the tree with right K for both train and test
            train_label = _hc_cut(k, children_train, n_leaves_train)
            test_label = _hc_cut(k, children_test, n_leaves_test)
            # train a classifier on this clustering
            knn = KNeighborsClassifier(n_neighbors=n_neighbors)
            knn.fit(train_ds.T, train_label)
            # predict the clusters in the test set
            prediction_label = knn.predict(test_ds.T)
        elif method == 'gmm':
            gmm = GMM(n_components=k, **kwargs)
            # fit on train and predict test
            gmm.fit(train_ds.T)
            prediction_label = gmm.predict(test_ds.T)
            if cv_likelihood:
                log_prob = np.sum(gmm.score(test_ds.T))
            # fit on test and get labels
            gmm.fit(test_ds.T)
            test_label = gmm.predict(test_ds.T)
        elif method == 'kmeans':
            kmeans = KMeans(n_clusters=k)
            # fit on train and predict test
            kmeans.fit(train_ds.T)
            prediction_label = kmeans.predict(test_ds.T)
            # fit on test and get labels
            kmeans.fit(test_ds.T)
            test_label = kmeans.predict(test_ds.T)
        else:
            raise ValueError("We shouldn't get here")
            
        # append results
        ks[i_k] = k
        ari[i_k] = adjusted_rand_score(prediction_label, test_label)
        ami[i_k] = adjusted_mutual_info_score(prediction_label, test_label)
        if stability:
            stab[i_k] = stability_score(prediction_label, test_label, k)
        if cv_likelihood:
            likelihood[i_k] = log_prob
        if corr_score is not None:
            corr[i_k] = correlation_score(prediction_label, test_label,
                                          test_ds, corr_score)
        if ground_truth is not None:
            ari_gt[i_k] = adjusted_rand_score(prediction_label, ground_truth)
            ami_gt[i_k] = adjusted_mutual_info_score(prediction_label,
                                                     ground_truth)
            if stability:
                stab_gt[i_k] = stability_score(prediction_label,
                                               ground_truth, k)
            if corr_score is not None:
                corr_gt[i_k] = correlation_score(prediction_label,
                                                 ground_truth,
                                                 test_ds, corr_score)

    results = [ks, ari, ami]
    if stability:
        results.append(stab)
    else:
        results.append(None)
    if cv_likelihood:
        results.append(likelihood)
    else:
        results.append(None)

    if ground_truth is not None:
        results += [ari_gt, ami_gt]
    else:
        results += [None, None]

    if stability and ground_truth is not None:
        results.append(stab_gt)
    else:
        results.append(None)

    if corr_score is not None:
        results.append(corr)
    else:
        results.append(None)

    if corr_score is not None and ground_truth is not None:
        results.append(corr_gt)
    else:
        results.append(None)

    return results
示例#36
0
def test_adjusted_rand_score_sparse():
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
    C_sparse = contingency_matrix(labels_a, labels_b, sparse=True)
    assert_almost_equal(adjusted_rand_score(labels_a, labels_b), adjusted_rand_score(None, None, contingency=C_sparse))