def clustering(data, params):

    # parse parameters

    for item in params:
        if isinstance(params[item], str):
            exec(item+'='+'"'+params[item]+'"')
        else:
            exec(item+'='+str(params[item]))

    # apply Agglomerative Clustering to reduced data

    clusters = AgglomerativeClustering(n_clusters=n_clusters,
                                       affinity=affinity, linkage=linkage)
    clusters.fit(data)

    # Agglomerative Clustering does not give centers of clusters
    # so lets try the mean of each cluster

    cluster_centers = []
    for i in range(n_clusters):
        mask = (clusters.labels_ == i)
        cluster_centers.append(mean(data[mask], axis=0))
    cluster_centers = array(cluster_centers)

    return [cluster_centers, clusters.labels_]
示例#2
0
def cluster_agg(cluster_data):
    clstr = AgglomerativeClustering(n_clusters=11, linkage='ward')
    clstr.fit(cluster_data)

    df['tier'] = clstr.labels_
    results = df[['Player', 'tier']]
    return results
示例#3
0
def Word2VecReduction(senlist, w2vec, ratio):
  slen = len(senlist)
  word_matrix = []
  word2label = {}
  idx2word = {}
  useword = set([])
  cnt = 0
  for i in range(0, slen):
    for word in senlist[i].word_used:
      if word not in useword: #and word in w2vec:
        idx2word[cnt] = word
        cnt += 1
        useword.add(word)
        word_matrix.append(w2vec[word])
  wlen = len(useword)
  print "use words:", wlen
  
  nclusters = max(int(0.9*wlen), 100)
  print nclusters
  AgloCluster = AgglomerativeClustering(n_clusters=nclusters,linkage="average", affinity='cosine')
  AgloCluster.fit(word_matrix)
  AgloCluster_labels = AgloCluster.labels_
  
  for i in range(0, wlen):
    word2label[idx2word[i]] = AgloCluster_labels[i]

  for i in range(0, slen):
    senlist[i].sen_words = [ str(word2label[w]) for w in senlist[i].word_used]
    senlist[i].word_dict = {}
    #print senlist[i].sen_words
  return
示例#4
0
def train_agglomerative():
	print "starting agglomerative clustering..."
	model = AgglomerativeClustering(n_clusters=num_clusters, affinity=aggl_affinity,  
	linkage=aggl_linkage)
	model.fit(X)
	labels = model.labels_	
	print labels
 def knn_connectivity(self, X):
     knn_graph = kneighbors_graph(X, 30, include_self=False)
 
     for connectivity in (None, knn_graph):
             n_clusters = 4
             plt.figure(figsize=(10, 4))
             for index, linkage in enumerate(('average', 'complete', 'ward')):
                 plt.subplot(1, 3, index + 1)
                 model = AgglomerativeClustering(linkage=linkage,
                                             connectivity=connectivity,
                                             n_clusters=n_clusters)
                 t0 = time.time()
                 model.fit(X)
                 elapsed_time = time.time() - t0
                 plt.scatter(X[:, 0], X[:, 1], c=model.labels_,
                         cmap=plt.cm.spectral)
                 plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time),
                       fontdict=dict(verticalalignment='top'))
                 plt.axis('equal')
                 plt.axis('off')
 
                 plt.subplots_adjust(bottom=0, top=.89, wspace=0,
                                 left=0, right=1)
                 plt.suptitle('n_cluster=%i, connectivity=%r' %
                          (n_clusters, connectivity is not None), size=17)
 
 
     plt.show()
def test_connectivity_propagation():
    # Check that connectivity in the ward tree is propagated correctly during
    # merging.
    X = np.array(
        [
            (0.014, 0.120),
            (0.014, 0.099),
            (0.014, 0.097),
            (0.017, 0.153),
            (0.017, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.152),
            (0.018, 0.149),
            (0.018, 0.144),
        ]
    )
    connectivity = kneighbors_graph(X, 10, include_self=False)
    ward = AgglomerativeClustering(n_clusters=4, connectivity=connectivity, linkage="ward")
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)
示例#7
0
def __generate_dummy_data():
    from sklearn.cluster import AgglomerativeClustering
    import itertools
    X = np.array([[
         -5.27453240e-01,  -6.14130238e-01,  -1.63611427e+00,
         -9.26556498e-01,   7.82296885e-01,  -1.06286220e+00,
         -1.24368729e+00,  -1.16151964e+00,  -2.25816923e-01,
         -3.32354552e-02],
       [ -2.01273137e-01,   5.25758359e-01,   1.37940072e+00,
         -7.63256657e-01,  -1.27275323e+00,  -1.31618084e+00,
         -7.00167331e-01,   2.21410669e+00,   9.15456567e-01,
          7.93076923e-01],
       [  1.53249104e-01,  -5.48642411e-01,  -1.06559060e+00,
         -3.05253203e-01,  -1.93393495e+00,   1.39827978e-01,
          1.73359830e-01,   2.85576854e-02,  -1.19427027e+00,
          1.04395610e+00],
       [  1.00595172e+02,   1.01661346e+02,   1.00115635e+02,
          9.86884249e+01,   9.86506406e+01,   1.02214982e+02,
          1.01144087e+02,   1.00642778e+02,   1.01635339e+02,
          9.88981171e+01],
       [  1.01506262e+02,   1.00525318e+02,   9.93021764e+01,
          9.92514163e+01,   1.01199015e+02,   1.01771241e+02,
          1.00464097e+02,   9.97482396e+01,   9.96888274e+01,
          9.88297336e+01]])
    model = AgglomerativeClustering(linkage="average", affinity="cosine")
    model.fit(X)
    ii = itertools.count(X.shape[0])
    DEBUG(str([{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]))
    return model, model.labels_
示例#8
0
def wardHierarchical(img):
    connectivity = grid_to_graph(*img.shape)
    print("Compute structured hierarchical clustering...")
    st = time.time()
    n_clusters = 15  # number of regions
    ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward',
                                   connectivity=connectivity)
    
    face = sp.misc.imresize(img, 0.10) / 255.
    X = np.reshape(img, (-1, 1))
    ward.fit(X)
    label = np.reshape(ward.labels_, face.shape)
    print("Elapsed time: ", time.time() - st)
    print("Number of pixels: ", label.size)
    print("Number of clusters: ", np.unique(label).size)


    plt.figure(figsize=(5, 5))
    plt.imshow(face, cmap=plt.cm.gray)
    for l in range(n_clusters):
        plt.contour(label == l, contours=1,
                    colors=[plt.cm.spectral(l / float(n_clusters)), ])
    plt.xticks(())
    plt.yticks(())
    plt.show()
def test_agglomerative_clustering_with_distance_threshold(linkage):
    # Check that we obtain the correct number of clusters with
    # agglomerative clustering with distance_threshold.
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    connectivity = grid_to_graph(*mask.shape)
    # test when distance threshold is set to 10
    distance_threshold = 10
    for conn in [None, connectivity]:
        clustering = AgglomerativeClustering(
            n_clusters=None,
            distance_threshold=distance_threshold,
            connectivity=conn, linkage=linkage)
        clustering.fit(X)
        clusters_produced = clustering.labels_
        num_clusters_produced = len(np.unique(clustering.labels_))
        # test if the clusters produced match the point in the linkage tree
        # where the distance exceeds the threshold
        tree_builder = _TREE_BUILDERS[linkage]
        children, n_components, n_leaves, parent, distances = \
            tree_builder(X, connectivity=conn, n_clusters=None,
                         return_distance=True)
        num_clusters_at_threshold = np.count_nonzero(
            distances >= distance_threshold) + 1
        # test number of clusters produced
        assert num_clusters_at_threshold == num_clusters_produced
        # test clusters produced
        clusters_at_threshold = _hc_cut(n_clusters=num_clusters_produced,
                                        children=children,
                                        n_leaves=n_leaves)
        assert np.array_equiv(clusters_produced,
                              clusters_at_threshold)
示例#10
0
def test_compute_full_tree():
    """Test that the full tree is computed if n_clusters is small"""
    rng = np.random.RandomState(0)
    X = rng.randn(10, 2)
    connectivity = kneighbors_graph(X, 5, include_self=False)

    # When n_clusters is less, the full tree should be built
    # that is the number of merges should be n_samples - 1
    agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
    agc.fit(X)
    n_samples = X.shape[0]
    n_nodes = agc.children_.shape[0]
    assert_equal(n_nodes, n_samples - 1)

    # When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
    # we should stop when there are n_clusters.
    n_clusters = 101
    X = rng.randn(200, 2)
    connectivity = kneighbors_graph(X, 10, include_self=False)
    agc = AgglomerativeClustering(n_clusters=n_clusters,
                                  connectivity=connectivity)
    agc.fit(X)
    n_samples = X.shape[0]
    n_nodes = agc.children_.shape[0]
    assert_equal(n_nodes, n_samples - n_clusters)
示例#11
0
	def agglomerative_clusters(self, word_vectors):
	
		#Pre-calculate BallTree object
		starting = time.time()
		Ball_Tree = BallTree(word_vectors, leaf_size = 200, metric = "minkowski")
		print("BallTree object in " + str(time.time() - starting))
		
		#Pre-calculate k_neighbors graph
		starting = time.time()
		connectivity_graph = kneighbors_graph(Ball_Tree, 
						n_neighbors = 1, 
						mode = "connectivity", 
						metric = "minkowski", 
						p = 2, 
						include_self = False, 
						n_jobs = workers
						)
		print("Pre-compute connectivity graph in " + str(time.time() - starting))

		#Agglomerative clustering
		starting = time.time()
		Agl = AgglomerativeClustering(n_clusters = 100, 
										affinity = "minkowski", 
										connectivity = connectivity_graph, 
										compute_full_tree = True, 
										linkage = "average"
										)
		
		Agl.fit(word_vectors)
		print("Agglomerative clustering in " + str(time.time() - starting))
		
		clusters = Agl.labels_
		
		return clusters
    def classify_core(self, N_CLUSTERS, clusterType, data_for_trial_type, begin_time, end_time):

        BEGIN_TIME_FRAME = begin_time*self.griddy.TIME_GRID_SPACING
        END_TIME_FRAME = end_time*self.griddy.TIME_GRID_SPACING

        data = data_for_trial_type[:,BEGIN_TIME_FRAME:END_TIME_FRAME,self.griddy.VEL_X]

        labels = None
        if clusterType == 'kmeans':
            kmeans = KMeans(n_clusters=N_CLUSTERS)
            kmeans.fit(data)
            labels = kmeans.labels_
        elif clusterType == 'affinity_propagation':
            ap = AffinityPropagation(damping=0.75)
            ap.fit(data)
            labels = ap.labels_
            N_CLUSTERS = np.max(self.labels)+1
        elif clusterType == 'DBSCAN':
            dbscan = DBSCAN()
            dbscan.fit(data)
            labels = dbscan.labels_
            N_CLUSTERS = np.max(labels)+1
            print 'N_CLUSTERS=' + str(N_CLUSTERS)
        elif clusterType == 'AgglomerativeClustering':
            ac = AgglomerativeClustering(n_clusters=N_CLUSTERS)
            ac.fit(data)
            labels = ac.labels_
        else:
            print 'ERROR: clusterType: ' + clusterType + ' is not recognized'

        return (labels, N_CLUSTERS)
def programmer_3():

    standardizedfile = "data/standardized.xls"
    k = 3
    data = pd.read_excel(standardizedfile, index_col=u"基站编号")

    # 层次聚类
    model = AgglomerativeClustering(n_clusters=k, linkage="ward")
    model.fit(data)

    # 详细输入原始数据及对应类别
    r = pd.concat([data, pd.Series(model.labels_, index=data.index)], axis=1)
    r.columns = list(data.columns) + [u"聚类类别"]

    # 绘制聚类图,并且用不同样式进行画图
    style = ["ro-", "go-", "bo-"]
    xlabels = [u"工作日人均停留时间", u"凌晨人均停留时间", u"周末人均停留时间", u"日均人流量"]
    pic_output = "tmp/type_"

    for i in range(k):
        plt.figure()
        tmp = r[r[u"聚类类别"] == i].iloc[:, :4]
        for j in range(len(tmp)):
            plt.plot(range(1, 5), tmp.iloc[j], style[i])

        plt.xticks(range(1, 5), xlabels, rotation=20)

        plt.title(u"商圈类别%s" % (i + 1))
        # 调整底部
        plt.subplots_adjust(bottom=0.15)
        plt.savefig(u"%s%s.png" % (pic_output, i + 1))
示例#14
0
文件: hcm.py 项目: harrylclc/ist557
def eval_dist(linkage='ward'):
    a_score = []
    idx = []
    d = [[] for i in xrange(3)]
    for k in xrange(2, 50 + 1):
        print 'k={}'.format(k)
        est = AgglomerativeClustering(n_clusters=k, linkage=linkage)
        est.fit(x)
        ari_v = metrics.adjusted_rand_score(y, est.labels_)
        ds = calc_distance(k, est.labels_)
        for i in xrange(3):
            d[i].append(ds[i])
        print ari_v
        a_score.append(ari_v)
        idx.append(k)
    fig, axes = plt.subplots(nrows=1, ncols=2)
    axes[0].plot(idx, a_score)
#     plt.xlim(0, 220)
    axes[0].set_ylim(ymin=0)
    axes[0].set_ylabel('ARI')
    axes[0].set_xlabel('# of clusters')
#     plt.savefig('figs/hc_ari.png')
#     plt.show()
#     plt.close()
    labels = ['Minimum', 'Maximum', 'Average']
#     for i in xrange(3):
#         axes[1].plot(idx, d[i], label=labels[i])
    axes[1].plot(idx, d[1])
    axes[1].legend()
    axes[1].set_ylabel('distance')
    axes[1].set_xlabel('# of clusters')
#     plt.savefig('figs/hc_distance.png')
    plt.show()
def clustering_tweets_hc(labeled_tweets, num_cluster):
    vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param)
    tweet_vec = vectorizer.fit_transform(labeled_tweets).toarray()
    # print(tweet_vec)
    n_clusters = num_cluster

    from sklearn.neighbors import kneighbors_graph

    knn_graph = kneighbors_graph(tweet_vec, 1, include_self=False)
    # print(knn_graph)

    connectivity = knn_graph
    from sklearn.cluster import AgglomerativeClustering

    model = AgglomerativeClustering(linkage='ward', connectivity=connectivity, n_clusters=n_clusters)
    model.fit(tweet_vec)
    c = model.labels_
    # print(c,len(c))

    clustered_tweets = []
    for i in range(0, num_cluster):
        similar_indices = (c == i).nonzero()[0]
        sent = ''
        for sid in similar_indices:
            sent = labeled_tweets[sid] + ' ' + sent
        clustered_tweets.append(sent)
    return clustered_tweets
示例#16
0
    def plot_mfi(self, outputfile='embeddings.pdf', nb_clusters=8, weights='NA'):
        # collect embeddings for mfi:
        X = np.asarray([self.w2v_model[w] for w in self.mfi \
                            if w in self.w2v_model], dtype='float32')
        # dimension reduction:
        tsne = TSNE(n_components=2)
        coor = tsne.fit_transform(X) # unsparsify

        plt.clf()
        sns.set_style('dark')
        sns.plt.rcParams['axes.linewidth'] = 0.4
        fig, ax1 = sns.plt.subplots()  

        labels = self.mfi
        # first plot slices:
        x1, x2 = coor[:,0], coor[:,1]
        ax1.scatter(x1, x2, 100, edgecolors='none', facecolors='none')
        # clustering on top (add some colouring):
        clustering = AgglomerativeClustering(linkage='ward',
                            affinity='euclidean', n_clusters=nb_clusters)
        clustering.fit(coor)
        # add names:
        for x, y, name, cluster_label in zip(x1, x2, labels, clustering.labels_):
            ax1.text(x, y, name, ha='center', va="center",
                     color=plt.cm.spectral(cluster_label / 10.),
                     fontdict={'family': 'Arial', 'size': 8})
        # control aesthetics:
        ax1.set_xlabel('')
        ax1.set_ylabel('')
        ax1.set_xticklabels([])
        ax1.set_xticks([])
        ax1.set_yticklabels([])
        ax1.set_yticks([])
        sns.plt.savefig(outputfile, bbox_inches=0)
示例#17
0
 def pca_ward_tree(self):
     if not self.pca_reduced:
         self.pc_analysis()
     reduced_red = manifold.SpectralEmbedding(n_components=2).fit_transform(self.pca_reduced)
     clustering = AgglomerativeClustering(linkage='ward', n_clusters=3)
     clustering.fit(self.pca_reduced)
     self._plot_ward_tree(reduced_red, self.pca_reduced, self.player_value, clustering.labels_)
     return plt
 def agglomerative_clustering(self, samples):
     affinityArg = self.metric
     if self.metric == "gaussian":
         affinityArg = similairty_metrics.gaussianSimGraph
         
     ac = AgglomerativeClustering(linkage = self.linkage, n_clusters=self.num_clusters, affinity = affinityArg)
     ac.fit(samples)
     return ac.labels_
示例#19
0
def clusterWithSimMatrix(simMatrix, num):
  clustering = AgglomerativeClustering(n_clusters=num,
                                       affinity='precomputed',
                                       linkage='complete')
  #clustering = MiniBatchKMeans(n_clusters=num, init='k-means++', n_init=1,
  #				 init_size=1000, batch_size=1000, verbose=opts.verbose)
  clustering.fit(simMatrix)
  return clustering
def openfaceExp(lfwAligned, net, cls):
    df = pd.DataFrame(columns=('nPpl', 'nImgs',
                               'trainTimeSecMean', 'trainTimeSecStd',
                               'predictTimeSecMean', 'predictTimeSecStd',
                               'accsMean', 'accsStd'))

    repCache = {}

    df_i = 0
    for nPpl in nPplVals:
        print(" + nPpl: {}".format(nPpl))
	cls = AgglomerativeClustering(n_clusters=nPpl)
        (X, y) = getData(lfwAligned, nPpl, nImgs, size=96, mode='rgb')
        nSampled = X.shape[0]
        ss = ShuffleSplit(nSampled, n_iter=10, test_size=0.1, random_state=0)

        allTrainTimeSec = []
        allPredictTimeSec = []
        accs = []

        for train, test in ss:
            X_train = []
            for img in X[train]:
                h = hash(str(img.data))
                if h in repCache:
                    rep = repCache[h]
                else:
                    rep = net.forward(img)
                    repCache[h] = rep
                X_train.append(rep)

            start = time.time()
            X_train = np.array(X_train)
            cls.fit(X_train, y[train])
            trainTimeSec = time.time() - start
            allTrainTimeSec.append(trainTimeSec)

            start = time.time()
            X_test = []
            for img in X[test]:
                X_test.append(net.forward(img))
            y_predict = cls.fit_predict(X_test)
            predictTimeSec = time.time() - start
            allPredictTimeSec.append(predictTimeSec / len(test))
            y_predict = np.array(y_predict)
            print y[test], y_predict
            acc = accuracy_score(y[test], y_predict)
            print acc
            accs.append(acc)

        df.loc[df_i] = [nPpl, nImgs,
                        np.mean(allTrainTimeSec), np.std(allTrainTimeSec),
                        np.mean(allPredictTimeSec), np.std(allPredictTimeSec),
                        np.mean(accs), np.std(accs)]
        df_i += 1

    return df
示例#21
0
def test_connectivity_callable():
    rng = np.random.RandomState(0)
    X = rng.rand(20, 5)
    connectivity = kneighbors_graph(X, 3, include_self=False)
    aglc1 = AgglomerativeClustering(connectivity=connectivity)
    aglc2 = AgglomerativeClustering(connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False))
    aglc1.fit(X)
    aglc2.fit(X)
    assert_array_equal(aglc1.labels_, aglc2.labels_)
示例#22
0
def hierarchical(X, num_clusters):
    """
    Hierarchical Clustering on X for response y
    Returns array of cluster groups
    """
    model = AgglomerativeClustering(n_clusters=num_clusters)
    cleanX = preprocessing.scale(X.as_matrix())
    model.fit(cleanX)
    return model.labels_
示例#23
0
    def __init__(self, dataset, classes, active_selecting=True,
            subsample_qs=None, random_state=None):
        super(HierarchicalSampling, self).__init__(dataset)
        X = np.array(next(zip(*self.dataset.get_entries())))
        cluster = AgglomerativeClustering()
        cluster.fit(X)
        childrens = cluster.children_

        if subsample_qs is not None:
            if not isinstance(subsample_qs, QueryStrategy):
                raise TypeError("subsample_qs has to be a QueryStrategy")
            self.sub_qs = subsample_qs
        else:
            self.sub_qs = None

        self.active_selecting = active_selecting
        self.random_state_ = seed_random_state(random_state)
        self.n = len(childrens) + 1
        self.m = self.n * 2 - 1
        self.num_class = len(classes)
        self.classes = list(classes)
        self.class_id = dict(zip(self.classes, range(self.num_class)))

        self.parent = np.full(self.m, NO_NODE, dtype=int)
        self.size = np.zeros(self.m, dtype=int)
        self.depth = np.zeros(self.m, dtype=int)
        for i, (left_child, right_child) in enumerate(childrens):
            parent = i + self.n
            self.parent[left_child] = parent
            self.parent[right_child] = parent
        self.left_child = np.concatenate([np.full(self.n, NO_NODE), childrens[:,0]]).astype(int)
        self.right_child = np.concatenate([np.full(self.n, NO_NODE), childrens[:,1]]).astype(int)

        for i in range(self.n):
            node = i
            cur_depth = 0
            while node != NO_NODE:
                assert node >= 0 and node < self.m
                self.size[node] += 1
                self.depth[node] = max(self.depth[node], cur_depth)
                cur_depth += 1
                node = self.parent[node]

        self.count = np.zeros((self.m, self.num_class), dtype=int)
        self.total = np.zeros(self.m, dtype=int)
        self.upper_bound = np.ones((self.m, self.num_class), dtype=float)
        self.lower_bound = np.zeros((self.m, self.num_class), dtype=float)
        self.admissible = np.zeros((self.m, self.num_class), dtype=bool)
        self.best_label = np.full(self.m, NO_LABEL, dtype=int)
        self.split = np.zeros(self.m, dtype=bool)
        self.cost = self.size.copy()

        self.prunings = [self.m-1]

        for i, entry in enumerate(self.dataset.data):
            if entry[1] != None:
                self.update(i, entry[1])
def hierarchical_clustering(corpus_fn, n_clusters=2, linkage='complete'):
    corpus = corpora.MmCorpus(corpus_fn)
    corpus = matutils.corpus2csc(corpus, num_terms=corpus.num_terms).transpose()
    svd = TruncatedSVD(n_components=100)
    new_corpus = svd.fit_transform(corpus)
    knn_graph = kneighbors_graph(new_corpus, 10, metric='euclidean')
    agg = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage=linkage, connectivity=knn_graph)
    agg.fit(new_corpus)
    return corpus, agg.labels_
示例#25
0
def pheno_cluster_scikit(data, **kwargs):
    """Use the scikit-learn package to cluster the data according to the
    phenotypic distances.

    """
    import matplotlib.pyplot as plt
    plt.subplot(221)
    model = AgglomerativeClustering(linkage='ward', nclusters=10)
    model.fit(data)
    print 'done'
示例#26
0
    def fn(inst):
        if not 'x' in inst:
            raise Exception('no x')

        x = inst['x']

        agg = AgglomerativeClustering(*args, **margs)
        agg.fit(x)

        return inst.set('model', agg).set('prediction', agg.labels_)
def agglomerative(doc_term_matrix, k, linkage) :
    ## Documentation here:
    ## http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
    agg = AgglomerativeClustering(n_clusters=k, linkage=linkage)
    print("Clustering sparse data with %s" % agg)
    t0 = time()
    ## This call does the job but it requires a dense doc_term_matrix.
    agg.fit(doc_term_matrix.todense())
    print("done in %0.3fs" % (time() - t0))
    return agg;
示例#28
0
def test_connectivity_ignores_diagonal():
    rng = np.random.RandomState(0)
    X = rng.rand(20, 5)
    connectivity = kneighbors_graph(X, 3, include_self=False)
    connectivity_include_self = kneighbors_graph(X, 3, include_self=True)
    aglc1 = AgglomerativeClustering(connectivity=connectivity)
    aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self)
    aglc1.fit(X)
    aglc2.fit(X)
    assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_known_output():
    #  Creating some test labels
    new_labels = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

    data, __, __, __ = parse_data('../data/test_dataset.gct')
    test_model = AgglomerativeClustering(linkage='average', n_clusters=2, affinity=str2func['custom_euclidean'])
    test_model.fit(data)
    # We know that the euclidean distance gets one label wrong
    assert count_mislabels(test_model.labels_, new_labels) == 1
示例#30
0
文件: aggc.py 项目: s1van/cse5243
def fit(fvecs, params):
	ncluster = int(params[0])
	# linkage : {“ward”, “complete”, “average”}
	linkage_ = params[1]
	# affinity : “euclidean”, “l1”, “l2”, “manhattan”, “cosine”, or ‘precomputed’
	# metric = params[2]

	model = AgglomerativeClustering(n_clusters=ncluster, linkage=linkage_, affinity='manhattan')
	model.fit(fvecs)
	return model.labels_
示例#31
0
#Select features
featureset = pdf[[
    'engine_s', 'horsepow', 'wheelbas', 'width', 'length', 'curb_wgt',
    'fuel_cap', 'mpg'
]]

#Normalize data
from sklearn.preprocessing import MinMaxScaler
x = featureset.values
min_max_scaler = MinMaxScaler()
feature_mtx = min_max_scaler.fit_transform(x)

dist_matrix = distance_matrix(feature_mtx, feature_mtx)
agglom = AgglomerativeClustering(n_clusters=6, linkage='complete')
agglom.fit(feature_mtx)
pdf['cluster_'] = agglom.labels_

import matplotlib.cm as cm
n_clusters = max(agglom.labels_) + 1
colors = cm.rainbow(np.linspace(0, 1, n_clusters))
cluster_labels = list(range(0, n_clusters))

import matplotlib.cm as cm
n_clusters = max(agglom.labels_) + 1
colors = cm.rainbow(np.linspace(0, 1, n_clusters))
cluster_labels = list(range(0, n_clusters))

# Create a figure of size 6 inches by 4 inches.
plt.figure(figsize=(16, 14))
class ClusteringAlgorithmPool:

    # common parameters
    random_state = None
    n_jobs = None
    dist_metric = None
    n_cluster = None
    runtime = None

    # K means
    kmeans_obj = None

    # Common parameter for Hierarchical clustering
    linkage_method = None

    # Hierarchical clust from scipy
    hier_scipy_obj = None

    # Agglomerative
    agglo_obj = None

    # DBSCAN
    eps_dbscan = None
    min_pts_dbscan = None
    dbscan_obj = None

    # Setting up variables and objects.
    def __init__(self, n_cluster, random_state, n_jobs, dist_metric, linkage_method, eps_dbscan, min_pts_dbscan):
        self.n_cluster = n_cluster
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.dist_metric = dist_metric
        self.runtime = numpy.zeros(4)

        # K means
        self.kmeans_obj = KMeans(n_clusters=self.n_cluster, max_iter=100, random_state=self.random_state,
                                 n_jobs=self.n_jobs)
        # Hierarchical clust
        self.linkage_method = linkage_method
        self.agglo_obj = AgglomerativeClustering(n_clusters= self.n_cluster, affinity=self.dist_metric,
                                                 linkage=self.linkage_method)

        # DBSCAN
        self.eps_dbscan = eps_dbscan
        self.min_pts_dbscan = min_pts_dbscan
        self.dbscan_obj =  DBSCAN(eps=self.eps_dbscan, min_samples=self.min_pts_dbscan, metric=self.dist_metric,
                                  algorithm ='kd_tree', n_jobs = self.n_jobs)

    def kmeans_fit(self, X):
        # Record fitting runtime
        start_time = time.time()
        print("Clustering X using K-means")
        self.kmeans_obj.fit(X)
        self.runtime[0] = time.time() - start_time
        print("Clustering ended in " + "{0:.2f}".format(round(self.runtime[0], 2)) + " seconds")

    def hierarchy_scipy_fit(self, X):
        # Record fitting runtime
        start_time = time.time()
        print("Hierarchical (Agglomerative) clustering of X using Scipy library")
        self.hier_scipy_obj = linkage(y=X, method=self.linkage_method, metric=self.dist_metric)
        self.hier_scipy_obj = fcluster(Z=self.hier_scipy_obj, t=self.n_cluster, criterion='maxclust')
        self.runtime[1] = time.time() - start_time
        print("Clustering ended in " + "{0:.2f}".format(round(self.runtime[1], 2)) + " seconds")

    def hierarchy_sklearn_fit(self, X):
        # Record fitting runtime
        start_time = time.time()
        print("Hierarchical (Agglomerative) clustering of X using Sklearn library")
        self.agglo_obj.fit(X)
        self.runtime[2] = time.time() - start_time
        print("Clustering ended in " + "{0:.2f}".format(round(self.runtime[2], 2)) + " seconds")

    def dbscan_fit(self, X):
        # Record fitting runtime
        start_time = time.time()
        print("Clustering X using DBSCAN")
        self.dbscan_obj.fit(X)
        self.runtime[3] = time.time() - start_time
        print("Clustering ended in " + "{0:.2f}".format(round(self.runtime[3], 2)) + " seconds")

    def cluster_fit_all(self, X):
        self.kmeans_fit(X)
        self.hierarchy_scipy_fit(X)
        self.hierarchy_sklearn_fit(X)
        self.dbscan_fit(X)

    def get_cluster_results(self):

        cl_al_dict = {
            "kmeans": self.kmeans_obj.labels_,
            "agnes_scipy": self.hier_scipy_obj,
            "agnes_sklearn": self.agglo_obj.labels_,
            "dbscan": self.dbscan_obj.labels_,
        }

        return cl_al_dict
示例#33
0
def get_AgglomerativeAverage(dataframe, K):
    AgglomerativeAverage = AgglomerativeClustering(n_clusters=K,
                                                   linkage="average")
    AgglomerativeAverage.fit(dataframe)

    return AgglomerativeAverage.labels_
示例#34
0
# Iris dataset
from sklearn import datasets

iris = datasets.load_iris()
X, y = iris.data, iris.target

# Step 1: Load Agglomerative Clustering
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=3,
                                  affinity='euclidean',
                                  linkage='ward')

# Step 2: Training
cluster.fit(X)

# Step 3 Evaluation
# from sklearn import metrics
# print(metrics.accuracy_score(y,cluster.labels_))

plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.title('Original')
plt.subplot(1, 2, 2)
plt.scatter(X[:, 0], X[:, 1], c=cluster.labels_)
plt.title('Agglomerative Clustering')
plt.show()

# import scipy
from scipy.cluster.hierarchy import dendrogram, linkage
示例#35
0
# -------------------------------------------------------------------------------------------
# Kmeans 模型
x = data1[[
    "Impressions", "Clicks", "Total_Spend",
    "Orders_placed_within_1_week_of_a_click",
    "Product_Sales_within_1_week_of_a_click", "period"
]]

x_scaled = preprocessing.StandardScaler().fit(x)
# kmeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(x_scaled.transform(x))
data1["label"] = kmeans.labels_

# 轮廓系数
silhouette_avg = silhouette_score(x, kmeans.labels_)
print "The average silhouette_score is : ", silhouette_avg
tm = time.strftime("%Y%m%d%H%M%S", time.localtime())
# file_name = "kmeans_cluster_%s.xlsx" %tm
# data1.to_excel(file_name, encoding="utf8")

# -------------------------------------------------------------------------------------------
# 层次聚类 Hierarchical clustering
from sklearn.cluster import AgglomerativeClustering
for linkage in ('ward', 'average', 'complete'):
    clustering = AgglomerativeClustering(linkage=linkage, n_clusters=3)
    clustering.fit(x)
    silhouette_avg = silhouette_score(x, clustering.labels_)
    print "The average silhouette_score is : ", silhouette_avg
    file_name = "agglomerative_%s_cluster_%s.xlsx" % (linkage, tm)
    data1.to_excel(file_name, encoding="utf8")
示例#36
0
data_pipeline = Pipeline(cfg.database_name, cfg.collection_name, binary_path=cfg.binary_path)
print("Getting raw data from the DB")
data_pipeline.get_titles_and_skills_data(min_skill_length=cfg.min_skill_length, drop_list=cfg.titles_to_drop)
print(len(data_pipeline.titles_raw))
print("Dropping titles from bad title list from data")
data_pipeline.drop_titles_from_data([], min_title_freq=cfg.min_title_freq)
print("Preparing data for CountVectorizer and TfidfTransformer")
data_pipeline.prepare_data_for_count_vectorizer(skill_depth=cfg.min_skill_depth)
print("Tranforming with CountVectorizer")
data_pipeline.setup_count_vectorizer_and_transform()
print("Transforming with TfidfTransformer")
data_pipeline.setup_tfidf_transformer_and_fit_transform(data_pipeline.data_count_matrix)
print("Integer encoding titles")
data_pipeline.setup_label_encoder_and_fit_transform()
print("Dumping binaries")
data_pipeline.dump_binaries()
# Note: This is where memory requirements increase drastically because we need to store the full matrix in memory
if cfg.subsample_depth > 0:
    print("Splitting data by title and subsampling")
    data_pipeline.subsample_data(min_skill_length=cfg.min_skill_length, subsample_depth=cfg.subsample_depth)
else:
    print("Dropping data points with too few skills")
    data_pipeline.drop_matrix_rows_by_sum(min_skill_length=cfg.min_skill_length)
print("Pipeline complete!")

print("Clustering")
# Create and fit the model, dump output to a pickle in case we need it later
model = AgglomerativeClustering(affinity=cfg.affinity, linkage=cfg.linkage, n_clusters=cfg.n_cluster_stop)
clustering = model.fit(data_pipeline.data_tfidf_matrix)
dump(model, cfg.binary_path + "clustering_model.joblib")
def test_agglomerative_clustering():
    # Check that we obtain the correct number of clusters with
    # agglomerative clustering.
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=bool)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    connectivity = grid_to_graph(*mask.shape)
    for linkage in ("ward", "complete", "average", "single"):
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             linkage=linkage)
        clustering.fit(X)
        # test caching
        try:
            tempdir = mkdtemp()
            clustering = AgglomerativeClustering(
                n_clusters=10,
                connectivity=connectivity,
                memory=tempdir,
                linkage=linkage,
            )
            clustering.fit(X)
            labels = clustering.labels_
            assert np.size(np.unique(labels)) == 10
        finally:
            shutil.rmtree(tempdir)
        # Turn caching off now
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             linkage=linkage)
        # Check that we obtain the same solution with early-stopping of the
        # tree building
        clustering.compute_full_tree = False
        clustering.fit(X)
        assert_almost_equal(
            normalized_mutual_info_score(clustering.labels_, labels), 1)
        clustering.connectivity = None
        clustering.fit(X)
        assert np.size(np.unique(clustering.labels_)) == 10
        # Check that we raise a TypeError on dense matrices
        clustering = AgglomerativeClustering(
            n_clusters=10,
            connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]),
            linkage=linkage,
        )
        with pytest.raises(ValueError):
            clustering.fit(X)

    # Test that using ward with another metric than euclidean raises an
    # exception
    clustering = AgglomerativeClustering(
        n_clusters=10,
        connectivity=connectivity.toarray(),
        affinity="manhattan",
        linkage="ward",
    )
    with pytest.raises(ValueError):
        clustering.fit(X)

    # Test using another metric than euclidean works with linkage complete
    for affinity in PAIRED_DISTANCES.keys():
        # Compare our (structured) implementation to scipy
        clustering = AgglomerativeClustering(
            n_clusters=10,
            connectivity=np.ones((n_samples, n_samples)),
            affinity=affinity,
            linkage="complete",
        )
        clustering.fit(X)
        clustering2 = AgglomerativeClustering(n_clusters=10,
                                              connectivity=None,
                                              affinity=affinity,
                                              linkage="complete")
        clustering2.fit(X)
        assert_almost_equal(
            normalized_mutual_info_score(clustering2.labels_,
                                         clustering.labels_), 1)

    # Test that using a distance matrix (affinity = 'precomputed') has same
    # results (with connectivity constraints)
    clustering = AgglomerativeClustering(n_clusters=10,
                                         connectivity=connectivity,
                                         linkage="complete")
    clustering.fit(X)
    X_dist = pairwise_distances(X)
    clustering2 = AgglomerativeClustering(
        n_clusters=10,
        connectivity=connectivity,
        affinity="precomputed",
        linkage="complete",
    )
    clustering2.fit(X_dist)
    assert_array_equal(clustering.labels_, clustering2.labels_)
示例#38
0
import numpy as np 
from scipy import ndimage 
from scipy.cluster import hierarchy 
from scipy.spatial import distance_matrix 
from matplotlib import pyplot as plt 
from sklearn import manifold, datasets 
from sklearn.cluster import AgglomerativeClustering 
from sklearn.datasets.samples_generator import make_blobs

#Make the blobs
X2, y2 = make_blobs(n_samples=50, centers=[[4,4], [-2, -1], [1, 1], [10,4]], cluster_std=0.9)
#Create the model and train it
agglom = AgglomerativeClustering(n_clusters = 4, linkage = 'average')
agglom.fit(X2,y2)
# Create a minimum and maximum range of X2.
x_min, x_max = np.min(X2, axis=0), np.max(X2, axis=0)
# Get the average distance for X2.
X2 = (X2 - x_min) / (x_max - x_min)
#Create the distance matrix
dist_matrix = distance_matrix(X2,X2)
#Create the training data
Z = hierarchy.linkage(dist_matrix, 'complete')
#Create the dendogram
dendro = hierarchy.dendrogram(Z)

# Create a figure of size 6 inches by 4 inches.
plt.figure(figsize=(6,4))
# These two lines of code are used to scale the data points down,
# Or else the data points will be scattered very far apart.
# Create a minimum and maximum range of X2.
x_min, x_max = np.min(X2, axis=0), np.max(X2, axis=0)
    for i in range(n_clusters):
        for j in range(n_clusters):
            plt.text(i, j, '%5.3f' % avg_dist[i, j],
                     verticalalignment='center',
                     horizontalalignment='center')

    plt.imshow(avg_dist, interpolation='nearest', cmap=plt.cm.gnuplot2,
               vmin=0)
    plt.xticks(range(n_clusters), labels, rotation=45)
    plt.yticks(range(n_clusters), labels)
    plt.colorbar()
    plt.suptitle("Interclass %s distances" % metric, size=18)
    plt.tight_layout()


# Plot clustering results
for index, metric in enumerate(["cosine", "euclidean", "cityblock"]):
    model = AgglomerativeClustering(n_clusters=n_clusters,
                                    linkage="average", affinity=metric)
    model.fit(X)
    plt.figure()
    plt.axes([0, 0, 1, 1])
    for l, c in zip(np.arange(model.n_clusters), 'rgbk'):
        plt.plot(X[model.labels_ == l].T, c=c, alpha=.5)
    plt.axis('tight')
    plt.axis('off')
    plt.suptitle("AgglomerativeClustering(affinity=%s)" % metric, size=20)


#plt.show()
    plt.figure(figsize=(14, 12), facecolor='w')
    plt.cla()
    linkages = ("ward", "complete", "average")
    for index, (n_clusters, data, y) in enumerate(((4, data1, y1), (4, data1_noise, y1_noise),
                                                   (2, data2, y2), (2, data2_noise, y2_noise))):
        plt.subplot(4, 4, 4*index+1)
        plt.scatter(data[:, 0], data[:, 1], c=y, cmap=cm)
        plt.title('Prime', fontsize=17)
        plt.grid(b=True, ls=':')
        data_min1, data_min2 = np.min(data, axis=0)
        data_max1, data_max2 = np.max(data, axis=0)
        plt.xlim(extend(data_min1, data_max1))
        plt.ylim(extend(data_min2, data_max2))

        connectivity = kneighbors_graph(data, n_neighbors=7, mode='distance', metric='minkowski', p=2, include_self=True)
        connectivity = 0.5 * (connectivity + connectivity.T)
        for i, linkage in enumerate(linkages):
            ac = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean',
                                         connectivity=connectivity, linkage=linkage)
            ac.fit(data)
            y = ac.labels_
            plt.subplot(4, 4, i+2+4*index)
            plt.scatter(data[:, 0], data[:, 1], c=y, cmap=cm)
            plt.title(linkage, fontsize=17)
            plt.grid(b=True, ls=':')
            plt.xlim(extend(data_min1, data_max1))
            plt.ylim(extend(data_min2, data_max2))
    plt.suptitle(u'层次聚类的不同合并策略', fontsize=20)
    plt.tight_layout(0.5, rect=(0, 0, 1, 0.95))
    plt.show()
示例#41
0
                         **rescale_params)

X = np.reshape(rescaled_coins, (-1, 1))

# #############################################################################
# Define the structure A of the data. Pixels connected to their neighbors.
connectivity = grid_to_graph(*rescaled_coins.shape)

# #############################################################################
# Compute clustering
print("Compute structured hierarchical clustering...")
st = time.time()
n_clusters = 27  # number of regions
ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward',
                               connectivity=connectivity)
ward.fit(X)
label = np.reshape(ward.labels_, rescaled_coins.shape)
print("Elapsed time: ", time.time() - st)
print("Number of pixels: ", label.size)
print("Number of clusters: ", np.unique(label).size)

# #############################################################################
# Plot the results on an image
plt.figure(figsize=(5, 5))
plt.imshow(rescaled_coins, cmap=plt.cm.gray)
for l in range(n_clusters):
    plt.contour(label == l,
                colors=[plt.cm.nipy_spectral(l / float(n_clusters)), ])
plt.xticks(())
plt.yticks(())
plt.show()
X = scale(digits.data)
n_samples, n_features = X.shape
n_digits = len(np.unique(digits.target))
labels_true = digits.target
sample_size = 300

# #######################ward######################################################
# Compute clustering
print("Compute structured hierarchical clustering...")
# st = time.time()
# n_clusters = 27  # number of regions
# ward = AgglomerativeClustering()
# ward = ward.fit(X)
# AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
#                             connectivity=None,
#                             linkage='ward', memory=None, n_clusters=2,
#                             pooling_func='deprecated')
# labels = ward.labels_
# print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
# print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
# print("NMI: %0.3f" % metrics.normalized_mutual_info_score(labels_true, labels, average_method='arithmetic'))
for linkage in ('ward', 'average', 'complete', 'single'):
    clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10)
    # t0 = time()
    clustering = clustering.fit(X)
    labels = clustering.labels_
    # labels = ward.labels_
    print(linkage + ":")
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("NMI: %0.3f" % metrics.normalized_mutual_info_score(labels_true, labels, average_method='arithmetic'))
示例#43
0
import getData, matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering, DBSCAN

dissimilarities = getData.get_embedding_data(['rightEye'], 'bottleneck', 'tsne', 'Angry', 'metric', 30, True)

agcluster = AgglomerativeClustering(linkage='average', n_clusters=10, affinity='precomputed')
#dbscanCluster = DBSCAN(eps=1, metric='precomputed')

agcluster.fit(dissimilarities)
#dbscanCluster.fit(dissimilarities)

print(agcluster.labels_)
#print(dbscanCluster.labels_)

for i in range(len(agcluster.labels_)-1):
    if agcluster.labels_[i] != agcluster.labels_[i+1] and [agcluster.labels_[i], agcluster.labels_[i+1]] in [[8,3],[3,8]]:
        print(f'[{agcluster.labels_[i]},{agcluster.labels_[i+1]}] -> [{i},{i+1}]')

 def agglomerative(obj, clusterCount=None, **kwargs):
     model = AgglomerativeClustering(n_clusters=clusterCount)
     model.fit(obj)
     labels = model.labels_
     return attMethods.labelsToCluster(obj, labels, clusterCount)
示例#45
0
#Define which KMeans algorithm to use and fit it
Y_Kmeans = KMeans(n_clusters=clusters)
Y_Kmeans.fit(X)
Y_Kmeans_labels = Y_Kmeans.labels_
Y_Kmeans_silhouette = metrics.silhouette_score(X,
                                               Y_Kmeans_labels,
                                               metric='sqeuclidean')
print("Silhouette for Kmeans: {0}".format(Y_Kmeans_silhouette))
print("Results for Kmeans: {0}".format(Y_Kmeans_labels))

#Define which hierarchical clustering algorithm to use and fit it
linkage_types = ['ward', 'average', 'complete']
Y_hierarchy = AgglomerativeClustering(linkage=linkage_types[2],
                                      n_clusters=clusters)
Y_hierarchy.fit(X)
Y_hierarchy_labels = Y_hierarchy.labels_
Y_hierarchy_silhouette = metrics.silhouette_score(X,
                                                  Y_hierarchy_labels,
                                                  metric='sqeuclidean')
print("Silhouette for Hierarchical Clustering: {0}".format(
    Y_hierarchy_silhouette))
print("Hierarchical Clustering: {0}".format(Y_hierarchy_labels))

#Define figure
colormap = np.array(
    [
        'cyan', 'black', 'magenta', 'red', 'orange', 'green', 'brown',
        'yellow', 'blue', 'white'
    ]
)  #Define colors to use in graph - could use c=Y but colors are too similar when only 2-3 clusters
示例#46
0
# In[23]:

pairwise = pd.DataFrame(data=corr,
                        index=signature.keys(),
                        columns=signature.keys())

# In[24]:

from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics

cluster = AgglomerativeClustering(n_clusters=numClusters,
                                  affinity='euclidean',
                                  linkage='ward')
clusterers = cluster.fit(pairwise.values)

sil_score = metrics.silhouette_score(pairwise.values,
                                     clusterers.labels_,
                                     metric='euclidean')

# In[25]:

cells = pairwise.index

for idx in range(len(cells)):
    grid.loc[grid['unitArea'] == cells[idx],
             'cluster'] = clusterers.labels_[idx]

# In[26]:
    def grafica(self):
        import matplotlib.pyplot as plt
        from itertools import cycle
        import numpy as np

        plt.figure(figsize=(9, 3))
        plt.subplot(131)

        ms = MeanShift(bin_seeding=True)
        ms.fit(self.X)
        labels = ms.labels_
        cluster_centers = ms.cluster_centers_
        labels_unique = np.unique(labels)
        n_clusters_ = len(labels_unique)

        colors = cycle('bgrcmyk')
        start_time = time()
        with parallel_backend('threading', n_jobs=n_jobs_parrallel):
            for k, col in zip(range(n_clusters_), colors):
                my_members = labels == k
                cluster_center = cluster_centers[k]
                plt.plot(self.X[my_members, 0], self.X[my_members, 1],
                         col + '.')
                plt.plot(cluster_center[0],
                         cluster_center[1],
                         'o',
                         markerfacecolor=col,
                         markeredgecolor='k',
                         markersize=14)
        elapsed_time = time() - start_time
        elapsed_time = format(elapsed_time, '.6f')
        salida = 'Tiempo ejecución:' + str(elapsed_time) + ' segundos'
        plt.title('MeanShift Estimated number of clusters:' +
                  str(n_clusters_) + '\n' + salida)

        plt.subplot(132)
        model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)

        model = model.fit(self.X)
        plt.title('Hierarchical Clustering Dendrogram')
        # plot the top three levels of the dendrogram
        plot_dendrogram(model, truncate_mode='level', p=3)
        plt.xlabel(
            "Number of points in node (or index of point if no parenthesis).")

        plt.subplot(133)

        import numpy as np
        from sklearn.cluster import DBSCAN
        from sklearn import metrics
        from sklearn.datasets import make_blobs
        from sklearn.preprocessing import StandardScaler

        # #############################################################################
        # Compute DBSCAN
        db = DBSCAN(eps=0.3, min_samples=10).fit(self.X)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_

        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)

        print('Estimated number of clusters: %d' % n_clusters_)
        print('Estimated number of noise points: %d' % n_noise_)
        # #############################################################################
        # Plot result
        import matplotlib.pyplot as plt

        # Black removed and is used for noise instead.
        unique_labels = set(labels)
        colors = [
            plt.cm.Spectral(each)
            for each in np.linspace(0, 1, len(unique_labels))
        ]
        for k, col in zip(unique_labels, colors):
            if k == -1:
                # Black used for noise.
                col = [0, 0, 0, 1]

            class_member_mask = (labels == k)

            xy = self.X[class_member_mask & core_samples_mask]
            plt.plot(xy[:, 0],
                     xy[:, 1],
                     'o',
                     markerfacecolor=tuple(col),
                     markeredgecolor='k',
                     markersize=14)

            xy = self.X[class_member_mask & ~core_samples_mask]
            plt.plot(xy[:, 0],
                     xy[:, 1],
                     'o',
                     markerfacecolor=tuple(col),
                     markeredgecolor='k',
                     markersize=6)
        plt.title('DBSCAN')

        if self.nombreFichero:
            plt.savefig(self.nombreFichero)
        else:
            plt.show()
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

#%%
cah_fit = AgglomerativeClustering(n_clusters=10)

#%%
cah_fit = cah_fit.fit(clustering.kmeans_centers)

#%%
fig = plt.figure(1, figsize=(12, 7))
plot_dendrogram(cah_fit, labels = cah_fit.labels_)

#%%
cah_fit.labels_

#%%
tmp = Tools.read_np_picture('data/processed/models/autoencoder/train/k/20171109-192001.jpg',target_size = (27, 48), scale = 1/255)
tmp = tmp.reshape((1,27,48,3))
np.sum(model.get_encoded_prediction(tmp))

#%%
filenames = Tools.list_directory_filenames('data/processed/models/autoencoder/train/k/')
示例#49
0
    plt.figure(figsize=(6, 4))
    for i in range(X_red.shape[0]):
        plt.text(X_red[i, 0], X_red[i, 1], str(y[i]),
                 color=plt.cm.nipy_spectral(labels[i] / 10.),
                 fontdict={'weight': 'bold', 'size': 9})

    plt.xticks([])
    plt.yticks([])
    if title is not None:
        plt.title(title, size=17)
    plt.axis('off')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])

print("Computing embedding")
X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X)
print("Done.")

from sklearn.cluster import AgglomerativeClustering

for linkage in ('ward', 'average', 'complete'):
    clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10)
    t0 = time()
    clustering.fit(X_red)
    print("%s :\t%.2fs" % (linkage, time() - t0))

    plot_clustering(X_red, clustering.labels_, "%s linkage" % linkage)


plt.show()
示例#50
0
def spatial_cluster(loc_dict, Cvar_dict, shapefile, cluster_num, file_path_elev, idx_list,
                    plot_2D, plot_3D, return_all):
    '''Spatial clustering based on scikit learn's agglomerative clustering

    Parameters
    ----------
         loc_dict : dictionary
              the latitude and longitudes of the daily/hourly stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile
         clusternum : int
              number of clusters
         file_path_elev : string
              path to the elevation lookup file
         idx_list : int
              position of the elevation column in the lookup file
         plot_2D : bool
              whether to plot maps of the clusters in 2d
         plot_3D : bool
              whether to plot maps of the clusters in 3d             
         return_all : bool
            whether or not to return all the outputs (needed for selecting cluster size)
            
    Returns
    ----------
         dictionary
             - a dictionary of cluster that each station is in 
    '''

    x = []
    y = []

    proj_stations = {}
    for station in Cvar_dict.keys():
        if station in loc_dict.keys():
            coord = loc_dict[station]
            Plon1, Plat1 = pyproj.Proj('esri:102001')(
                coord[1], coord[0])  # longitude,lat
            Plat = float(Plat1)
            Plon = float(Plon1)
            x.append([Plon])
            y.append([Plat])
            proj_stations[station] = [Plat, Plon]
    X = [val+y[i] for i, val in enumerate(x)]
    X = np.array(X)
    # print(X)
    # Make the longitudinal transect of distance (lon, elev)

    Xi1_grd = []
    Yi1_grd = []
    elev_grd = []
    # Preparing the coordinates to send to the function that will get the elevation grid
    concat = np.array((x, y)).T
    send_to_list = concat[0].tolist()
    send_to_tuple = [tuple(x) for x in send_to_list]
    # Get the elevations from the lookup file
    elev_grd_dict = GD.finding_data_frm_lookup(
        send_to_tuple, file_path_elev, idx_list)

    for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
        x = keys[0]
        y = keys[1]
        Xi1_grd.append(x)
        Yi1_grd.append(y)
        # Append the elevation data to the empty list
        elev_grd.append(elev_grd_dict[keys])

    lon = [i for i in Xi1_grd]  # list of 0
    lon_list = [[i] for i in lon]
    lat_list = [[i] for i in Yi1_grd]
    elev = [[i] for i in elev_grd]  # put into sublist so you can make pairs
    Xelev = [val+lat_list[i]+elev[i] for i, val in enumerate(lon_list)]
    Xelev = np.array(Xelev)

    # This is where we make the connectivity graph based on elevation

    knn_graph = kneighbors_graph(Xelev, 10, include_self=False)
    connectivity = knn_graph
    n_clusters = cluster_num

    linkage = 'ward'

    model = AgglomerativeClustering(
        linkage=linkage, connectivity=connectivity, n_clusters=n_clusters)

    model.fit(Xelev)  # fit with lat lon elev
    label = model.labels_

    if plot_3D:
        fig = plt.figure()
        ax = p3.Axes3D(fig)
        ax.view_init(7, -80)
        for l in np.unique(label):
            ax.scatter(Xelev[label == l, 0], Xelev[label == l, 1], Xelev[label == l, 2],
                       color=plt.cm.jet(float(l) / np.max(label + 1)),
                       s=20, edgecolor='k')
        plt.title('With connectivity constraints, Elevation inc.')
        ax.set_xlabel('Longitude')
        ax.set_ylabel('Latitude')
        ax.set_zlabel('Elevation (m)')

        plt.show()

    # This is where we make the connectivity graph where we can see on the map
    if plot_2D:

        fig, ax = plt.subplots(figsize=(15, 15))
        crs = {'init': 'esri:102001'}
        na_map = gpd.read_file(shapefile)

        na_map.plot(ax=ax, color='white', edgecolor='k', linewidth=1, alpha=1)

        plt.scatter(Xelev[:, 0], Xelev[:, 1], c=model.labels_,
                    cmap=plt.cm.tab20b, s=20, edgecolor='k')

        ax.tick_params(axis='both', which='both', bottom=False, top=False,
                       labelbottom=False, right=False, left=False, labelleft=False)
        ax.ticklabel_format(useOffset=False, style='plain')

        # plt.subplots_adjust(bottom=0, top=.83, wspace=0,
        # left=0, right=1)
        # plt.suptitle('n_cluster=%i, connectivity=%r' %
        # (n_clusters, connectivity is not None), size=17)

        plt.show()

    # Make a dictionary with each class
    station_class = {}

    count = 0
    for val in Xelev:
        key = [key for key, value in proj_stations.items() if value == [
            val[1], val[0]]]
        if len(key) == 1:
            # We add 1, because for the random selection the groups start at 1
            station_class[key[0]] = label[count] + 1
        elif len(key) == 2:
            station_class[key[0]] = label[count] + 1
            station_class[key[1]] = label[count] + 1
        elif len(key) == 3:
            station_class[key[0]] = label[count] + 1
            station_class[key[1]] = label[count] + 1
            station_class[key[2]] = label[count] + 1
        else:
            print('Too many stations have the same lat lon.')
        count += 1

    if count != label.shape[0]:
        print('The groups and label matrix do not match')

    if return_all:
        return label, Xelev, station_class
    else:

        return station_class
示例#51
0
import sklearn.metrics as metrics
ax = plt.axes()

iris = datasets.load_iris()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 30)
# ------------K means Clustering
from sklearn.cluster import KMeans

km_cls = KMeans(n_clusters=2)

km_cls = km_cls.fit(X_train) 

#km_cls.predict(X_test)

print(metrics.homogeneity_score(km_cls.predict(X_test), y_test))


# -----------------Agglomerative Clustering -------

from sklearn.cluster import AgglomerativeClustering
agg_cls = AgglomerativeClustering(n_clusters=3)
agg_cls = agg_cls.fit(X_train)
print(metrics.homogeneity_score(agg_cls.fit_predict(X_test), y_test))
plt.subplot(2,1,1)
ax.plot(X_train, y_train, 'ro')

plt.subplot(2,1,2)
ax.plot(agg_cls.fit_predict(X_train), y_train)
plt.show()
示例#52
0
 
 # kmeans clustering of cell line data
 X = data_in.values
 y = np.concatenate((np.zeros(20), np.ones(20)))
 
 sklearn_KMeans = KMeans(n_clusters=2, random_state=0)
 sklearn_KMeans.fit(X)
 
 print("KMeans clustering results: labels")
 print(sklearn_KMeans.labels_)
 print("KMeans clustering results: cluster centers")
 print(sklearn_KMeans.cluster_centers_)
 
 # hierarchical clustering
 sklearn_agglomerative_clustering = AgglomerativeClustering(n_clusters=2, linkage='complete', affinity='euclidean')
 sklearn_agglomerative_clustering.fit(X)
 print("agglomerative clustering results: complete linkage, euclidean distance")
 print(sklearn_agglomerative_clustering.labels_)
 
 sklearn_agglomerative_clustering = AgglomerativeClustering(n_clusters=2, linkage='average', affinity='euclidean')
 sklearn_agglomerative_clustering.fit(X)
 print("agglomerative clustering results: average linkage, euclidean distance")
 print(sklearn_agglomerative_clustering.labels_)
 
 sklearn_agglomerative_clustering = AgglomerativeClustering(n_clusters=2, linkage='complete', affinity='manhattan')
 sklearn_agglomerative_clustering.fit(X)
 print("agglomerative clustering results: complete linkage, manhattan distance")
 print(sklearn_agglomerative_clustering.labels_)
 
 # -----------------------------------------------------
 # 8. use logistic regression for classification of the cell line data
示例#53
0
 def clustering_Agglomerative(self,semi_matrix,n_clusters = 15):              
     model_Agg = AgglomerativeClustering(affinity='precomputed',linkage='average',n_clusters=n_clusters)
     model_Agg.fit(semi_matrix)
     return model_Agg.labels_
示例#54
0
tfidf_mat = tfidf_vectorizer.fit_transform(train)

tfidf_mat_test = tfidf_vectorizer.transform(test)

###### Kmeans

kmeans = KMeans(n_clusters=30, random_state=0, n_jobs=-1).fit(tfidf_mat)

print accuracy(train, train_class, kmeans.predict(tfidf_mat))

print accuracy(test, test_class, kmeans.predict(tfidf_mat_test))

### Decission trees

clf = tree.DecisionTreeClassifier(max_depth=10)
clf = clf.fit(tfidf_mat, train_class)

train_result = clf.predict(tfidf_mat)

print decision_accuracy(train_result, train_class)

test_result = clf.predict(tfidf_mat_test)
print decision_accuracy(test_result, test_class)

#### Heirarchical clustering

model = AgglomerativeClustering(n_clusters=30)
model.fit(tfidf_mat.toarray(), train_class)
print model.labels_
print accuracy(train, train_class, model.labels_)
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch

features = pd.read_csv(r'T:\tbase\tbase_data_imputed.csv', sep=',')
features = features.drop('Unnamed: 0', axis=1)
features_imp = features.copy()
feature_list = list(features.columns)

#Use diagnosis data only
columns_to_use = [x for x in features.columns if 'Diag' in x]
features = features[columns_to_use]

#Use all features
features.drop('Longterm_TransplantOutcome', axis=1, inplace=True)
features.drop('TransplantationID', axis=1, inplace=True)
features.drop('PatientID', axis=1, inplace=True)
features.drop('tenure', axis=1, inplace=True)

dendrogram = sch.dendrogram(sch.linkage(features, method='ward'))

model = AgglomerativeClustering(n_clusters=3,
                                affinity='euclidean',
                                linkage='ward')
model.fit(features)
labels = model.labels_

clusters = pd.DataFrame(labels, columns=['cluster'])

features = features_imp.join(clusters)
features.to_csv(r'T:\\tbase\\tbase_data_hierarchical_3_clusters.csv')
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


iris = load_iris()
X = iris.data

# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)

model = model.fit(X)
plt.title('Hierarchical Clustering Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()
示例#57
0
    temp = np.average(a=model_values, weights=solution_weights, axis=0)

    agg_data = aggregate_data(temp, case=case)

    all_model_data = agg_data.reshape(
        12, -1, order='A').T  # shape = 9900 (11*25*36), 12

    model_data = pd.DataFrame.from_records(all_model_data).dropna(axis=0)

    ocean_points_index = model_data.index.values  # ocean points index
    model_data = model_data.values  # train data

    model_labels = np.zeros(shape=(len(all_model_data)), dtype=int)

    hac = AgglomerativeClustering(n_clusters=nb_classes)
    hac = hac.fit(som_model.codebook)
    ocean_predicted_labels = get_reverse_classification(ctk.findbmus(
        sm=som_model, Data=model_data),
                                                        hac_labels=hac.labels_)
    model_labels[ocean_points_index] = ocean_predicted_labels.flatten(
    ) + 1  # à cause du 0 de la terre

    if case.upper() == 'ALL':
        model_labels_ = model_labels.reshape(11, 25, 36, order='A')
    elif case.upper() == 'SEL':
        model_labels_ = model_labels.reshape(11, 13, 12, order='A')

    perf_vector = get_projection_errors(true_labels=true_labels,
                                        pred_labels=model_labels_)
    print(f'{"*"*10} Genetic algorithm results {"*"*10}')
    print(f'\t\t[+] solution weights : {solution_weights}\n')
示例#58
0
arr = np.array(ClusterDF[['time','check_in_count']])
indices = np.random.randint(0,arr.shape[0],200)
X = arr[indices]

plt.figure(figsize=(9, 10))  
plt.title("Data Dendrogram Single Link")  
dend = shc.dendrogram(shc.linkage(X, method='ward'))  
print ('\nprint dend single link\n')
print(dend)

plt.show() 



cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')  
print(cluster.fit(X))

plt.figure(figsize=(9, 10))  
plt.scatter(X[:,0], X[:,1], c=cluster.labels_, cmap='rainbow')  
plt.title("Single Link")
plt.show()


#Average Link
plt.figure(figsize=(9, 10))  
plt.title("Data Dendrogram Average Link")  
dend = shc.dendrogram(shc.linkage(X, method='average'))  
print ('\nprint dend average link\n')
print(dend)

plt.show() 
示例#59
0
def plot_cluster_components(df,
                            decomposition='tsne',
                            lle_method='standard',
                            plot='2D',
                            n_clusters=3,
                            n_components=3,
                            titlex='XXX',
                            fname=None,
                            azim=90,
                            elev=90):
    import numpy as np
    from sklearn.cluster import AgglomerativeClustering, KMeans
    from sklearn.decomposition import PCA, KernelPCA
    from sklearn.manifold import Isomap, TSNE, LocallyLinearEmbedding, MDS, SpectralEmbedding
    import matplotlib.pyplot as plt

    title = ''
    n_clusters = n_clusters
    clusterx = AgglomerativeClustering(n_clusters=n_clusters,
                                       affinity='euclidean',
                                       linkage='ward')
    clusterx.fit(df)

    # decompose data and plot 2Pcs
    n_components = 3
    n_neighbors = 10
    if decomposition == 'isomap':
        data_projected = Isomap(n_components=n_components).fit_transform(df)
    elif decomposition == 'tsne':
        data_projected = TSNE(n_components=n_components,
                              init='pca',
                              random_state=0).fit_transform(df)
    elif decomposition == 'mds':
        data_projected = MDS(n_components, max_iter=100,
                             n_init=1).fit_transform(df)
    elif decomposition == 'spectral':
        data_projected = SpectralEmbedding(
            n_components=n_components,
            n_neighbors=n_neighbors).fit_transform(df)
    elif decomposition == 'lle':
        lle_methods = ['standard', 'ltsa', 'hessian', 'modified']
        data_projected = LocallyLinearEmbedding(
            n_neighbors, n_components, eigen_solver='auto',
            method=lle_method).fit_transform(df)
    elif decomposition == 'kpca':
        kpca = KernelPCA(
            n_components=n_components,
            kernel="rbf",
            fit_inverse_transform=True,
            gamma=10,
        )
        data_projected = kpca.fit_transform(df)
    elif decomposition == 'pca':
        pca = PCA(n_components=n_components)
        data_projected = pca.fit_transform(df)

    colors = ['g', 'r', 'b', 'm', 'k', 'g']

    if plot == '2D':
        fig = plt.figure(figsize=(10, 10))
        for i in range(n_clusters):
            ds = data_projected[np.where(clusterx.labels_ == i)]
            # select only data observations with cluster label == i
            plt.xlabel('PC1', fontsize=20, weight='bold', labelpad=15)
            plt.ylabel('PC2', fontsize=20, weight='bold', labelpad=15)
            plt.plot(ds[:, 0], ds[:, 1], 'o', c=colors[i])
            plt.axis('tight')
            #if title:
            plt.title(titlex, fontsize=20)

    #
    # elif plot=='3D':
    #     print data_projected.shape
    #     ds1 = data_projected[np.where(clusterx.labels_==0)]
    #     ds2 = data_projected[np.where(clusterx.labels_==1)]
    #     ds3 = data_projected[np.where(clusterx.labels_==2)]
    #     fig = plt.figure(figsize=(10,10))
    #     ax = fig.add_subplot(1, 1, 1, projection='3d')
    #     ax.set_xlabel('PC1',fontsize=20, weight='bold',labelpad=15)
    #     ax.set_ylabel('PC2',fontsize=20, weight='bold',labelpad=15)
    #     ax.set_zlabel('PC3',fontsize=20, weight='bold',labelpad=15)
    #     ax.scatter(ds1[:,0],ds1[:,1],ds1[:,2], c=colors[0],  s = 200)
    #     ax.scatter(ds2[:,0],ds2[:,1],ds2[:,2], c=colors[1],  s = 200)
    #     ax.scatter(ds3[:,0],ds3[:,1],ds3[:,2], c=colors[2],  s = 200)
    #
    #     ax.view_init(elev=elev, azim=azim)
    #     plt.title(titlex)

    elif plot == '3D':
        print data_projected.shape

        fig = plt.figure(figsize=(10, 10))
        ax = fig.add_subplot(1, 1, 1, projection='3d')
        ax.set_xlabel('PC1', fontsize=20, weight='bold', labelpad=15)
        ax.set_ylabel('PC2', fontsize=20, weight='bold', labelpad=15)
        ax.set_zlabel('PC3', fontsize=20, weight='bold', labelpad=15)

        for i in range(n_clusters):
            ds = data_projected[np.where(clusterx.labels_ == i)]
            ax.scatter(ds[:, 0], ds[:, 1], ds[:, 2], c=colors[i], s=200)

        ax.view_init(elev=elev, azim=azim)
        plt.title(titlex)

    plt.xticks(fontsize=15, weight='bold')
    plt.yticks(fontsize=15, weight='bold')

    if fname:
        plt.savefig(fname, dpi=300, bbox_inches='tight', transprent=True)
示例#60
0
    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    shc.dendrogram(linkage_matrix, **kwargs)

# Perform agglomerative clustering with sklearn
model = AgglomerativeClustering(n_clusters=5, linkage='single')
model.fit(termDocMatrix.T)
plt.title("Messages Dendrogram with sklearn")
plot_dendrogram(model)
plt.show()


'''

    CORRELATION BETWEEN CLUSTERS USING DTW

'''

# Import DTW library
from dtaidistance import dtw

# Get time information from messages dataset