예제 #1
1
def scikit_pca(model, rel_wds, plot_lims, title, cluster="kmeans"):
    """
    Given a word2vec model and a cluster (choice of "kmeans" or "spectral")
    Make a plot of all word-vectors in the model.
    """
    X, keys = make_data_matrix(model)

    for i, key in enumerate(keys):
        X[i,] = model[key]

    if cluster == "kmeans":
        k_means = KMeans(n_clusters=8)
        labels = k_means.fit_predict(X)

    elif cluster == "spectral":
        sp_clust = SpectralClustering()
        labels = sp_clust.fit_predict(X)

    # PCA
    X_std = StandardScaler().fit_transform(X)
    sklearn_pca = PCA(n_components=2)
    X_transf = sklearn_pca.fit_transform(X_std)

    scatter_plot(X_transf[:,0], X_transf[:,1],  rel_wds, labels, title, keys, plot_lims)

    return sklearn_pca.explained_variance_ratio_
예제 #2
0
 def add_kmeans_col(self, iter = 1000, n_init = 10, n = 4):
     '''Add a new k_means cluster column to X data'''
     logging.info('Adding kmeans %d clusters to X' %(n))
     km = KMeans(n_clusters=n, max_iter=iter, n_init=n_init)
     km.fit(self.X[:,1:]) # XXX: This might not be kosher as it affects all of X
     self.models['km-col'] = km        
     self.X = np.hstack( (self.X, km.predict(self.X[:,1:]).reshape(-1,1)) )   
def get_modelKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    # Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    # benign_h2o.summary()

    benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values="NaN", strategy="mean", axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2, 7):
        # Log.info("H2O K-Means")
        km_h2o = H2OKMeansEstimator(k=i)
        km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o)
        km_h2o.show()
        model = h2o.get_model(km_h2o._id)
        model.show()

        km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1)
        km_sci.fit(benign_sci)
        print "sckit centers"
        print km_sci.cluster_centers_
예제 #4
0
def re_classify_dict():
    dict_file = open("_dictionary.pickle", "rb")
    sc_list = cPickle.load(dict_file)
    sc_list = np.concatenate(sc_list)

    Dh_dict = sc_list[:, 144:]
    Dl_dict = sc_list[:, :144]

    k_means = KMeans(n_clusters=15)
    k_means = k_means.fit(Dl_dict)
    y_predict = k_means.predict(Dl_dict)

    num = []
    y_tmp = np.asarray(y_predict, dtype=int) * 0 + 1
    for i in range(len(np.unique(y_predict))):
        num.append(np.sum(y_tmp[y_predict == i]))
    rand = np.asarray(num).argsort()  # 按照各个类别patch个数从少到多排序的类别索引

    classified_hdict = []
    classified_patch = []
    for i in rand:
        predict_temp = y_predict == i
        classified_hdict.append(Dh_dict[predict_temp])
        print len(classified_hdict[-1])

    for i in range(9):
        x = i % 3
        y = i / 3
        # 进行一次系数编码测试
        patch_show(classified_hdict[i+5][:100], [0.05+x*0.31, 0.05+y*0.31, 0.3, 0.3], i)

    plt.show()
예제 #5
0
def Kmeans_cluster_analysis(x,y,n_clusters):
    X = np.hstack((x.reshape((x.shape[0],1)),y.reshape((y.shape[0],1))))
    X = Scaler().fit_transform(X)
    km = KMeans(n_clusters)
    km.fit(X)
    labels = km.labels_
    cluster_centers = km.cluster_centers_
    
    labels_unique = set(labels) #np.unique(labels)
    n_clusters_ = len(labels_unique)
    
    #print("number of estimated clusters : %d" % n_clusters_)
    colors = 'bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmyk' #cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    #colors = pl.cm.Spectral(np.linspace(0, 1, len(labels_unique)))
    for i in xrange(len(labels_unique)):
        my_members = labels == i
        cluster_center = cluster_centers[i]
        plt.scatter(X[my_members, 0], X[my_members, 1],s=90,c=colors[i],alpha=0.7)
        plt.scatter(cluster_center[0], cluster_center[1],marker='+',s=280,c=colors[i])
    tolx = (X[:,0].max()-X[:,0].min())*0.03
    toly = (X[:,1].max()-X[:,1].min())*0.03
    plt.xlim(X[:,0].min()-tolx,X[:,0].max()+tolx)
    plt.ylim(X[:,1].min()-toly,X[:,1].max()+toly)
    plt.show()
    return labels
def csv_parser(fileName):
    data = open(fileName, 'rU').readlines()
    outfile = fileName[:-4] + '_kmeans.csv'
    fhout = open(outfile, 'w')
    outfile = data[0].strip() + ',Label' + '\n'
    fhout.write(outfile)


    vaf = []

    for line in data[1:]:
        flds = line.split(',')
        vaf.append([float(flds[7]), float(flds[8])])

    print vaf[:5]

    vaf_np = np.array(vaf)
    print len(vaf_np)
    print vaf_np[:5]

    kmeansModel = KMeans(k=6, init='k-means++', n_init=100, max_iter=3000)

    labels = kmeansModel.fit_predict(vaf_np)

##    clustDist = model.transform(vaf_np)
    print labels[:30]

    for j in range(1, len(data)):
        outline = data[j].strip() + ',' + str(labels[j-1]) + '\n'
        fhout.write(outline)
    fhout.close()
예제 #7
0
    def initialize_hypers(self, W):
        mu_0 = W.mean(axis=(0,1))
        sigma_0 = np.diag(W.var(axis=(0,1)))

        # Set the global cov
        nu_0 = self._cov_model.nu_0
        self._cov_model.sigma_0 = sigma_0 * (nu_0 - self.B - 1)

        # Set the mean
        for c1 in xrange(self.C):
            for c2 in xrange(self.C):
                self._gaussians[c1][c2].mu_0 = mu_0
                self._gaussians[c1][c2].sigma = self._cov_model.sigma_0
                self._gaussians[c1][c2].resample()

        if self.special_case_self_conns:
            W_self = W[np.arange(self.N), np.arange(self.N)]
            self._self_gaussian.mu_0 = W_self.mean(axis=0)
            self._self_gaussian.sigma_0 = np.diag(W_self.var(axis=0))
            self._self_gaussian.resample()

        # Cluster the neurons based on their rows and columns
        from sklearn.cluster import KMeans
        features = np.hstack((W[:,:,0], W[:,:,0].T))
        km = KMeans(n_clusters=self.C)
        km.fit(features)
        self.c = km.labels_.astype(np.int)

        print "Initial c: ", self.c
예제 #8
0
def update_clusters():
    num_reviews = Review.objects.count()
    update_step = ((num_reviews/100)+1) * 5
    if num_reviews % update_step == 0: 
    # Create a sparse matrix from user reviews
        all_usernames = map(lambda x: x.username, User.objects.only("username"))
        all_wine_ids = set(map(lambda x: x.wine.id, Review.objects.only("wine")))
        num_users = len(all_usernames)
        # m is often used to denote a matrix
        ratings_m = dok_matrix((num_users, max(all_wine_ids)+1), dtype=np.float32)
        for i in range(num_users): 
            # each user corresponds to a row, in the order of all_usernames
            user_reviews = Review.objects.filter(user_name=all_usernames[i])
        for user_review in user_reviews:
            ratings_m[i,user_review.wine.id] = user_review.rating

        # Perform kmeans clustering
        k = int(num_users / 10) + 2
        kmeans = KMeans(n_clusters=k)
        clustering = kmeans.fit(ratings_m.tocsr())

        # Update clusters
        Cluster.objects.all().delete()
        new_clusters = {i: Cluster(name=i) for i in range(k)}
        for cluster in new_clusters.values(): # clusters need to be saved before referring to users
            cluster.save()
        for i,cluster_label in enumerate(clustering.labels_):
            new_clusters[cluster_label].users.add(User.objects.get(username=all_usernames[i]))
예제 #9
0
def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
    # check that fit.predict gives same result as fit_predict
    # There's a very small chance of failure with elkan on unstructured dataset
    # because predict method uses fast euclidean distances computation which
    # may cause small numerical instabilities.
    # NB: This test is largely redundant with respect to test_predict and
    #     test_predict_equal_labels.  This test has the added effect of
    #     testing idempotence of the fittng procesdure which appears to
    #     be where it fails on some MacOS setups.
    if sys.platform == "darwin":
        pytest.xfail(
            "Known failures on MacOS, See "
            "https://github.com/scikit-learn/scikit-learn/issues/12644")
    if not (algo == 'elkan' and constructor is sp.csr_matrix):
        rng = np.random.RandomState(seed)

        X = make_blobs(n_samples=1000, n_features=10, centers=10,
                       random_state=rng)[0].astype(dtype, copy=False)
        X = constructor(X)

        kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed,
                        tol=tol, max_iter=max_iter, n_jobs=1)

        labels_1 = kmeans.fit(X).predict(X)
        labels_2 = kmeans.fit_predict(X)

        assert_array_equal(labels_1, labels_2)
예제 #10
0
def kmeans_clustering(matrix, N):
    km = KMeans(n_clusters=N, n_jobs=-1)
    clusters = km.fit_predict(matrix)
    res = [[] for _ in range(N) ]
    for i, c in enumerate(clusters):
        res[c].append(i)
    return res
 def treeGenerator(self, rootLabel, points,names):
     # rootLabel is label of root
     # points is list of Feature Vectors
     # names is the name of the image corresponding Feature vector is in
     # print rootLabel, len(points)
     if len(points) < self.threshold:
         self.adjancency[rootLabel]=[]
         if rootLabel not in self.leafLabels:
             self.leafLabels.append(rootLabel)
         return
     else:
         localModel = KMeans(n_clusters = self.branches,n_jobs=4)
         localModel.fit(points)
         adj = []
         localTree = {}
         for i in localModel.cluster_centers_:
             self.treeMap[self.nodes]=i
             self.nodeImages[self.nodes]=[] # A map for node and the Images It has
             localTree[tuple(i)]=self.nodes
             adj.append(self.nodes)
             self.nodes = self.nodes + 1
         self.adjancency[rootLabel]=adj
         localClusterPoints = [[] for i in range(self.branches)]
         localClusterImgNames = [[] for i in range(self.branches)]
         # A local array to store which FV is in which cluster
         for i in range(len(points)):
             localClusterPoints[localModel.labels_[i]].append(points[i])
             localClusterImgNames[localModel.labels_[i]].append(names[i])
             if names[i] not in self.nodeImages[localTree[tuple(localModel.cluster_centers_[localModel.labels_[i]])]]:
                  self.nodeImages[localTree[tuple(localModel.cluster_centers_[localModel.labels_[i]])]].append(names[i])
         for i in range(self.branches):
             thisClusterCenter = tuple(localModel.cluster_centers_[i])
             self.treeGenerator(localTree[thisClusterCenter],localClusterPoints[i],localClusterImgNames[i])
예제 #12
0
def make_tsne_plot(model, rel_wds, plot_lims, title):

    dim = 30
    X, keys = make_data_matrix(model)

    # first we actually do PCA to reduce the
    # dimensionality to make tSNE easier to calculate
    X_std = StandardScaler().fit_transform(X)
    sklearn_pca = PCA(n_components=2)
    X = sklearn_pca.fit_transform(X_std)[:,:dim]

    # do downsample
    k = 5000
    sample = []
    important_words = []
    r_wds = [word[0] for word in rel_wds]
    for i, key in enumerate(keys):
        if key in r_wds:
            sample.append(i)
    sample = np.concatenate((np.array(sample),
                np.random.choice(len(keys), k-10, replace = False),
             ))
    X = X[sample,:]
    keys = [keys[i] for i in sample]



    # Do tSNE
    tsne = TSNE(n_components=2, random_state=0, metric="cosine")
    X_transf = tsne.fit_transform(X)

    k_means = KMeans(n_clusters=8)
    labels = k_means.fit_predict(X_transf)

    scatter_plot(X_transf[:,0], X_transf[:,1],  rel_wds, labels, title, keys, plot_lims)
예제 #13
0
def showClustering(data):
    kmeans = KMeans()
    kmeans.fit(data)
    labels = kmeans.labels_
    uniqueLabels = numpy.unique(labels)
    nCluster = len(uniqueLabels)
    centers = kmeans.cluster_centers_
    import matplotlib.pyplot as plt
    from itertools import cycle
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    plt.figure(1)
    plt.clf()
    for center in centers:
        print center
    for k,col in zip(range(nCluster),colors):
        members = labels == k
        print "plotting %dth cluster" % k
        print "label type" ,labels, type(labels)
        print "members are:", members, type(members)
        print "data[members,0]",data[members,0],type(data[members,0])
        center = centers[k]
        plt.plot(data[members,0],data[members,1],col +'.')
        plt.plot(center[0],center[1],'o',markerfacecolor=col,
                 markeredgecolor = 'k',markersize = 14)
    plt.title("clusters")
    plt.show()
def kmeans_cluster(G, graph_name, num_clusters):
    subgraphs = []
    #Find a way to figure out clusters number automatically
    write_directory = os.path.join(Constants.KMEANS_PATH,graph_name)
    if not os.path.exists(write_directory):
        os.makedirs(write_directory)
    nodeList = G.nodes()
    matrix_data = nx.to_numpy_matrix(G, nodelist = nodeList)
    kmeans = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)   
    kmeans.fit(matrix_data)
    label = kmeans.labels_
    clusters = {}
    
    for nodeIndex, nodeLabel in enumerate(label):
        if nodeLabel not in clusters:
            clusters[nodeLabel] = []
        clusters[nodeLabel].append(nodeList[nodeIndex])
        
    #countNodes is used to test whether we have all the nodes in the clusters 
    countNodes = 0    
    for clusterIndex, subGraphNodes in enumerate(clusters.keys()):
        subgraph = G.subgraph(clusters[subGraphNodes])
        subgraphs.append(subgraph)
        nx.write_gexf(subgraph, os.path.join(write_directory,graph_name+str(clusterIndex)+Constants.GEXF_FORMAT))
        #countNodes = countNodes + len(clusters[subGraphNodes])
        pass
    return num_clusters
예제 #15
0
파일: clustering.py 프로젝트: SwoJa/ruman
def kmeans(content_list):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenize, \
    lowercase=False)
    '''
    tokenizer: 指定分词函数
    lowercase: 在分词之前将所有的文本转换成小写,因为涉及到中文文本处理,
    所以最好是False
    '''
    tfidf_matrix = tfidf_vectorizer.fit_transform(content_list)
    num_clusters = 20
    km_cluster = KMeans(n_clusters=num_clusters, max_iter=300, n_init=8, \
                        init='k-means++',n_jobs=8)
    '''
    n_clusters: 指定K的值
    max_iter: 对于单次初始值计算的最大迭代次数
    n_init: 重新选择初始值的次数
    init: 制定初始值选择的算法
    n_jobs: 进程个数,为-1的时候是指默认跑满CPU
    注意,这个对于单个初始值的计算始终只会使用单进程计算,
    并行计算只是针对与不同初始值的计算。比如n_init=10,n_jobs=40, 
    服务器上面有20个CPU可以开40个进程,最终只会开10个进程
    '''
    #返回各自文本的所被分配到的类索引
    result = km_cluster.fit_predict(tfidf_matrix)
    print "Predicting result: ", result
    return result
예제 #16
0
def match_line_cluster(gdf1, gdf2):
    """
    Try to match two layers of linestrings with KMeans cluster analysis based
    on a triplet of descriptive attributes :
    (centroid coords., rounded length, approximate bearing)

    Parameters
    ----------
    gdf1: GeoDataFrame
        The reference dataset.
    gdf2: GeoDataFrame
        The collection of LineStrings to match.

    Returns
    -------
    matching_table: pandas.Series
        A table (index-based on *gdf1*) containing the id of the matching
        feature found in *gdf2*.
    """
    param1, param2 = list(map(mparams, [gdf1, gdf2]))
    k_means = KMeans(init='k-means++', n_clusters=len(gdf1),
                     n_init=10, max_iter=1000)
    k_means.fit(np.array((param1+param2)))
    df1 = pd.Series(k_means.labels_[len(gdf1):])
    df2 = pd.Series(k_means.labels_[len(gdf1):])
#    gdf1['fid_layer2'] = \
#        df1.apply(lambda x: df2.where(gdf2['key'] == x).notnull().nonzero()[0][0])
    return pd.DataFrame(
        index=list(range(len(gdf1))),
        data=df1.apply(
            lambda x: df2.where(df2 == x).notnull().nonzero())
        )
예제 #17
0
 def pca_k_means(self):
     if not self.pca_reduced:
         self.pc_analysis()
     kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10)
     kmeans.fit(self.pca_reduced, self.player_value)
     h = .02
     x_min, x_max = self.pca_reduced[:, 0].min() - 1, self.pca_reduced[:, 0].max() + 1
     y_min, y_max = self.pca_reduced[:, 1].min() - 1, self.pca_reduced[:, 1].max() + 1
     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
     Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
     Z = Z.reshape(xx.shape)
     plt.figure(1)
     plt.clf()
     plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()),
                cmap=plt.cm.Paired, aspect='auto', origin='lower')
     plt.plot(self.pca_reduced[:, 0], self.pca_reduced[:, 1], 'k.', markersize=2)
     centroids = kmeans.cluster_centers_
     labels = self.pca_labels = kmeans.labels_
     intertia = kmeans.inertia_
     plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10)
     plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
               'Centroids are marked with white cross')
     plt.xlim(x_min, x_max)
     plt.ylim(y_min, y_max)
     plt.xticks(())
     plt.yticks(())
     return {'plt': plt, 'centroids': centroids, 'labels': labels, 'inertia': intertia}
예제 #18
0
def Corpus_K_Means(TestSample,num_topic): 
    Theta = TestSample.Theta
    ThetaPredict = np.zeros(Theta.shape)
    
    W = TestSample.Word
    W = np.array(W,dtype='double')
    
    estimators = KMeans(n_clusters=num_topic,n_init=5)
    estimators.fit(W)
    BetaPredict=estimators.cluster_centers_

    Q = 2*BetaPredict.dot(BetaPredict.transpose())
    Q = matrix(Q)
    P = W.dot(BetaPredict.transpose())
    G = -np.eye(num_topic)
    G = matrix(G)
    h = np.zeros([num_topic,1])
    h = matrix(h)
    A = np.ones([1,num_topic])
    A = matrix(A)
    b = matrix(1.0)
    
    solvers.options['show_progress'] = False
    
    for i in range(num_topic):
        p = matrix(P[[i],:].transpose())
        sol=solvers.qp(Q, p, G, h, A, b)
        ThetaPredict[:,[i]] = np.array(sol['x'])
        
    Err = ThetaPredict - Theta

    return np.square(np.linalg.norm(Err))
class AdvancedModel():
    
    clusters = []
    
    # price class regression
    price_reg = LinearRegression()
        
    def fit(self, X_train, y_train, n_clusters=4):
        y_train_mat = np.array(y_train).reshape((-1,1))
        
        # 1. determine clusters
        self.km = KMeans(n_clusters=5)
        self.km.fit(y_train_mat)
        clusters = self.km.cluster_centers_
        cluster_indices = self.km.predict(y_train_mat)
        print(clusters)
        
        # 2. fit naive bayes
        #self.nb.fit(X_train, ...)
        #self
        
        # 3. train regression model
        #price_reg.fit
        
    def predict(self, X):
        pass
        
    def get_weights(self):
        return np.append(self.price_reg.coef_, [self.price_reg.intercept_])
        
    def set_weights(self, w):
        self.price_reg.coef_ = w[:-1]
        self.price_reg.intercept_ = w[-1]
        
예제 #20
0
def cluster(dat):
	kmean=KMeans(init='k-means++', n_clusters=numclusters, n_init=10)
	y=kmean.fit_predict(dat)
	partition=[[] for i in range(numclusters)]
	for i in range(len(dat)):
		partition[y[i]].append(dat[i])
	return [partition,kmean]
def iris_h2o_vs_sciKmeans(ip,port):
  # Connect to a pre-existing cluster
  h2o.init(ip,port)  # connect to localhost:54321

  iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))
  iris_sci = np.genfromtxt(h2o.locate("smalldata/iris/iris.csv"), delimiter=',')
  iris_sci = iris_sci[:,0:4]

  s =[[4.9,3.0,1.4,0.2],
  [5.6,2.5,3.9,1.1],
  [6.5,3.0,5.2,2.0]]

  start = h2o.H2OFrame(s)
  start_key = start.send_frame()

  h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start_key, standardize=False)

  sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
  sci_km.fit(iris_sci)

  # Log.info("Cluster centers from H2O:")
  print "Cluster centers from H2O:"
  h2o_centers = h2o_km.centers()
  print h2o_centers

  # Log.info("Cluster centers from scikit:")
  print "Cluster centers from scikit:"
  sci_centers = sci_km.cluster_centers_.tolist()
  print sci_centers

  for hcenter, scenter in zip(h2o_centers, sci_centers):
    for hpoint, spoint in zip(hcenter,scenter):
      assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
예제 #22
0
def main():
	songIds = open("songIDsofFirst100Users.txt","r")
	try:
		for line in songIds:
			songIDsToCluster.append(int(line))
	finally:
		songIds.close()
	print len(songIDsToCluster)

	f= sio.loadmat('/home/dmitriy/workspace/MLFinalProject/MatlabFiles/finalVectors.mat')

	full = np.nan_to_num(np.matrix(f['finalVectors']))
	# fullSplit = np.array_split(full, 360)

	# print("Done Reading")
	# mtx = fullSplit[0]
	# print(len(mtx))
	mtx = full[songIDsToCluster]
	mtx /= np.max(np.abs(mtx),axis=0)
	for clusters in range(25,50):
		errors = 0
		num_clusters = clusters
		ClusteringKmeans = KMeans(n_clusters=num_clusters)
		ClusteringKmeans.fit(mtx)
		result = ClusteringKmeans.labels_
		#silhouette = metrics.silhouette_score(mtx,result,metric='euclidean')
		#plot(mtx,result)
		writeSongIDandClusterToFile(result,clusters)
		print("Clusters:", clusters, "Retest Error:", errors)
예제 #23
0
def run_kmeans(gene_folder, n_clusters):
	pars, fitness = load_all_generations_as_DataFrame(gene_folder)
	kmeans = KMeans(n_clusters=n_clusters)
	kmeans.fit(pars)
	means = map(lambda c: fitness[kmeans.labels_ == c].mean()['longest_interval_within_margin'], range(n_clusters))
	stds = map(lambda c: fitness[kmeans.labels_ == c].std()['longest_interval_within_margin'], range(n_clusters))
	return kmeans, means, stds
예제 #24
0
def perform_cluster_analysis(dataset):

    filename = 'elbow_plot.dat'

    if os.path.exists(cpath + filename):
        data = joblib.load(cpath + filename)
        K = data[0]
        meandistortions = data[1]
    else:
        X = dataset
        print 'X Shape: ', X.shape

        #K = range(1, 50, 5)
        K = [1, 2, 5, 10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
        #K = [1, 2, 5, 10, 50, 100]
        meandistortions = []
        cluster_centers = []
        for k in K:
            print k
            kmeans = KMeans(n_clusters=k, n_jobs=3)
            kmeans.fit(X)
            #import ipdb; ipdb.set_trace() # debugging code
            #meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1))/X.shape[0])
            meandistortions.append(kmeans.inertia_)
            cluster_centers.append(kmeans.cluster_centers_)
            #print 'k: ', k, ' Cluster Centers: ', kmeans.cluster_centers_
        data = [K, meandistortions]
        joblib.dump(data, cpath + filename, compress=8)

    plot_name = "elbow_plot.png"
    title = 'Selecting k with the Elbow Method'
    xlabel = 'Number of Clusters (k)'
    ylabel = 'Average Distortion'
    xyplot(K, meandistortions, 0, 0, 0, 0, title, xlabel, ylabel, staticpath + plot_name, line=1, y_log=0)
예제 #25
0
파일: Xu.py 프로젝트: allinox/kemlglearn
    def fit(self, X):
        """

        :param X:
        :return:
        """

        lcl = range(1, self._maxc+1)


        # compute the fractal dimension
        ldistorsion = []
        for i in range(1, self._maxc+1):
            cluster = KMeans(n_clusters=i, n_jobs=-1)
            cluster.fit(X)
            ldistorsion.append(within_scatter_matrix_score(X, cluster.labels_))

        print(X.shape[1])
        print(ldistorsion)

        PCF = []
        for x,y in zip(ldistorsion, lcl):
            print(x,y, np.power(y, 2.0/X.shape[1]))
            PCF.append(x * np.power(y, 2.0/X.shape[1]))

        print(PCF)

        self._M = np.argmin(PCF)
        print(self._M)
예제 #26
0
def reduce_colors(image, n_clusters):

	image = img_as_float(image)
	height = len(image)
	width = len(image[0])
	image = image.reshape((height*width,3))

	image_mean = {}
	image_median = {}

	kmeans = KMeans(n_clusters = n_clusters, init='k-means++', random_state=241)
	classes = kmeans.fit_predict(image)

	means, medians = [], []
	for cl in range(n_clusters):
		means.append( np.mean(image[classes == cl], axis = 0))
		medians.append( np.median(image[classes == cl], axis = 0))
	
	image_mean = image.copy().astype(float)
	image_median = image.copy().astype(float)
	for cl in range(n_clusters):
		image_mean[classes == cl] = means[cl]
		image_median[classes == cl] = medians[cl]

	logging.info('Clusters: %s, PSNR(mean): %s, PSRN(median): %s'%(n_clusters, PSNR(image, image_mean), PSNR(image, image_median)))

	image_mean = image_mean.reshape(height,width,3)

	string_image = StringIO()
	plt.imsave(string_image, image_mean)

	return string_image
예제 #27
0
    def partition_FOV_KMeans(self,tradeoff_weight=.5,fx=.25,fy=.25,n_clusters=4,max_iter=500):
        """
        Partition the FOV in clusters that are grouping pixels close in space and in mutual correlation

        Parameters
        ------------------------------
        tradeoff_weight:between 0 and 1 will weight the contributions of distance and correlation in the overall metric
        fx,fy: downsampling factor to apply to the movie
        n_clusters,max_iter: KMeans algorithm parameters

        Outputs
        -------------------------------
        fovs:array 2D encoding the partitions of the FOV
        mcoef: matric of pairwise correlation coefficients
        distanceMatrix: matrix of picel distances

        Example

        """

        _,h1,w1=self.shape
        self.resize(fx,fy)
        T,h,w=self.shape
        Y=np.reshape(self,(T,h*w))
        mcoef=np.corrcoef(Y.T)
        idxA,idxB =  np.meshgrid(list(range(w)),list(range(h)));
        coordmat=np.vstack((idxA.flatten(),idxB.flatten()))
        distanceMatrix=euclidean_distances(coordmat.T);
        distanceMatrix=old_div(distanceMatrix,np.max(distanceMatrix))
        estim=KMeans(n_clusters=n_clusters,max_iter=max_iter);
        kk=estim.fit(tradeoff_weight*mcoef-(1-tradeoff_weight)*distanceMatrix)
        labs=kk.labels_
        fovs=np.reshape(labs,(h,w))
        fovs=cv2.resize(np.uint8(fovs),(w1,h1),old_div(1.,fx),old_div(1.,fy),interpolation=cv2.INTER_NEAREST)
        return np.uint8(fovs), mcoef, distanceMatrix
예제 #28
0
def findColor(frame):
    t = time()
    # dim = np.array(frame.size)/2
    # frame.thumbnail(dim, Image.ANTIALIAS)
    # print "Thumbnail in %0.3f seconds." % (time() - t)
    # t = time()
    points = imresize(np.array(frame, dtype=np.float64), 0.3)
    w,h,d = points.shape
    data = np.reshape(points, (w*h, d))
    sample = shuffle(data, random_state=0)[:len(data)/3]
    print "Reshape and shuffle in %0.3f seconds." % (time() - t)
    t = time()
    kmeans = KMeans(n_clusters=k_colors, n_jobs=jobs).fit(sample)
    labels = kmeans.predict(data)
    print "Fit and predict in %0.3f seconds." % (time() - t)
    t = time()
    colors = [map(int, color) for color in kmeans.cluster_centers_]
    # hsvs = np.array([rgb_to_hsv(*values) for values in colors])
    # frequent = np.argmax(hsvs[:,1])
    # frequent = colors[frequent]
    print "Found in %0.3f seconds." % (time() - t)
    frequents = defaultdict(int)
    for l in labels:
        frequents[l] += 1
    frequents = sorted(frequents.items(), key=lambda x:x[1], reverse=True)
    frequents = [colors[i[0]] for i in frequents[:3]]
    # print "Counted in %0.3f seconds." % (time() - t)
    # print "Top 3 colors [RGB]: ", frequents[:3]
    return frequents[2] if len(frequents) == 3 else frequents[0]
def makecluster():

	n_points=6
	n_dim=2
	n_clusters=6
	model=KMeans(init='k-means++',n_clusters=4,n_init=10)
   
	data=np.zeros((16,2))
 	#print data
        data1=np.array(temp)   
	data[0:4,:]=2
	data[4:8,:]=1
	data[8:12:,:]=-1
	data[12:16,:]=-2
	data[(0,4,8,12),1]=2
	data[(1,5,9,13),1]=1
	data[(2,6,10,14),1]=-1
	data[(3,7,11,15),1]=-2
	
	#data[3,1]=2
        #data[4,1]=3
	#data[5,1]=2
        #data[0,1]=3

	
	model.fit(data1)
	print data1
	print model.labels_
예제 #30
0
def create_fiveline(image):
    edges = cv2.Canny(image, 50, 150, apertureSize=3)

    ys = list()
    minLineLength = 1
    maxLineGap = 10

    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 70, minLineLength, maxLineGap)
    
    for line in lines:
        for x1, y1, x2, y2 in line:
            cv2.line(image, (x1,y1), (x2,y2), (0, 255, 0), 2)
            if (abs(y1 - y2 < 4)):
                innerlist = list()
                innerlist.append((y1 + y2) / 2)
                ys.append(innerlist)
    
    cv2.imwrite('images/houghlines.jpg', image)
    display_image(image)

    kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10)
    kmeans.fit(np.asarray(ys))

    fiveline = list()
    for innerlist in kmeans.cluster_centers_:
        fiveline.append(innerlist[0])

    fiveline.sort()
    print "K-MEANS centers"
    print fiveline
    return fiveline
예제 #31
0
파일: 67.py 프로젝트: RandyGen/NLP100
model = KeyedVectors.load_word2vec_format(
    './data/GoogleNews-vectors-negative300.bin.gz',
    binary=True
)

# 国名の取得
countries = set()
with open('data/analogy_data_add.txt', 'r') as f:
  for line in f:
    line = line.split()
    if line[0] in ['capital-common-countries', 'capital-world']:
      countries.add(line[2])
    elif line[0] in ['currency', 'gram6-nationality-adjective']:
      countries.add(line[1])
countries = list(countries)

# 単語ベクトルの取得
countries_vec = [model[country] for country in countries]

from sklearn.cluster import KMeans
import numpy as np

# k-meansクラスタリング
kmeans = KMeans(n_clusters=5)
kmeans.fit(countries_vec)
for i in range(5):
    cluster = np.where(kmeans.labels_ == i)[0]
    print('cluster', i)
    print(', '.join([countries[k] for k in cluster]))
    
예제 #32
0
def k_means(feature_matrix, num_clusters=10):
    km = KMeans(n_clusters=num_clusters, max_iter=10000)
    km.fit(feature_matrix)
    clusters = km.labels_
    return km, clusters
예제 #33
0

plt.figure(figsize=(20,20))
for index, (image, label) in enumerate(zip(train_images[0:100], clusterAssignement[0:100])):
    plt.subplot(5, 20, index + 1)
    plt.axis("off")
    plt.imshow(np.reshape(image, (28,28)), cmap=plt.cm.gray)
    plt.title(label, fontsize = 20)
    plt.show()



## comparison to the sklearn algorithm

pca = PCA(n_components=10)
kmeans = KMeans(n_clusters=10,n_init=1)
predictor = Pipeline([('pca', pca), ('kmeans', kmeans)])
predict = predictor.fit(test_images).predict(test_images)

acc = 0
for i in range(len(predict)):
    acc += predict[i] == test_labels[i]
print("accuracy = ", acc/len(predict))


plt.figure(figsize=(20,20))
for index, (image, label) in enumerate(zip(train_images[0:100], predict[0:100])):
    plt.subplot(5, 20, index + 1)
    plt.axis("off")
    plt.imshow(np.reshape(image, (28,28)), cmap=plt.cm.gray)
    plt.title(label, fontsize = 20)
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [((e_Values[i]), e_Vectors[:,i]) for i in range(len(e_Values))]


# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort(key=lambda x: x[0], reverse=True)
#projection matrix
matrix_w = np.hstack((eig_pairs[0][1].reshape(13,1), 
                      eig_pairs[1][1].reshape(13,1)
                    ))


#kmeans clustering
startingpoint = np.vstack((XTrain[0,],XTrain[1,]))

kmeans_model = KMeans(algorithm='full', copy_x=True, init=startingpoint,max_iter=300,\
                      n_clusters=2, n_init=1).fit(data_scaled)

#centroid values the algorithm generated

y_predict=kmeans_model.fit_predict(data_scaled)
#print(y_predict)

centroids = kmeans_model.cluster_centers_
print("The Centroids:",centroids)




#Projecting the centered data
transformed = np.dot(data_scaled,matrix_w) 
print(transformed) 
예제 #35
0
model.add(Conv2D(16, kernel_size=(3,3), activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=(2,2),padding='same'))
model.add(Conv2D(8, kernel_size=(3,3), activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=(2,2),padding='same'))
# decoder part : (conv + relu + upsampling) x 3
model.add(Conv2D(8, kernel_size=(3,3), activation='relu', padding='same'))
model.add(UpSampling2D(size=(2,2)))
model.add(Conv2D(16, kernel_size=(3,3), activation='relu', padding='same'))
model.add(UpSampling2D(size=(2,2)))
model.add(Conv2D(32, kernel_size=(3,3), activation='relu', padding='same'))
model.add(UpSampling2D(size=(2,2)))
model.add(Conv2D(3, kernel_size=(3,3), activation='sigmoid', padding='same'))
# compile and train the model
model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])
model.fit(X, X, epochs=10, batch_size=5, shuffle=True, verbose=1)

#---------- 2. Retrieve encoded image and classify pathways ----------
get_encoded_layer = backend.function([model.layers[0].input],[model.layers[5].output])
encoded_layer = get_encoded_layer([X])[0]
X_encoded = encoded_layer.reshape(encoded_layer.shape[0], -1)
km = KMeans(n_clusters)
km.fit(X_encoded)

#---------- 3. Print percentage of each path and corresponding example image ----------
X_clustered = km.labels_
N = float(len(X_clustered))
paths, counts = np.unique(X_clustered, return_counts=True)
print "---Output---"
for each_path, each_count in zip(paths, counts):
    idx = np.where(X_clustered==each_path)[0][0]
    print "path%d (%.2f) %s"%(each_path+1, each_count/N, img_list[idx])
예제 #36
0
    # 'key',
    # 'loudness',
    # 'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    # 'liveness',
    'valence',
    # 'tempo',
    # 'time_signature'
]

data = np.array([[track[k] for k in feature_keys] for track in features])
std_data = StandardScaler().fit_transform(data)

clustering = KMeans(n_clusters=N_CLUSTERS, random_state=123)
clustering.fit(std_data)
cluster_labels = clustering.labels_

tsne = TSNE(n_components=2, random_state=123)
reduced = tsne.fit_transform(std_data)

df = pd.DataFrame(data)
df.columns = feature_keys
df['x'] = reduced[:, 0]
df['y'] = reduced[:, 1]
df['added_by'] = display_names
df['cluster'] = cluster_labels
df['name'] = [track['track']['name'] for track in tracks]
df['artists'] = [', '.join(artist['name'] for artist in track['track']['artists']) for track in tracks]
df['id'] = [track['track']['id'] for track in tracks]
예제 #37
0
            float(v[23]),
            float(v[25]),
            float(v[27]),
            float(v[28])
        ]
    iris_target.append(stateCode[str(v[4]).strip()])
    iris_data.append(line_data)
labels_true = np.array(iris_target)
data = np.array(iris_data)
n_sample = len(data)
X = data[:, :2]
# Incorrect number of clusters
# #############################################################################
# Compute clustering with Means

k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0

# #############################################################################
# Compute clustering with MiniBatchKMeans

mbk = MiniBatchKMeans(init='k-means++',
                      n_clusters=2,
                      batch_size=batch_size,
                      n_init=10,
                      max_no_improvement=10,
                      verbose=0)
t0 = time.time()
mbk.fit(X)
예제 #38
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jun  6 15:36:36 2019

@author: KIIT
"""

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
df=pd.read_csv('crime_data.csv')
features=df.iloc[:,[1,2,4]].values
pca=PCA(n_components=2)
features=pca.fit_transform(features)
kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 0)
pred_cluster = kmeans.fit_predict(features)
plt.scatter(features[pred_cluster == 0, 0], features[pred_cluster == 0, 1], c = 'blue', label = 'LowCrime')
plt.scatter(features[pred_cluster == 1, 0], features[pred_cluster == 1, 1], c = 'red', label = 'MedCrime')
plt.scatter(features[pred_cluster == 2, 0], features[pred_cluster == 2, 1], c = 'green', label = 'HighCrime')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c = 'yellow', label = 'Centroids')
plt.title('Crime Data')
plt.xlabel('P1 Features')
plt.ylabel('P2 Features')
plt.legend()
plt.show()




예제 #39
0
def kmeanspp(X, k):
    kmeans = KMeans(n_clusters=k, max_iter=1, init='k-means++', n_init=1).fit(X)
    return kmeans.cluster_centers_
digits_test = pandas.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes',
                               header=None)

X_train = digits_train[numpy.arange(64)]
y_train = digits_train[64]

print(X_train)
print(y_train)

print('-----------------------------------------')

X_test = digits_test[numpy.arange(64)]
y_test = digits_test[64]

# 初始化 KMeans模型, 并设置聚类中心数量为10
kmeans = KMeans(n_clusters=10)
kmeans.fit(X_train)

# 逐条判断每个测试图像所属的聚类中心
y_predict = kmeans.predict(X_test)
# 0.6592893679369013,  0.6621773801044615
print(metrics.adjusted_rand_score(y_test, y_predict))


plt.subplot(3, 2, 1)

x1 = numpy.array([1, 2, 3, 1, 5, 6, 5, 5, 6, 7, 8, 9, 7, 9])
x2 = numpy.array([1, 3, 2, 2, 8, 6, 7, 6, 7, 1, 2, 1, 1, 3])

print(x1)
print(x2)
예제 #41
0
# display first 5 rows
matrix.head()

# Code ends here

# --------------
# import packages
from sklearn.cluster import KMeans

# Code starts here

# initialize KMeans object
cluster = KMeans(n_clusters=5,
                 init='k-means++',
                 max_iter=300,
                 n_init=10,
                 random_state=0)

# create 'cluster' column
matrix['cluster'] = cluster.fit_predict(matrix[matrix.columns[1:]])
matrix.head()
# Code ends here

# --------------
# import packages
from sklearn.decomposition import PCA

# Code starts here

# initialize pca object with 2 components
예제 #42
0
def forgy(X, k):
    kmeans = KMeans(n_clusters=k, max_iter=1, init='random', n_init=1).fit(X)
    return kmeans.cluster_centers_
    # retro_clustering_algo = AgglomerativeClustering(n_clusters=args.clusters, connectivity=proximity)
    # retro_cluster_ids = clustering_algo.fit_predict(X=new_vectors)

# do agglomerative clustering with structure
print('agglomerative clustering', file=sys.stderr, )
clustering_algo = AgglomerativeClustering(n_clusters=args.clusters, connectivity=proximity, affinity=args.affinity, linkage=args.linkage)
cluster_ids = clustering_algo.fit_predict(X=vectors)
color_names = [cluster_colors[c] for i, c in enumerate(cluster_ids) if locations[eligible_cities[i]][-2] == "DE"]
cMap = colors.ListedColormap(cluster_colors)
print('done', file=sys.stderr, )

# do kmeans clustering
print('kmeans clustering', file=sys.stderr, )
dumb_cluster_ids = []
for x in range(KMEANS_AVG):
    dumb_cluster = KMeans(n_jobs=-1, n_clusters=args.clusters)
    dumb_cluster_ids.append(dumb_cluster.fit_predict(vectors))
    # dumb_cluster_ids = dumb_cluster.fit_predict(X_train_tfidf)
print('done', file=sys.stderr, )


if args.show_nuts:
    NUTS_shape_file = '/Users/dirkhovy/Dropbox/working/lowlands/GeoStats/data/nuts/NUTS_RG_03M_2010.shp'
    print("reading country outline from %s" % NUTS_shape_file, end=' ', file=sys.stderr)
    NUTS_shapes = fiona.open(NUTS_shape_file)

    NUTS2_outlines = {}
    NUTS3_outlines = {}
    for item in islice(NUTS_shapes, None):
        nuts_id = None
        if item['properties']['STAT_LEVL_'] == 2:
예제 #44
0
# # Import KMeans
from sklearn.cluster import KMeans

# Create a KMeans instance with 3 clusters: model
model = KMeans(n_clusters=3)

# Fit model to points
model.fit(points)

# Determine the cluster labels of new_points: labels
labels = model.predict(new_points)

# Print cluster labels of new_points
print(labels)



# Import pyplot
import matplotlib.pyplot as plt

# Assign the columns of new_points: xs and ys
xs = new_points[:,0]
ys = new_points[:,1]
# Make a scatter plot of xs and ys, using labels to define the colors
plt.scatter(xs,ys,c=labels,alpha=0.5)

# Assign the cluster centers: centroids
centroids = model.cluster_centers_

# Assign the columns of centroids: centroids_x, centroids_y
centroids_x = centroids[:,0]
예제 #45
0
import numpy as np
from sklearn.cluster import KMeans


def extractFeatures(filename):
    features = []
    features = np.array([features])
    for line in file(filename):
        row = line.split(',')
        features = np.append(features, np.array([float(x) for x in row[0:5]]))
    return np.array(features, dtype=int)


filename1 = "/home/tharindra/PycharmProjects/WorkBench/DataMiningAssignment/LabelingBeforeClustering.csv"
features = extractFeatures(filename1)
features = features.reshape(4149, 5)

kmeans = KMeans(n_clusters=3)
kmeans.fit(features)

centroids = kmeans.cluster_centers_
label = kmeans.labels_

for i in range(len(features)):
    #print("coordinate:",features[i], "label:", label[i])
    print(label[i])
예제 #46
0
#Get Max and Min for exercised_stock_options
dframe = pd.DataFrame.from_dict(data_dict , orient='index')
dframe_filtered = dframe[dframe[feature_1] != "NaN"]
print ("max is %s" % dframe_filtered[feature_1].max())
print ("min is %s" % dframe_filtered[feature_1].min())

### in the "clustering with 3 features" part of the mini-project,
### you'll want to change this line to 
### for f1, f2, _ in finance_features:
### (as it's currently written, the line below assumes 2 features)
for f1, f2, f3 in finance_features:
    plt.scatter( f1, f2, f3 )
plt.show()

### cluster here; careate predictions of the cluster labels
### for the data and store them to a list called pred
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=3, random_state=0)
pred = clf.fit_predict( finance_features )



### rename the "name" parameter when you change the number of features
### so that the figure gets saved to a different file
try:
    Draw(pred, finance_features, poi, mark_poi=False, name="clusters3points.pdf", f1_name=feature_1, f2_name=feature_2)
except NameError:
    print "no predictions object named pred found, no clusters to plot"

예제 #47
0
def test_using_sklearn(label_true, label_true_test, dataset, datatest):
    X = numpy.array(dataset)
    kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
    cluster_train = kmeans.labels_
    arr_test = numpy.array(datatest)
    cluster_test = kmeans.predict(arr_test)

    # Evaluation for Full Training
    print(
        "\n------------------------ SCIKIT LEARN --------------------------------"
    )
    print(
        "--------------- K-MEANS SCORE USING DATA TRAIN -----------------------"
    )
    print("ARI SCORE: " + str(
        adjusted_rand_score(numpy.array(label_true), numpy.array(
            cluster_train))))
    print("MUTUAL INFO SCORE: " + str(
        adjusted_mutual_info_score(numpy.array(label_true),
                                   numpy.array(cluster_train))))
    print("HOMOGENEITY SCORE: " + str(
        homogeneity_score(numpy.array(label_true), numpy.array(cluster_train)))
          )
    print("COMPLETENESS SCORE: " + str(
        completeness_score(numpy.array(label_true), numpy.array(
            cluster_train))))
    print("V MEASURE SCORE: " + str(
        v_measure_score(numpy.array(label_true), numpy.array(cluster_train))))
    print("FOWLKES-MALLOWS SCORE: " + str(
        fowlkes_mallows_score(numpy.array(label_true),
                              numpy.array(cluster_train))))
    # print("SILHOUETTE SCORE: " + str(silhouette_score(numpy.array(dataset), numpy.array(label_true), metric="euclidean")))
    print("CALINSKI-HARABAZ SCORE: " + str(
        calinski_harabaz_score(numpy.array(dataset), numpy.array(label_true))))

    # Evaluation for Split Validation
    print(
        "--------------- K-MEANS SCORE USING DATA TEST -----------------------"
    )
    print("ARI SCORE: " + str(
        adjusted_rand_score(numpy.array(label_true_test),
                            numpy.array(cluster_test))))
    print("MUTUAL INFO SCORE: " + str(
        adjusted_mutual_info_score(numpy.array(label_true_test),
                                   numpy.array(cluster_test))))
    print("HOMOGENEITY SCORE: " + str(
        homogeneity_score(numpy.array(label_true_test),
                          numpy.array(cluster_test))))
    print("COMPLETENESS SCORE: " + str(
        completeness_score(numpy.array(label_true_test),
                           numpy.array(cluster_test))))
    print("V MEASURE SCORE: " + str(
        v_measure_score(numpy.array(label_true_test), numpy.array(
            cluster_test))))
    print("FOWLKES-MALLOWS SCORE: " + str(
        fowlkes_mallows_score(numpy.array(label_true_test),
                              numpy.array(cluster_test))))
    # print("SILHOUETTE SCORE: " + str(silhouette_score(numpy.array(dataset), numpy.array(label_true_test), metric="euclidean")))
    print("CALINSKI-HARABAZ SCORE: " + str(
        calinski_harabaz_score(numpy.array(datatest),
                               numpy.array(label_true_test))))

    return None
    def _get_masks(self,output, utt_info):
        '''estimate the masks

        Args:
            output: the output of a single utterance of the neural network
                    tensor of dimension [Txfeature_dimension*emb_dim]

        Returns:
            the estimated masks'''

        embeddings = output['bin_emb']
        noise_filter = output['noise_filter']
        #only the non-silence bins will be used for the clustering
        mix_to_mask, _ = self.mix_to_mask_reader(self.pos)

        [T,F] = np.shape(mix_to_mask)
        emb_dim = np.shape(embeddings)[1]/F
        N = T*F
        if np.shape(embeddings)[0] != T:
            raise 'Number of frames in usedbins does not match the sequence length'
        if np.shape(noise_filter)[0] != T:
            raise 'Number of frames in usedbins does not match the sequence length'
        if np.shape(noise_filter)[1] != F:
            raise 'Number of noise filter outputs does not match number of frequency bins'
        #reshape the outputs
        emb_vec = embeddings[:T,:]
        emb_vec_resh = np.reshape(emb_vec,[T*F,emb_dim])

        X_hat_clean = np.multiply(mix_to_mask,noise_filter[:T,:])
        maxbin = np.max(X_hat_clean)
        floor=maxbin/self.usedbin_threshold

        #apply floor to get the used bins
        usedbins=np.greater(X_hat_clean,floor)
        noise_filter_reshape = np.reshape(noise_filter[:T,:],[T*F,1])


        usedbins_resh = np.reshape(usedbins, T*F)

        #Only keep the active bins (above threshold) for clustering
        output_speech_resh = emb_vec_resh[usedbins_resh] # dim:N' x embdim (N' is number of bins that are used N'<N)
        if np.shape(output_speech_resh)[0] < 2:
            print 'insufficient bins with energie'
            return np.zeros([self.nrS,T,F])
        #apply kmeans clustering and assign each bin to a clustering
        kmeans_model=KMeans(n_clusters=self.nrS, init='k-means++', n_init=10, max_iter=100, n_jobs=-1)
        for _ in range(5):
            # Sometime it fails due to some indexerror and I'm not sure why. Just retry then. max 5 times
            try:
                kmeans_model.fit(output_speech_resh)
            except IndexError:
              continue
            break

        A = kmeans_model.cluster_centers_ # dim: nrS x embdim


        prod_1 = np.matmul(A,emb_vec_resh.T) # dim: nrS x N
        numerator = np.exp(prod_1-np.max(prod_1,axis=0))
        denominator = np.sum(numerator,axis=0)
        M = numerator/denominator
        M_final = np.multiply(M,np.transpose(noise_filter_reshape))

        #reconstruct the masks from the cluster labels
        masks = np.reshape(M_final,[self.nrS,T,F])
        np.save(os.path.join(self.center_store_dir,utt_info['utt_name']),kmeans_model.cluster_centers_)
        return masks
예제 #49
0
    sns.show()


#data_plot(df)

df[df['Grad.Rate'] > 100]

df['Grad.Rate']['Cazenovia College'] = 100

df[df['Grad.Rate'] > 100]

sns.set_style('darkgrid')
g = sns.FacetGrid(df, hue="Private", palette='coolwarm', size=6, aspect=2)
g = g.map(plt.hist, 'Grad.Rate', bins=20, alpha=0.7)

kmeans = KMeans(n_clusters=2)

kmeans.fit(df.drop('Private', axis=1))

kmeans.cluster_centers_


def converter(cluster):
    if cluster == 'Yes':
        return 1
    else:
        return 0


df['Cluster'] = df['Private'].apply(converter)
예제 #50
0
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
plt.figure(figsize=(12, 12))

n_samples = 1500
random_state = 170
X, y = make_blobs(n_samples=n_samples, random_state=random_state)

# Incorrect number of clusters
y_pred = KMeans(n_clusters=2, random_state=random_state).fit_predict(X)
plt.subplot(221)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.title("Incorrect Number of Blobs")

# Anisotropicly distributed data
transformation = [[ 0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
X_aniso = np.dot(X, transformation)
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_aniso)

plt.subplot(222)
plt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
plt.title("Anisotropicly Distributed Blobs")

# Different variance
X_varied, y_varied = make_blobs(n_samples=n_samples,
                                cluster_std=[1.0, 2.5, 0.5],
                                random_state=random_state)
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_varied)
예제 #51
0
 def fit(self, data, n_clusters):
     data = np.array(data)
     data = preprocessing.MinMaxScaler().fit_transform(data)
     model = KMeans(n_clusters=n_clusters)
     clustering = model.fit(data)
     return clustering
예제 #52
0
def document_clustering(year):
    """ Cluster the documents of the year given as a parameter.
        --------------------
        Parameter:
            year: the year of interest
        Return:
            None
    """
    #preprocess(year,year)

    #query_docs didn't work(Memory error) so I wrote quite similar code below
    #reports = query_docs(2013, 2014)

    #Create list of reports
    reports = []
    #Create list of year directory's reports
    companies = os.listdir('cleaned' + os.sep + str(year))
    #The command above inserted some "DS.store"-string in the beginning, so I remove it
    companies.remove(companies[0])
    #Create list of selected companies
    company = []
    #
    amount_of_files = 100
    for i in range(amount_of_files):
        # Open the report

        with open('cleaned/' + str(year) + '/' + companies[i], 'r') as file:
            data = file.read().replace('\n', '')
        # Append report to the list
        reports.append(data)
        #Append selected company to another list
        company.append(companies[i])

    #tf-idf
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(reports)
    transformer = TfidfTransformer(smooth_idf=False)
    tfidf = transformer.fit_transform(X)

    #K-means clustering
    num_clusters = 5
    km = KMeans(n_clusters=num_clusters,
                init='k-means++',
                max_iter=100,
                n_init=1)
    km.fit(tfidf)
    clusters = km.labels_.tolist()

    idea = {
        'Filename': company,
        'Cluster': clusters
    }  #Creating dict having report's filename with the corresponding cluster number.
    frame = pd.DataFrame(idea,
                         index=[clusters],
                         columns=['Filename', 'Cluster'
                                  ])  # Converting it into a dataframe.

    #Printing the results
    for i in range(num_clusters):
        print("Cluster" + str(i + 1) + ":")
        cluster_i = frame.loc[[i]]
        fra = cluster_i['Filename'].tolist()
        for i in fra:
            print(i)
예제 #53
0
detectors = {'sift': cv2.xfeatures2d.SIFT_create, 'surf': cv2.xfeatures2d.SURF_create, 'orb': cv2.ORB_create}
detector = detectors.get(sys.argv[2])

if sys.argv[1] == 'train':
    k = int(sys.argv[5])
    print('|||||||| First Class ||||||||')
    im_class0 = ImagesManager(sys.argv[3], detector())
    print('|||||||| Second Class ||||||||')
    im_class1 = ImagesManager(sys.argv[4], detector())
    data = []
    for d in im_class0.__get_all_descriptors__():
        data.append(d)
    for d in im_class1.__get_all_descriptors__():
        data.append(d)
    print('|||||||| Kmeans ||||||||')
    kmeans = KMeans(n_clusters=k, random_state=0).fit(data)
    print('|||||||| First Class ||||||||')
    im_class0.__compute_bows__(kmeans, k, True)
    print('|||||||| Second Class ||||||||')
    im_class1.__compute_bows__(kmeans, k, True, sum(nb_d for nb_d in im_class0.number_descriptors))

    logistic = LogisticRegression()
    labels = []
    for i in range(0, len(im_class0.files)):
        labels.append(0)
    for i in range(0, len(im_class1.files)):
        labels.append(1)
    bows = []
    for b in im_class0.__get_bows__():
        bows.append(b)
    for b in im_class1.__get_bows__():
예제 #54
0
test_standardscal = standard_scaler.fit(train).transform(test)

def compute_error(label,predict):
    
    label_list = label.transpose().tolist()
    count = 0
    for i in range(len(predict)):
        if label_list[i] == predict[i]:
            count += 1
    error = 1-((count / len(predict)))
    return '{:.4f} '.format(error)
scaler = MinMaxScaler()
train_scal = scaler.fit(train).transform(train)
train_log = np.log(train_scal)
##kmeans ==================================================
kmeans = KMeans(n_clusters = 2).fit(train_standardscal)                   
pred_kmeans = kmeans.labels_
import collections
collections.Counter(pred_kmeans)
compute_error(label,pred_kmeans)
kmeans_pred = kmeans.predict(test_standardscal)               
#==========================================================                  
train_scale = scale(train)
test_scale = scale(test)

label = train_label[:,0]
   # 'reg_lambda':[1,2,3,4,5],              
cv_params = {'n_estimators':[1300,1100]}
ind_params = { 'seed':0, 
              'subsample':0.7,
              'min_child_weight':3,
import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs

attributes, clusters = make_blobs(cluster_std=1)

plt.scatter(attributes[:, 0], attributes[:, 1], c=clusters)
plt.show()

from sklearn.cluster import KMeans

attributes, clusters = make_blobs()

# Better n_init to be large!
k_means = KMeans(3, init="random", n_init=10)
assigned = k_means.fit_predict(attributes)

# Original, generated clusters
plt.scatter(attributes[:, 0], attributes[:, 1], c=clusters)
plt.show()

# Assigned clusters
plt.scatter(attributes[:, 0], attributes[:, 1], c=assigned)
plt.show()

k_means = KMeans(3, init="k-means++")
assigned = k_means.fit_predict(attributes)
assigned = k_means.fit_predict(attributes)

# Original, generated clusters
import numpy as np
import matplotlib.pyplot as plt
# Though the following import is not directly being used, it is required
# for 3D projection to work
from mpl_toolkits.mplot3d import Axes3D

from sklearn.cluster import KMeans
from sklearn import datasets

np.random.seed(5)

iris = datasets.load_iris()
X = iris.data
y = iris.target

estimators = [('k_means_iris_8', KMeans(n_clusters=8)),
              ('k_means_iris_3', KMeans(n_clusters=3)),
              ('k_means_iris_bad_init',
               KMeans(n_clusters=3, n_init=1, init='random'))]

fignum = 1
titles = ['8 clusters', '3 clusters', '3 clusters, bad initialization']
for name, est in estimators:
    fig = plt.figure(fignum, figsize=(4, 3))
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
    est.fit(X)
    labels = est.labels_

    ax.scatter(X[:, 3],
               X[:, 0],
               X[:, 2],
def call_KM(genre1, genre2, genre3):
    movies = pd.read_csv('mysite/movies.csv')
    ratings = pd.read_csv('mysite/ratings.csv')

    # genre1='Adventure'
    # genre2='Sci-Fi'
    # genre3='Action'
    my_clusters = 0
    helper.set_Variables(genre1, genre2, genre3)

    genre_ratings = helper.get_genre_ratings(ratings, movies, [genre1, genre2],
                                             [Dict[genre1], Dict[genre2]])
    biased_dataset = helper.bias_genre_rating_dataset(genre_ratings, 3.2, 2.5)

    print("Number of records: ", len(biased_dataset))
    biased_dataset.head()
    helper.draw_scatterplot(biased_dataset[Dict[genre2]], Dict[genre2],
                            biased_dataset[Dict[genre1]], Dict[genre1],
                            'mysite/static/mysite/Normal.png')
    # plt.savefig('mysite/static/mysite/Normal.png')
    #
    # plt.close('mysite/static/mysite/Normal.png')

    X = biased_dataset[[Dict[genre2], Dict[genre1]]].values

    # TODO: Create an instance of KMeans to find two clusters
    kmeans_1 = KMeans(n_clusters=2, random_state=0)
    predictions = kmeans_1.fit_predict(X)
    helper.draw_clusters(biased_dataset, predictions,
                         'mysite/static/mysite/TwoCluster.png')
    # plt.savefig('mysite/static/mysite/TwoCluster.png')
    # plt.close('TwoCluster.png')

    # TODO: Create an instance of KMeans to find three clusters
    kmeans_2 = KMeans(n_clusters=3, random_state=1)
    predictions_2 = kmeans_2.fit_predict(X)
    helper.draw_clusters(biased_dataset, predictions_2,
                         'mysite/static/mysite/ThreeCluster.png')
    # plt.savefig('mysite/static/mysite/ThreeCluster.png')
    # plt.close('ThreeCluster.png')

    # TODO: Create an instance of KMeans to find four clusters
    kmeans_3 = KMeans(n_clusters=4, random_state=3)
    predictions_3 = kmeans_3.fit_predict(X)
    helper.draw_clusters(biased_dataset, predictions_3,
                         'mysite/static/mysite/FourCluster.png')
    # plt.savefig('mysite/static/mysite/FourCluster.png')
    # plt.close('FourCluster.png')

    possible_k_values = range(2, len(X) + 1, 5)
    errors_per_k = [helper.clustering_errors(k, X) for k in possible_k_values]
    list(zip(possible_k_values, errors_per_k))
    fig, ax = plt.subplots(figsize=(16, 6))
    ax.set_xlabel('K - number of clusters')
    ax.set_ylabel('Silhouette Score (higher is better)')
    ax.plot(possible_k_values, errors_per_k)
    fig.savefig('mysite/static/mysite/score.png')
    plt.close(fig)

    # Ticks and grid
    xticks = np.arange(min(possible_k_values), max(possible_k_values) + 1, 5.0)
    ax.set_xticks(xticks, minor=False)
    ax.set_xticks(xticks, minor=True)
    ax.xaxis.grid(True, which='both')
    yticks = np.arange(round(min(errors_per_k), 2), max(errors_per_k), .05)
    ax.set_yticks(yticks, minor=False)
    ax.set_yticks(yticks, minor=True)
    ax.yaxis.grid(True, which='both')

    # TODO: Create an instance of KMeans to find seven clusters
    kmeans_4 = KMeans(n_clusters=7, random_state=6)
    predictions_4 = kmeans_4.fit_predict(X)
    helper.draw_clusters(biased_dataset,
                         predictions_4,
                         'mysite/static/mysite/BestCluster.png',
                         cmap='Accent')
    # plt.savefig('mysite/static/mysite/BestCluster.png')
    # plt.close('BestCluster.png')

    biased_dataset_3_genres = helper.get_genre_ratings(
        ratings, movies, [genre1, genre2, genre3],
        [Dict[genre1], Dict[genre2], Dict[genre3]])
    biased_dataset_3_genres = helper.bias_genre_rating_dataset(
        biased_dataset_3_genres, 3.2, 2.5).dropna()
    print("Number of records: ", len(biased_dataset_3_genres))

    X_with_action = biased_dataset_3_genres[[
        Dict[genre2], Dict[genre1], Dict[genre3]
    ]].values

    # TODO: Create an instance of KMeans to find seven clusters
    kmeans_5 = KMeans(n_clusters=7)
    predictions_5 = kmeans_5.fit_predict(X_with_action)
    helper.draw_clusters_3d(biased_dataset_3_genres, predictions_5,
                            'mysite/static/mysite/3DCluster.png')
    # plt.savefig('mysite/static/mysite/3DCluster.png')
    # plt.close('3DCluster.png')

    #Merge the two tables then pivot so we have Users X Movies dataframe
    ratings_title = pd.merge(ratings,
                             movies[['movieId', 'title']],
                             on='movieId')
    user_movie_ratings = pd.pivot_table(ratings_title,
                                        index='userId',
                                        columns='title',
                                        values='rating')
    user_movie_ratings.iloc[:6, :10]
    n_movies = 30
    n_users = 18
    most_rated_movies_users_selection = helper.sort_by_rating_density(
        user_movie_ratings, n_movies, n_users)
    most_rated_movies_users_selection.head()

    helper.draw_movies_heatmap(most_rated_movies_users_selection,
                               'mysite/static/mysite/HeatMap.png')
예제 #58
0
    elif classify == 3:
        cLabel = 'SVM'
        clf = SVC()

    elif classify == 4:
        cLabel = 'Linear Discriminant Analysis'
        clf = LDA()

    elif classify == 5:
        cLabel = 'Random Forest Classifier'
        clf = RandomForestClassifier(n_estimators=5)
        #SVR(C = 1.0, epsilon=0.2)

    elif classify == 6:
        cLabel = 'K-means clustering'
        clf = KMeans(n_clusters=512, init='random')

    t0 = time.time()
    clf.fit(train_instances, train_labels)
    t1 = time.time()
    nd = len(use_idx)

    # prediction on training and test data
    accuracyTr, dev_acc_train, predicted_labels_binary_train = deviceErrors(
        clf, nd, train_instances, train_labels, train_labels_binary)
    accuracyTs, dev_acc_test, predicted_labels_binary_test = deviceErrors(
        clf, nd, test_instances, test_labels, test_labels_binary)

    # prediction of device energy consumption
    agg_energy_train = train_instances[:, 5]
    actEnergy_train = actDevEnergy(device_power, device_timer, nd)
예제 #59
0
from sklearn.cluster import KMeans
import cPickle as pickle
from time import time
import numpy as np

if __name__ == "__main__":

    with open("../lesson10/dataset.pickle", "rb") as f:
        X = np.load(f)
        print "shape of dataset:", X.shape

    km = KMeans(init='k-means++', n_clusters=500, verbose=1)
    t0 = time()
    km.fit(X)
    print "done in %0.3fs" % (time() - t0)

    with open("km.pickle", "wb") as f:
        pickle.dump(km, f, pickle.HIGHEST_PROTOCOL)
예제 #60
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import confusion_matrix

#load data and assign headers
iris = load_iris()
x = pd.DataFrame(iris.data)
x.columns = ['Sepal-Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']

#fit kmeans model
kmeans = KMeans(n_clusters=3)
kmodel = kmeans.fit(x)

#fit em model
gmm = GaussianMixture(n_components=3)
gmm.fit(x)
gmm_labels = gmm.predict(x)

#print confusion matrices for both classifications
print("Kmeans algorithm:\n ", confusion_matrix(iris.target, kmodel.labels_))
print("\nEM algorithm:\n ", confusion_matrix(iris.target, gmm_labels))

#print scatter plots for iris target clusters and kmeans-em classifications
colormap = np.array(['red', 'blue', 'green'])
plt.subplot(2, 2, 1)
plt.scatter(x.Petal_Length, x.Petal_Width, c=colormap[iris.target], s=40)