def semesters_cluster(self, **kwargs):
     start_year = self._ha_df[self.year_attr].values.min()
     end_year = self._ha_df[self.year_attr].values.max()
     self.se_df = self.semesters_features
     self.se_df = self.se_df[ (self.se_df[self.year_attr].values >= start_year) & (self.se_df[self.year_attr].values <= end_year) ]
     _h_program = hash( self._program )
     self.se_df.fillna(0)
     data = self.se_df[ self.SEMESTERS_F_LABELS ].as_matrix()
     if kwargs == {}:
         C = self._C_k
         try:
             c_init = genfromtxt('./data/%s/centers_%d.csv'%(self.source,_h_program), delimiter=',')
             km = kmeans(init=c_init, n_clusters=C, n_init=10)
             km.fit(data)
         except:
             km = kmeans(init='k-means++', n_clusters=C, n_init=10)
             km.fit(data)
             savetxt('./data/%s/centers_%d.csv'%(self.source,_h_program), km.cluster_centers_, delimiter=',')
     else:
         km = kmeans( kwargs )
         km.fit(data)
     cntr, L = km.cluster_centers_, km.labels_
     #print 'L',L
     #self.semesters_features['km_cluster_ID'] = L
     self.se_df['km_cluster_ID'] = L
     self.cntr_se = cntr
예제 #2
0
	def compute_centers(self, X, layers=1):
		#use kmeans to compute centroids
		k1, k2 = self.l1_size, self.l2_size
		kmns = kmeans(n_clusters=k1, compute_labels=False,
						n_init=3, max_iter=200)
		kmns.fit(X)
		if layers==1: return kmns.cluster_centers_
		#handle only two layers right now
		Xc = kmns.transform(X)
		#cluster in transformed space
		kmns2 = kmeans(n_clusters=k2, compute_labels=False,
						n_init=3, max_iter=200)
		kmns2.fit(Xc)
		return (kmns.cluster_centers_, kmns2.cluster_centers_)
예제 #3
0
def calculate_wss(points, kmax):
  """Calculates wss."""

  # For the Elbow Method, compute the Within-Cluster-Sum of Squared Errors (wss)
  # for different values of k and choose the k for which wss becomes first stars
  # to diminish.

  sse = []
  for k in range(1, kmax + 1):
    kmeans = cluster.kmeans(n_clusters=k, max_iter=1000).fit(points)
    centroids = kmeans.cluster_centers_
    pred_clusters = kmeans.predict(points)
    curr_sse = 0

    # calculate square of Euclidean distance of each point from its cluster
    # center and add to current wss

    for i in range(len(points)):
      curr_center = centroids[pred_clusters[i]]
      curr_sse += ((points[i, 0] - curr_center[0])**2 +
                   (points[i, 1] - curr_center[1])**2)

    sse.append(curr_sse)

  return sse
예제 #4
0
def segregate(transaction_details, n):
    convergence_warning = None
    vectorizer = CountVectorizer()
    vector = vectorizer.fit_transform(transaction_details)
    word_counts = vector.toarray()
    k_means = kmeans(n_clusters=n)
    try:
        with catch_warnings():
            filterwarnings('error')
            try:
                k_means.fit(word_counts)
            except Warning as w:
                convergence_warning = str(w).split(".")[0]
            cluster_ids = k_means.labels_
        unique_clustor_ids = set(cluster_ids)
        unique_clusters = {}
        for clustor_id in unique_clustor_ids:
            common_words = []
            for j in range(len(cluster_ids)):
                if(clustor_id == cluster_ids[j]):
                    common_words.append(transaction_details[j])
            unique_clusters.update({clustor_id: findstem(list(set(common_words)))})
        clusters = []
        for clustor_id in cluster_ids:
            clusters.append((clustor_id, unique_clusters[clustor_id]))
        return (clusters,convergence_warning)
    except Exception:
        convergence_warning = "Enable to find "+str(n)+" clusters"
    return ([],convergence_warning)
def getSparseModel(num_inducing):
    random_state = np.random.RandomState(1)
    X, Y = getTrainingHoldOutData()
    km = kmeans(n_clusters=num_inducing, random_state=random_state).fit(X)
    Z = km.cluster_centers_
    model = GPflow.sgpr.SGPR(X, Y, kern=getRbfKernel(), Z=Z)
    return model
예제 #6
0
def cluster_vectors(word2vec):
	"""Clusters a set of word vectors"""

	# Load Data
	df = pd.read_csv("data/input/thought.csv", na_filter=False, encoding="utf-8", error_bad_lines=False)
	seers_grouped = df.groupby('Seer', as_index=False)
	seers = dict(list(seers_grouped))
	thoughts = list(seers["msevrens"]["Thought"])
	ken = vectorize(thoughts, min_df=1)
	sorted_ken = sorted(ken.items(), key=operator.itemgetter(1))
	sorted_ken.reverse()

	keys_in_word2vec = word2vec.vocab.keys()
	tokens = [x[0] for x in sorted_ken[0:2500] if x[0] in keys_in_word2vec]
	X = np.array([word2vec[t].T for t in tokens])

	print(tokens)

	# Clustering
	clusters = kmeans(n_clusters=100, max_iter=5000, batch_size=128, n_init=250)
	clusters.fit(X)
	word_clusters = {word:label for word, label in zip(tokens, clusters.labels_)}
	sorted_clusters = sorted(word_clusters.items(), key=operator.itemgetter(1))
	collected = collections.defaultdict(list)

	for k in sorted_clusters:
		collected[k[1]].append(k[0])

	for key in collected.keys():
		safe_print(key, collected[key], "\n")

	# Visualize with t-SNE
	t_SNE(word2vec, tokens, collected)
예제 #7
0
 def __init__(self):
     self.tagger = Mecab.Tagger('-Ochasen')
     self.alnum_Reg = re.compile(
         r'^[a-zA-Z0-9_(:;,./?*+&%#!<>|\u3000)~=]+$')
     # These list prepared for data in the Tweeter,
     # First 4 variables prepared for a line which is devided by tab space
     # Last 3 are for contexts that is devided by space
     # rt_flags: contains if the tweet is retweet or not
     # reps:     contains for whom the tweet was replied
     # texts:    contains actual tweet message
     self.ids, self.dates, self.contexts, self.unk_nums, self.rt_flags = [], [], [], [], []
     self.reps, self.texts, self.lines, self.morpho = [], [], [], []
     self.noun_count, self.adj_count, self.verb_count = defaultdict(
         int), defaultdict(int), defaultdict(int)
     self.word_count = defaultdict(int)
     self.total_noun, self.total_adj, self.total_verb = 0, 0, 0
     self.total_words = 0
     self.corpList = []
     self.existCount, self.totalCount = 0, 0
     self.dictIDWord, self.dictWordID = {}, {}
     self.dictIDProb, self.dictIDNum, self.dictNumFeat = {}, {}, {}
     self.dictNumID = {}
     self.features = np.array([])
     self.clf = kmeans(n_clusters=3,
                       init='random',
                       n_init=10,
                       max_iter=30,
                       tol=1e-05,
                       random_state=0)
     self.keyID = []
def reducer_creation(df_final, target_column, reducer, dataset):
    X = df_final.loc[:, df_final.columns != target_column]
    Y = df_final.loc[:, df_final.columns == target_column]

    my_scorer = make_scorer(cluster_acc, greater_is_better=True)
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    components = np.linspace(2,
                             len(X.columns) - 1,
                             5,
                             dtype=np.int64,
                             endpoint=True)
    estimators = [('reduce_dim', reducer), ('clf', km)]
    param_grid = [
        dict(reduce_dim__n_components=components, clf__n_clusters=components)
    ]
    pipe = Pipeline(estimators)
    grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring=my_scorer)
    grid_search.fit(X, Y)
    mean_scores = np.array(grid_search.cv_results_['mean_test_score']).reshape(
        len(components), -1, len(components))
    plotReducerAndCluster(mean_scores, components)

    estimators = [('reduce_dim', reducer), ('clf', gmm)]
    param_grid = [
        dict(reduce_dim__n_components=components, clf__n_components=components)
    ]
    pipe = Pipeline(estimators)
    grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring=my_scorer)
    grid_search.fit(X, Y)

    mean_scores = np.array(grid_search.cv_results_['mean_test_score']).reshape(
        len(components), -1, len(components))
    plotReducerAndCluster(mean_scores, components)
예제 #9
0
def Kmeans(seq, cluster=2):
    seq = seq.reshape(-1, seq.shape[-1])
    y_pred = kmeans(n_clusters=cluster,
                    init='k-means++',
                    n_jobs=-1,
                    precompute_distances=True).fit(seq)
    return y_pred
예제 #10
0
파일: specPy.py 프로젝트: nv73/specPy
    def build_kmeans(self, hsi_df, cluster=8):

        model = kmeans(n_clusters=cluster)

        model.fit(hsi_df)

        return (model)
예제 #11
0
def cmeans(image, n_clusters):

	input_image = np.array(image_processing(image))
	# input_image = np.reshape(input_image, (input_image.shape[0], 1))

	classifier = kmeans(n_clusters=n_clusters)
	image_k = image

	new_image = classifier.fit_predict(input_image.T)

	comp = np.argsort(classifier.cluster_centers_[:,0], axis=0)
	indexes = range(len(comp))

	for pos in range(len(comp)):
		indexes[comp[pos]] = pos
	indexes = np.array(indexes)


	set_color = COLORS[indexes].tolist()

	# set_color = classifier.cluster_centers_

	print classifier.cluster_centers_

	p = 0
	for pos_l, line in enumerate(image):
		for pos_p, point in enumerate(line):
				image_k[pos_l][pos_p] = set_color[new_image[p]]
				p+=1

	return image_k
예제 #12
0
def clustering(X_train, X_test, X_train2, X_test2):
    X_train_ = X_train[['1', '2', '3']]
    X_test_ = X_test[['1', '2', '3']]
    X_train2_ = X_train2[['1', '2', '3', '4']]
    X_test2_ = X_test2[['1', '2', '3', '4']]

    # X_train = np.asarray(X_train)
    # X_test = np.asarray(X_test)
    # X_train2 = np.asarray(X_train2)
    # X_test2 = np.asarray(X_test2)

    clusters = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40]
    st = clock()
    SSE = defaultdict(dict)
    ll = defaultdict(dict)
    km = kmeans(random_state=15)
    gmm = GMM(random_state=50)
    for k in clusters:
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(X_train_)
        gmm.fit(X_train_)
        # km.score = Opposite of the value of X on the K-means objective.
        #         =Sum of distances of samples to their closest cluster center
        SSE[k]['Diamond'] = km.score(X_test_)
        ll[k]['Diamond'] = gmm.score(X_test_)

        km.fit(X_train2_)
        gmm.fit(X_train2_)
        SSE[k]['CreditCard'] = km.score(X_test2_)
        ll[k]['CreditCard'] = gmm.score(X_test2_)

    return SSE, ll
예제 #13
0
def k_means_tree(data, branch_factor=5, leaf_size=50, max_iter=7):
    ndata, ndim = np.shape(data)
    tree = []
    if ndata < branch_factor:
        tree.append((data, 0, data.mean(axis=0)))
    else:
        km = kmeans(n_clusters=branch_factor, n_init=1,
                    max_iter=max_iter).fit(data)
        # Stack is to-do list, (data, parentnumber, node number, leaf)
        tree.append((None, 0, km.cluster_centers_))
        stack = []
        for i in range(branch_factor):
            stack.append((data[km.labels_ == i, :], i + 1,
                          np.sum(km.labels_ == i) < leaf_size))
        while stack:
            data, node, leaf = stack.pop()
            if leaf:
                tree.append((data, node, None))
            else:
                km = km.fit(data)
                tree.append((None, node, km.cluster_centers_))
                for i in range(branch_factor):
                    stack.append((data[km.labels_ == i, :],
                                  node * branch_factor + i + 1,
                                  np.sum(km.labels_ == i) < leaf_size))

    return tree
예제 #14
0
def split_step(data):
    km = kmeans(n_clusters=2, random_state=0).fit(data)
    to_split = zip(data, km.labels_)
    cluster_0 = filter(lambda (entry, cluster): cluster == 0, to_split)
    cluster_1 = filter(lambda (entry, cluster): cluster == 1, to_split)
    return np.asarray(map(lambda (entry, cluster): entry,
                          cluster_0)), np.asarray(
                              map(lambda (entry, cluster): entry, cluster_1))
예제 #15
0
 def compute_centers(self, X, layers=1):
     #use kmeans to compute centroids
     k1, k2 = self.l1_size, self.l2_size
     kmns = kmeans(n_clusters=k1,
                   compute_labels=False,
                   n_init=3,
                   max_iter=200)
     kmns.fit(X)
     if layers == 1: return kmns.cluster_centers_
     #handle only two layers right now
     Xc = kmns.transform(X)
     #cluster in transformed space
     kmns2 = kmeans(n_clusters=k2,
                    compute_labels=False,
                    n_init=3,
                    max_iter=200)
     kmns2.fit(Xc)
     return (kmns.cluster_centers_, kmns2.cluster_centers_)
예제 #16
0
def plot_centers(train_images):
    im_clus = kmeans(10)
    im_clus.fit(train_images)
    centers = im_clus.cluster_centers_[:,0:784]
    fig = plt.figure(1)
    fig.clf()
    ax = fig.add_subplot(111)
    plot_images(centers, ax, ims_per_row=10)
    plt.savefig('centroid.png')
예제 #17
0
def target_clusterer(df, cluster_cols):
    cluster_df = df.loc[df['cycles_to_failure'] == 0]
    cluster_df = cluster_df[cluster_cols]
    kmeans_fit = kmeans(n_clusters=2, random_state=0).fit(cluster_df)
    df = pd.DataFrame({
        'unit_id': df['unit_id'].unique(),
        'target_cluster': kmeans_fit.labels_
    })
    return (df)
예제 #18
0
def run_adult_analysis(adultX, adultY):
    np.random.seed(0)
    clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 20, 25, 30]

    print(
        'Part 1 - Running clustering algoirthms on original datasets...adult')
    SSE = defaultdict(dict)
    BIC = defaultdict(dict)
    h**o = defaultdict(lambda: defaultdict(dict))
    compl = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    st = clock()
    for k in clusters:
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(adultX)
        gmm.fit(adultX)
        SSE[k]['Adult SSE'] = km.score(adultX)
        BIC[k]['Adult BIC'] = gmm.bic(adultX)  #BAYESIAN INFORMATION CRITERION
        #true = np.transpose(adultY)
        #pred = np.transpose(km.predict(adultX))
        h**o[k]['Adult']['Kmeans'] = homogeneity_score(
            adultY, km.predict(adultX))  # Agreement of labels
        h**o[k]['Adult']['GMM'] = homogeneity_score(adultY,
                                                    gmm.predict(adultX))
        compl[k]['Adult']['Kmeans'] = completeness_score(
            adultY, km.predict(adultX)
        )  #A clustering result satisfies completeness if all the data points that are members of a given class are elements of the same cluster
        compl[k]['Adult']['GMM'] = completeness_score(adultY,
                                                      gmm.predict(adultX))
        adjMI[k]['Adult']['Kmeans'] = ami(
            adultY, km.predict(adultX))  #ADJUSTED MUTUAL INFORMATION
        adjMI[k]['A dult']['GMM'] = ami(adultY, gmm.predict(adultX))

        print(k, clock() - st)

    SSE = (-pd.DataFrame(SSE)).T
    BIC = pd.DataFrame(BIC).T
    h**o = pd.Panel(h**o)
    compl = pd.Panel(compl)
    adjMI = pd.Panel(adjMI)

    SSE.to_csv(
        './P1_Clustering_Algorithms_Non_Transformed/Adult_Cluster_Select_Kmeans.csv'
    )
    BIC.to_csv(
        './P1_Clustering_Algorithms_Non_Transformed/Adult_Cluster_Select_GMM.csv'
    )
    h**o.ix[:, :, 'Adult'].to_csv(
        './P1_Clustering_Algorithms_Non_Transformed/Adult_homo.csv')
    compl.ix[:, :, 'Adult'].to_csv(
        './P1_Clustering_Algorithms_Non_Transformed/Adult_compl.csv')
    adjMI.ix[:, :, 'Adult'].to_csv(
        './P1_Clustering_Algorithms_Non_Transformed/Adult_adjMI.csv')
예제 #19
0
def KMeans(tfidf, n_TOPICS):
  km = kmeans(n_clusters=n_TOPICS, init='k-means++',\
              max_iter=100, n_init=1, verbose=True)
  km.fit(tfidf)
  kmeans_embedding = km.transform(tfidf)
  kmeans_embedding = -(kmeans_embedding -\
                  kmeans_embedding.mean(axis=0))\
                  /kmeans_embedding.std(axis=0)
  return km, kmeans_embedding
예제 #20
0
def treinaRBF(xin, yin, p, metodo='kmeans'):

    ######### Função Radial Gaussiana #########
    #Definindo função para calcular a PDF
    def pdf_mv(x, m, K, n):
        if n == 1:
            r = np.sqrt(K)
            px = ((1 / (np.sqrt(2 * np.pi * r * r))) * np.exp(-0.5 * ((x - m) /
                                                                      (r))**2))
        else:
            parte1 = 1 / (((2 * np.pi)**(n) * (det(K))))
            parte2 = -0.5 * matmul(matmul((x - m).T, (inv(K))), (x - m))
            px = parte1 * np.exp(parte2)
        return (px)

    ##################################################

    N = xin.shape[0]  # Número de amostras
    n = xin.shape[1]  # Dimensão de entrada

    if metodo == 'kmeans':
        xclust = kmeans(n_clusters=p).fit(
            xin)  # Fazendo o Clustering com a função kmeans do sklearn

    elif metodo == 'rand':
        xclust = seleciona_centros_rand(xin, p)

    else:
        print(f'Metodo {metodo} invalido')
        return

    # Armazena o centro dasd funções
    m = xclust.cluster_centers_

    covlist = []

    for i in range(p):
        xci = xin[xclust.labels_ == i]
        if n == 1:
            covi = np.var(xci.T)
        else:
            covi = np.cov(xci.T)
        covlist.append(covi)
    H = np.zeros((N, p))

    for j in range(N):
        for i in range(p):
            mi = np.array(m[i, :])
            cov = np.array(covlist[i])
            H[j, i] = pdf_mv(xin[j, :], mi,
                             cov + 1e-3 * np.identity(cov.shape[0]), n)

    Haug = np.append(np.ones((N, 1)), H, axis=1)
    W = matmul(np.linalg.pinv(Haug), (yin))

    return (m, covlist, W, H, xclust)
예제 #21
0
def Kmeans_model_predict_test(testX,testY):
    testX=testX.tolist()
    testY=testY.tolist()
    model=kmeans(n_clusters=11)
    labels=model.fit_predict(testX)
    count=0
    for l in range(len(testY)):
        if str(labels[l]) in testY[l]:
            count+=1
    acc=(count/float(len(testY)))*100.
    return acc
예제 #22
0
def Kmeans_model_predict(trainX,trainY):
    trainX=trainX.tolist()
    trainY=trainY.tolist()
    model=kmeans(n_clusters=11)
    labels=model.fit_predict(trainX)
    count=0
    for l in range(len(trainY)):
        if labels[l]==trainY[l]:
            count+=1
    acc=(count/float(len(trainY)))*100.
    return acc
예제 #23
0
def plot_projected_centers(encoder, decoder, enc_w, dec_w):
    latent_images = encoder(enc_w, train_images)[0]
    im_clus = kmeans(10)
    im_clus.fit(latent_images)
    centers = im_clus.cluster_centers_
    im_cents = decoder(dec_w, centers)
    fig = plt.figure(1)
    fig.clf()
    ax = fig.add_subplot(111)
    plot_images(im_cents, ax, ims_per_row=10)
    plt.savefig('centroid.png')
def clustering_append_centroid(X_data, algorithm, clusters=[2, 5, 10, 25]):
    X_data2_ = X_data[[
        '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
        '14', '15', '16', '17', '18', '19', '20', '21', '22', '23'
    ]]
    km = kmeans(random_state=15)
    for k in clusters:
        km.set_params(n_clusters=k)
        km = km.fit(X_data2_)
        X_data2_['Cluster' + str(k)] = km.labels_
        #X_train2_.to_csv(str(algorithm)+'Transformed-Add-Cluster-2-5-10-25-Credit_card_train.csv')
    return X_data2_
예제 #25
0
def run_adult_NN_dim_red():
    print(
        'Part 5 - Running neural network with dimensionally reduced adult dataset...'
    )
    np.random.seed(0)
    algo_name = ['PCA', 'ICA', 'RP', 'RF']
    clusters = [2, 3, 4, 5, 6, 7, 8]
    nn_arch = [(8, ), (8, 8), (8, 8, 8), (14), (14, 14), (14, 14, 14),
               (28, 14, 28)]
    nn_lr = [.001, .006, .01, .06, .1, .6, 1]

    algo_name.append('original')

    # Run NN on dimensionally reduced and original datasets with addition cluster dimension
    for i in range(len(algo_name)):
        # load datasets
        adult = pd.read_hdf('datasets.hdf', 'adult_' + algo_name[i])
        adultX = adult.drop('Class', 1).copy().values
        adultY = adult['Class'].copy().values

        km = kmeans(random_state=5)
        gmm = myGMM(random_state=5)

        grid = {
            'addClustKM__n_clusters': clusters,
            'NN__learning_rate_init': nn_lr,
            'NN__hidden_layer_sizes': nn_arch
        }
        mlp = MLPClassifier(max_iter=2000, early_stopping=True, random_state=5)
        pipe = Pipeline([('addClustKM', appendClusterDimKM(cluster_algo=km)),
                         ('NN', mlp)])
        gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

        gs.fit(adultX, adultY)
        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv('./P5_Neural_Networks_Reduced_With_Clusters/adult_km_' +
                   algo_name[i] + '.csv')

        grid = {
            'addClustGMM__n_clusters': clusters,
            'NN__learning_rate_init': nn_lr,
            'NN__hidden_layer_sizes': nn_arch
        }
        mlp = MLPClassifier(max_iter=2000, early_stopping=True, random_state=5)
        pipe = Pipeline([('addClustGMM',
                          appendClusterDimGMM(cluster_algo=gmm)), ('NN', mlp)])
        gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

        gs.fit(adultX, adultY)
        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv('./P5_Neural_Networks_Reduced_With_Clusters/adult_gmm_' +
                   algo_name[i] + '.csv')
예제 #26
0
 def kmns_centers(self, X): #returns numCenters centroids using kmeans method applied to the input X
     X1 = X
     k= self.numCenters
     kmns = kmeans(n_clusters=k, compute_labels=False,
         n_init=1, max_iter=200)
     if X[0].shape == () :#if the data is 1D, we have to reshape it like this : [[],[],...] in order for kmeans to work
         X1 = X.reshape(-1,1)
     kmns.fit(X1)
     c = kmns.cluster_centers_
     if X[0].shape == () :
         return(c.reshape(len(c),))
     else :
         return(c)
예제 #27
0
def compute_silhouette_score(X, kmax):
  """Computes the silhouette for different k and chooses the lower."""

  sil = []

  # dissimilarity would not be defined for a single cluster, thus, minimum
  # number of clusters should be 2

  for k in range(2, kmax + 1):
    kmeans = cluster.kmeans(n_clusters=k, max_iter=1000).fit(X)
    labels = kmeans.labels_
    sil.append(silhouette_score(X, labels, metric='euclidean'))
  return sil
예제 #28
0
def cluster_vects(word2vect):
	#use sklearn minibatch kmeans to cluster the vectors.
	clusters = kmeans(n_clusters= 25, max_iter=10,batch_size=200,
						n_init=1,init_size=2000)
	X = np.array([i.T for i in word2vect.itervalues()])
	y = [i for i in word2vect.iterkeys()]
	
	print 'fitting kmeans, may take some time'
	clusters.fit(X)
	print 'done.'
	
	#now we can get a mapping from word->label
	#which will let us figure out which other words are in the same cluster
	return {word:label for word,label in zip(y,clusters.labels_)}
예제 #29
0
def treinaRBF(xin, yin, p):

    ######### Função Radial Gaussiana #########
    #Definindo função para calcular a PDF
    def pdf_mv(x, m, K, n):
        if n == 1:
            r = np.sqrt(K)
            px = 1 / (np.sqrt(2 * np.pi * r**2)) * np.exp(-0.5 * ((x - m) /
                                                                  (r**2)))
        else:
            px = (1 / (np.sqrt(2 * np.pi**(n * det(K)))))
            px = px * np.exp(-0.5 * (matmul(matmul((x - m).T, inv(K)),
                                            (x - m))))
        return (px)

    ##################################################

    N = xin.shape[0]  # Número de amostras
    n = xin.shape[1]  # Dimensão de entrada

    xclust = kmeans(n_clusters=p).fit(
        xin)  # Fazendo o Clustering com a função kmeans do sklearn

    # Armazena o centro dasd funções
    m = xclust.cluster_centers_
    covlist = []

    for i in range(p):
        xci = xin[xclust.labels_ == i, :]

        if n == 1:
            covi = np.var(xci)
        else:
            covi = np.cov(xci)
        covlist.append(covi)

    H = np.zeros((N, p))

    for j in range(N):
        for i in range(p):
            mi = m[i, :]
            cov = covlist[i]
            H[j, i] = pdf_mv(xin[j, :], mi, cov, n)

    # print(H)
    Haug = np.append(np.ones((N, 1)), H, axis=1)
    W = matmul(inv(matmul(matmul(Haug.T, Haug), Haug.T)), yin)

    return (m, covlist, W, H)
예제 #30
0
def label_alanine(traj):
    ''' use dihedral angles to cluster and label alanine dipeptide
    '''
    # calculating psi and phi angles
    psi_atoms = [6,8,14,16]
    phi_atoms = [4,6,8,14]
    indices = np.asarray([phi_atoms,psi_atoms])
    dihedrals = md.compute_dihedrals(traj,indices)
#    print(dihedrals)
    trans_di = np.transpose(dihedrals)
#    print(trans_di)
    plt.scatter(trans_di[0],trans_di[1])
    plt.xlabel('phi')
    plt.ylabel('psi')
    plt.savefig("/output/tempplot")

    # deal with the periodical condition: manually put same cluster together
    def shift_phi(x):
        if x>2.2:
            return -2*np.pi+x
        else: 
            return x
    def shift_psi(x):
        if x<-2.2:
            return 2*np.pi+x 
        else:
            return x
    dihedrals_shift = np.asarray(
                [np.vectorize(shift_phi)(trans_di[0]),
                 np.vectorize(shift_psi)(trans_di[1])]) 
#    print(trans_di)
    plt.figure()
    plt.scatter(dihedrals_shift[0],dihedrals_shift[1])
    plt.xlabel('phi')
    plt.ylabel('psi')
    plt.savefig("/output/tempplot1")
    print(dihedrals.shape)
    print(dihedrals_shift.shape)
   
    # do clustering with given initial centers
    centers = np.array([[55,48],[-77,138],[-77, -39],[60, -72]])*np.pi/180.0
    clu = kmeans(n_clusters=4,init=centers)
#    labels = clu.fit_predict(dihedrals)
    labels = clu.fit_predict(np.transpose(dihedrals_shift))
    labels = 
    print("centers:")
    print(clu.cluster_centers_*180.0/np.pi)
    return labels
예제 #31
0
def run_clustering(out, perm_x, perm_y, housing_x, housing_y):
    SSE = defaultdict(dict)
    ll = defaultdict(dict)
    acc = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    st = clock()
    for k in clusters:
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(perm_x)
        gmm.fit(perm_x)
        SSE[k]['perm'] = km.score(perm_x)
        ll[k]['perm'] = gmm.score(perm_x)
        acc[k]['perm']['Kmeans'] = cluster_acc(perm_y, km.predict(perm_x))
        acc[k]['perm']['GMM'] = cluster_acc(perm_y, gmm.predict(perm_x))
        adjMI[k]['perm']['Kmeans'] = ami(perm_y, km.predict(perm_x))
        adjMI[k]['perm']['GMM'] = ami(perm_y, gmm.predict(perm_x))

        km.fit(housing_x)
        gmm.fit(housing_x)
        SSE[k]['housing'] = km.score(housing_x)
        ll[k]['housing'] = gmm.score(housing_x)
        acc[k]['housing']['Kmeans'] = cluster_acc(housing_y,
                                                  km.predict(housing_x))
        acc[k]['housing']['GMM'] = cluster_acc(housing_y,
                                               gmm.predict(housing_x))
        adjMI[k]['housing']['Kmeans'] = ami(housing_y, km.predict(housing_x))
        adjMI[k]['housing']['GMM'] = ami(housing_y, gmm.predict(housing_x))
        print(k, clock() - st)

    SSE = (-pd.DataFrame(SSE)).T
    SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True)
    ll = pd.DataFrame(ll).T
    ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True)
    acc = pd.Panel(acc)
    adjMI = pd.Panel(adjMI)

    SSE.to_csv(out + 'SSE.csv')
    ll.to_csv(out + 'logliklihood.csv')
    acc.ix[:, :, 'housing'].to_csv(out + 'Housing acc.csv')
    acc.ix[:, :, 'perm'].to_csv(out + 'Perm acc.csv')
    adjMI.ix[:, :, 'housing'].to_csv(out + 'Housing adjMI.csv')
    adjMI.ix[:, :, 'perm'].to_csv(out + 'Perm adjMI.csv')
예제 #32
0
def cluster_vects(word2vect):
    #use sklearn minibatch kmeans to cluster the vectors.
    clusters = kmeans(n_clusters=25,
                      max_iter=10,
                      batch_size=200,
                      n_init=1,
                      init_size=2000)
    X = np.array([i.T for i in word2vect.itervalues()])
    y = [i for i in word2vect.iterkeys()]

    print('fitting kmeans, may take some time')
    clusters.fit(X)
    print('done.')

    #now we can get a mapping from word->label
    #which will let us figure out which other words are in the same cluster
    return {word: label for word, label in zip(y, clusters.labels_)}
예제 #33
0
def km(clusters, X, Y, name):
    grid = {
        'km__n_clusters': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    mlp = MLPClassifier(activation='relu',
                        max_iter=500,
                        early_stopping=True,
                        random_state=5)
    km = kmeans(random_state=5)
    pipe = Pipeline([('km', km), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10)

    gs.fit(X, Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(name + '_nn_km.csv')
예제 #34
0
 def centers_y(self, X, Y): #returns numCenters centroids using kmeans method applied to the whole trainingset (including the desired output Y)
     X1 = X
     k = self.numCenters
     kmns = kmeans(n_clusters = k, compute_labels=False, n_init=1, max_iter = 200)
     if X[0].shape == ():
         D = np.concatenate((X.reshape(-1,1),Y.reshape(-1,1)), axis = 1)
     elif Y[0].shape == () :
         D = np.concatenate((X,Y.reshape(-1,1)), axis=1)
     else:
         D = np.concatenate((X,Y), axis=1)
     kmns.fit(D)
     cy = kmns.cluster_centers_
     if X[0].shape == ():
         c = cy[:,0].reshape(self.numCenters,)
     else:
         c = cy[:,:X[0].shape[0]]
     return(c)
def other(c_alg, X, y, name, clusters):
    grid = None
    if c_alg == 'km':
        grid = {'km__n_clusters':clusters, 'NN__alpha':nn_reg, 'NN__hidden_layer_sizes':nn_arch}
    else:
        grid = {'gmm__n_components':clusters, 'NN__alpha':nn_reg, 'NN__hidden_layer_sizes':nn_arch}
    mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
    clust = kmeans(random_state=5) if c_alg == 'km' else myGMM(random_state=5)
    pipe = Pipeline([(c_alg, clust), ('NN', mlp)], memory=memory)
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)
    gs.fit(X, y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv('{}/{} cluster {}.csv'.format(OUT, name, c_alg))

    X2D = TruncatedSVD(random_state=5).fit_transform(X)
    #X2D = TSNE(verbose=10, random_state=5).fit_transform(X)
    csv = pd.DataFrame(np.hstack((X2D, np.atleast_2d(y).T)), columns=['x', 'y', 'target'])
    csv.to_csv('{}/{}.csv'.format(OUT, name))
예제 #36
0
def cluster_map(X, k, itr):
	mapper = kmeans(n_clusters=k, n_jobs=-1, n_init=itr) #only run once (weak comp)
	Xnew = mapper.fit_transform(X)
	
	#now apply the triangle method for sparsity
	#basically, it says to set to zero all distances which are 
	#further than the distance to the total mean
	x_mean = X.mean(axis=0) 
	#if the distance from the centroid is > dist to mean, set to 0
	transformeddata = []
	for x_old, x_new in it.izip(X,Xnew):
		#we can compute dist to mean as
		disttomean = norm(x_old-x_mean)
		#now we can compare that with each centroid dist
		transformedrow = [max(disttomean-z,0.0) for z in x_new]
		transformeddata.append(transformedrow)
	
	return np.matrix(transformeddata), mapper.cluster_centers_
def clustering_creation(df_final, target_column, dataset):

    X = df_final.loc[:, df_final.columns != target_column]
    Y = df_final.loc[:, df_final.columns == target_column]

    clusters = np.linspace(2, len(X.columns), 3, dtype=np.int64, endpoint=True)
    SSE = defaultdict(dict)
    ll = defaultdict(lambda: defaultdict(dict))
    acc = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    SS = defaultdict(lambda: defaultdict(dict))
    SSS = defaultdict(lambda: defaultdict(dict))
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    for k in clusters:
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(X)
        gmm.fit(X)
        SSE[k][dataset] = km.score(X)
        ll[k][dataset]['AIC'] = gmm.aic(X)
        ll[k][dataset]['BIC'] = gmm.bic(X)
        SS[k][dataset]['Kmeans'] = cluster_silhouette_score(X, km.predict(X))
        SS[k][dataset]['GMM'] = cluster_silhouette_score(X, gmm.predict(X))
        SSS[k][dataset]['Kmeans'] = cluster_sample_silhouette_score(
            X, km.predict(X))
        SSS[k][dataset]['GMM'] = cluster_sample_silhouette_score(
            X, gmm.predict(X))
        acc[k][dataset]['Kmeans'] = cluster_acc(Y, km.predict(X))
        acc[k][dataset]['GMM'] = cluster_acc(Y, gmm.predict(X))
        adjMI[k][dataset]['Kmeans'] = ami(Y.squeeze(1), km.predict(X))
        adjMI[k][dataset]['GMM'] = ami(Y.squeeze(1), gmm.predict(X))
        print(k)

        cluster_labels_km = km.predict(X)
        cluster_labels_gm = gmm.predict(X)

        plot_silhouette_score(X, SS, SSS, k, dataset, cluster_labels_km,
                              cluster_labels_gm)
    plot_cluster_accuracy(dataset, acc, clusters)
    plot_cluster_information(dataset, adjMI, clusters)
    KMeans_ELBOW(dataset, SSE, clusters)
    BICandAIC(dataset, ll, clusters)
	tmp = None
	print "cool"
	return X_reduce, u, v_T

print "Reading data"
user_dict, movie_dict, data = readData()
cut1, cut2 = int(len(data)*0.6), int(len(data)*0.8)
train, validate,test = data[:cut1, :], data[cut1:cut2, :], data[cut2:, :]
num_user, num_movie = len(user_dict), len(movie_dict)

w, l = int(num_user/2), int(num_movie/2)

print w,l
print "kmeans user"
train_matrix = buildMatrix(train, user_dict, movie_dict, num_user, num_movie)
user_clf = kmeans(n_clusters = w, n_jobs = 4)
user_clf.fit(train_matrix)
new_rate = []
for centroid in user_clf.cluster_centers_:
	new_rate.append(centroid[1])
new_rate = np.array(new_rate)

print "kmeans item"
item_clf = kmeans(n_clusters = l, n_jobs = 4)
item_clf.fit(new_rate.transpose())
final_rate = []
for centroid in item_clf.cluster_centers_:
	final_rate.append(centroid[1])
train_reduce = np.array(final_rate).transpose()

print "save train"
예제 #39
0
import biggus
import h5py
from itertools import *
from sklearn.cluster import MiniBatchKMeans as kmeans

# https://github.com/bmcfee/ml_scraps/blob/master/BufferedEstimator.py
from BufferedEstimator import BufferedEstimator

wordvec = biggus.NumpyArrayAdapter(h5py.File('model/wordvec.hdf5')['wordvec'])

def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    args = [iter(iterable)] * n
    return izip_longest(fillvalue=fillvalue, *args)

clusters = kmeans(n_clusters=500, max_iter=10, random_state=0)

def generator(X):
    for i, x in enumerate(X):
        if i % 10000 == 0:
            logging.info('Throwing seq #{0}'.format(i))
        yield x.ndarray()

buf_est = BufferedEstimator(clusters, batch_size=1000)
buf_est.fit(generator(wordvec))
	if not u and not v_T:
		u, s, v_T = svd(X)
	X_reduce = ((u[:, :w].transpose()).dot(X)).dot(v_T[:l, :].transpose())
	return X_reduce, u, v_T

print "Reading data"
user_dict, movie_dict, data = readData()
cut1, cut2 = int(len(data)*0.6), int(len(data)*0.8)
train, validate,test = data[:cut1, :], data[cut1:cut2, :], data[cut2:, :]
num_user, num_movie = len(user_dict), len(movie_dict)

w, l = int(num_user/2), int(num_movie/2)

print "train kmeans classifier"
train_reduce = [0,0,0.0] * len(train)
user_clf = kmeans(n_clusters=w, max_iter= 30, n_jobs=4)
print "train user"
train_category = user_clf.fit_predict(train[:,[0,2]])
for i in xrange(len(train_category)):
	train_reduce[i][0] = train_category[i]
	train_reduce[i][1] = train[i, 1]
	train_reduce[i][2] = user_clf.cluster_centers_[train_category[i], 1]
print "train product"
product_clf = kmeans(n_clusters=l, max_iter= 30, n_jobs=4)
train_category = product_clf.fit_predict(train_reduce[:,[1,2]])
for i in xrange(len(train_category)):
	train_reduce[i][1] = train_category[i]
	train_reduce[i][2] = product_clf.cluster_centers_[train_category[i], 1]

print "validate_reduction"
validate_reduce = [0,0,0.0] * len(validate)