def semesters_cluster(self, **kwargs): start_year = self._ha_df[self.year_attr].values.min() end_year = self._ha_df[self.year_attr].values.max() self.se_df = self.semesters_features self.se_df = self.se_df[ (self.se_df[self.year_attr].values >= start_year) & (self.se_df[self.year_attr].values <= end_year) ] _h_program = hash( self._program ) self.se_df.fillna(0) data = self.se_df[ self.SEMESTERS_F_LABELS ].as_matrix() if kwargs == {}: C = self._C_k try: c_init = genfromtxt('./data/%s/centers_%d.csv'%(self.source,_h_program), delimiter=',') km = kmeans(init=c_init, n_clusters=C, n_init=10) km.fit(data) except: km = kmeans(init='k-means++', n_clusters=C, n_init=10) km.fit(data) savetxt('./data/%s/centers_%d.csv'%(self.source,_h_program), km.cluster_centers_, delimiter=',') else: km = kmeans( kwargs ) km.fit(data) cntr, L = km.cluster_centers_, km.labels_ #print 'L',L #self.semesters_features['km_cluster_ID'] = L self.se_df['km_cluster_ID'] = L self.cntr_se = cntr
def compute_centers(self, X, layers=1): #use kmeans to compute centroids k1, k2 = self.l1_size, self.l2_size kmns = kmeans(n_clusters=k1, compute_labels=False, n_init=3, max_iter=200) kmns.fit(X) if layers==1: return kmns.cluster_centers_ #handle only two layers right now Xc = kmns.transform(X) #cluster in transformed space kmns2 = kmeans(n_clusters=k2, compute_labels=False, n_init=3, max_iter=200) kmns2.fit(Xc) return (kmns.cluster_centers_, kmns2.cluster_centers_)
def calculate_wss(points, kmax): """Calculates wss.""" # For the Elbow Method, compute the Within-Cluster-Sum of Squared Errors (wss) # for different values of k and choose the k for which wss becomes first stars # to diminish. sse = [] for k in range(1, kmax + 1): kmeans = cluster.kmeans(n_clusters=k, max_iter=1000).fit(points) centroids = kmeans.cluster_centers_ pred_clusters = kmeans.predict(points) curr_sse = 0 # calculate square of Euclidean distance of each point from its cluster # center and add to current wss for i in range(len(points)): curr_center = centroids[pred_clusters[i]] curr_sse += ((points[i, 0] - curr_center[0])**2 + (points[i, 1] - curr_center[1])**2) sse.append(curr_sse) return sse
def segregate(transaction_details, n): convergence_warning = None vectorizer = CountVectorizer() vector = vectorizer.fit_transform(transaction_details) word_counts = vector.toarray() k_means = kmeans(n_clusters=n) try: with catch_warnings(): filterwarnings('error') try: k_means.fit(word_counts) except Warning as w: convergence_warning = str(w).split(".")[0] cluster_ids = k_means.labels_ unique_clustor_ids = set(cluster_ids) unique_clusters = {} for clustor_id in unique_clustor_ids: common_words = [] for j in range(len(cluster_ids)): if(clustor_id == cluster_ids[j]): common_words.append(transaction_details[j]) unique_clusters.update({clustor_id: findstem(list(set(common_words)))}) clusters = [] for clustor_id in cluster_ids: clusters.append((clustor_id, unique_clusters[clustor_id])) return (clusters,convergence_warning) except Exception: convergence_warning = "Enable to find "+str(n)+" clusters" return ([],convergence_warning)
def getSparseModel(num_inducing): random_state = np.random.RandomState(1) X, Y = getTrainingHoldOutData() km = kmeans(n_clusters=num_inducing, random_state=random_state).fit(X) Z = km.cluster_centers_ model = GPflow.sgpr.SGPR(X, Y, kern=getRbfKernel(), Z=Z) return model
def cluster_vectors(word2vec): """Clusters a set of word vectors""" # Load Data df = pd.read_csv("data/input/thought.csv", na_filter=False, encoding="utf-8", error_bad_lines=False) seers_grouped = df.groupby('Seer', as_index=False) seers = dict(list(seers_grouped)) thoughts = list(seers["msevrens"]["Thought"]) ken = vectorize(thoughts, min_df=1) sorted_ken = sorted(ken.items(), key=operator.itemgetter(1)) sorted_ken.reverse() keys_in_word2vec = word2vec.vocab.keys() tokens = [x[0] for x in sorted_ken[0:2500] if x[0] in keys_in_word2vec] X = np.array([word2vec[t].T for t in tokens]) print(tokens) # Clustering clusters = kmeans(n_clusters=100, max_iter=5000, batch_size=128, n_init=250) clusters.fit(X) word_clusters = {word:label for word, label in zip(tokens, clusters.labels_)} sorted_clusters = sorted(word_clusters.items(), key=operator.itemgetter(1)) collected = collections.defaultdict(list) for k in sorted_clusters: collected[k[1]].append(k[0]) for key in collected.keys(): safe_print(key, collected[key], "\n") # Visualize with t-SNE t_SNE(word2vec, tokens, collected)
def __init__(self): self.tagger = Mecab.Tagger('-Ochasen') self.alnum_Reg = re.compile( r'^[a-zA-Z0-9_(:;,./?*+&%#!<>|\u3000)~=]+$') # These list prepared for data in the Tweeter, # First 4 variables prepared for a line which is devided by tab space # Last 3 are for contexts that is devided by space # rt_flags: contains if the tweet is retweet or not # reps: contains for whom the tweet was replied # texts: contains actual tweet message self.ids, self.dates, self.contexts, self.unk_nums, self.rt_flags = [], [], [], [], [] self.reps, self.texts, self.lines, self.morpho = [], [], [], [] self.noun_count, self.adj_count, self.verb_count = defaultdict( int), defaultdict(int), defaultdict(int) self.word_count = defaultdict(int) self.total_noun, self.total_adj, self.total_verb = 0, 0, 0 self.total_words = 0 self.corpList = [] self.existCount, self.totalCount = 0, 0 self.dictIDWord, self.dictWordID = {}, {} self.dictIDProb, self.dictIDNum, self.dictNumFeat = {}, {}, {} self.dictNumID = {} self.features = np.array([]) self.clf = kmeans(n_clusters=3, init='random', n_init=10, max_iter=30, tol=1e-05, random_state=0) self.keyID = []
def reducer_creation(df_final, target_column, reducer, dataset): X = df_final.loc[:, df_final.columns != target_column] Y = df_final.loc[:, df_final.columns == target_column] my_scorer = make_scorer(cluster_acc, greater_is_better=True) km = kmeans(random_state=5) gmm = GMM(random_state=5) components = np.linspace(2, len(X.columns) - 1, 5, dtype=np.int64, endpoint=True) estimators = [('reduce_dim', reducer), ('clf', km)] param_grid = [ dict(reduce_dim__n_components=components, clf__n_clusters=components) ] pipe = Pipeline(estimators) grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring=my_scorer) grid_search.fit(X, Y) mean_scores = np.array(grid_search.cv_results_['mean_test_score']).reshape( len(components), -1, len(components)) plotReducerAndCluster(mean_scores, components) estimators = [('reduce_dim', reducer), ('clf', gmm)] param_grid = [ dict(reduce_dim__n_components=components, clf__n_components=components) ] pipe = Pipeline(estimators) grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring=my_scorer) grid_search.fit(X, Y) mean_scores = np.array(grid_search.cv_results_['mean_test_score']).reshape( len(components), -1, len(components)) plotReducerAndCluster(mean_scores, components)
def Kmeans(seq, cluster=2): seq = seq.reshape(-1, seq.shape[-1]) y_pred = kmeans(n_clusters=cluster, init='k-means++', n_jobs=-1, precompute_distances=True).fit(seq) return y_pred
def build_kmeans(self, hsi_df, cluster=8): model = kmeans(n_clusters=cluster) model.fit(hsi_df) return (model)
def cmeans(image, n_clusters): input_image = np.array(image_processing(image)) # input_image = np.reshape(input_image, (input_image.shape[0], 1)) classifier = kmeans(n_clusters=n_clusters) image_k = image new_image = classifier.fit_predict(input_image.T) comp = np.argsort(classifier.cluster_centers_[:,0], axis=0) indexes = range(len(comp)) for pos in range(len(comp)): indexes[comp[pos]] = pos indexes = np.array(indexes) set_color = COLORS[indexes].tolist() # set_color = classifier.cluster_centers_ print classifier.cluster_centers_ p = 0 for pos_l, line in enumerate(image): for pos_p, point in enumerate(line): image_k[pos_l][pos_p] = set_color[new_image[p]] p+=1 return image_k
def clustering(X_train, X_test, X_train2, X_test2): X_train_ = X_train[['1', '2', '3']] X_test_ = X_test[['1', '2', '3']] X_train2_ = X_train2[['1', '2', '3', '4']] X_test2_ = X_test2[['1', '2', '3', '4']] # X_train = np.asarray(X_train) # X_test = np.asarray(X_test) # X_train2 = np.asarray(X_train2) # X_test2 = np.asarray(X_test2) clusters = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40] st = clock() SSE = defaultdict(dict) ll = defaultdict(dict) km = kmeans(random_state=15) gmm = GMM(random_state=50) for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(X_train_) gmm.fit(X_train_) # km.score = Opposite of the value of X on the K-means objective. # =Sum of distances of samples to their closest cluster center SSE[k]['Diamond'] = km.score(X_test_) ll[k]['Diamond'] = gmm.score(X_test_) km.fit(X_train2_) gmm.fit(X_train2_) SSE[k]['CreditCard'] = km.score(X_test2_) ll[k]['CreditCard'] = gmm.score(X_test2_) return SSE, ll
def k_means_tree(data, branch_factor=5, leaf_size=50, max_iter=7): ndata, ndim = np.shape(data) tree = [] if ndata < branch_factor: tree.append((data, 0, data.mean(axis=0))) else: km = kmeans(n_clusters=branch_factor, n_init=1, max_iter=max_iter).fit(data) # Stack is to-do list, (data, parentnumber, node number, leaf) tree.append((None, 0, km.cluster_centers_)) stack = [] for i in range(branch_factor): stack.append((data[km.labels_ == i, :], i + 1, np.sum(km.labels_ == i) < leaf_size)) while stack: data, node, leaf = stack.pop() if leaf: tree.append((data, node, None)) else: km = km.fit(data) tree.append((None, node, km.cluster_centers_)) for i in range(branch_factor): stack.append((data[km.labels_ == i, :], node * branch_factor + i + 1, np.sum(km.labels_ == i) < leaf_size)) return tree
def split_step(data): km = kmeans(n_clusters=2, random_state=0).fit(data) to_split = zip(data, km.labels_) cluster_0 = filter(lambda (entry, cluster): cluster == 0, to_split) cluster_1 = filter(lambda (entry, cluster): cluster == 1, to_split) return np.asarray(map(lambda (entry, cluster): entry, cluster_0)), np.asarray( map(lambda (entry, cluster): entry, cluster_1))
def compute_centers(self, X, layers=1): #use kmeans to compute centroids k1, k2 = self.l1_size, self.l2_size kmns = kmeans(n_clusters=k1, compute_labels=False, n_init=3, max_iter=200) kmns.fit(X) if layers == 1: return kmns.cluster_centers_ #handle only two layers right now Xc = kmns.transform(X) #cluster in transformed space kmns2 = kmeans(n_clusters=k2, compute_labels=False, n_init=3, max_iter=200) kmns2.fit(Xc) return (kmns.cluster_centers_, kmns2.cluster_centers_)
def plot_centers(train_images): im_clus = kmeans(10) im_clus.fit(train_images) centers = im_clus.cluster_centers_[:,0:784] fig = plt.figure(1) fig.clf() ax = fig.add_subplot(111) plot_images(centers, ax, ims_per_row=10) plt.savefig('centroid.png')
def target_clusterer(df, cluster_cols): cluster_df = df.loc[df['cycles_to_failure'] == 0] cluster_df = cluster_df[cluster_cols] kmeans_fit = kmeans(n_clusters=2, random_state=0).fit(cluster_df) df = pd.DataFrame({ 'unit_id': df['unit_id'].unique(), 'target_cluster': kmeans_fit.labels_ }) return (df)
def run_adult_analysis(adultX, adultY): np.random.seed(0) clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 20, 25, 30] print( 'Part 1 - Running clustering algoirthms on original datasets...adult') SSE = defaultdict(dict) BIC = defaultdict(dict) h**o = defaultdict(lambda: defaultdict(dict)) compl = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(adultX) gmm.fit(adultX) SSE[k]['Adult SSE'] = km.score(adultX) BIC[k]['Adult BIC'] = gmm.bic(adultX) #BAYESIAN INFORMATION CRITERION #true = np.transpose(adultY) #pred = np.transpose(km.predict(adultX)) h**o[k]['Adult']['Kmeans'] = homogeneity_score( adultY, km.predict(adultX)) # Agreement of labels h**o[k]['Adult']['GMM'] = homogeneity_score(adultY, gmm.predict(adultX)) compl[k]['Adult']['Kmeans'] = completeness_score( adultY, km.predict(adultX) ) #A clustering result satisfies completeness if all the data points that are members of a given class are elements of the same cluster compl[k]['Adult']['GMM'] = completeness_score(adultY, gmm.predict(adultX)) adjMI[k]['Adult']['Kmeans'] = ami( adultY, km.predict(adultX)) #ADJUSTED MUTUAL INFORMATION adjMI[k]['A dult']['GMM'] = ami(adultY, gmm.predict(adultX)) print(k, clock() - st) SSE = (-pd.DataFrame(SSE)).T BIC = pd.DataFrame(BIC).T h**o = pd.Panel(h**o) compl = pd.Panel(compl) adjMI = pd.Panel(adjMI) SSE.to_csv( './P1_Clustering_Algorithms_Non_Transformed/Adult_Cluster_Select_Kmeans.csv' ) BIC.to_csv( './P1_Clustering_Algorithms_Non_Transformed/Adult_Cluster_Select_GMM.csv' ) h**o.ix[:, :, 'Adult'].to_csv( './P1_Clustering_Algorithms_Non_Transformed/Adult_homo.csv') compl.ix[:, :, 'Adult'].to_csv( './P1_Clustering_Algorithms_Non_Transformed/Adult_compl.csv') adjMI.ix[:, :, 'Adult'].to_csv( './P1_Clustering_Algorithms_Non_Transformed/Adult_adjMI.csv')
def KMeans(tfidf, n_TOPICS): km = kmeans(n_clusters=n_TOPICS, init='k-means++',\ max_iter=100, n_init=1, verbose=True) km.fit(tfidf) kmeans_embedding = km.transform(tfidf) kmeans_embedding = -(kmeans_embedding -\ kmeans_embedding.mean(axis=0))\ /kmeans_embedding.std(axis=0) return km, kmeans_embedding
def treinaRBF(xin, yin, p, metodo='kmeans'): ######### Função Radial Gaussiana ######### #Definindo função para calcular a PDF def pdf_mv(x, m, K, n): if n == 1: r = np.sqrt(K) px = ((1 / (np.sqrt(2 * np.pi * r * r))) * np.exp(-0.5 * ((x - m) / (r))**2)) else: parte1 = 1 / (((2 * np.pi)**(n) * (det(K)))) parte2 = -0.5 * matmul(matmul((x - m).T, (inv(K))), (x - m)) px = parte1 * np.exp(parte2) return (px) ################################################## N = xin.shape[0] # Número de amostras n = xin.shape[1] # Dimensão de entrada if metodo == 'kmeans': xclust = kmeans(n_clusters=p).fit( xin) # Fazendo o Clustering com a função kmeans do sklearn elif metodo == 'rand': xclust = seleciona_centros_rand(xin, p) else: print(f'Metodo {metodo} invalido') return # Armazena o centro dasd funções m = xclust.cluster_centers_ covlist = [] for i in range(p): xci = xin[xclust.labels_ == i] if n == 1: covi = np.var(xci.T) else: covi = np.cov(xci.T) covlist.append(covi) H = np.zeros((N, p)) for j in range(N): for i in range(p): mi = np.array(m[i, :]) cov = np.array(covlist[i]) H[j, i] = pdf_mv(xin[j, :], mi, cov + 1e-3 * np.identity(cov.shape[0]), n) Haug = np.append(np.ones((N, 1)), H, axis=1) W = matmul(np.linalg.pinv(Haug), (yin)) return (m, covlist, W, H, xclust)
def Kmeans_model_predict_test(testX,testY): testX=testX.tolist() testY=testY.tolist() model=kmeans(n_clusters=11) labels=model.fit_predict(testX) count=0 for l in range(len(testY)): if str(labels[l]) in testY[l]: count+=1 acc=(count/float(len(testY)))*100. return acc
def Kmeans_model_predict(trainX,trainY): trainX=trainX.tolist() trainY=trainY.tolist() model=kmeans(n_clusters=11) labels=model.fit_predict(trainX) count=0 for l in range(len(trainY)): if labels[l]==trainY[l]: count+=1 acc=(count/float(len(trainY)))*100. return acc
def plot_projected_centers(encoder, decoder, enc_w, dec_w): latent_images = encoder(enc_w, train_images)[0] im_clus = kmeans(10) im_clus.fit(latent_images) centers = im_clus.cluster_centers_ im_cents = decoder(dec_w, centers) fig = plt.figure(1) fig.clf() ax = fig.add_subplot(111) plot_images(im_cents, ax, ims_per_row=10) plt.savefig('centroid.png')
def clustering_append_centroid(X_data, algorithm, clusters=[2, 5, 10, 25]): X_data2_ = X_data[[ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23' ]] km = kmeans(random_state=15) for k in clusters: km.set_params(n_clusters=k) km = km.fit(X_data2_) X_data2_['Cluster' + str(k)] = km.labels_ #X_train2_.to_csv(str(algorithm)+'Transformed-Add-Cluster-2-5-10-25-Credit_card_train.csv') return X_data2_
def run_adult_NN_dim_red(): print( 'Part 5 - Running neural network with dimensionally reduced adult dataset...' ) np.random.seed(0) algo_name = ['PCA', 'ICA', 'RP', 'RF'] clusters = [2, 3, 4, 5, 6, 7, 8] nn_arch = [(8, ), (8, 8), (8, 8, 8), (14), (14, 14), (14, 14, 14), (28, 14, 28)] nn_lr = [.001, .006, .01, .06, .1, .6, 1] algo_name.append('original') # Run NN on dimensionally reduced and original datasets with addition cluster dimension for i in range(len(algo_name)): # load datasets adult = pd.read_hdf('datasets.hdf', 'adult_' + algo_name[i]) adultX = adult.drop('Class', 1).copy().values adultY = adult['Class'].copy().values km = kmeans(random_state=5) gmm = myGMM(random_state=5) grid = { 'addClustKM__n_clusters': clusters, 'NN__learning_rate_init': nn_lr, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('addClustKM', appendClusterDimKM(cluster_algo=km)), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(adultX, adultY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv('./P5_Neural_Networks_Reduced_With_Clusters/adult_km_' + algo_name[i] + '.csv') grid = { 'addClustGMM__n_clusters': clusters, 'NN__learning_rate_init': nn_lr, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('addClustGMM', appendClusterDimGMM(cluster_algo=gmm)), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(adultX, adultY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv('./P5_Neural_Networks_Reduced_With_Clusters/adult_gmm_' + algo_name[i] + '.csv')
def kmns_centers(self, X): #returns numCenters centroids using kmeans method applied to the input X X1 = X k= self.numCenters kmns = kmeans(n_clusters=k, compute_labels=False, n_init=1, max_iter=200) if X[0].shape == () :#if the data is 1D, we have to reshape it like this : [[],[],...] in order for kmeans to work X1 = X.reshape(-1,1) kmns.fit(X1) c = kmns.cluster_centers_ if X[0].shape == () : return(c.reshape(len(c),)) else : return(c)
def compute_silhouette_score(X, kmax): """Computes the silhouette for different k and chooses the lower.""" sil = [] # dissimilarity would not be defined for a single cluster, thus, minimum # number of clusters should be 2 for k in range(2, kmax + 1): kmeans = cluster.kmeans(n_clusters=k, max_iter=1000).fit(X) labels = kmeans.labels_ sil.append(silhouette_score(X, labels, metric='euclidean')) return sil
def cluster_vects(word2vect): #use sklearn minibatch kmeans to cluster the vectors. clusters = kmeans(n_clusters= 25, max_iter=10,batch_size=200, n_init=1,init_size=2000) X = np.array([i.T for i in word2vect.itervalues()]) y = [i for i in word2vect.iterkeys()] print 'fitting kmeans, may take some time' clusters.fit(X) print 'done.' #now we can get a mapping from word->label #which will let us figure out which other words are in the same cluster return {word:label for word,label in zip(y,clusters.labels_)}
def treinaRBF(xin, yin, p): ######### Função Radial Gaussiana ######### #Definindo função para calcular a PDF def pdf_mv(x, m, K, n): if n == 1: r = np.sqrt(K) px = 1 / (np.sqrt(2 * np.pi * r**2)) * np.exp(-0.5 * ((x - m) / (r**2))) else: px = (1 / (np.sqrt(2 * np.pi**(n * det(K))))) px = px * np.exp(-0.5 * (matmul(matmul((x - m).T, inv(K)), (x - m)))) return (px) ################################################## N = xin.shape[0] # Número de amostras n = xin.shape[1] # Dimensão de entrada xclust = kmeans(n_clusters=p).fit( xin) # Fazendo o Clustering com a função kmeans do sklearn # Armazena o centro dasd funções m = xclust.cluster_centers_ covlist = [] for i in range(p): xci = xin[xclust.labels_ == i, :] if n == 1: covi = np.var(xci) else: covi = np.cov(xci) covlist.append(covi) H = np.zeros((N, p)) for j in range(N): for i in range(p): mi = m[i, :] cov = covlist[i] H[j, i] = pdf_mv(xin[j, :], mi, cov, n) # print(H) Haug = np.append(np.ones((N, 1)), H, axis=1) W = matmul(inv(matmul(matmul(Haug.T, Haug), Haug.T)), yin) return (m, covlist, W, H)
def label_alanine(traj): ''' use dihedral angles to cluster and label alanine dipeptide ''' # calculating psi and phi angles psi_atoms = [6,8,14,16] phi_atoms = [4,6,8,14] indices = np.asarray([phi_atoms,psi_atoms]) dihedrals = md.compute_dihedrals(traj,indices) # print(dihedrals) trans_di = np.transpose(dihedrals) # print(trans_di) plt.scatter(trans_di[0],trans_di[1]) plt.xlabel('phi') plt.ylabel('psi') plt.savefig("/output/tempplot") # deal with the periodical condition: manually put same cluster together def shift_phi(x): if x>2.2: return -2*np.pi+x else: return x def shift_psi(x): if x<-2.2: return 2*np.pi+x else: return x dihedrals_shift = np.asarray( [np.vectorize(shift_phi)(trans_di[0]), np.vectorize(shift_psi)(trans_di[1])]) # print(trans_di) plt.figure() plt.scatter(dihedrals_shift[0],dihedrals_shift[1]) plt.xlabel('phi') plt.ylabel('psi') plt.savefig("/output/tempplot1") print(dihedrals.shape) print(dihedrals_shift.shape) # do clustering with given initial centers centers = np.array([[55,48],[-77,138],[-77, -39],[60, -72]])*np.pi/180.0 clu = kmeans(n_clusters=4,init=centers) # labels = clu.fit_predict(dihedrals) labels = clu.fit_predict(np.transpose(dihedrals_shift)) labels = print("centers:") print(clu.cluster_centers_*180.0/np.pi) return labels
def run_clustering(out, perm_x, perm_y, housing_x, housing_y): SSE = defaultdict(dict) ll = defaultdict(dict) acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(perm_x) gmm.fit(perm_x) SSE[k]['perm'] = km.score(perm_x) ll[k]['perm'] = gmm.score(perm_x) acc[k]['perm']['Kmeans'] = cluster_acc(perm_y, km.predict(perm_x)) acc[k]['perm']['GMM'] = cluster_acc(perm_y, gmm.predict(perm_x)) adjMI[k]['perm']['Kmeans'] = ami(perm_y, km.predict(perm_x)) adjMI[k]['perm']['GMM'] = ami(perm_y, gmm.predict(perm_x)) km.fit(housing_x) gmm.fit(housing_x) SSE[k]['housing'] = km.score(housing_x) ll[k]['housing'] = gmm.score(housing_x) acc[k]['housing']['Kmeans'] = cluster_acc(housing_y, km.predict(housing_x)) acc[k]['housing']['GMM'] = cluster_acc(housing_y, gmm.predict(housing_x)) adjMI[k]['housing']['Kmeans'] = ami(housing_y, km.predict(housing_x)) adjMI[k]['housing']['GMM'] = ami(housing_y, gmm.predict(housing_x)) print(k, clock() - st) SSE = (-pd.DataFrame(SSE)).T SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True) ll = pd.DataFrame(ll).T ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True) acc = pd.Panel(acc) adjMI = pd.Panel(adjMI) SSE.to_csv(out + 'SSE.csv') ll.to_csv(out + 'logliklihood.csv') acc.ix[:, :, 'housing'].to_csv(out + 'Housing acc.csv') acc.ix[:, :, 'perm'].to_csv(out + 'Perm acc.csv') adjMI.ix[:, :, 'housing'].to_csv(out + 'Housing adjMI.csv') adjMI.ix[:, :, 'perm'].to_csv(out + 'Perm adjMI.csv')
def cluster_vects(word2vect): #use sklearn minibatch kmeans to cluster the vectors. clusters = kmeans(n_clusters=25, max_iter=10, batch_size=200, n_init=1, init_size=2000) X = np.array([i.T for i in word2vect.itervalues()]) y = [i for i in word2vect.iterkeys()] print('fitting kmeans, may take some time') clusters.fit(X) print('done.') #now we can get a mapping from word->label #which will let us figure out which other words are in the same cluster return {word: label for word, label in zip(y, clusters.labels_)}
def km(clusters, X, Y, name): grid = { 'km__n_clusters': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=500, early_stopping=True, random_state=5) km = kmeans(random_state=5) pipe = Pipeline([('km', km), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10) gs.fit(X, Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(name + '_nn_km.csv')
def centers_y(self, X, Y): #returns numCenters centroids using kmeans method applied to the whole trainingset (including the desired output Y) X1 = X k = self.numCenters kmns = kmeans(n_clusters = k, compute_labels=False, n_init=1, max_iter = 200) if X[0].shape == (): D = np.concatenate((X.reshape(-1,1),Y.reshape(-1,1)), axis = 1) elif Y[0].shape == () : D = np.concatenate((X,Y.reshape(-1,1)), axis=1) else: D = np.concatenate((X,Y), axis=1) kmns.fit(D) cy = kmns.cluster_centers_ if X[0].shape == (): c = cy[:,0].reshape(self.numCenters,) else: c = cy[:,:X[0].shape[0]] return(c)
def other(c_alg, X, y, name, clusters): grid = None if c_alg == 'km': grid = {'km__n_clusters':clusters, 'NN__alpha':nn_reg, 'NN__hidden_layer_sizes':nn_arch} else: grid = {'gmm__n_components':clusters, 'NN__alpha':nn_reg, 'NN__hidden_layer_sizes':nn_arch} mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) clust = kmeans(random_state=5) if c_alg == 'km' else myGMM(random_state=5) pipe = Pipeline([(c_alg, clust), ('NN', mlp)], memory=memory) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(X, y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv('{}/{} cluster {}.csv'.format(OUT, name, c_alg)) X2D = TruncatedSVD(random_state=5).fit_transform(X) #X2D = TSNE(verbose=10, random_state=5).fit_transform(X) csv = pd.DataFrame(np.hstack((X2D, np.atleast_2d(y).T)), columns=['x', 'y', 'target']) csv.to_csv('{}/{}.csv'.format(OUT, name))
def cluster_map(X, k, itr): mapper = kmeans(n_clusters=k, n_jobs=-1, n_init=itr) #only run once (weak comp) Xnew = mapper.fit_transform(X) #now apply the triangle method for sparsity #basically, it says to set to zero all distances which are #further than the distance to the total mean x_mean = X.mean(axis=0) #if the distance from the centroid is > dist to mean, set to 0 transformeddata = [] for x_old, x_new in it.izip(X,Xnew): #we can compute dist to mean as disttomean = norm(x_old-x_mean) #now we can compare that with each centroid dist transformedrow = [max(disttomean-z,0.0) for z in x_new] transformeddata.append(transformedrow) return np.matrix(transformeddata), mapper.cluster_centers_
def clustering_creation(df_final, target_column, dataset): X = df_final.loc[:, df_final.columns != target_column] Y = df_final.loc[:, df_final.columns == target_column] clusters = np.linspace(2, len(X.columns), 3, dtype=np.int64, endpoint=True) SSE = defaultdict(dict) ll = defaultdict(lambda: defaultdict(dict)) acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) SS = defaultdict(lambda: defaultdict(dict)) SSS = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(X) gmm.fit(X) SSE[k][dataset] = km.score(X) ll[k][dataset]['AIC'] = gmm.aic(X) ll[k][dataset]['BIC'] = gmm.bic(X) SS[k][dataset]['Kmeans'] = cluster_silhouette_score(X, km.predict(X)) SS[k][dataset]['GMM'] = cluster_silhouette_score(X, gmm.predict(X)) SSS[k][dataset]['Kmeans'] = cluster_sample_silhouette_score( X, km.predict(X)) SSS[k][dataset]['GMM'] = cluster_sample_silhouette_score( X, gmm.predict(X)) acc[k][dataset]['Kmeans'] = cluster_acc(Y, km.predict(X)) acc[k][dataset]['GMM'] = cluster_acc(Y, gmm.predict(X)) adjMI[k][dataset]['Kmeans'] = ami(Y.squeeze(1), km.predict(X)) adjMI[k][dataset]['GMM'] = ami(Y.squeeze(1), gmm.predict(X)) print(k) cluster_labels_km = km.predict(X) cluster_labels_gm = gmm.predict(X) plot_silhouette_score(X, SS, SSS, k, dataset, cluster_labels_km, cluster_labels_gm) plot_cluster_accuracy(dataset, acc, clusters) plot_cluster_information(dataset, adjMI, clusters) KMeans_ELBOW(dataset, SSE, clusters) BICandAIC(dataset, ll, clusters)
tmp = None print "cool" return X_reduce, u, v_T print "Reading data" user_dict, movie_dict, data = readData() cut1, cut2 = int(len(data)*0.6), int(len(data)*0.8) train, validate,test = data[:cut1, :], data[cut1:cut2, :], data[cut2:, :] num_user, num_movie = len(user_dict), len(movie_dict) w, l = int(num_user/2), int(num_movie/2) print w,l print "kmeans user" train_matrix = buildMatrix(train, user_dict, movie_dict, num_user, num_movie) user_clf = kmeans(n_clusters = w, n_jobs = 4) user_clf.fit(train_matrix) new_rate = [] for centroid in user_clf.cluster_centers_: new_rate.append(centroid[1]) new_rate = np.array(new_rate) print "kmeans item" item_clf = kmeans(n_clusters = l, n_jobs = 4) item_clf.fit(new_rate.transpose()) final_rate = [] for centroid in item_clf.cluster_centers_: final_rate.append(centroid[1]) train_reduce = np.array(final_rate).transpose() print "save train"
import biggus import h5py from itertools import * from sklearn.cluster import MiniBatchKMeans as kmeans # https://github.com/bmcfee/ml_scraps/blob/master/BufferedEstimator.py from BufferedEstimator import BufferedEstimator wordvec = biggus.NumpyArrayAdapter(h5py.File('model/wordvec.hdf5')['wordvec']) def grouper(iterable, n, fillvalue=None): "Collect data into fixed-length chunks or blocks" args = [iter(iterable)] * n return izip_longest(fillvalue=fillvalue, *args) clusters = kmeans(n_clusters=500, max_iter=10, random_state=0) def generator(X): for i, x in enumerate(X): if i % 10000 == 0: logging.info('Throwing seq #{0}'.format(i)) yield x.ndarray() buf_est = BufferedEstimator(clusters, batch_size=1000) buf_est.fit(generator(wordvec))
if not u and not v_T: u, s, v_T = svd(X) X_reduce = ((u[:, :w].transpose()).dot(X)).dot(v_T[:l, :].transpose()) return X_reduce, u, v_T print "Reading data" user_dict, movie_dict, data = readData() cut1, cut2 = int(len(data)*0.6), int(len(data)*0.8) train, validate,test = data[:cut1, :], data[cut1:cut2, :], data[cut2:, :] num_user, num_movie = len(user_dict), len(movie_dict) w, l = int(num_user/2), int(num_movie/2) print "train kmeans classifier" train_reduce = [0,0,0.0] * len(train) user_clf = kmeans(n_clusters=w, max_iter= 30, n_jobs=4) print "train user" train_category = user_clf.fit_predict(train[:,[0,2]]) for i in xrange(len(train_category)): train_reduce[i][0] = train_category[i] train_reduce[i][1] = train[i, 1] train_reduce[i][2] = user_clf.cluster_centers_[train_category[i], 1] print "train product" product_clf = kmeans(n_clusters=l, max_iter= 30, n_jobs=4) train_category = product_clf.fit_predict(train_reduce[:,[1,2]]) for i in xrange(len(train_category)): train_reduce[i][1] = train_category[i] train_reduce[i][2] = product_clf.cluster_centers_[train_category[i], 1] print "validate_reduction" validate_reduce = [0,0,0.0] * len(validate)