def use_k_means(x_pca, y): k = 10 y_pred, centroids_pred = k_mean(k, x_pca) score = get_score(y, y_pred) print(score) title = 'k-means-mnist-socre-' + str(score) k_show(x_pca, title, y_pred, centroids_pred)
def k_mean_validation(K): D = 2 B = 10000 KM = km.k_mean("data2D.npy") # Required argument: numbers of clusters, dimensions of points, numbers of points _, segment_ids, X_data, mu = KM.cluster(K, D, B, 1.0 / 3.0) # Take the validation set as input to calculate the loss from cluster centers loss, _ = KM.cal_loss(KM.validation.astype(np.float32), mu, D) with tf.Session(): print "K =", K, ":", loss.eval()
def k_mean_validation(K): D = 2 B = 10000 KM = km.k_mean("data2D.npy") # Required argument: numbers of clusters, dimensions of points, numbers of points _, segment_ids, X_data, mu= KM.cluster(K, D, B, 1.0/3.0) # Take the validation set as input to calculate the loss from cluster centers loss,_ = KM.cal_loss(KM.validation.astype(np.float32), mu, D) with tf.Session(): print "K =",K,":",loss.eval()
def k_comparison(K): D = 2 B = 10000 KM = km.k_mean("data2D.npy") _, segment_ids, X_data, mu= KM.cluster(K, D, B) data = tf.ones(shape = [B,]) division = tf.unsorted_segment_sum(data, segment_ids, K, name=None) with tf.Session(): print "K =",K,":",division.eval()/10000 plot.plot_cluster(segment_ids, X_data, mu, K)
def cluster_articles(supervised=False, target_dimension=10, sentence=False, path_result="result.csv"): """ Cluster the articles with a k_mean algorithms, writes the results in a csv file and return a list of cluster objects""" chemin = "preprocessed_df" + sentence * "_sentence" + ".csv" try: # Load a preprocessed df df = pd.read_csv(chemin, index_col=0) except IOError: # Build the preprecessed df if it is missing print("Missing preprocessed file") df = create_df("df_brown.csv", sentence=sentence) df.to_csv(chemin) if supervised: real_clusters = df["real_cluster"] df = df.drop("real_cluster", axis=1) df = df.drop("text", axis=1) if PCA_val: # Reduce the dimension with a PCA pca = PCA(n_components=target_dimension, svd_solver='full') print("Fit and Transform...") pca.fit(df) print("Transform...") df_reduced = pd.DataFrame(pca.transform(df)) if MDS_val: # Reduce the dimension with a MDS mds = manifold.MDS(target_dimension, max_iter=100, n_init=1) df_reduced = pd.DataFrame(mds.fit_transform(df)) # Del the initial df del df # The score at each iteration of the algorithm scores = [] model = None centroids = None for i in range(N): if centroids: # Centroids already intialized clusters, cost, centroids = k_mean(df_reduced, number_clusters, centroids=centroids) labels = [value for (key, value) in sorted(clusters.items())] score = silhouette_score(df_reduced, labels, metric='cosine') if not centroids: best_score = 0 # First iterations, we run the algorithm mutliple times and look for the best centroids for _ in range(num_first_tests): clusters, cost, centroids = k_mean(df_reduced, number_clusters) labels = [value for (key, value) in sorted(clusters.items())] score = silhouette_score(df_reduced, labels, metric='cosine') if score > best_score: # Best iteratoin, we save the configuration best_centroids = centroids best_score = score best_labels = labels best_clusters = clusters centroids = best_centroids score = best_score labels = best_labels clusters = best_clusters scores += [score] # Pairwise distances between every points distances = pairwise_distances(df_reduced, metric='cosine') # Silhouette score for each point sil_samples = silhouette_samples(distances, labels, metric='cosine') # Find the the misclassified elements min_elements = choose_min_elements(sil_samples) # Load the inital dataset and split in training and testing set df_brown = pd.DataFrame.from_csv("df_brown.csv") df_test_brown = df_brown.ix[min_elements] df_train_brown = df_brown.drop(df_brown.index[min_elements]) labels_as_dict = {} for a in range(len(labels)): if a not in min_elements: labels_as_dict[a] = labels[a] train_Y = pd.DataFrame(labels_as_dict.values()) labels = pd.get_dummies(train_Y[0]) # number of clusters in the df s = labels.shape[1] # Correction when a clusters is deleted by the algorithm if s < number_clusters: headers = list(labels) size_train = df_train_brown.shape[0] for i in range(number_clusters): if i not in headers: labels[i] = np.array([0] * size_train) train_Y.to_csv("train_y_nn.csv") df_train_brown.to_csv("train_nn.csv") df_test_brown.to_csv("test_nn.csv") # Neural network returns prediction, new vector and model predictions, new_vectors, model = predict_neural_network( df_train_brown, labels, df_test_brown, target_dimension, number_clusters, model) # Update the vectors in the dataframe for j in range(len(min_elements)): index = min_elements[j] vector = new_vectors[j] for k in range(len(vector)): df_reduced[k][index] = vector[k] # Write the results in a csv file predicted_clusters = [value for (key, value) in sorted(clusters.items())] df_brown["pred_cluster"] = predicted_clusters df_brown.to_csv(path_result) # Uncomment to plot the silhouette according to the iterations """ plt.xlabel("Number of clusters") plt.ylabel("Silhouette") plt.title("Evolution of the silhouette after " + str(N) + " iterations") plt.plot(range(N), scores) plt.show() """ return clusters, df_reduced
def main(): input=input_pipeline() print("ready input pipeline") net=k_mean() _solver=solver(net,input,'./log') _solver.train_and_test()
i_time = config.getint('sys','i_time') with open(complete_file_path,'w') as f: f.write("0,"+str(i_time * 11)) with open(os.path.join(file_path,"pid.txt"),'w') as f: f.write(str(os.getpid())) from knn import knn from ada_boost import ada_boost from random_forest import random_forest from logistic import logistic from svm import svm from decision_tree import c4_5,cart from k_mean import k_mean from xgboost_clf import xgboost from gbdt_clf import gbdt from net import net print('knn:',knn(i_time=i_time)) print('AdaBoost:',ada_boost(i_time=i_time)) print('random forest',random_forest(i_time=i_time)) print('logistic regression:',logistic(i_time=i_time)) print('C4.5:',c4_5(i_time=i_time)) print('cart:',cart(i_time=i_time)) print('k_mean',k_mean(i_time=i_time)) print('xgboost',xgboost(i_time=i_time)) print('gbdt',gbdt(i_time=i_time)) print('SVM:',svm(i_time=i_time)) print('net',net(i_time=i_time))
ratio = random.random() raw_cubs.append( Lion(selected_male.x * ratio + selected_female.x * (1 - ratio))) raw_cubs.append( Lion(selected_female.x * ratio + selected_male.x * (1 - ratio))) #選擇幾隻突變 selected_mutation_cubs = random.sample( raw_cubs, int(len(raw_cubs) * mutation_ratio)) for i in selected_mutation_cubs: i.mutation(maxvalue=y, minvalue=x) #k_mean分群 common_gender_cubs = k_mean.k_mean(raw_cubs, 2, 10) boy_cubs = common_gender_cubs[0] girl_cubs = common_gender_cubs[1] #刪除多餘弱小的子代 diff = len(boy_cubs) - len(girl_cubs) if diff > 0: for i in range(diff): boy_cubs.remove(min(boy_cubs, key=attrgetter('fitness'))) else: for i in range(-diff): girl_cubs.remove(min(girl_cubs, key=attrgetter('fitness'))) #領土防禦 for meow in range(generate_nomad_time): #產生流浪獅子 nomads.append(Lion(random.uniform(x, y)))
return(dis) def cost(m,data): cost=0 for i in range(len(data)): dist=euclidean_distance(m,data[i]) cost=cost+dist return(cost) # k_mean......for k=2............... kn=4 cluster=[[] for x in range(kn)] mean= [[] for x in range(kn)] i=0 k=0 cluster[k],cluster[k+1],mean[k],mean[k+1] = k_mean(my_data) while(1): cost_dist=[] for i in range(k+1): cost1=cost(mean[i],cluster[i]) cost_dist.append(cost1) a=cost_dist.index(max(cost_dist)) cluster[a],cluster[k+2],mean[a],mean[k+2]=k_mean(cluster[a]) k=k+1 if (kn==k+2): break