def main(): # reads files <trainSet> <testSet> as command line argument trainSet, testSet, stopwords_bn = read_files() # True = Binary Sentiment Classification / False = Six Way Topic classification(multi-class label) use_sentiment = False # divides the files into tokenized documents and class labels # this is for SVM trainDoc, trainClass = modify_corpus(trainSet, use_sentiment) testDoc, testClass = modify_corpus(testSet, use_sentiment) # show the distribution of classes in training and testing set distribution(trainClass, testClass) # Running the best model based among 3 (if you want to see the output of every model then uncomment the above function) print("\n\n Running the best Model - Non-Linear SVM:") run_best_model(trainDoc, trainClass, testDoc, testClass, stopwords_bn, use_sentiment) # Run both linear and no-linear classifier with different C/Gamma values print("\n\n Do you want to See the Output of all variants of SVM classifier?:") c = str(input("[Y/N]:")) if c =='Y' or c == 'y': # run all the 3 classifiers run_all_classifiers(trainDoc, trainClass, testDoc, testClass, stopwords_bn, use_sentiment) # Run both linear and no-linear classifier with different C/Gamma values print("\n\n Do you want to run K-means clustering?:") c = str(input("[Y/N]:")) if c =='Y' or c == 'y': # calling K-Means Clustering doc, label, use_sentiment = False (sixway topic classification) K_Means.run_all_classifiers(trainDoc+testDoc, trainClass+testClass, stopwords_bn, use_sentiment) # Run Multinomial Naive Bayes Classifier print("\n\n Do you want to run Multinomial Naive Bayes Classifier?:") c = str(input("[Y/N]:")) if c =='Y' or c == 'y': # calling NB classifier, use_sentiment = False (sixway topic classification) Naive_Bayes.run_all_classifiers(trainDoc, trainClass, testDoc, testClass, stopwords_bn, use_sentiment) # Run Multinomial Naive Bayes Classifier print("\n\n Do you want to run Decisiton Tree/KNN Classifier?:") c = str(input("[Y/N]:")) if c =='Y' or c == 'y': # calling NB classifier, use_sentiment = False (sixway topic classification) DT_KNN.run_all_classifiers(trainDoc, trainClass, testDoc, testClass, stopwords_bn, use_sentiment)
def start(self, data): print("from html req:", data) result = "" if len(data) <= 1: result = "<p>出错</p>" else: name = data.get("name", "") if name == "expansion_data": result = smote_expansion_data.expansion_data() elif name == "smote_compare": result = smote_compare.draw_pic() elif name == "DBSCAN": result = DBSCAN.deal(data.get("all_file", "")) elif name == const.DECISION or name == const.RANDOM_FOREST: result = decision.deal(data) elif name == "XGBoost": result = XGBoost.deal_real(data) elif name == "KMeans": result = K_Means.deal(data["all_file"]) self.send_response(200) self.send_header("Content-type", "text/html;charset = UTF-8") self.send_header("Access-Control-Allow-Origin", "*") self.end_headers() print("send to html response : {}".format(result)) self.wfile.write(result.encode())
def total(): check_dic() csv_dealer.csv_deal_to_file("./data/training_csv/",no_total=1,to_addr="./data/training_csv_cut") csv_dealer.csv_deal_to_file('./data/test_csv/',no_total=1,to_addr='./data/test_csv_cut') csv_dealer.csv_deal_to_file('./data/final_test/',no_total=1,to_addr='./data/final_test_cut') mat_list = nmf_sklearn.generate_new_mat_list('./data/training_csv') total_mat = csv_dealer.mat_list_to_total(mat_list) np.savetxt("./data/total.csv", total_mat, delimiter=',', fmt="%f") w, h = nmf_sklearn.nmf_sklearn(3,total_mat) K_Means.k_means(w,250,new_file_addr='./data/class_list.txt') probability.run() get_Sit.run() cal_f1.run()
def clustering_and_scheduling(layout, channel_losses_mat, n_links_on, clustering_method): N = np.shape(layout)[0] assert np.shape(layout) == (N, 4) assert np.shape(channel_losses_mat) == (N, N) assert 1 <= n_links_on <= N if (clustering_method == "Spectral Clustering"): clusters_one_layout = Spectral_Clustering.clustering(layout, channel_losses_mat, n_links_on) adj_mat = Spectral_Clustering.construct_adj_mat(channel_losses_mat) allocs_one_layout = adjacency_mat_based_scheduling(adj_mat, clusters_one_layout) elif (clustering_method == "Hierarchical Clustering"): clusters_one_layout = Hierarchical_Clustering.clustering(layout, channel_losses_mat, n_links_on) adj_mat = Spectral_Clustering.construct_adj_mat(channel_losses_mat) allocs_one_layout = adjacency_mat_based_scheduling(adj_mat, clusters_one_layout) elif (clustering_method == "K-Means"): clusters_one_layout, centroids_one_layout = K_Means.clustering(layout, n_links_on) allocs_one_layout = GLI_based_scheduling(layout, centroids_one_layout, clusters_one_layout) elif (clustering_method == "Hierarchical Clustering EqualSize"): clusters_one_layout = Hierarchical_Clustering_EqualSize.clustering(layout, channel_losses_mat, n_links_on) adj_mat = Spectral_Clustering.construct_adj_mat(channel_losses_mat) allocs_one_layout = adjacency_mat_based_scheduling(adj_mat, clusters_one_layout) elif (clustering_method == "K-Means EqualSize"): clusters_one_layout, centroids_one_layout = K_Means_EqualSize.clustering(layout, n_links_on) allocs_one_layout = GLI_based_scheduling(layout, centroids_one_layout, clusters_one_layout) else: print("Invalid clustering method name: {}!".format(clustering_method)) exit(1) return clusters_one_layout, allocs_one_layout
def apply_algorithm(self, alg_name, training_vectors, test_vectors_clean, test_vectors_anomalous): """Applies the specified algorithm onto the feature sets. """ if alg_name == 'lof': result_clean, result_anomalous, result_training = LOF.local_outlier_detection(training_vectors, test_vectors_clean, test_vectors_anomalous) elif alg_name == 'svm': result_clean, result_anomalous, result_training = SVM.one_class_svm(training_vectors, test_vectors_clean, test_vectors_anomalous) elif alg_name == 'dbscan': result_clean,result_anomalous, result_training = DBSCAN.dbscan(training_vectors,test_vectors_clean,test_vectors_anomalous) elif alg_name == 'kmeans': result_clean,result_anomalous, result_training = K_Means.k_means(training_vectors,test_vectors_clean,test_vectors_anomalous) else: raise NameError("Invalid Algorithm Name") return result_clean,result_anomalous, result_training
output.close() print 'Parameters saved to ' + filenames['params'] + '.pkl' ######################################################### print 'Initiating K-Means...' k_results=list() for i in range(numRounds): start=datetime.now() iGenerations=qk_rounds_genNum[i] numInits=np.int(iGenerations*numOracles*initsPercentage) k_centroids,k_assignment,k_timings,k_intertia=K_Means.k_means(mixture,numClusters,numInits) k_results.append([k_centroids,k_assignment,k_timings,k_intertia]) round=(datetime.now() - start).total_seconds() print float(i+1)*100/numRounds,'%\t','round ', i,':',round,'s - estimated:',(float(numRounds-1)-i)*round,'s / ',(float(numRounds-1)-i)*round/60,'m' print 'Preparing K-Means data structures...' k_rounds=dict() k_rounds['centroids']=list() k_rounds['assignment']=list() k_rounds['times']=list() k_rounds['inertia']=list() for i in range(numRounds): k_rounds['centroids'].append(k_results[i][0])
def gaussian_mixture(data, Km, init_method, epsilon, niterations, plotflag, r, RSEED=123): ''' % Template for a function to fit a Gaussian mixture model via EM % using K mixture components. The data are contained in the N x d "data" matrix. % % INPUTS % data: N x d real-valued data matrix % K: number of clusters (mixture components) % initialization_method: 1 for memberships, 2 for parameters, 3 for kmeans % epsilon: convergence threshold used to detect convergence % niterations (optional): maximum number of iterations to perform (default 500) % plotflag (optional): equals 1 to plot parameters during learning, % 0 for no plotting (default is 0) % RSEED (optional): initial seed value for the random number generator % % % OUTPUTS % gparams: K-dim structure array containing the learned mixture model parameters: % gparams(k).weight = weight of component k % gparams(k).mean = d-dimensional mean vector for kth component % gparams(k).covariance = d x d covariance vector for kth component % memberships: N x K matrix of probability memberships for "data" % % Note: Interpretation of gparams and memberships: % % - gparams(k).weight is the probability that a randomly selected row % belongs to component (or cluster) i (so it is "cluster size") % % - memberships(i,k) = p(cluster k | x) which is the probability % (computed via Bayes rule) that vector x was generated by cluster % k, according to the "generative" probabilistic model. ''' # your code goes here.... N = len(data) # initialize.... llBIC = [] Ks = [] for K in range(1, Km): lls = [] mu = [np.array([0., 0.]) for j in range(K)] sigma = [np.array([[0., 0.], [0., 0.]]) for j in range(K)] z = [1 / float(K) for i in range(K)] eFirst = False mFirst = False prevLikelihood = 0 compMat = list() if init_method == 0: compMat = randomWeights(N, K) mFirst = True elif init_method == 1: mu, sigma = randomParams(len(data[0]), K) eFirst = True else: clusters = K_Means.kMeans(data, K, 1) mu = [clusters[j].mean for j in range(K)] sigma = [np.eye(len(clusters[0].mean)) for i in range(K)] eFirst = True done = False iterations = 0 #plt.ion() iter = [] ll = [] while not done: iterations += 1 # perform E-step... if eFirst: list1 = [[] for j in range(K)] maxList = [] compMat = list() for d in data: x = [gaussian(d, mu[k], sigma[k]) * z[k] for k in range(K)] x = [j / sum(x) for j in x] compMat.append(x) list1[np.array(x).argmax()].append(d) compMat = np.array(compMat) #plotCluster(list1,mu,sigma) mFirst = True # perform M-step... if mFirst: #print len(compMat[:,0]) z = [sum(compMat[:, k]) / float(len(data)) for k in range(K)] mu = [np.array([0., 0.]) for j in range(K)] maxDist = 0 for k in range(K): for j in range(len(data)): mu[k] += compMat[j][k] * data[j] mu[k] /= float((z[k] * len(data))) sigma = [np.array([[0., 0.], [0., 0.]]) for j in range(K)] for k in range(K): p = z[k] p = p * len(data) for j in range(len(data)): c = (data[j] - mu[k]) sigma[k] += compMat[j][k] * (np.mat(c).T * np.mat(c)) sigma[k] /= p eFirst = True if iterations > niterations: done = True print "Iteration....", iterations # compute log-likelihood and print to screen..... log_likelihood = 0 final_sum = 0 for d in data: log_likelihood = 0 for j in range(K): log_likelihood += z[j] * gaussian(d, mu[j], sigma[j]) final_sum += np.log(log_likelihood) final_sum -= ((3 * K * np.log(len(data))) / 2) ll.append(final_sum) if abs(prevLikelihood - final_sum) < epsilon: print "Converged after...", iterations done = True prevLikelihood = final_sum iter.append(iterations) #lls.append((ll,iter)) llBIC.append(final_sum) Ks.append(K) # check for convergence..... #plt.ioff() #plt.show() return llBIC, Ks
from sklearn import datasets import K_Means, Plot blobs = datasets.make_blobs()[0] centers, clusters, colors, color_labels = K_Means.k_means(blobs, 3) Plot.plot(blobs, centers, colors, color_labels)
Investigated the performance of your neural network for different sizes of hidden layer.""" import MNIST_Loader import RBFNetwork import K_Means #loadMNISTData() returns a dataset which is a list of length 70,000 containing (image, label) tuples: #image is the input data in the form of a 784x1 vector where each element is a normalized greyscale value #label is the target data in the form of a 10x1 vector with a 1 in the index of the target value training_dataset, testing_dataset = MNIST_Loader.load_data() num_centroids = 16 num_iterations = 100 import random random.shuffle(training_dataset) print("K_Means Clustering in progress...") final_centroids, final_clusters = K_Means.k_means_clustering(num_centroids, training_dataset[:1000].copy(),num_iterations) print("LOOKK HERE") print(len(final_centroids)) #hyper-parameters learning_rate = 0.1 epochs = 10 num_folds = 5 scores = RBFNetwork.k_fold_cross_validation(training_dataset[:1000], num_folds, learning_rate, epochs, final_centroids, final_clusters) print('\nMean Classification Accuracy: %.3f%%' % (sum(scores)/float(len(scores)))) print("\nTesting with MNIST testing dataset") # initialize and train our network
n = np.int(n) total_bytes = np.float((n * d + k * d + n * k) * 4) print 'Memory used by arrays:\t',total_bytes/1024,'\tKBytes' print '\t\t\t',total_bytes/(1024*1024),'\tMBytes' print 'Memory used by data: \t',n * d * 4 / 1024,'\t','KBytes' ## Generate data #data = np.random.random((n,d)).astype(np.float32) data, groundTruth = datasets.make_blobs(n_samples=n,n_features=d,centers=k, center_box=(-1000.0,1000.0)) data = data.astype(np.float32) grouper = K_Means(N=n,D=d,K=k) centroids = grouper._init_centroids(data) times = dict() # Distance matrix start = timer() dist_mat_np = grouper._np_calc_dists(data,centroids) times["dist_mat np"] = timer() - start start = timer() dist_mat_cu_auto = grouper._cu_calc_dists(data,centroids,gridDim=None, blockDim=None,memManage='auto', keepDataRef=False) times["dist_mat cuda manual"] = timer() - start
import xlrd as xd import K_Means import numpy as np # K-means聚类算法 # 加载数据 print('K-means clustering') data = xd.open_workbook('data.xls') table = data.sheets()[0] height = table.col_values(3)[1:] weight = table.col_values(4)[1:] dataSet = np.vstack((height, weight)).T # 聚类 k = 2 centroids, clusterAssment = K_Means.kmeans(dataSet, k) # 可视化 K_Means.showCluster(dataSet, k, centroids, clusterAssment) # 分层聚类算法 #设置分层聚类函数 print('hierarchical clustering') linkages = ['ward', 'average', 'complete'] n_clusters_ = 2 ac = AgglomerativeClustering(linkage=linkages[2], n_clusters=n_clusters_) # 训练数据 ac.fit(dataSet) # 每个数据的分类 lables = ac.labels_ # 可视化
pickle.dump(params, output) output.close() print 'Parameters saved to ' + filenames['params'] + '.pkl' ######################################################### print 'Initiating K-Means...' print "Each round will have ", numInits, " K-Means initializations." k_results = list() for i in range(numRounds): start = datetime.now() k_centroids, k_assignment, k_timings, k_intertia = K_Means.k_means( mixture, numClusters, numInits) k_results.append([k_centroids, k_assignment, k_timings, k_intertia]) round = (datetime.now() - start).total_seconds() print float( i + 1 ) * 100 / numRounds, '%\t', 'round ', i, ':', round, 's - estimated:', ( float(numRounds - 1) - i) * round, 's / ', (float(numRounds - 1) - i) * round / 60, 'm' print 'Preparing K-Means data structures...' k_rounds = dict() k_rounds['centroids'] = list() k_rounds['assignment'] = list() k_rounds['times'] = list()
Y = CL.Moving_Mean(X, 5) #[CL.aritmeticky_prumer_fce(X, x, 5) for x in range(len(X))] Z = CL.Exp_Moving_Mean(X, 5) #[CL.suma_zleva_fce(X, x, 5) for x in range(len(X))] XX = np.vstack((Y, Z, X)).T XX # np.shape(XX) # plt.scatter(Y,X) # plt.show() np.shape(Z) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(Y, Z, X) plt.show() km.Nakresly(XX, 2, "3d") clf = km.K_means(2) clf.fit(XX) predictions = [] for pred in XX: predictions.append(clf.Predict(pred)) print(predictions) plt.plot(np.arange(len(X)), X, linewidth=0.5) plt.scatter(np.arange(len(X)), X, c=predictions, cmap=plt.cm.plasma) plt.show() ZZ = np.load(way + "No2.npy") ZZ[0] plt.plot(ZZ[0], ZZ[1]) plt.show()
total_bytes = np.float((n * d + k * d + n * k) * 4) print 'Memory used by arrays:\t', total_bytes / 1024, '\tKBytes' print '\t\t\t', total_bytes / (1024 * 1024), '\tMBytes' print 'Memory used by data: \t', n * d * 4 / 1024, '\t', 'KBytes' ## Generate data #data = np.random.random((n,d)).astype(np.float32) data, groundTruth = datasets.make_blobs(n_samples=n, n_features=d, centers=k, center_box=(-1000.0, 1000.0)) data = data.astype(np.float32) grouper = K_Means(N=n, D=d, K=k) centroids = grouper._init_centroids(data) times = dict() # Distance matrix start = timer() dist_mat_np = grouper._np_calc_dists(data, centroids) times["dist_mat np"] = timer() - start start = timer() dist_mat_cu_auto = grouper._cu_calc_dists(data, centroids, gridDim=None, blockDim=None, memManage='auto',