def main(): f = "colours" length = 151 r = 90 c = 120 nbins = 10 colors = np.zeros((length, r, c, 3), dtype=np.uint8) grays = np.zeros((length, r, c), dtype=np.uint8) # Read the images and store them in color and grayscale formats for i in range(length): im = Image.open(f+"/dwc"+str(i+1).zfill(3)+".png").convert('RGB') colors[i] = np.asarray(im, dtype=np.uint8) grays[i] = np.asarray(im.convert('L')) # Save the color and grayscale images for i in range(length): out_col = Image.fromarray(colors[i]) out_gray = Image.fromarray(grays[i]) out_col.save("out/col_im/dwc_col"+str(i+1).zfill(3)+".png") out_gray.save("out/gray_im/dwc_gray"+str(i+1).zfill(3)+".png") # Draw histograms colorhs, grayhs = histograms(length, nbins, colors, grays) # Kmeans clustering(length, nbins, colorhs, grayhs, colors, grays) # Other Methods other(length, nbins, grayhs, grays)
def main_test(): current_dir = os.path.dirname(os.path.abspath(__file__)) GO_Term_path = current_dir + '/example/GO_Term.xlsx' outpath = current_dir + '/example/test_dealing.csv' batch_read_GO_Term(GO_Term_path=GO_Term_path, outputpath=outpath) Save_path = current_dir + '/example/test_duplicate_removal_1.csv' screen_Term_gene_1(GO_Term=outpath, Save_path=Save_path) Save_path_1 = current_dir + '/example/test_duplicate_removal.csv' screen_Term_gene(GO_Term=Save_path, Save_path=Save_path_1) GO_Term_gene_expression_path = current_dir + '/example/test_count_matrix.csv' outputpath1 = current_dir + '/example/Term_matrix' batch_read_GO_Term_matrix( GO_Term_gene_path=Save_path_1, GO_Term_gene_expression_path=GO_Term_gene_expression_path, outputpath=outputpath1) outputpath2 = current_dir + '/example/Term_matrix' Save_path_feature_matrix = current_dir + '/example/test_feature_matrix.csv' batch_exacting_feature_KPCA(read_path=outputpath2, outputpath=Save_path_feature_matrix, n_components=0.4) label_path = current_dir + '/example/test_label.csv' clustering(read_path=Save_path_feature_matrix, n_clusters=5, label_path=label_path)
def main(): n_regions = 4 # Loading region data and calculate them if not saved try: with np.load('region_labels.npz') as r_labels: region_labels = r_labels['matrix'] except: region_labels = region_calculation( n_regions=n_regions, show_silhouette=True) np.savez_compressed('region_labels.npz', matrix=region_labels) print('Fetching Data...') with np.load('av_model_data30.npz') as m: av_matrix = m['matrix'] with open("datetimes.txt", "rb") as fp: dates = pickle.load(fp) print('Finished Fetching Data') # Clustering parameters mode = 'kmeans' n_clusters = [2, 3, 4] # Comparing results with 2, 3, and 4 temporal clusters data = [] s_avg = [] # Keeping relevant dates new_d = [] for i in range(int(len(dates)/30.4325)): new_d.append(dates[int(i * 30.4325)]) # Clustering with regional average by chemical for i in range(4): for n in n_clusters: data.append(average_by_region(matrix=av_matrix, chemical=i, r_labels=region_labels, n_regions=n_regions)) # Clustering if mode == 'kmeans': clustered_data = clustering( data=data[i], n_clusters=n, mode='kmeans', verbose=False) elif mode == 'hierarchical': clustered_data = clustering( data=data[i], n_clusters=n, mode='hierarchical', verbose=False) print("The " + str(n) + " cluster sizes are:") cluster_sizes = [len(list(compress(data[i], clustered_data.labels_ == cluster))) for cluster in range(n)] print(cluster_sizes) s_avg.append(silhouette_plot(labels=clustered_data.labels_, data=data[i], plotGraph=False, n_clusters=n)) # Two different ways of visualizing the results timeseries_plot(data=clustered_data.labels_, t=new_d) timeClustersVisualization( labels=clustered_data.labels_, data_points_per_year=12, n_clusters=n) print(s_avg)
def main(): if len(sys.argv) < 2 or len(sys.argv) > 3: print("Wrong number of arguments, usage: {0} input_file [output_file] \ (\"output.txt\" by default)".format(sys.argv[0])) sys.exit() input_filename = sys.argv[1] if len(sys.argv) == 3: clustering.clustering(input_filename, sys.argv[2], console_out=True) else: clustering.clustering(input_filename, console_out=True)
def main(args): node_embeddings = load_embeddings(args.embedding_file) if args.label_file: labels = read_node_label(args.label_file) if args.modularity: print("Modularity") modularity(args, node_embeddings, args.min_k, args.max_k) if args.reconstruction: print("Graph reconstruction") reconstr(args, node_embeddings, args.k_nbrs) if args.clustering: print("Clustering") clustering(node_embeddings, labels, args.exp_times) if args.link_prediction: print("Link prediction") link_prediction(args.input, node_embeddings) if args.classification: X = list(labels.keys()) Y = list(labels.values()) print("Node classification") clf_ratio_list = args.clf_ratio.strip().split(',') result_list = {} train_ratio = np.asarray(range(1, 10)) * .1 for clf_ratio in train_ratio: # clf_ratio_list: result_per_test = [] for ti in range(args.exp_times): clf = Classifier(vectors=node_embeddings, clf=LogisticRegression()) myresult = clf.split_train_evaluate(X, Y, float(clf_ratio)) result_per_test.append(myresult) result_list[clf_ratio] = result_per_test print('-------------------') for clf_ratio in train_ratio: print('Train percent:', clf_ratio) results = result_list[clf_ratio] for index, result in enumerate(results): print('Shuffle #%d: ' % (index + 1), result) avg_score = defaultdict(float) for score_dict in results: for metric, score in score_dict.items(): avg_score[metric] += score for metric in avg_score: avg_score[metric] /= len(results) print('Average score:', dict(avg_score)) print('-------------------')
def figure_5_parameter_values(subjects=None, shape=None, fignum=105): """Paper figure. Plots, on the left, the histogram of alphas. On the right, the inferred shapes for all selected subjects. """ if subjects is None: subjects = range(35) if shape is None: # shapes = [shape_pars[0] for shape_pars in invp.__rds__()] shape = 'exponential' fig = plt.figure(fignum, figsize=(9, 5)) fig.clear() num_colors = 3 cmap = figure_colors('lines_cmap') colors = [cmap(x) for x in np.linspace(0, 1, num_colors)] maax_alpha = plt.subplot2grid((2, 2), (0, 0), rowspan=1) maax_bias = plt.subplot2grid((2, 2), (0, 1), rowspan=1) maax_lnc = plt.subplot2grid((2, 2), (1, 0), rowspan=1) maax_clu = plt.subplot2grid((2, 2), (1, 1), rowspan=1) alpha_hist(subjects, maax=maax_alpha, shapes=[ shape], color=figure_colors('histograms'), divisors='auto') bias_hist(subjects, maax=maax_bias, shapes=[shape], color=figure_colors('histograms'), divisors=np.linspace(0.6, 1.2, 7)) if shape == 'unimodal_s': centroids, membership, _ = cl.clustering( subjects, k=3, clustering_type='kmeans', shape=shape) cluster_names = [[]] * 3 cluster_names[centroids[:, 1].argmax()] = r'$\uparrow \sigma$' cluster_names[centroids[:, 0].argmax()] = r'$\uparrow \mu$' cluster_names[centroids.sum(axis=1).argmin()] = r'$\downarrow \mu$' elif shape == 'exponential': centroids, membership, distorsion = cl.clustering( subjects, k=2, clustering_type='kmeans', shape=shape) cluster_names = [[]] * 2 cluster_names[centroids.argmax()] = '$highSTP$' cluster_names[centroids.argmin()] = '$lowSTP$' plot_cluster_shapes(subjects=subjects, shape=shape, maax=maax_lnc, colors=colors, centroids=centroids, legends=cluster_names) scatter_kappas(subjects=subjects, shape=shape, maax=maax_clu, membership=membership, centroids=centroids, colors=colors) maax_lnc.set_label('Label via method') maax_lnc.legend(loc='upper left') plt.tight_layout() plt.show(block=False)
def build_model(self, path): list_crawling = [] if self.crawling: with open(os.path.join(os.getcwd(), "ind_url/urls.txt"), "r", encoding="utf8", errors="ignore") as f: document = f.read() f.close() list_crawling = [item for item in document.split('/n')] self.build = True docs = glob.glob(os.path.join(path + "/**", "*.txt"), recursive=True) print(docs) if not self.lineEdit_select_directory.text() == '': clustering.clustering(path + "/", 4) for i in reversed(range(self.tableWidget_relevant.rowCount())): self.tableWidget_relevant.removeRow(i) for item in range(len(docs)): row_position = self.tableWidget_relevant.rowCount() qwidget = QWidget() checkbox = QCheckBox() checkbox.setChecked(False) qhboxlayout = QHBoxLayout(qwidget) qhboxlayout.addWidget(checkbox) # qhboxlayout.setAlignment(Qt.AlignCenter) qhboxlayout.setContentsMargins(0, 0, 0, 0) name = os.path.basename(docs[item]) if self.crawling: name = list_crawling[item] self.tableWidget_relevant.insertRow(row_position) self.tableWidget_relevant.setItem( row_position, 0, QTableWidgetItem(clustering.all_label_from_cluster())) self.tableWidget_relevant.setItem(row_position, 1, QTableWidgetItem(name)) self.tableWidget_relevant.setCellWidget( row_position, 2, qwidget) self.disable_buttons() json_value = json.dumps({'action': 'build', 'path': path}) modelo.model(json_value) self.enable_buttons() self.indexing_label.setText("")
def perform_clustering(self): self.cluster_dialog.textEdit.setText('Custering...') if self.cluster_dialog.checkbox_pca.isChecked(): dim = int(self.cluster_dialog.line_pca.text()) matrix = reduce_dimensionality(self.dicom_data.data_array, dim) else: matrix = self.dicom_data.data_array fr = int(self.cluster_dialog.fr.text()) to = int(self.cluster_dialog.to.text()) matrix[:,:,:fr,:] = 0 matrix[:,:,to:,:] = 0 coordinates = self.cluster_dialog.checkbox_coords.isChecked() clusters = int(self.cluster_dialog.line_clusters.text()) if self.cluster_dialog.kmean.isChecked(): function = MiniBatchKMeans elif self.cluster_dialog.agglomerative.isChecked(): function = AgglomerativeClustering elif self.cluster_dialog.dbscan.isChecked(): function = DBSCAN elif self.cluster_dialog.optics.isChecked(): function = OPTICS if self.cluster_dialog.slicewise.isChecked(): w = matrix.shape[0] h = matrix.shape[1] slices = matrix.shape[2] stacks = matrix.shape[3] z = self.slider_slice.value() if self.plane == 'tra': matrix = matrix[:,:,z,:].reshape([w, h, 1, stacks]) self.dicom_data.cluster_array[:,:,z,0] = clustering(matrix, function, clusters, coordinates)[:,:,0,0] if self.plane == 'cor': matrix = matrix[z,:,:,:].reshape([1, h, slices, stacks]) self.dicom_data.cluster_array[z,:,:,0] = clustering(matrix, function, clusters, coordinates)[0,:,:,0] if self.plane == 'sag': matrix = matrix[:,z,:,:].reshape([w, 1, slices, stacks]) self.dicom_data.cluster_array[:,z,:,0] = clustering(matrix, function, clusters, coordinates)[:,0,:,0] else: self.dicom_data.cluster_array = clustering(matrix, function, clusters, coordinates) self.cluster_dialog.textEdit.setText('Custering acomplished!') self.labeling_window.lineEdit.setText(str(clusters)) self.set_view('segments')
def solution_test_main(): filenames = ("solution_test1.txt", "solution_test2.txt", "solution_test3.txt") expected_partition1 = (('A', 'B', 'C'), ('D', 'E', 'F')) expected_partition2 = (('0', '1', '2', '3', '4', '5'), ('6', '7', '8', '9', '10', '11', '12'), ('13', '14', '15', '16')) expected_partition3 = (('0', '1', '2', '3'), ('4', '5', '6', '7'), ('8', '9', '10', '11'), ('12', '13', '14', '15')) expected_partitions = (expected_partition1, expected_partition2, expected_partition3) for filename, expected_partition in zip(filenames, expected_partitions): clusters_partition = clustering.clustering(filename, "solution_test_output.txt") for cluster in expected_partition: number = clusters_partition[cluster[0]] for node in cluster: if clusters_partition[node] != number: os.remove("solution_test_output.txt") return False labels_clusters = [] for cluster in expected_partition: if clusters_partition[cluster[0]] in labels_clusters: os.remove("solution_test_output.txt") return False else: labels_clusters.append(clusters_partition[cluster[0]]) os.remove("solution_test_output.txt") return True
def use_case_kmeans(users_skills, clusters_ground_truth): print("Clustering") clustering_model, times = clustering(users_skills, range(*clustering_range), True) print("- Number of clusters found", len(clustering_model.cluster_centers_)) print("- Real number of clusters", len(skills_sets)) evaluate_clustering(clusters_ground_truth, clustering_model.labels_) if False: pca = PCA(n_components=2) # pca.fit(users_skills) new_data = pca.transform(users_skills) # pca.fit(clustering_model.cluster_centers_) new_data2 = pca.transform(clustering_model.cluster_centers_) c = np.concatenate( (clustering_model.labels_, np.array([6] * len(clustering_model.cluster_centers_)))) new_data = np.concatenate((new_data, new_data2), axis=0) # axs[1, 0].scatter(new_data.T[0], new_data.T[1], c=c, alpha=0.5) print("Plotting graph") plot_graph(G, None, colors=clustering_model.labels_) return times
def klasteryzacja(): x_points, y_points, details = clustering() all_points = [] for idx, xy in enumerate(zip(x_points, y_points)): point = Point(idx + 1, tuple(xy)) all_points.append(point) return all_points, details
def use_case_kmeans(G, users_skills, clusters_ground_truth): clustering_range = (2, 10) distance_function = "euclidean" print("Clustering") print("Using KMeans") clustering_model = clustering(users_skills, range(*clustering_range), True) print("- Number of clusters found", len(clustering_model.cluster_centers_)) print("- Real number of clusters", len(np.unique(clusters_ground_truth))) users_distances_to_centers = cdist( users_skills, clustering_model.cluster_centers_, metric=distance_function) print("Link prediction") model, y_train, predicted_train, y_test, predicted_test = link_prediction( G, users_distances_to_centers) print("Evaluation") print("- Train") print_evaluate(y_train, predicted_train) print("- Test") print_evaluate(y_test, predicted_test) print("Visualization") visualization(model, G, users_distances_to_centers, clustering_model.labels_)
def main() -> None: small_data_answer = clustering(get_small_data(), k=4) print("Question 1 answer:", small_data_answer) big_graph = get_big_data() kosaraju = Kosaraju(big_graph) components = kosaraju.run() largest_k = len(components) print("Question 2 answer:", largest_k)
def testBasic(self): point_objs = [] for i in range(5): for j in range(10): point_objs.append(Point(i * 10000 + j, i * 10000 + j)) km = clustering(point_objs, 5) clusters = km.k_means(False) self.assertEqual(len(clusters), 5)
def test_solution_correctness(filename, expected_partition): clusters_partition = clustering.clustering(filename, file_output=False) for cluster in expected_partition: number = clusters_partition[cluster[0]] for node in cluster: assert clusters_partition[node] == number labels_clusters = [] for cluster in expected_partition: assert clusters_partition[cluster[0]] not in labels_clusters labels_clusters.append(clusters_partition[cluster[0]])
def main(): # start = time.time() num = 5 data = [ 'pollen_human', 'goolam', 'petal_human', 'biase', 'kelin', 'zeisel' ] cluster_num = [11, 5, 5, 4, 4, 7] n_component = [0.4, 0.4, 0.4, 0.4, 0.6, 0.8] data_name = data[num] n_components = n_component[num] n_clusters = cluster_num[num] GO_Term_path = 'GO_BP_MF_CC.xlsx' outputpath = data_name + '_dealing.csv' batch_read_GO_Term(GO_Term_path, outputpath) GO_Term = data_name + '_dealing.csv' Save_path = data_name + '_duplicate_removal_1.csv' screen_Term_gene_1(GO_Term, Save_path) GO_Term_1 = data_name + '_duplicate_removal_1.csv' Save_path_1 = data_name + '_duplicate_removal.csv' screen_Term_gene(GO_Term_1, Save_path_1) Save_path_2 = data_name + '_duplicate_removal.csv' GO_Term_gene_expression_path = data_name + '_count_matrix.csv' outputpath1 = 'Term_matrix' batch_read_GO_Term_matrix( GO_Term_gene_path=Save_path_2, GO_Term_gene_expression_path=GO_Term_gene_expression_path, outputpath=outputpath1) read_path = 'Term_matrix' outputpath2 = data_name + '_KPCA_' + str(n_components) + '_cosine.csv' batch_exacting_feature_KPCA(read_path, outputpath2, n_components) read_path = data_name + '_KPCA_' + str(n_components) + '_cosine.csv' label_path = data_name + '_label.csv' clustering(n_clusters, read_path, label_path)
def evaluate_single_extraction(prediction, truth, index_test, labelled_entities): correct = True atleast_one = False entities = labelled_entities[index_test] cluster_map = clustering(entities) assert len(entities) == len(prediction) assert len(entities) == len(truth) true_clusters = set() for test, entity in zip(truth, entities): if test != 1.0: continue cluster = cluster_map[entity] if cluster > -1: true_clusters.add(cluster) for pred, test, entity in zip(prediction, truth, entities): pred_is_one = pred == 1.0 test_is_one = test == 1.0 test_is_zero = test == 0.0 cluster = cluster_map[entity] if pred_is_one and cluster > -1: if cluster in true_clusters: atleast_one = True else: correct = False else: if pred_is_one and test_is_zero: correct = False if pred_is_one and test_is_one: atleast_one = True # if (correct and atleast_one) or (sum(prediction) == 0 and sum(truth) == 0): if correct or (sum(prediction) == 0 and sum(truth) == 0): return 1 else: return 0
def factfile(DAG, TR, out, DAG_name='temp'): N = DAG.number_of_nodes() E = DAG.number_of_edges() [c_plus,c_zero,c_minus] = clus.clustering(DAG) N_TR = TR.number_of_nodes() E_TR = TR.number_of_edges() [c_plus_TR,c_zero_TR,c_minus_TR] = clus.clustering(TR) degree_test(DAG, TR, DAG_name) out.write('DAG:' + '\n') out.write('Number of nodes: ' + str(N) + '\n') out.write('Number of edges: ' + str(E) + '\n') out.write('clustering_plus: ' + str(c_plus) + '\n') out.write('clustering_zero: ' + str(c_zero) + '\n') out.write('clustering_minus: ' + str(c_minus) + '\n') out.write('Transitive Reduction of DAG:' + '\n') out.write('Number of nodes: ' + str(N_TR) + '\n') out.write('Number of edges: ' + str(E_TR) + '\n') out.write('clustering_plus: ' + str(c_plus_TR) + '\n') out.write('clustering_zero: ' + str(c_zero_TR) + '\n') out.write('clustering_minus: ' + str(c_minus_TR) + '\n')
def scatter_kappas(subjects=None, shape=None, membership=None, colors=None, centroids=None, labels=None, maax=None, fignum=6): """Makes a scatter plot of kappa values, coloring them according to their membership. """ if subjects is None: subjects = range(35) if colors is None: colors = ['blue', 'red', 'green'] if labels is None: labels = colors if shape is None: shape = 'exponential' if membership is None: _, membership, _ = cl.clustering(subjects, k=2, clustering_type='kmeans', shape=shape) membership = np.array([membership[key] for key in membership.keys()]) num_clusters = np.unique(membership).size if maax is None: fig = plt.figure(fignum) fig.clear() maax = fig.add_subplot(111) best_pars = ba.best_model(subjects, shapes=[shape]) kappa = np.zeros(len(subjects)) for subject in best_pars.keys(): kappa[subject] = best_pars[subject][1][0][-1][1] counter = 1 for c_cluster in range(num_clusters): c_subs = np.where(membership == c_cluster)[0] maax.scatter(range(counter, counter + len(c_subs)), kappa[c_subs], color=colors[c_cluster]) if centroids is not None: maax.plot(range(counter, counter + len(c_subs)), np.tile(centroids[c_cluster], len(c_subs)), color=colors[c_cluster]) counter += len(c_subs) maax.set_xlim(0, len(subjects) + 1) maax.set_ylim(0, 110) maax.set_yticklabels(np.array(maax.get_yticks() / 10, dtype=int)) maax.set_xlabel('Subjects') maax.set_ylabel('Sensitivity to points (STP)') maax.set_title('D', loc='left') plt.show(block=False)
def evaluate_clustering(): warnings.filterwarnings(action="ignore", category=ConvergenceWarning) X = [] Y = [] nb_cluster_found = [] max_skills_sets_sizes = 30 for i in range(3, max_skills_sets_sizes): print(i) X.append(i) skills_sets = generate_skills_sets(i, 5, 7) users_skills, clusters_ground_truth = generate_user_skills( skills_sets, 500, 1, 2) clustering_range = (3, max_skills_sets_sizes) clustering_model = clustering(users_skills, range(*clustering_range), False) nb_cluster_found.append(len(clustering_model.cluster_centers_)) info_score = normalized_mutual_info_score(clusters_ground_truth, clustering_model.labels_) Y.append(info_score) plt.figure(figsize=(10, 5)) plt.tight_layout() plt.title("Normalized mutual info score over number of skills sets/jobs") plt.xlabel("Number of skill sets/jobs") plt.ylabel("Normalized mutual info score") X = np.array(X) Y = np.array(Y) plt.plot(X, Y) is_correct = np.array(X) == np.array(nb_cluster_found) correct_indices = np.where(is_correct)[0] incorrect_indices = np.where(np.logical_not(is_correct))[0] plt.scatter(X[correct_indices], Y[correct_indices], color="blue", label="Correct number of cluster found") plt.scatter(X[incorrect_indices], Y[incorrect_indices], color="red", label="Incorrect number of cluster found") plt.ylim(0.95, 1.05) plt.legend() plt.savefig("evaluation_clustering.png") plt.show()
def ck(adjm,nodes): import clustering as cl import degree as dg dist = {} for j in nodes: deg = dg.degree(adjm,j) cls = cl.clustering(adjm, j) if if_present(deg, dist) == False: dist[deg] = [cls] if if_present(deg, dist) == True: dist[deg].append(cls) dist = popit(dist) # dist = ckdist(dist) #dist[deg] = cls return dist
def ck(adjm, nodes): import clustering as cl import degree as dg dist = {} for j in nodes: deg = dg.degree(adjm, j) cls = cl.clustering(adjm, j) if if_present(deg, dist) == False: dist[deg] = [cls] if if_present(deg, dist) == True: dist[deg].append(cls) dist = popit(dist) # dist = ckdist(dist) #dist[deg] = cls return dist
def run(): start_time = time.clock() jieba.set_dictionary('jieba/dict.txt.big') jieba.initialize() print ("jieba " + str(time.clock() - start_time)) start_time = time.clock() news_rss_url = "http://hk.news.yahoo.com/rss/hong-kong" # news_rss_url = "http://hk.news.yahoo.com/rss/china" info = feedparser.parse(news_rss_url) start_time = time.clock() for entry in info.entries: # word count of each word of summary word_list = getBagOfWords(preprocess(jieba.cut(stripTag(entry.summary)))) # word count of each word of title bag_of_word_of_title = getBagOfWords(preprocess(jieba.cut(stripTag(entry.title)))) # Combine word count of both summary and title and title weights more bag_of_word = Counter() for i in range(3): bag_of_word.update(bag_of_word_of_title) bag_of_word.update(word_list) entry["bag_of_words"] = bag_of_word print ("preprocess " + str(time.clock() - start_time)) # result = Counter() # for entry in info.entries: # result.update(entry["bag_of_words"]) # printList(result) # Clustering them start_time = time.clock() clusters = clustering.clustering([Cluster([Vector(entry)]) for entry in info.entries]) print ("clustering " + str(time.clock() - start_time)) # Print the result newsList = [] for (index, cluster) in enumerate(clusters): for vector in cluster.listOfVectors: news = News(index, (vector == cluster.centroidVector), vector.data["title"], vector.data["published"], vector.data["link"]) newsList.append(news.__dict__) return json.dumps(newsList)
def plot_cluster_shapes(subjects=None, maax=None, shape=None, colors=None, centroids=None, labels=None, legends=None, fignum=4): """Finds the centers of the clusters in the data and plots the shapes that these centers create. """ if subjects is None: subjects = range(35) if shape is None: shape = 'unimodal_s' flag_noshow = True if maax is None: fig = plt.figure(fignum) maax = fig.add_subplot(111) flag_noshow = False if colors is None: colors = ['blue', 'red', 'green'] if labels is None: labels = colors if legends is None: legends = colors if centroids is None: centroids, _, _ = cl.clustering(subjects, k=3, clustering_type='kmeans', shape=shape) mabes = bc.betMDP(nS=72, thres=60) for ix_cen, centroid in enumerate(centroids): shape_pars = [shape] + [par for par in centroid] lnc = mabes.set_prior_goals(shape_pars=shape_pars, convolute=False, cutoff=False, just_return=True) lnc /= lnc.max() maax.plot(lnc, color=colors[ix_cen], linewidth=3, label=legends[ix_cen]) maax.set_xlim([0, 71]) maax.set_xticklabels(np.array(maax.get_xticks(), dtype=int) * 10) shaded_threshold = np.arange(mabes.thres, mabes.nS) bg_color = figure_colors('past_threshold_background') maax.fill_between(shaded_threshold, 0, maax.get_ylim() [-1], alpha=0.1, color=bg_color) # maax.set_title('Cluster centers'' shapes') maax.set_title('C', loc='left') maax.set_xlabel('Points') maax.set_ylabel('Valuation (a.u.)') if not flag_noshow: plt.show(block=False)
def main(): n = 5 #Number of nodes e = 7 #Number of edges A = [[0 for x in xrange(n)] for x in xrange(n)] #Adjacency Matrix i = 1 #iterator while i <= e: a = random.randint(0,n-1) b = random.randint(0,n-1) if a!=b and A[a][b]!=1: A[a][b] = 1 A[b][a] = 1 else: i=i-1 i+=1 print A, "\n" total_cost = [] for i in xrange(len(A)): start = i costs = dijkstra(A, start) print i, ":", costs, total_cost.append(sum(costs)) print "\nTotal cost:", total_cost, "\n" characterstic_pathlength = sum(total_cost)/float(n) print "Characterstic pathlength:", characterstic_pathlength, "\n" #Clustering Coefficient of the graph: clustering_coefficient=clustering(A) print "Clustering coefficients are:", clustering_coefficient, "\n" #Average clustering coefficient of the graph aver_clustering=sum(clustering_coefficient)/n print "Average clustering coefficient of the network:", aver_clustering, "\n" return 2
def interval_test(DAG,start,end): lp = dl.lpd(DAG,start,end) length = lp[2] print 'The longest path between %s and %s is %d edges long' %(start,end,length) interval = lc.interval(DAG,start,end) N = interval.number_of_nodes() E = interval.number_of_edges() print 'The interval contains %d nodes and %d edges' %(N,E) c = clus.clustering(interval) print 'For the interval, c+ is %f, c0 is %f, c- is %f' %(c[0],c[1],c[2]) #MMd = MM.MM_dimension(interval) MPSD = mp.mpsd(interval,lp[1]) #print 'The MM dimension of the interval is %f and the MPSD is %f' %(MMd,MPSD) print 'The MPSD is %f' %MPSD[0]
def main(): n = 5 #Number of nodes e = 7 #Number of edges A = [[0 for x in xrange(n)] for x in xrange(n)] #Adjacency Matrix i = 1 #iterator while i <= e: a = random.randint(0, n - 1) b = random.randint(0, n - 1) if a != b and A[a][b] != 1: A[a][b] = 1 A[b][a] = 1 else: i = i - 1 i += 1 print A, "\n" total_cost = [] for i in xrange(len(A)): start = i costs = dijkstra(A, start) print i, ":", costs, total_cost.append(sum(costs)) print "\nTotal cost:", total_cost, "\n" characterstic_pathlength = sum(total_cost) / float(n) print "Characterstic pathlength:", characterstic_pathlength, "\n" #Clustering Coefficient of the graph: clustering_coefficient = clustering(A) print "Clustering coefficients are:", clustering_coefficient, "\n" #Average clustering coefficient of the graph aver_clustering = sum(clustering_coefficient) / n print "Average clustering coefficient of the network:", aver_clustering, "\n" return 2
def clustering(self, new_feat_names, nr_clusters): self.df_all = self.df_all.dropna(subset=new_feat_names) new_feat_df = self.df_all.loc[:, new_feat_names] x_matrix = new_feat_df.as_matrix() # kmeans = cl.KMeans(n_clusters=nr_clusters).fit(x_matrix) # cluster_centers = kmeans.cluster_centers_ clusterer = clust.clustering(self.df_all, new_feat_names) cluster_centers, self.df_all = clusterer.dtw_clustering() x = np.arange(4) for cluster_center in cluster_centers: plt.plot(x, cluster_center) plt.show() # self.df_all['labels'] = kmeans.labels_ cluster_groups = self.df_all.groupby(['label']) count = 0 highest_group_len = 0 highest_group = None highest_group_name = 0 for name, group in cluster_groups: cluster_center_list = [cluster_centers[int(name)]] * group.shape[0] # group.loc[:, new_feat_names] = cluster_center_list # self.create_dataframe_from_groups(group, count) if group.shape[0] > highest_group_len: if count > 0: second_highest_group_len = highest_group_len second_highest_group = highest_group second_highest_group_name = highest_group_name second_highest_group_index = highest_group_index highest_group_len = group.shape[0] highest_group = group highest_group_name = cluster_centers[int(name)] highest_group_index = name count += 1 group_strats = highest_group.loc[:, new_feat_names].as_matrix() print('highest group name: %s' % str(highest_group_name)) print('highest group len: %s' % str(highest_group_len)) print('highest group index: %s' % str(highest_group_index)) for strat in group_strats: plt.plot(x, strat) # plt.plot(x, highest_group_name) plt.show()
def resultpage(f): Query = unicode(f.getvalue('query',''),'utf-8') (RequestURL, numbers, items) = googleajaxsearch(Query) # items: 辞書{title, url, content}がサイト数分ある配列 html_ans = u'<p>検索ワード:<big><b>%s</b></big> - %s<br>%s</p>' % ( Query,numbers,unicode(RequestURL,'utf-8')) # items: 辞書を{title, url, content, (extract,) unigram}にアップデート """ for item in items: item["extract"] = extracting(Query,item["title"],item["content"]) """ morphing(items,target='title|content') result = clustering(items) # result = [[group1][group2]...[groupk]] html_tree_cates = u'' groupid=1 for group in result: site_num=0 for itemid in group: html_site = u'<font color="#ff0000">タグ: ' for unigram in items[itemid]["unigram"]: html_site = html_site + u'%s ' % unigram html_site = html_site + u'</font><br><b>%s</b><br>%s<br>%s<br><br>' % ( items[itemid]["title"],items[itemid]["url"],items[itemid]["content"]) if site_num == 0: # 各カテゴリのトップサイト html_this_cate = html_tree_site1 % html_site site_num == len(group) elif site_num == 1: # 各カテゴリの最終サイト html_this_cate = html_this_cate + html_tree_site3 % html_site else: html_this_cate = html_this_cate + html_tree_site2 % html_site site_num-=1 html_tree_cates += html_tree_category % (unicode(str(groupid),'utf-8'),unicode(str(groupid),'utf-8'),unicode(str(groupid),'utf-8'),unicode(str(groupid),'utf-8'),unicode(str(groupid),'utf-8'),html_this_cate) groupid = groupid + 1 html_ans = html_ans + html_tree_script + html_tree_head + html_tree_cates + html_tree_foot html = html_head + html_form % Query + html_ans + html_foot return html.encode('utf-8')
def conjunto_segmentados(segmentados, n=3): import pandas as pd segments_df = pd.DataFrame() lens = [] inicio = 0 for conj in segmentados: segments_df = segments_df.append(conj) lens.append([inicio, conj.shape[0]]) inicio = conj.shape[0] fit = clustering(segments_df, n) segments_df['cluster'] = fit.labels_ segments_df['cluster'] = segments_df['cluster'].astype(str) for x in range(len(lens)): segmentados[x] = segments_df.iloc[lens[x]] confidences = [] for i in range(n): confidences.append( extremos_incertidumbre(fit, filter_numerical(segments_df), i)) return segmentados, fit, confidences, segments_df
def histogram_clusters(subjects=None, shape=None, membership=None, colors=None, labels=None, maax=None, fignum=5): """Makes a bar plot for membership of subjects in clusters.""" if subjects is None: subjects = range(35) if colors is None: colors = ['blue', 'red', 'green'] if labels is None: labels = colors if shape is None: shape = 'unimodal_s' if membership is None: _, membership, _ = cl.clustering(subjects, k=3, clustering_type='kmeans', shape=shape) membership = np.array([membership[key] for key in membership.keys()]) ix_centroid = np.unique(membership) ix_centroid.sort() count_members = np.array([len(np.where(membership == x)[0]) for x in ix_centroid]) if maax is None: fig = plt.figure(fignum) fig.clear() maax = fig.add_subplot(111) maax.bar(ix_centroid, count_members, width=1, color=colors) maax.set_xticks(ix_centroid + 0.5) maax.set_xticklabels(labels) maax.set_ylim(1.2 * np.array(maax.get_ylim())) # maax.set_title('Cluster members') maax.set_title('C', loc='left') maax.set_ylabel('Number of subjects') plt.show(block=False)
#!coding:utf-8 # 一発で計算できるようにプログラム化 import sys import make_data as md import clustering as cls #------------[start]データ生成〜類似度行列計算---------- if(len(sys.argv)>1): #オプション受け取り N = sys.argv[1] # N:個体数 M = sys.argv[2] # M:個体の特徴ベクトルの次元 K = md.make_K(N=N,M=M) # 特徴ベクトル生成->類似度行列生成 else: K = md.make_K() #オプションがなければデフォルト値(N=5,M=?)で類似度行列生成 print K #------------[end]データ生成〜類似度行列計算---------- #------------クラスタリング---------- #clustering(N,M,W,alpha,beta) # 類似度行列とその他のパラメータが与えられたもとで崩壊型ギブスサンプリングを行う M=5 cls.clustering(K=K,M=M,W=K,alpha=1,beta=2)
if i ==0: cluster_pred = cluster_pred_tem.copy() else: cluster_pred = pd.concat([cluster_pred,cluster_pred_tem]) return total_pred.sort_values(by='id').reset_index(drop=True), cluster_pred.sort_values(by='id').reset_index(drop=True) # excution if __name__=='__main__': data_path = 'data/' perform_raw, rating, test_raw = load_data(data_path,trend=False,weather=False,query=False) train_var, test_var = make_variable(perform_raw,test_raw,rating) raw_data, y_km, train_len= preprocess(train_var,test_var,0.03,3,inner=False) data = mk_trainset(raw_data,categorical=True) # lgbm만 categorical = True, 나머지 모델은 False -> one-hot encoding train, val, robustScaler = clustering(data,y_km,train_len,test=True) # test 할때만 test = True # permutation으로 날릴 변수들 # lgbm 기준이라서 one-hot 안된 카테고리 변수들이 있음, 다른 모델 랜덤 서치 돌릴 때는 # 해당 변수들은 이름이 없을테니(ex. min -> min_0, min_1, min_2 ...) # 알아서 에러나는거 보고 빼던가 미리 카테고리 변수는 drop 리스트에서 빼 놓으셈 # 그리고 변수 drop은 0,1 모델 기준으로 한거라 cluster 기준으로 랜덤서치하는 거랑 안 맞을 수 있음 # 혜린이한테는 일단 두 리스트 교집합으로 하라 했는데 더 좋은 방법 있음 생각해서 시도 ㄱㄱ drop1_ = ['min_sales_med', 'min_sales_std', 'day_sales_rank', 'min_sales_rank', 'min_order_rank', 'cate_sales_rank', 'cate_order_rank', 'cate_order_med', 'cate_sales_med', 'prime', 'min_order_std', 'min_sales_mean', 'day_order_rank', 'cate_order_std', 'min_order_med', 'min_order_mean', 'cate_sales_mean', 'cate_sales_std', 'day_order_std', 'rating', 'day_sales_med', 'min', 'day_order_med']
predictions = clf.predict(feature_vector) predictions_prob = clf.predict_proba(feature_vector) correctness = evaluate_single_extraction(predictions, target_vector_reshaped, idx, talker_entities) all_entities = [entity["entity"] for entity in entry["talker"]] # print("idx", idx) print("quote", entry["quote"]) print("url", entry["source"]) # print("all_entities", all_entities) cluster_map, inverse_cluster_map = clustering(all_entities, return_inverse=True) print("cluster_map", cluster_map) # if correctness == 0: # pass # print("wrong:") # # url_counts["wrong"].append(url_map_count[entry["source"]]) # entities_counts["wrong"].append(len(all_entities)) # # print("prediction") # talker_candidates = get_talker_candidates(predictions_prob, all_entities, cluster_map, inverse_cluster_map) # pprint(talker_candidates) # # print("truth") # pprint([entry["talker"][i]["entity"] for i, p in enumerate(target_vector_reshaped) if p==1]) #
#print(score_idx) # print(sorted_score_vector) #print(score_matrix) # extract top N candidate vanishing points if len(VP) < numPointRank: numPointRank = len(VP) topN_VP = [] for i in range(numPointRank): topN_VP.append(VP[score_idx[i]][0:2]) print(topN_VP[i]) # print(topN_VP) a = clustering(topN_VP) # #print(tmp_product_norm[0]) #print(tmp_product_norm) #VD3D.append() #cv2.line(rgb_img, (x1,y1), (x2,y2), (0,0,255),2) #cv2.imshow("RGB",rgb_img) #cv2.imshow("CANNY",edges) #cv2.waitKey(300000) cv2.imshow("CANNY", rgb_img)
info = feedparser.parse(news_rss_url) printList(info.entries) for entry in info.entries: # word count of each word of summary word_list = getBagOfWords(preprocess(pseg.cut(stripTag(entry.summary)))) # word count of each word of title bag_of_word_of_title = getBagOfWords(preprocess(pseg.cut(stripTag(entry.title)))) # Combine word count of both summary and title and title weights more bag_of_word = Counter() for i in range(3): bag_of_word.update(bag_of_word_of_title) bag_of_word.update(word_list) entry["bag_of_words"] = bag_of_word # result = Counter() # for entry in info.entries: # result.update(entry["bag_of_words"]) # printList(result) # Clustering them clusters = clustering.clustering([Cluster([Vector(entry)]) for entry in info.entries]) # Print the result for cluster in clusters: print "____FINAL___CLUSTER___" printList("CENTROID: " + cluster.centroidVector.data["title"]) for vector in cluster.listOfVectors: printList(vector.data["title"]) print "____END_OF_CLUSTER___"
start = time.time() geo_locs = [] #loc_ = Point(0.0, 0.0) #tuples for location #geo_locs.append(loc_) #read the fountains location from the csv input file and store each fountain location as a Point(latit,longit) object f = open('data.csv', 'r') reader = csv.reader(f, delimiter=",") for line in reader: loc_ = Point(line[0], float(line[1]), float(line[2])) #tuples for name and location geo_locs.append(loc_) #print len(geo_locs) #for p in geo_locs: # print "%f %f" % (p.latit, p.longit) #let's run k_means clustering. the second parameter is the no of clusters cluster = clustering(geo_locs, 4 ) flag = cluster.k_means() end = time.time() if flag == -1: print "Error in arguments!" else: print "%f" % (end-start) #the clustering results is a list of lists where each list represents one cluster print "Clustering results:" cluster.print_clusters(cluster.clusters)
n_components = args.n_components n_clusters = args.n_clusters GO_Term_path = args.GO_Term_path outputpath = data_name + '_dealing.csv' batch_read_GO_Term(GO_Term_path, outputpath) GO_Term = outputpath Save_path = data_name + '_duplicate_removal_1.csv' screen_Term_gene_1(GO_Term, Save_path) GO_Term_1 = Save_path Save_path_1 = data_name + '_duplicate_removal.csv' screen_Term_gene(GO_Term_1, Save_path_1) Save_path_2 = Save_path_1 GO_Term_gene_expression_path = args.expression_path outputpath1 = args.outputpath1 batch_read_GO_Term_matrix(GO_Term_gene_path=Save_path_2, GO_Term_gene_expression_path=GO_Term_gene_expression_path, outputpath=outputpath1) read_path = outputpath1 outputpath2 = data_name + '_KPCA_' + str(n_components) + '_cosine.csv' batch_exacting_feature_KPCA(read_path, outputpath2, n_components) read_path = outputpath2 label_path = args.label_path clustering(n_clusters, read_path, label_path)
def interpret_clustering_ten_minutes(self,times,entropy=False,minimum_traversals=2,minute_interval=10,days=False): if days==True: max_val = 1440*7 else: max_val=1440 if len(times)>1: c = clustering() clusters=c.cluster_path_times(times,False,days) #print('clusters are:') #for key in clusters.keys(): # print clusters[key] averages=[] main_average=0 total=0 for key in clusters.keys(): av_duration =0 for (duration,date)in clusters[key]: av_duration+=duration main_average+=duration total+=1 av_duration/=len(clusters[key]) averages.append(av_duration) main_average/=total #print averages #print main_average bins_amts = [] bins_ests =[] zeroCount=[] partitions=[] #print 'days', days #print 'max_val',max_val for i in range(0,max_val/minute_interval):#create a bin for each ten minutes of the day bins_ests.append(0) bins_amts.append(0) zeroCount.append(0) partitions.append(i*minute_interval) partitions.append(max_val) #print clusters.keys() for i in range(len(clusters.keys())): #zeroCount=0 for(duration,date) in clusters[clusters.keys()[i]]: if duration ==0: zeroCount[int(date)/minute_interval]+=1 else: bins_amts[int(date)/minute_interval]+=1 bins_ests[int(date)/minute_interval]+=averages[i] for i in range(0,max_val/minute_interval): if(bins_amts[i]>0): bins_ests[i]/=bins_amts[i] else: bins_ests[i]=main_average if zeroCount[i] > bins_amts[i]: bins_ests[i]= -1 #special value for if this edge is blocked entropies=[] if entropy is True: entropies=[] for i in range(0,max_val/minute_interval): bin=[] for j in range(len(clusters.keys())): for(duration,date) in clusters[clusters.keys()[j]]: if (date>=i*minute_interval) & (date<(i+1)*(minute_interval)): # print(i*minute_interval, (i+1)*minute_interval) bin.append(j) counts=dict() total=len(bin)*1.0 for value in bin: try: counts[value]+=1 except KeyError: counts[value]=1. #print counts entropy=0 if total >= minimum_traversals: for x in counts.keys(): # print('counts/total',counts[x]/total) #print 'counts[x]',counts[x] #print 'total',total if counts[x] > 0: entropy-=counts[x]/total*math.log(counts[x]/total) else: entropy=sys.maxint entropies.append(entropy) return partitions,bins_ests,entropies else: return partitions,bins_ests else: partitions=[] bin_ests=[] entropies=[] for i in range(0,max_val/minute_interval): partitions.append(i*minute_interval) bin_ests.append(sys.maxint) entropies.append(sys.maxint) if entropies: return partitions,bin_ests,entropies else: return partitions, bin_ests
video = pd.merge(video1, video2, right_index=True, left_index=True) video = pd.merge(video, video3, right_index=True, left_index=True) video.index = [idx[:7] for idx in video.index] video_norm = pd.DataFrame(pca.fit_transform(normalize(video)), index=video.index) print(video_norm.head(1)) print(sum(pca.explained_variance_ratio_)) path = 'result/merge/' #Fusion des modalités audio_video = pd.merge(video_norm, audio_norm, right_index=True, left_index=True) clustering(audio_video, path + 'audio_video.html', nb_cluster=1) audio_text = pd.merge(audio_norm, text_norm, right_index=True, left_index=True) clustering(audio_text, path + 'audio_text.html', nb_cluster=1) video_text = pd.merge(video_norm, text_norm, right_index=True, left_index=True) clustering(video_text, path + 'video_text.html', nb_cluster=1) audio_video_text = pd.merge(video_text, audio_norm, right_index=True, left_index=True) audio_video_text.to_csv('features/merge/all.csv', sep='§', index_label='Sequence') # audio_video_text_norm = pd.DataFrame(normalize(audio_video_text), index=audio_video_text.index, columns=audio_video_text.columns)
# word count of each word of summary word_list = getBagOfWords(preprocess(pseg.cut(stripTag( entry.summary)))) # word count of each word of title bag_of_word_of_title = getBagOfWords( preprocess(pseg.cut(stripTag(entry.title)))) # Combine word count of both summary and title and title weights more bag_of_word = Counter() for i in range(3): bag_of_word.update(bag_of_word_of_title) bag_of_word.update(word_list) entry["bag_of_words"] = bag_of_word # result = Counter() # for entry in info.entries: # result.update(entry["bag_of_words"]) # printList(result) # Clustering them clusters = clustering.clustering( [Cluster([Vector(entry)]) for entry in info.entries]) # Print the result for cluster in clusters: print "____FINAL___CLUSTER___" printList("CENTROID: " + cluster.centroidVector.data["title"]) for vector in cluster.listOfVectors: printList(vector.data["title"]) print "____END_OF_CLUSTER___"
import csv import sys geo_locs = [] #loc_ = Point(0.0, 0.0) #tuples for location #geo_locs.append(loc_) #read the fountains location from the csv input file and store each fountain location as a Point(latit,longit) object if len(sys.argv)>2: print "Input CSV file:" + sys.argv[2] f= open(sys.argv[2], 'r') else: print "Input CSV file: it-2004.sites.gpscoords.csv" f = open('it-2004.sites.gpscoords.csv', 'r') reader = csv.reader(f, delimiter=",") for line in reader: loc_ = Point(float(line[0]), float(line[1])) #tuples for location geo_locs.append(loc_) #print len(geo_locs) #for p in geo_locs: # print "%f %f" % (p.latit, p.longit) #let's run k_means clustering. the second parameter is the no of clusters k_value = sys.argv[1] print "K_Value: " + k_value cluster = clustering(geo_locs, int(k_value)) flag = cluster.k_means(False) if flag == -1: print "Error in arguments!" else: #the clustering results is a list of lists where each list represents one cluster print "clustering results:" cluster.print_clusters(cluster.clusters)
import random as rand from clustering import clustering from point import Point import csv geo_locs = [] #loc_ = Point(0.0, 0.0) #tuples for location #geo_locs.append(loc_) #read the fountains location from the csv input file and store each fountain location as a Point(latit,longit) object f = open('./drinking_fountains.csv', 'r') reader = csv.reader(f, delimiter=",") for line in reader: loc_ = Point(float(line[1]), float(line[2])) #tuples for location geo_locs.append(loc_) print(len(geo_locs)) #for p in geo_locs: # print "%f %f" % (p.latit, p.longit) #let's run k_means clustering. the second parameter is the no of clusters cluster = clustering(geo_locs, 8) flag = cluster.k_means(False) if flag == -1: print("Error in arguments!") else: #the clustering results is a list of lists where each list represents one cluster print("clustering results:") cluster.print_clusters(cluster.clusters)
####################### # Dimension reduction print("Reducing dimensionality ...") vectors = dimensionreduction.reduce_dimension(vectors=vectors, method="pca", pre_normalization=False, target_dim=2, params={}) print("Dimensionality=%d" % vectors.shape[1]) ####################### # Clustering print("Clustering ...") model = clustering.clustering(vectors=vectors, method="gmm", params={ "n_clusters": 10, "covariance_type": "full" }) cluster_ids = model.get_cluster_assignments() cluster_names = np.asarray(["C%s" % c_id for c_id in cluster_ids]) cluster_centers = model.get_cluster_centers() cluster_covariances = model.get_cluster_covariances() print("# of clusters=%d" % model.n_clusters) ####################### # Visualization print("Visualizing ...") cluster_order = ["C%s" % c_id for c_id in range(model.n_clusters)] visualizers.scatter(vectors=vectors, categories=cluster_names, category_name="Cluster",
#coding:utf-8 import clustering as cl if __name__ == '__main__': ''' クラス01234をcsvで出力 文字コードはutf-8。 エクセルで見る場合は、サクラエディタとかでshift_JISに変換してから ''' current = cl.clustering('sony.csv') current.output_csv()
from combineCSV import combCSV from clustering import clustering #class_Combine = combCSV('combinedNew.csv') #class_Combine.readCSV('tokenDatanew.csv','tokenDatanew2.csv') class_Cluster = clustering('combinedNew.csv') #class_Cluster.pca() #print class_Cluster.top_words() #print class_Cluster.getFrequency() class_Cluster.write() print 'End of the code'