def affinity(data): space = {'damping': hp.uniform('damping', 0.5, 0.99)} algo = partial(tpe.suggest, n_startup_jobs=10) best = fmin(hyper_affinity, space, algo=algo, max_evals=30) model = AffinityPropagation(damping=best['damping']) return best, model.fit_predict(data), sil_score( data, model.fit_predict(data)), model.fit(data)
def clustering(relation_all_df, all_keys, txt_name, method = 'AffinityPropagation', n_clusters= 5): print('begin clustering') data = relation_all_df.iloc[:, :].values if method == 'AffinityPropagation': clustering = AffinityPropagation(damping=0.8).fit(data) if method == 'AgglomerativeClustering': clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') clustering.fit_predict(data) res_dict = dict() for i in range(len(clustering.labels_)): if clustering.labels_[i] not in res_dict: res_dict[clustering.labels_[i]] = [] res_dict[clustering.labels_[i]].append(all_keys[i]) else: res_dict[clustering.labels_[i]].append(all_keys[i]) for k, v in res_dict.items(): print(k,v) # for key in res_dict.keys(): # if type(key) is not str: # res_dict[str(key)] = res_dict[key] # del res_dict[key] with open(txt_name, 'w') as the_file: # the_file.write(json.dumps(list(res_dict.items()))) if method == 'AgglomerativeClustering': the_file.write('n_clusters: \n') the_file.write(str(n_clusters) + '\n') for i in res_dict.keys(): the_file.write(str(i) + '\n') the_file.write(','.join([str(x) for x in res_dict[i]]) + "\n")
def cluster(self, tracklet): # tracklet: [[pid, time, img_id, pseudo_id],...], only img_id is used ids = map(lambda arr: arr[2], tracklet) # cluster = SpectralClustering(n_clusters=2, affinity='precomputed') track_features = [] for i, img_i in enumerate(ids): track_features.append(self.feature[img_i]) similarity = -euclidean_distances(track_features, squared=True) # cls = cluster.fit_predict(affinity) cluster = AffinityPropagation(preference=np.median(similarity)) cls = cluster.fit_predict(track_features).reshape(-1) self.spectral_score += metrics.adjusted_rand_score( [info[0] for info in tracklet], cls) cls_cnt = len(set(cls)) # # cluster = SpectralClustering(n_clusters=cls_cnt) # cls = cluster.fit_predict(track_features).reshape(-1) # self.ap_score += metrics.adjusted_rand_score([info[0] for info in tracklet], cls) cluster = KMeans(n_clusters=cls_cnt) cls = cluster.fit_predict(track_features).reshape(-1) self.kmeans_score += metrics.adjusted_rand_score( [info[0] for info in tracklet], cls) self.tracklet_cnt += 1 return [(ids[i], cls[i]) for i in range(len(ids))]
def get_w2v_field(zeta_res, model, zeta_scope, mode): if mode == 0: words = zeta_res.index[zeta_res[zeta_scope] > 0] else: words = zeta_res.index[zeta_res[zeta_scope] < 0] vecs = [] for word in words: try: vecs.append(model[word]) except KeyError: pass word_matrix = np.matrix(vecs) if mode == 0: clu = AffinityPropagation( preference=zeta_res[zeta_scope][zeta_res[zeta_scope] > 0]) else: clu = AffinityPropagation( preference=zeta_res[zeta_scope][zeta_res[zeta_scope] < 0]) clu.fit_predict(word_matrix) cluster_frame = pd.DataFrame(clu.cluster_centers_) cluster_frame["Category"] = mode return cluster_frame
def AffProp(SM): af = AffinityPropagation(preference=None, affinity='precomputed') af.fit_predict(SM) cluster_centers_indices = af.cluster_centers_indices_ labels_ = af.labels_ n_clusters_ = len(cluster_centers_indices) return n_clusters_, labels_
def clusteringAlgorithm(self, typeOfAlgorithm): #define the model clusterPoint = None if typeOfAlgorithm == 'affinityPropagation': model = AffinityPropagation(damping=0.9) model.fit(x) clusterPoint = model.predict(x) elif typeOfAlgorithm == 'agglomerativeClustering': model = AgglomerativeClustering(n_clusters=2) model.fit(x) clusterPoint = model.fit_predict(x) elif typeOfAlgorithm == 'BIRCH': model = Birch(threshold=0.01, n_clusters=2) model.fit(x) clusterPoint = model.predict(x) elif typeOfAlgorithm == "DBSCAN": model = DBSCAN(eps=0.30, min_samples=9) clusterPoint = model.fit_predict(x) elif typeOfAlgorithm == "KMeans": model = KMeans(n_clusters=2) model.fit(x) clusterPoint = model.predict(x) elif typeOfAlgorithm == "MiniBatchKMeans": model = MiniBatchKMeans(n_clusters=2) model.fit(x) clusterPoint = model.predict(x) elif typeOfAlgorithm == "MeanShift": model = MeanShift() model.fit(x) clusterPoint = model.fit_predict(x) elif typeOfAlgorithm == "OPTICS": model = OPTICS(eps=0.8, min_samples=10) model.fit(x) clusterPoint = model.fit_predict(x) elif typeOfAlgorithm == "SpectralClustering": model = SpectralClustering(n_clusters=2) model.fit(x) clusterPoint =model.fit_predict(x) elif typeOfAlgorithm == "GaussianMixture": model = GaussianMixture(n_components=2) model.fit(x) clusterPoint = model.predict(x) else: model = "unknown" #fit model #assign a cluster to each example #retrieve unique clusters clusters = unique(clusterPoint) for cluster in clusters: #get row indexesMeanShift rowIndexes = where(clusterPoint == cluster) #create scatter of these samples pyplot.scatter(x[rowIndexes, 0], x[rowIndexes, 1]) pyplot.savefig('img/' + typeOfAlgorithm + '.png') pyplot.clf() pyplot.cla() pyplot.close()
def affinityaropagation(tfidf_matrix): ap_cluster = AffinityPropagation(damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False) print('affinityaropagation聚类的个数:', end="") print(len(set(ap_cluster.fit_predict(tfidf_matrix)))) #聚类的个数 return ap_cluster.fit_predict(tfidf_matrix)
def feat_based_cluster(uniquePhrases, gold, phraseFeats, clutter, damping, affinity='cosine', saveto=None): phrase_dic = read_phrase(uniquePhrases) gt_df = pd.read_csv(gold, encoding="utf-8") feat_matrix = np.load(phraseFeats) img_list = gt_df.image.unique() result = [] for img in img_list: gt_df_img = gt_df.query('image == %i' % img) feats = [] phrases = [] for _, item in gt_df_img.iterrows(): phrase = item.phrase phrases.append(phrase) feats.append(feat_matrix[phrase_dic[phrase]]) feats = np.vstack(feats) labels = [] if len(phrases) > 1: if affinity == 'cosine': similarity = cosine_similarity(feats) pref = np.percentile(similarity, clutter) af = AffinityPropagation(preference=pref, affinity='precomputed', damping=damping) labels = af.fit_predict(similarity) elif affinity == 'euclidean': distance = -euclidean_distances(feats, squared=True) pref = np.percentile(distance, clutter) af = AffinityPropagation(preference=pref, damping=damping) labels = af.fit_predict(feats) else: raise RuntimeError('invalid affinity metric') if np.isnan(labels).any(): # when af did not converge labels = np.arange(labels.size) else: labels = [1] for i in range(len(phrases)): result.append({ 'image': img, 'phrase': phrases[i], 'label': labels[i] }) return pd.DataFrame(result)
def clusterSamples(model,trainDataIn,testDataIn,params): if model == 'SOM': # Map size msz0 = params[0] msz1 = params[1] #print('SOM size: ', trainDataIn.shape[0]) sm = SOM.SOM('sm', trainDataIn, mapsize = [msz0, msz1],norm_method = 'var',initmethod='pca') sm.train(n_job = 1, shared_memory = 'no',verbose='off') #sm.set_data_labels(list(instancesCorpus)) if params[2] == True: #print('Hitmap for CORPUS (train, red) and TARGET (test, blue) data:') sm.hit_map(testDataIn) testData_proj = sm.project_data(testDataIn) trainData_proj = sm.project_data(trainDataIn) testData_loc = sm.ind_to_xy(testData_proj)[:,2] trainData_loc = sm.ind_to_xy(trainData_proj)[:,2] return trainData_loc, testData_loc, sm if model == 'AffinityPropagation': model = AffinityPropagation() model.fit(trainDataIn) return model.predict(trainDataIn), model.predict(testDataIn), model if model == 'DBSCAN': model = DBSCAN() model.fit(trainDataIn) return model.fit_predict(trainDataIn), model.fit_predict(testDataIn), model if model == 'KMeans': model = KMeans(n_clusters=params[0]) model.fit(trainDataIn) return model.predict(trainDataIn), model.predict(testDataIn), model if model == 'AgglomerativeClustering': model = AgglomerativeClustering(n_clusters=params[0]) model.fit(trainDataIn) return model.fit_predict(trainDataIn), model.fit_predict(testDataIn), model
def affinity_propagation(D, preference='median'): assert D.shape[0] == D.shape[1], 'Matrix is not square!' if preference in ['minimum', 'min']: preference = np.min(-D) elif preference in ['maximum', 'max']: preference = np.max(-D) elif preference in ['median', 'med']: preference = np.median(-D) else: raise ValueError('preference must be: {minimum, maximum, median}') clusterer = AffinityPropagation(affinity='precomputed', preference=preference) clusterer.fit_predict(-D) return clusterer
def cluster(playlist): arq = 'Total ' + playlist + '.csv' n_clusters = 0 Full_data = pd.read_csv(arq) Full_data = Full_data.dropna(axis=1, how='all') Full_data = Full_data.dropna(axis=0, how='any') ID = Full_data['id'] Mode = Full_data['mode'] length = Full_data['duration_ms'] artist = Full_data['artist'] Full_data = Full_data.drop( columns=['track', 'album_id', 'artist', 'id', 'mode']) Fdata = Full_data.values scaler = Scaler() data_u = scaler.fit_transform(Fdata) # pca_transf = PCA(0.8) # PCA_data = pca_transf.fit_transform(data_u) clusterer = AffinityPropagation(random_state=None, preference=-500) # clusterer = HDBSCAN(min_cluster_size=20) # clusterer = MeanShift() labels = clusterer.fit_predict(data_u) n_clusters = len(set(labels)) - (1 if -1 in labels else 0) labels.shape = (len(labels), 1) Full_data['cluster'] = labels + 1 Full_data['id'] = ID Full_data['mode'] = Mode Full_data['artist'] = artist Full_data['duration_ms'] = length # Full_data.sort_values(by='cluster') Full_data.to_csv('clustered.csv', index=False) # sns.pairplot(Full_data, hue="cluster", palette='YlGnBu') # plt.show() return n_clusters
def execute(args): ############################################################################## if len(args) < 1: usage() sys.exit() names, labels_true, X = parse(args[0]) indices = [int(i) for i in args[1:]] relevant_names = names[1:] if len(indices) > 0: X = np.asarray([[sample[i] for i in indices] for sample in X]) relevant_names = [relevant_names[i] for i in indices] print "Clustering on", str(relevant_names) + "..." ############################################################################## # Compute Affinity Propagation af = AffinityPropagation(preference=-50) # cluster_centers_indices = af.cluster_centers_indices_ # labels = af.labels_ # # n_clusters_ = len(cluster_centers_indices) y_pred = af.fit_predict(X) if y_pred is None or len(y_pred) is 0 or type(y_pred[0]) is np.ndarray: return 0 counts = get_cluster_counts(labels_true, y_pred) print counts
def visual(c, X, y): from sklearn.cluster import AffinityPropagation cluster_object = AffinityPropagation() y_pred = cluster_object.fit_predict(X) colors = ['red', 'green', 'blue', 'cyan', 'black', 'yellow', 'magenta', 'brown', 'orange', 'silver', 'goldenrod', 'olive', 'dodgerblue'] clusters = np.unique(y_pred) print("Cluster Labels") print(clusters) print("Evaluation") evaluation_labels(y, y_pred) evaluation(X, y_pred) for cluster in clusters: row_idx = np.where(y == cluster) plt.scatter(X[row_idx, 0], X[row_idx, 1]) plt.title('Dataset') plt.xlabel('X1') plt.ylabel('X2') plt.legend() plt.show() for cluster in clusters: row_idx = np.where(y_pred == cluster) plt.scatter(X[row_idx, 0], X[row_idx, 1]) plt.title('Clusters') plt.xlabel('X1') plt.ylabel('X2') plt.legend() plt.show()
def cluster_trajectories( curves ): """Given a list of curves, cluster_trajectories will cluster them.""" n_curves = len(curves) X_2B_clstrd = np.zeros( (n_curves, 4) ) X_2B_clstrd[:,0] = np.array( [ curves[k][0, 0] for k in range(n_curves) ] ) X_2B_clstrd[:,1] = np.array( [ curves[k][1, 0] for k in range(n_curves) ] ) X_2B_clstrd[:,2] = np.array( [ curves[k][0,-1] for k in range(n_curves) ] ) X_2B_clstrd[:,3] = np.array( [ curves[k][1,-1] for k in range(n_curves) ] ) for col in range( 4 ): X_2B_clstrd[:,col] /= X_2B_clstrd[:,col].std() def distance_metric(a,b): #A distance metric on R^4 modulo the involution #(x0,x2,x3,x4) -> (x3,x4,x1,x2) d = lambda a,b : np.sqrt( np.sum( (a-b)**2 ) ) T = lambda x: np.array([x[2],x[3],x[0],x[1]]) return min( d(a,b) , d(T(a),b) ) from sklearn.cluster import AffinityPropagation clusterer = AffinityPropagation(affinity='precomputed', convergence_iter=100) aff = np.zeros((n_curves, n_curves)) for i in range(n_curves): for j in range(i+1,n_curves): aff[i,j] = np.exp(-distance_metric( X_2B_clstrd[i], X_2B_clstrd[j])**2) aff[j,i] = aff[i,j] #clusterer.Affinity = aff cluster_labels = clusterer.fit_predict(aff) out = [] for label in set( cluster_labels): cluster = map( lambda k: curves[k] , filter( lambda k: cluster_labels[k] == label , range( n_curves) ) ) out.append( cluster ) return map( align_cluster, out)
def community_detection_by_affinity(graph, weight_type=WeightType.ABSOLUTE): # Has an high impact on Girvan Newman clustering graph = nx.algorithms.tree.mst.maximum_spanning_tree( Graph.to_undirected(graph)) mat, node_to_int = prepare_matrix(graph) af = AffinityPropagation(preference=-50) labels = af.fit_predict(mat) inv_node_to_int = {v: k for k, v in node_to_int.items()} clusters = {} for index, lab in enumerate(labels): class_name = inv_node_to_int[index] if lab not in clusters: clusters[lab] = [] clusters[lab].append(class_name) print(f"\nClusters: {clusters}") print(f"Total Clusters: {len(clusters)}") pos = nx.spring_layout(graph) nx.draw_networkx(graph, pos=pos, edgelist=[], node_color=labels, with_labels=True, node_size=250, font_size=8) return [cluster for cluster in clusters.values()]
def fit_predict(self, preprocessed_documents: List[List[tuple]], documents_features: List[np.ndarray], dataset: Dataset, hyperparams=None, task=None) -> List[Segmentation]: assert len(documents_features) == len(preprocessed_documents) x_scaled = [] for doc_features in documents_features: x_scaled.append(StandardScaler().fit_transform(doc_features)) predicted_label_lists = [] for i in range(len(documents_features)): start_time = time.time() x = documents_features[i] # documents_features[i] x_scaled[i] true_n_clusters = dataset.segmentations[i].author_count assert x.shape[0] == len(preprocessed_documents[i]) diarizer = AffinityPropagation(damping=hyperparams['damping'], preference=hyperparams['preference'], copy=True, affinity='euclidean', max_iter=100, convergence_iter=5) labels = diarizer.fit_predict(x).tolist() predicted_label_lists.append(labels) estimated_n_clusters = len(set(labels)) print('Document', i + 1, '/', len(documents_features), x.shape, 'in', time.time() - start_time, 's', ) print('Real author count = {}, estimated = {}'.format(true_n_clusters, estimated_n_clusters)) print() return generate_segmentation(preprocessed_documents, documents_features, predicted_label_lists, dataset.documents, task=task)
def plotly_embedding(value, key): time5 = time.clock() "standardisation" x_min, x_max = np.min(value, 0), np.max(value, 0) reducer = (value - x_min) / (x_max - x_min) "clusterisation" clusterer = AffinityPropagation() cluster_labels = clusterer.fit_predict(reducer) X_projected = reducer x, y, z = np.random.multivariate_normal(np.array([0, 0, 0]), np.eye(3), 400).transpose() trace1 = go.Scatter3d(x=X_projected[:, 0], y=X_projected[:, 1], z=X_projected[:, 2], mode='markers', marker=dict(size=12, color=cluster_labels, colorscale='Paired', opacity=0.8)) data = [trace1] layout = go.Layout(title="TSNE", margin=dict(l=0, r=0, b=0, t=0)) fig = dict(data=data, layout=layout) plot(fig)
def group_clusters(roughHull): # AffinityPropagation algorithm: af = AffinityPropagation(preference=-100).fit(roughHull.squeeze(1)) cluster_indicators = af.fit_predict(roughHull.squeeze(1)) # map the list that the affinity provided to the current roughHull, then # find the mean point of all of them cluster_centers = [] current_cluster = 0 while(current_cluster < len(af.cluster_centers_indices_)): i=0 cluster=[] for x in cluster_indicators: if(x == current_cluster): cluster.append(roughHull[i][0].tolist()) i+=1 # now find the average point between these average = [0, 0] for y in cluster: average = [average[0] + y[0], average[1] + y[1]] average = [average[0]/len(cluster), average[1]/len(cluster)] cluster_centers.append(np.array([average])) current_cluster += 1 cc = np.array(cluster_centers) return cc
def score_based_cluster(gold, pair_score, affinity_save_path, clutter, damping): gt_df = pd.read_csv(gold, encoding="utf-8") img_list = gt_df.image.unique() result = [] for img in img_list: gt_df_img = gt_df.query('image == %i' % img) phrases = [] for _, item in gt_df_img.iterrows(): phrase = item.phrase phrases.append(phrase) scores = np.load(affinity_save_path + '/' + str(img) + '.npy') if scores.size > 1: pref = np.percentile(scores, clutter) af = AffinityPropagation(preference=pref, affinity='precomputed', damping=damping) labels = af.fit_predict(scores) if np.isnan(labels).any(): # when af did not converge labels = np.arange(labels.size) else: labels = [1] for i in range(len(phrases)): result.append({ 'image': img, 'phrase': phrases[i], 'label': labels[i] }) return pd.DataFrame(result)
def hyper_affinity(args): global basic_data global all_data ap = AffinityPropagation(damping = args['damping']) pred = ap.fit_predict(basic_data) temp = sil_score(all_data, pred) # print(args) return -temp
def apply_affinity_prop_consort(include_transformed): (X, y) = extract.generate_labelled_data( valid_labels=['1'], label_type='consort', include_transformed=include_transformed) am = AffinityPropagation() preds = am.fit_predict(X) return (X, preds)
def cluster_ap_blobs(): clustering_blobs = AffinityPropagation(affinity='euclidean', convergence_iter=5, damping=0.9, preference=-10.0) y_blobs = clustering_blobs.fit_predict(X_blobs) plt.scatter(X_blobs[:, 0], X_blobs[:, 1], c=y_blobs) print(y_blobs)
def test_sparse_input_for_fit_predict(): # Test to make sure sparse inputs are accepted for fit_predict # (non-regression test for issue #20049) af = AffinityPropagation(affinity="euclidean", random_state=42) rng = np.random.RandomState(42) X = csr_matrix(rng.randint(0, 2, size=(5, 5))) labels = af.fit_predict(X) assert_array_equal(labels, (0, 1, 1, 2, 3))
def fit(self, vectors: [int, float]) -> [int, int]: vectors_ = list(zip(*vectors))[1] cluster_model = AffinityPropagation(damping=0.96, max_iter=10000, convergence_iter=15) cluster = cluster_model.fit_predict(vectors_) show_two_dimensions_plot(vectors_, cluster) return [(i, label) for i, label in enumerate(cluster)]
class APModel(ClusteringModel): def __init__(self, n_clusters): super().__init__() self.n_clusters = n_clusters self.ap = AffinityPropagation(verbose=True) def fit_predict(self, feat): pred = self.ap.fit_predict(feat) return pred
def semantic_clusters(lemmas, unique=True): words = lemmas if unique: words = list(set(lemmas)) words = _filter_w2v(words) m = np.array(_get_matrix(words)) agg = AffinityPropagation(affinity="precomputed") u = agg.fit_predict(m) return _group_words(words, agg.labels_)
def get_affinity_clusters(listings): """Returns a list of cluster IDs based on relative similarity between listings.""" a = get_similarity_matrix(listings) clf = AffinityPropagation(affinity='precomputed') clusters = clf.fit_predict(a) return clusters
def main(): args = parse_arguments() verb2vec, subject2vec, object2vec = get_vectors(args.vector_path) lines, _, _, _ = get_dict_and_samples(args.input_path, args.min_count, args.first_n, args.step) concatenated = concat_vectors(lines, verb2vec, subject2vec, object2vec) print(f"Shape: {concatenated.shape}") ap = AffinityPropagation() result = ap.fit_predict(concatenated) groups = group_result(result, lines) print(f"Number of clusters: {len(groups)}")
def affinity(data,damping): # metric_list = ['euclidean', 'manhattan', 'chebyshev'] # ap = AffinityPropagation(damping = args['damping']) db = AffinityPropagation(damping = damping) db.fit(data) pred = db.fit_predict(data) score = sil_score(data,pred) print(score) return db,pred,score
def build_families(self, smiles, affin_matrix): cluster = AffinityPropagation() cls = cluster.fit_predict(affin_matrix) fam = {} for a, b in zip(smiles, cls): if b in fam: fam[b].add(a) else: fam[b] = set({a}) return fam
def _AffinityPropagation(corpus, labels): vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(corpus) affinity_propagation = AffinityPropagation(damping=.5, max_iter=200, convergence_iter=25, copy=False) result_affinity_propagation = affinity_propagation.fit_predict(X.toarray()) print('AffinityPropagation:', normalized_mutual_info_score(result_affinity_propagation, labels))
def affinity_propagation(): """ AffinityPropagation creates clusters by sending messages between pairs of samples until convergence. The messages sent between pairs represent the suitability for one sample to be the exemplar of the other, which is updated in response to the values from other pairs. this updates occurs iteratively until convergence, at which point the final exemplars are chosen and hence the final cluster is given. Algorithm: The message sent between pairs belongs to one of two categories. The first is the responsibility, r(i,k), which is the accumulated evidence that sample k should the exemplar for sample i. The second is the availability, a(i,k), which is the accumulated evidence that sample i should chose sample k to be its exemplar, and considers the values for all other samples that k should be an exemplar. In this case exemplars are chosen by samples if they are: - similar enough to many samples, and - chosen by many samples to be representative of themselves. """ # Generate a generic data sample. n_samples = 300 std = 0.3 seed = 0 centers = [ [-1., 0.], [0., 1.5], [1., 0.] ] data, target = make_blobs(n_samples = n_samples, centers = centers, cluster_std = std, random_state = seed) # Set the preference for each point: samples with large preference values # are more likely to be chosen as exemplars. The number of exemplars, i.e., # clusters, is influenced by the input preference values. If preferences are # not passed as arguments, they will be set to the median of the input # similarities. # pref = [ np.random.randint(low = -50, high = 0) for x in range(n_samples)] pref = -50 # Compute affinity propagation. clf = AffinityPropagation(preference = pref) aff_y = clf.fit_predict(data) # Find mismatches between predicted and true values. cnt = int(0) for idx in range(n_samples): if(target[idx] != aff_y[idx]): cnt += 1 # Print results. print('Approximated number of clusters ', len(clf.cluster_centers_indices_)) print('Accuracy ', float(n_samples - cnt) / float(n_samples)) print('Homogeneity ', metrics.homogeneity_score(target, clf.labels_)) print('Completeness ', metrics.completeness_score(target, clf.labels_)) # Plot resulting clusters. plt.figure(figsize = (8,8)) plt.scatter(data[:,0], data[:,1], c = aff_y, s = 50) plt.title('Affinity clustering') plt.show()
def cluster_articles(): ms = MongoStore() articles = [a for a in ms.get_pending_articles()] if len(articles) > 0: tfidf = TfidfVectorizer(tokenizer=preprocess) good_articles = [article for article in articles if article["text_content"].strip() != ""] texts = [article["text_content"] for article in good_articles] X_tfidf = tfidf.fit_transform(texts) print X_tfidf ap = AffinityPropagation(damping=0.95, max_iter=4000, convergence_iter=400, copy=True, preference=-4, affinity='euclidean', verbose=True) C = ap.fit_predict(X_tfidf) print X_tfidf.shape, C.shape print C centers = ap.cluster_centers_indices_ clusters = [] for c, center in enumerate(centers): members = np.where(C == c)[0] K = cosine_similarity(X_tfidf[members], X_tfidf[center]) member_sims = [(m, float(k)) for m, k in zip(members, K)] member_sims.sort(key=lambda x: x[1], reverse=True) cluster = {"articles": [], "date": datetime.now(), "summarized": False} if len([member for member, sim in member_sims if sim > .55]) >= 3: print texts[center][:75].replace("\n", " ") for member, sim in member_sims: print "\t{:3.3f} ".format(sim), print good_articles[member]["title"][:60].replace("\n", " ") cluster["articles"].append((good_articles[member]["_id"], sim)) else: continue clusters.append(cluster) if len(clusters) > 0: ms.insert_clusters(clusters) ms.set_clustered_flag(articles)
def evaluate_clustering(): similarity_matrix = get_sense_similarity_submatrix(range(10000)) matrix_size = len(similarity_matrix) print('got matrix') affinity_propagation = AffinityPropagation() labels1 = affinity_propagation.fit_predict(similarity_matrix) print('affinity propagation') dbscan = DBSCAN(min_samples=1) labels2 = dbscan.fit_predict(similarity_matrix) print('print dbscan') distance_matrix = np.ndarray((matrix_size, matrix_size)) for i in range(matrix_size): for j in range(matrix_size): distance_matrix[i, j] = 1 - similarity_matrix[i, j] print(distance_matrix[1, 2]) print(distance_matrix[1, 1]) print('created distance matrix') cluster_map1 = cluster_evaluation.fpena_get_clusters(labels1) cluster_map2 = cluster_evaluation.fpena_get_clusters(labels2) print(cluster_map1) print(cluster_map2) sc1 = sklearn.metrics.silhouette_score(distance_matrix, labels1, metric='euclidean') sc2 = sklearn.metrics.silhouette_score(distance_matrix, labels2, metric='euclidean') sc5 = cluster_evaluation.fpena_evaluate(cluster_map1, distance_matrix) sc6 = cluster_evaluation.fpena_evaluate(cluster_map2, distance_matrix) num_elements1 = [len(values) for values in cluster_map1.values()] num_elements2 = [len(values) for values in cluster_map2.values()] print(num_elements1) print(num_elements2) print('Number of clusters Affinity Propagation: %f' % len(cluster_map1)) print('Number of clusters DBSCAN: %f' % len(cluster_map2)) print('Average elements per cluster Affinity Propagation: %f' % np.mean(num_elements1)) print('Average elements per cluster DBSCAN: %f' % np.mean(num_elements2)) print('Standard deviation per cluster Affinity Propagation: %f' % np.std(num_elements1)) print('Standard deviation per cluster DBSCAN: %f' % np.std(num_elements2)) print('Silouhette score Affinity Propagation (distance matrix): %f' % sc1) print('Silouhette score DBSCAN (distance matrix): %f' % sc2) print('Dunn index Affinity Propagation (distance matrix): %f' % sc5) print('Dunn index DBSCAN (distance matrix): %f' % sc6)
def geo_worker_(job_queue, result_queue, **kwargs): signal.signal(signal.SIGINT, signal.SIG_IGN) geocache = get_resource_manager(u"GeoCacheResource") geoquery = GeoQuery(geocache.get_tsv_path()) event = kwargs.get(u"event") while not job_queue.empty(): try: string_tsv_path, geo_tsv_path = job_queue.get(block=False) with gzip.open(string_tsv_path, u"r") as f: string_df = pd.io.parsers.read_csv(f, sep="\t", quoting=3, header=0) loc_strings = [ loc_string for loc_string in string_df[u"locations"].tolist() if not isinstance(loc_string, float) ] coords = [] for loc_string in loc_strings: for location in loc_string.split(","): coord = geoquery.lookup_location(location) if coord is not None: coords.append(coord) centers = set() if len(coords) > 0: coords = np.array(coords) D = -geoquery.compute_distances(coords[:, None], coords) ap = AffinityPropagation(affinity=u"precomputed") Y = ap.fit_predict(D) if ap.cluster_centers_indices_ is not None: for center in ap.cluster_centers_indices_: centers.add((coords[center][0], coords[center][1])) centers = [{u"lat": lat, u"lng": lng} for lat, lng in centers] centers_df = pd.DataFrame(centers, columns=[u"lat", u"lng"]) with gzip.open(geo_tsv_path, u"w") as f: centers_df.to_csv(f, sep="\t", index=False, index_label=False, na_rep="nan") result_queue.put(None) except Queue.Empty: pass return True
def mhd_cluster_trajectories( curves ): """Returns clusters based upon the modified Hausdorff distance.""" n_curves = len(curves) from sklearn.cluster import AffinityPropagation clusterer = AffinityPropagation(affinity='precomputed', convergence_iter=100) aff = np.zeros((n_curves, n_curves)) for i in range(n_curves): for j in range(i+1,n_curves): from modified_Hausdorff_distance import modified_Hausdorff_distance as mhd aff[i,j] = mhd( curves[i].transpose(), curves[j].transpose() ) aff[j,i] = aff[i,j] #clusterer.Affinity = aff cluster_labels = clusterer.fit_predict(aff) out = [] for label in set( cluster_labels): cluster = map( lambda k: curves[k] , filter( lambda k: cluster_labels[k] == label , range( n_curves) ) ) out.append( cluster ) return map( align_cluster, out)
def plot_similarity_clusters(desc1, desc2, files, plot = None): """ find similar sounds using Affinity Propagation clusters :param desc1: first descriptor values :param desc2: second descriptor values :returns: - euclidean_labels: labels of clusters """ if plot == True: print((Fore.MAGENTA + "Clustering")) else: pass min_max = preprocessing.scale(np.vstack((desc1,desc2)).T, with_mean=False, with_std=False) pca = PCA(n_components=2, whiten=True) y = pca.fit(min_max).transform(min_max) euclidean = AffinityPropagation(convergence_iter=1800, affinity='euclidean') euclidean_labels= euclidean.fit_predict(y) if plot == True: time.sleep(5) print((Fore.WHITE + "Cada número representa el grupo al que pertence el sonido como ejemplar de otro/s. El grupo '0' esta coloreado en azul, el grupo '1' esta coloreado en rojo, el grupo '2' esta coloreado en amarillo. Observa el ploteo para ver qué sonidos son ejemplares de otros")) print(np.vstack((euclidean_labels,files)).T) time.sleep(6) plt.scatter(y[euclidean_labels==0,0], y[euclidean_labels==0,1], c='b') plt.scatter(y[euclidean_labels==1,0], y[euclidean_labels==1,1], c='r') plt.scatter(y[euclidean_labels==2,0], y[euclidean_labels==2,1], c='y') plt.scatter(y[euclidean_labels==3,0], y[euclidean_labels==3,1], c='g') plt.show() else: pass return euclidean_labels
def cluster(self, normalize=False): """ Cluster the nodes based on the PMI similarity measure. The clustering algorithm used is affinity propagation, which automatically choosed the number of clusters. :param normalize: If true, then normalize the similarity measured (i.e., the PMI) to be between -1 and 1. :return: The cluster labels. """ if normalize: # use normalized PMI for similarity metric similarity = self.pmi / -np.log(self.joint_probs) similarity[np.diag_indices_from(similarity)] = 1.0 else: similarity = self.pmi similarity[np.diag_indices_from(similarity)] = 1.1 * similarity.max() clustering = AffinityPropagation(affinity='precomputed', verbose=self.verbose, preference=similarity.min()) clusters = clustering.fit_predict(similarity) if self.verbose: print 'Found', len(np.unique(clusters)), 'clusters.' return clusters
def create_tag_categories(): """Cluster MSE tags in to categories using sklearn AffinityPropogation. Any existing category system in the database will be overwritten. """ con = connect_db() cur = con.cursor() query = """ SELECT T.id, T.name, COUNT(Q.question_id) AS count FROM ( SELECT tags.id, tags.name, COUNT(qt.question_id) AS count FROM tags JOIN question_tags AS qt ON qt.tag_id=tags.id WHERE tags.name NOT IN ('advice', 'applications', 'big-list', 'education', 'intuition', 'learning', 'math-history', 'math-software', 'reference-request', 'self-learning', 'soft-question', 'teaching', 'alternative-proof-strategy', 'proof-writing', 'visualization', 'alternative-proof', 'proof-strategy', 'proof-verification', 'solution-verification', 'definition', 'examples-counterexamples', 'mathematica', 'wolfram-alpha', 'maple', 'matlab', 'sage', 'octave', 'floor-function', 'ceiling-function', 'article-writing', 'publishing', 'combinatorial-species', 'gromov-hyperbolic-spaces', 'chemistry', 'book-recommendation') GROUP BY tags.name ) AS T JOIN question_tags AS Q ON T.id=Q.tag_id GROUP BY T.id""" cur.execute(query) tag_ids = [] tag_names = [] tag_indices = dict() tag_name_indices = dict() counts = [] for q in cur: tag_ids.append(q['id']) tag_names.append(q['name']) tag_indices[q['id']] = len(tag_ids) - 1 tag_name_indices[q['name']] = len(tag_ids) - 1 counts.append(q['count']) tag_ids = np.array(tag_ids) tag_names = np.array(tag_names) query = """ SELECT t1.id AS tag1, t2.id AS tag2, COUNT(qt1.question_id) as count FROM question_tags AS qt1 JOIN question_tags AS qt2 ON qt1.question_id=qt2.question_id JOIN tags AS t1 ON t1.id=qt1.tag_id JOIN tags AS t2 ON t2.id=qt2.tag_id WHERE t1.id IN ({taglist}) AND t2.id IN ({taglist}) GROUP BY t1.name, t2.name""".format(taglist=','.join(str(i) for i in tag_ids)) cur.execute(query) paircounts = [[0 for i in range(len(tag_ids))] for j in range(len(tag_ids))] for q in cur: t1 = q['tag1'] i1 = tag_indices[t1] t2 = q['tag2'] i2 = tag_indices[t2] c = q['count'] if i1 == i2: paircounts[i1][i1] = int(c/2) else: paircounts[i1][i2] = c sim = np.array(paircounts, dtype=np.float_) cluster = AffinityPropagation(affinity='precomputed', damping=0.5) labels = cluster.fit_predict(sim) classes = sorted(list(set(labels))) catnames = {i:tag_names[cluster.cluster_centers_indices_[i]] for i in \ range(len(cluster.cluster_centers_indices_))} cur.execute("DELETE FROM categories WHERE 1;") cur.execute("DELETE FROM tag_categories WHERE 1;") query = "INSERT INTO categories (id,name) VALUES " catnames = [tag_names[cluster.cluster_centers_indices_[c]] for c in classes] query += ','.join("({},'{}')".format(c,catnames[c]) for c in classes) cur.execute(query) query = "INSERT INTO tag_categories (tag_id, category_id) VALUES " query += ','.join("({},{})".format(tag_ids[i], labels[i]) for i \ in range(len(labels))) cur.execute(query) con.commit()
def main(argv): inputFile = '' outputFile = '' imax = 0 jmax = 0 inputFile = sys.argv[1] outputFile = sys.argv[2] if (len(sys.argv) < 4): # pick a default value. thisDamping = .92 else: # The third argument contains parameters in the format of key1:value1|key2:value2. In this # case we are only expecting one: "damping" paramList = sys.argv[3].split("|") for thisParam in paramList: # first and only parameter should be damping paramSplit = thisParam.split(":") if (paramSplit[0] == "damping"): thisDamping = float(paramSplit[1]) print 'Input file is:', inputFile print 'Output file is:', outputFile print 'thisDamping is:', str(thisDamping) with open(inputFile, 'rb') as csvfile: csvReader = csv.reader(csvfile, delimiter=',',quotechar='|') # First line is the number of distinct nodes. headerRows = csvReader.next() imax = int(headerRows[0]) jmax = int(headerRows[0]) print str(imax) + " " + str(jmax) # define the matrix simMatrix = np.zeros((imax, jmax), dtype=np.float) currentNodeIndex = 0 # We build a map between the matrix we want to build and the node identifiers # as we read in the rows. thisI = 0 thisJ = 0 nodeMap = dict() # we also want a list that maps the indices to the node names indexList = list() for row in csvReader: if (row[0] in nodeMap): thisI = nodeMap[row[0]] else: nodeMap[row[0]] = currentNodeIndex indexList.append(row[0]) currentNodeIndex += 1 if (row[1] in nodeMap): thisJ = nodeMap[row[1]] else: nodeMap[row[1]] = currentNodeIndex indexList.append(row[1]) currentNodeIndex += 1 # matrix is symetric simMatrix[thisI, thisJ] = float(row[2]) simMatrix[thisJ, thisI] = float(row[2]) for i in range(0,imax): # Set all of the diagonals to 1 simMatrix[i,i] = 1. db = AffinityPropagation(affinity='precomputed',damping=thisDamping) labels = db.fit_predict(simMatrix) # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) #print 'Estimated number of clusters: %d' % n_clusters_ print labels, len(labels) with open(outputFile, 'wb') as csvoutfile: csvWriter = csv.writer(csvoutfile, delimiter=',',quotechar='|') for i in range(0, imax): csvWriter.writerow([indexList[i], labels[i]])
print("size X",len(X)) #kernel = gaussian_kde(X_p.T) pref = [(-(mvn.pdf(x,[0,0],[[1,0],[0,.1]])))*100 for x in X] alpha = 1 dists = np.array([ -( euclidean( u/(alpha*-np.log(mvn.pdf(u,[0,0],[[.01,0],[0,.01]]))), v/(alpha*-np.log(mvn.pdf(v,[0,0],[[.01,0],[0,.01]]))) ) # ( # (-np.log(mvn.pdf(u,[0,0],[[1,0],[0,1]])) # +(-np.log(mvn.pdf(v,[0,0],[[1,0],[0,1]]))) # ) # ) ) for u in X for v in X]).reshape((len(X),len(X))) ap = AffinityPropagation(affinity = "precomputed", #preference=pref ) labels =ap.fit_predict(dists) print("n labels", len(set(labels))) import matplotlib.pyplot as plt cmap = dict((label,np.random.beta(1,1,3)) for label in labels) for x,label in zip(X,labels): plt.scatter(x[0],x[1],color=cmap[label]) plt.show()
def cluster_affinity_propagation(similarity_matrix, desired_keys=None): numpy_matrix = similarity_matrix_to_numpy(similarity_matrix, desired_keys) clusterer = AffinityPropagation() return clusterer.fit_predict(numpy_matrix)
data_thr.rateC, data_thr.rateCA] Html_file = open("clustering_files/affinitypropagation.html", "w") # consider only 10000 data (spectralclustering memory complexity): ind = np.array(10000 * [1] + (X.shape[0] - 10000) * [0]).astype(bool) ind = shuffle(ind) data_thr10 = pd.DataFrame(X[ind]) data_thr10.columns = data.columns scaler = StandardScaler() X = scaler.fit_transform(X) X = X[ind] km = AffinityPropagation(damping=0.95) preds = km.fit_predict(X) print "components:", set(preds) print np.bincount(preds) data_thr10['preds'] = pd.Series(preds).astype("category") color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink", "brown", "green", "orange"] * 25 title = str(np.bincount(preds)) TOOLS = "wheel_zoom,box_zoom,reset,box_select,pan" plot_width = 900 plot_height = 300 x_name = 'rateCA' y_name = 'rate' xmin_p = np.percentile(data_thr10[x_name], 0.1)
def main(): options = docopt.docopt(__doc__) features_file = h5py.File(options['<keypoints>']) cap = cv2.VideoCapture(options['<video>']) frame_idx = -1 tracks = None frame_pair = (None, None) tracking = Tracking() cluster_tracks = [] video_writer = None clusters = [] while options['--max-frames'] is None or frame_idx < int(options['--max-frames']): # Read in frame image rv, frame = cap.read() frame_idx += 1 # If we failed to read in a frame, exit if not rv: break if options['--no-video']: output_frame = np.zeros_like(frame) else: output_frame = np.copy(frame) if video_writer is None: h, w = frame.shape[:2] video_writer = cv2.VideoWriter(options['<output>'], cv2.cv.FOURCC(*'MJPG'), 25, (w,h), ) # Show progress if frame_idx % 100 == 0: print('Frame index: {0} => {1} tracks'.format(frame_idx, len(tracking.tracks))) # Convert to greyscale frame_gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) # Update frame pair frame_pair = (frame_pair[1], frame_gray) # Work out where in the keypoints file, keypoints start and end frame_kp_start, n_kps = features_file['frames'][frame_idx] # Find keypoint locations and descriptors kp_locs = features_file['keypoints'][frame_kp_start:(frame_kp_start+n_kps)] kp_descs = features_file['descriptors'][frame_kp_start:(frame_kp_start+n_kps)] # Convert locations to image space kp_im_locs = np.array(kp_locs, dtype=np.float32) h, w = frame_gray.shape kp_im_locs[:,0] += 0.5*w kp_im_locs[:,1] += 0.5*h # Construct a list of key points kps = list(Keypoint(frame_idx, loc[:2], loc[2], desc) for loc, desc in zip(kp_im_locs, kp_descs)) if options['--show-kps']: for kp in kps: x, y = kp.location cv2.circle(output_frame, (int(x), int(y)), 5, (0,0,200), lineType=cv2.CV_AA) # Track this frame's keypoints tracking.add_frame(frame_pair[0], frame_pair[1], frame_idx, kps) # All states and covariances for this frame frame_states, frame_covars, frame_track_kps = [], [], [] for t in tracking.tracks: if t.final_frame_idx < frame_idx or t.initial_frame_idx > frame_idx: continue frame_states.append(t.states[frame_idx - t.initial_frame_idx]) frame_covars.append(t.covariances[frame_idx - t.initial_frame_idx].copy()) frame_track_kps.append(t.associated_keypoints[-1]) # Draw trails if required trail_length = int(options['--trail-length']) if trail_length > 0: for t in tracking.tracks: if t.final_frame_idx <= frame_idx - trail_length or t.initial_frame_idx > frame_idx: continue start_frame = frame_idx - trail_length + 1 start_idx = start_frame - t.initial_frame_idx for s1, s2 in zip(t.states[start_idx:-1], t.states[start_idx+1:]): cv2.line(output_frame, (int(s1[0]), int(s1[1])), (int(s2[0]), int(s2[1])), (200,0,200), lineType=cv2.CV_AA) # Convert states to an array frame_states = np.array(frame_states) if not options['--no-cluster']: # PDF of choosing kp uniformly from image h, w = frame.shape[:2] non_cluster_pdf = -30 # Best existing cluster for each state and the associated PDF state_association = [(-1, non_cluster_pdf),] * frame_states.shape[0] # PDF of choosing states from each active cluster for c_idx, cluster in enumerate(clusters): # skip elderly clusters if cluster.last_update_frame_idx != frame_idx - 1: continue cluster_mu, cluster_sigma = cluster.predict(frame_idx) for s_idx in xrange(len(state_association)): s = frame_states[s_idx,:] c = frame_covars[s_idx] _, current_pdf = state_association[s_idx] pdf = mv_gaussian_log_pdf(s, cluster_mu, cluster_sigma + c)[0] if pdf > current_pdf: state_association[s_idx] = (c_idx, pdf) # Go through associations unassigned_states, unassigned_covars = [], [] cluster_states = [None,] * len(clusters) for s, c, assoc in zip(frame_states, frame_covars, state_association): c_idx = assoc[0] if c_idx < 0: unassigned_states.append(s) unassigned_covars.append(c) continue if cluster_states[c_idx] is None: cluster_states[c_idx] = [(s, c)] else: cluster_states[c_idx].append((s, c)) for cluster, assignment in zip(clusters, cluster_states): if assignment is None: if cluster.final_frame_idx >= frame_idx - 3: cluster.update(frame_idx) else: states = np.array(list(s for s,c in assignment)) if states.shape[0] >= 2: sigma = np.cov(states.T) else: sigma = cluster.covariances[-1].copy() mu = np.mean(states, axis=0) for _, cov in assignment: sigma += cov cluster.update(frame_idx, mu, sigma) minx, maxx = states[:,0].min(), states[:,0].max() miny, maxy = states[:,1].min(), states[:,1].max() if maxx - minx > 300 or maxy - miny > 300: continue cv2.rectangle(output_frame, (int(minx), int(miny)), (int(maxx), int(maxy)), (0,0,200), lineType=cv2.CV_AA) state, cov = cluster.predict(frame_idx) draw_cov(output_frame, cov[:2,:2], state[:2], (0,0,200), lineType=cv2.CV_AA) # Draw 'o' over each frame state sc = 10.0 filtered_states = [] filtered_covs = [] for kp, s, c in zip(frame_track_kps, frame_states, frame_covars): # Extract sigmas sigmas = np.diag(np.linalg.cholesky(c)) # Only sufficiently 'good' features pass if options['--max-position-sigma'] is not None: if np.any(sigmas[:2] > float(options['--max-position-sigma'])): continue if options['--max-velocity-sigma'] is not None: if np.any(sigmas[2:4] > float(options['--max-velocity-sigma'])): continue ## Only those with keypoints at this frame #if kp.frame_idx != frame_idx: # continue # Only those with minimum velocity #speed = np.sqrt(np.sum(s[2:4]*s[2:4])) #if speed < 0.5: # continue filtered_states.append(s) filtered_covs.append(c) if not options['--no-show-states']: draw_cov(output_frame, c[:2,:2], s[:2], (255,0,0), lineType=cv2.CV_AA) cv2.line(output_frame, (int(s[0]), int(s[1])), (int(s[0]+sc*s[2]), int(s[1]+sc*s[3])), (0,200,0), lineType=cv2.CV_AA) draw_cov(output_frame, sc*sc*c[2:4,2:4], s[:2]+sc*s[2:4], (0,200,0), lineType=cv2.CV_AA) filtered_states = np.array(filtered_states) # Cluster unlabelled states if not options['--no-cluster'] and len(unassigned_states) > 4: cluster_states = np.copy(np.array(unassigned_states)) cluster_covs = list(unassigned_covars) clustering = AffinityPropagation() labels = clustering.fit_predict(cluster_states) # Process labels for label in np.unique(labels): label_indices = np.nonzero(labels == label)[0] if label_indices.shape[0] < 2: continue label_states = cluster_states[label_indices, :] label_covs = list(cluster_covs[i] for i in label_indices) mu = np.mean(label_states, axis=0) sigma = np.cov(label_states.T) for c in label_covs: sigma += c new_cluster = Cluster() new_cluster.update(frame_idx, mu, sigma) clusters.append(new_cluster) draw_cov(output_frame, sigma[:2,:2], mu, (0,200,200), lineType=cv2.CV_AA) # Write output video_writer.write(output_frame) del video_writer
build_class_labels() num_classes = len(urls) sim_matrix = np.zeros((num_classes, num_classes)) record_in_matrix(sim_matrix) sim_matrix = np.sqrt(sim_matrix) np.savetxt("sim_mat.txt", sim_matrix) clst = AffinityPropagation(affinity='precomputed') #clst = SpectralClustering(n_clusters=7,affinity='precomputed') classes = clst.fit_predict(sim_matrix) with open("ap/centers.txt", "w") as f: for clst, indx in enumerate(clst.cluster_centers_indices_): f.write(all_urls[indx]) f.write(" ") f.write(str(clst)) f.write("\n") with open("ap/clusters.txt", "w") as f: for idx, cls in enumerate(classes): f.write(all_urls[idx]) f.write(" ") f.write(str(cls))
# R1 = C1.fit_predict(Gram) # n = len(Gram) Di = np.reshape(np.diag(Gram),(n,1)) M = Di.dot(np.ones((1,n))) D = M + M.T - 2*Gram C2 = AffinityPropagation(affinity='precomputed') C1 = KMeans(n_clusters = 5) C3 = AgglomerativeClustering(n_clusters=5, affinity='precomputed',linkage='average') C4 = SpectralClustering(n_clusters=5,affinity='precomputed') C5 = SpectralBiclustering(n_clusters=(5,5)) R1 = C1.fit_predict(D) R2 = C2.fit_predict(D) R3 = C3.fit_predict(D) R4 = C4.fit_predict(Gram +11) R5 = C5.fit(D) print(R4) modèle = TSNE(n_components=2,metric='precomputed') Trans = modèle.fit_transform(D) G_ACP = ACP(Gram,precomputed=True) trace_ACP(G_ACP,[10]*5) ## import propre_TSNE as pt
good_articles = [article for article in articles if article["text_content"].strip() != ""] texts = [article["text_content"] for article in good_articles] X_tfidf = tfidf.fit_transform(texts) print X_tfidf ap = AffinityPropagation(damping=0.95, max_iter=4000, convergence_iter=400, copy=True, preference=-4, affinity='euclidean', verbose=True) C = ap.fit_predict(X_tfidf) print X_tfidf.shape, C.shape print C centers = ap.cluster_centers_indices_ clusters = [] for c, center in enumerate(centers): members = np.where(C == c)[0] K = cosine_similarity(X_tfidf[members], X_tfidf[center]) member_sims = [(m, float(k)) for m, k in zip(members, K)] member_sims.sort(key=lambda x: x[1], reverse=True) cluster = {"articles": [], "date": datetime.now(), "summarized": False} if len([member for member, sim in member_sims if sim > .55]) >= 3:
# cluster3 = vectorLinspace([4,1],[7,9], num=50) # cluster3 = cluster1 + np.random.normal(5,.1,cluster3.shape) # cluster4 = vectorLinspace([-1,4],[-4,2], num=50) # cluster4 = cluster1 + np.random.normal(-5,.1,cluster4.shape) X = cluster1#np.append(cluster1,np.append(cluster2,np.append(cluster3,cluster4,axis=0),axis=0),axis=0) print(X) print(pearsonr(X[:,0],X[:,1]),spearmanr(X[:,0],X[:,1])) dists = np.zeros((len(X),len(X))) for i1,x1 in enumerate(X): print(i1,"/",len(X)) for i2,x2 in enumerate(X): # for i3,x3 in enumerate(X): # if i1 != i2 and i2 != i3 and i1 != i3: # tmp = np.append(x1,np.append(x2,x3,axis=0),axis=0).reshape((-1,2)) # #print(tmp) # c = spearmanr(tmp[:,0],tmp[:,1])[0] dists[i1,i2] = cosine(x1,x2) print(dists) from sklearn.cluster import AffinityPropagation ap = AffinityPropagation(affinity="precomputed") y_pred = ap.fit_predict(dists) print(len(set(y_pred))) cmap = dict((y,np.random.beta(1,1,3)) for y in y_pred) import matplotlib.pyplot as plt for x,y in zip(X,y_pred): #plt.annotate(y,x,color=cmap[y]) pass plt.scatter(X[:,0],X[:,1]) plt.scatter(cluster2[:,0],cluster2[:,1]) plt.show()
from sklearn.cluster import AffinityPropagation from sklearn.manifold import TSNE dataset = pd.read_csv('~/data/gene_expr_170104.csv') data = np.array(dataset)[:, 1:].astype(float).T Y = TSNE().fit_transform(data) clus = AffinityPropagation() lab = clus.fit_predict(Y) x, y = Y.T plt.scatter(x, y, alpha=0.9, c = plt.cm.Spectral(lab.astype(float) / lab.max()), edgecolors='none') # for i, j, t in zip(x, y, range(x.shape[0])): # plt.text(i, j, t, color = 'purple') plt.show() x, y, = SOS(iterations=10, alpha=1, beta=0, delta=0, theta=3.5).fit_transform(data).T plt.scatter(x, y, alpha=0.4, c = plt.cm.Spectral(lab.astype(float) / lab.max()), edgecolors='none') # for i, j, t in zip(x, y, range(x.shape[0])):
if embeddings.shape[1] != 2: print("tsne") tsne = TSNE(2) embeddings_transformed = tsne.fit_transform(embeddings) else: embeddings_transformed = embeddings #tsne = TSNE(2) #embeddings_transformed = tsne.fit_transform(embeddings) print("clustering") c2c = [5,6,7,8] labels = dict() from sklearn.cluster import AffinityPropagation ap = AffinityPropagation() for c in c2c: labels[c] = ap.fit_predict([emb for emb,concept in zip(embeddings_transformed,concepts) if concept ==c]) print(labels) print("plotting") import matplotlib.pyplot as plt import seaborn cmap = dict((key,np.random.beta(1,1,3)) for key in cognate_classes) counters = {5:0,6:0,7:0,8:0} for asjp_word,emb,cognate_class,concept in zip(asjp_words,embeddings_transformed,cognate_classes,concepts): # plt.annotate(asjp_word,emb,color=cmap[cognate_class]) if concept == 5: plt.subplot(2,2,1) label = labels[5][counters[5]] plt.annotate(asjp_word+"_"+str(label),emb,color=cmap[cognate_class]) counters[5] += 1 if concept == 6:
for j in range(size_berlin): if i != j: matrix_berlin[i][j] = (list_of_berlin_person[i].distance_of_two_persons(list_of_berlin_person[j])) for i in range(size_newcomers): for j in range(size_newcomers): if i != j: matrix_newcomer[i][j] = (list_of_newcomer_person[i].distance_of_two_persons(list_of_newcomer_person[j])) print(matrix_berlin) print(matrix_newcomer) print('_____________________________________') clusterer.fit(matrix_newcomer, y=None) print('_____________________________________') clusterer.fit_predict(matrix_newcomer, y=None) print('_____________________________________') # af = AffinityPropagation().fit(matrix_newcomer) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ print(labels) n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) # print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) # print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) # print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) # print("Adjusted Rand Index: %0.3f"% metrics.adjusted_rand_score(labels_true, labels)) # print("Adjusted Mutual Information: %0.3f"% metrics.adjusted_mutual_info_score(labels_true, labels)) # print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))