def _kmeanspp(self, X, random_state): # Based on: https://en.wikipedia.org/wiki/K-means%2B%2B Xp = type(X)(X, shape=X.shape, dtype=X.dtype, copy=True) if sparse.issparse(X) else np.copy(X) idx = random_state.randint(X.shape[0], size=(1,), dtype=Xp.indptr.dtype)[0] centroids = Xp[idx] Xp = self.delete_row_csr(Xp, idx) if sparse.issparse(Xp) else np.delete(Xp, idx, axis=0) while (centroids.shape[0] < self.n_clusters): clustering, distances = pairwise_distances_argmin_min(X=Xp, Y=centroids, metric='cosine') # Calculate weighted probability distribution d = np.power(distances, 2) p = d / d.sum() dist = rv_discrete(values=(np.arange(Xp.shape[0]), p), seed=random_state) # Choose next centroid idx = dist.rvs() centroids = sparse.vstack((centroids, Xp[idx])) if sparse.issparse(Xp) else np.concatenate((centroids, Xp[idx].reshape(1, -1)), axis=0) # Delete center from `Xp` Xp = self.delete_row_csr(Xp, idx) if sparse.issparse(Xp) else np.delete(Xp, idx, axis=0) return centroids
def closest_image(self, pixel): index, _ = pairwise_distances_argmin_min(pixel, self.rgb_means) img_path = self.file_paths[index[0]] img = io.imread(img_path) return img
def compute_data_labels(fname, dfilec, dfile, sensorref, sensor): """ Computes the labels of the data using the centroids of the cluster in the file the labels are relabeled acording to the matching with the reference sensor :param dfile: :param sensor: :return: """ f = h5py.File(fname + '.hdf5', 'r') d = f[dfilec + '/' + sensor + '/Clustering/' + 'Centers'] centers = d[()] d = f[dfile + '/' + sensor + '/' + 'PeaksResamplePCA'] data = d[()] d = f[dfilec + '/' + sensorref + '/Clustering/' + 'Centers'] centersref = d[()] f.close() # clabels, _ = pairwise_distances_argmin_min(centers, centersref) # # m = Munkres() # dist = euclidean_distances(centers, centersref) # indexes = m.compute(dist) # print indexes # print clabels labels, _ = pairwise_distances_argmin_min(data, centers) return labels #[indexes[i][1] for i in labels]
def get_only_nug2_ruptures(scores): # get the indices, sorting the true sort_idx = np.argsort(scores.idx_true) true = np.array([scores.idx_true[i] for i in sort_idx]).reshape(-1,1) pred = np.array(scores.idx_predicted).reshape(-1,1) # not interested in anything with just two ruptures if (len(true) < 3 or len(pred) < 3): return scores # POST: something to do; at least 3 ruptures # pairwise_distances_argmin_min: # for each row in X (true), the index of the row of Y (pred) which # is closest (according to the specified distance). idx_closest_pred_to_true,_ = metrics.pairwise_distances_argmin_min(X=true, Y=pred) # only interested from the second to the next to last, since the 6 ruptures # are: alpha3D, NUG2 (4 of these), biotin/streptavidin logical_fec_slice = slice(1,-1,None) slice_true = sort_idx[logical_fec_slice] idx_we_want = idx_closest_pred_to_true[logical_fec_slice] pred_slice = lambda x: [x[i] for i in idx_we_want] true_slice = lambda x: [x[i] for i in slice_true] scores.ruptures_true = true_slice(scores.ruptures_true) scores.ruptures_predicted = pred_slice(scores.ruptures_predicted) # also update all the indices and such scores.true_x = true_slice(scores.true_x) scores.pred_x = pred_slice(scores.pred_x) scores.idx_true = true_slice(scores.idx_true) scores.idx_predicted = pred_slice(scores.idx_predicted) return scores
def finding_correct_centers(list_underlyings, scenarios, cluster_centers): #Les index dans scenarios sont les même que dans list_underlyings #Dans scenarios_bis on supprime plusieurs ligne, mais les index sont inchangés ->les lignes ne correspondent pas aux index ! #cluster_problem indique tous les clusters (entre 0 et nclusters) pour lesquels on doit changer les centres scenarios_bis = scenarios nclusters = len(cluster_centers) cluster_problem = range(nclusters) #Onreécupère laliste des centres dans list_underlyings ET scénarios list_cluster_index,_ = pairwise_distances_argmin_min(cluster_centers, scenarios_bis.drop(['Underlying'],axis=1),metric='l2') while(len(cluster_problem)>0): #On cherche les centres priçables cluster_problem = test_pricing_centers(scenarios, list_underlyings, list_cluster_index, cluster_problem) #Si on a encore des problèmes if(len(cluster_problem)>0): #On commence par supprimer les sous-jacents qui posent un problème for ind in cluster_problem: index_centre = list_cluster_index[ind] name = str(list_underlyings['EliotName'][index_centre]) scenarios_bis = scenarios_bis.drop([name]) #On cherche les sous-jacents les plus proches des centres des clusters affectés list_cluster_index,_ = pairwise_distances_argmin_min(cluster_centers, scenarios_bis.drop(['Underlying'],axis=1),metric='l2') list_cluster_index = new_cluster_index(scenarios,scenarios_bis,list_cluster_index) #-------------------------------------------------------------------------- for i in range(nclusters): ind = list_cluster_index[i] index_centre_cluster = list_underlyings['Label'][ind] print("Le centre pricable du cluster {} est dans le cluster {}".format(i,index_centre_cluster)) return list_cluster_index
def euclidean_rupture_spectrum_distance(self): safe_log = lambda x : np.log10(x) if x > 0 else -10 spectrum_tuple = lambda x: [safe_log(x.loading_rate*1e12), x.rupture_force*1e12] all_tuples = lambda list_v: np.array([spectrum_tuple(x) for x in list_v]) X = all_tuples(self.ruptures_true) Y = all_tuples(self.ruptures_predicted) # get the distances from x to y and from y to x if (len(Y) == 0 or len(X) == 0): dist_1 = [] dist_2 = [sum(x**2) for x in X] else: _,dist_1 = metrics.pairwise_distances_argmin_min(X=X,Y=Y) _,dist_2 = metrics.pairwise_distances_argmin_min(X=Y,Y=X) all_distances = list(dist_1) + list(dist_2) return all_distances
def chunked(X, Y, axis=1, metric="euclidean", batch_size=500, **kwargs): """Return argmin on the selected axis. axis 0 is along X axis 1 is along Y """ return pairwise_distances_argmin_min(X, Y, axis=axis, batch_size=batch_size, metric=metric, **kwargs)
def kmeans(X): num_clusters = int(sys.argv[2]) kmeans_model = KMeans(n_clusters = num_clusters) kmeans_model.fit(X) if sys.argv[3] == 'c': print kmeans_model.cluster_centers_ else: closest, _ = pairwise_distances_argmin_min(kmeans_model.cluster_centers_, X) for point in closest: print X[point]
def get_Kmeans(self): ''' Set up Kmeans algorithm with arbitrary clusters''' k = 100 vect = TfidfVectorizer(ngram_range=(1,2), stop_words='english') X = vect.fit_transform(self.corpus) model = KMeans(k) model.fit(X) order_centroids = model.cluster_centers_.argsort()[:, ::-1] terms = vect.get_feature_names() self.centroids = order_centroids self.model = model self.vect = vect return model, pairwise_distances_argmin_min(model.cluster_centers_, X, metric='cosine')
def __init__(self,IdxFrom,IdxTo): _,distActualToPred= pairwise_distances_argmin_min(IdxFrom,IdxTo) self.MeanToLabel = np.mean(distActualToPred) self.MedianToLabel = np.median(distActualToPred) self.MaxToLabel = np.max(distActualToPred) self.MinToLabel = np.min(distActualToPred) maxV = max(distActualToPred) cond = (np.abs(distActualToPred) > 0.5) numWrongByAtLeastOne = sum(cond) nBins = 10 bins = np.linspace(start=0,stop=maxV,num=nBins,endpoint=True) self.histZeros = np.histogram(distActualToPred,bins=bins) nonZeroDistance = distActualToPred[np.where(cond)] self.histNoZeros =np.histogram(nonZeroDistance,bins=bins)
def compute_data_labels(dfilec, dfile, sensor): """ Computes the labels of the data using the centroids of the cluster in the file :param dfile: :param sensor: :return: """ f = h5py.File(datainfo.dpath + datainfo.name + ext + '.hdf5', 'r') d = f[dfilec + '/' + sensor + '/Clustering/' + 'Centers'] centers = d[()] d = f[dfile + '/' + sensor + '/' + 'PeaksResamplePCA'] data = d[()] labels, _ = pairwise_distances_argmin_min(data, centers) f.close() return labels
def compute_data_labels(fname, dfilec, dfile, sensor): """ Computes the labels of the data using the centroids of the cluster in the file the labels are relabeled acording to the matching with the reference sensor Disabled the association using the Hungarian algorithm so the cluster index are the original ones :param dfile: :param sensor: :return: """ f = h5py.File(datainfo.dpath + '/' + fname + '/' + fname + '.hdf5', 'r') d = f[dfilec + '/' + sensor + '/Clustering/' + 'Centers'] centers = d[()] d = f[dfile + '/' + sensor + '/' + 'PeaksResamplePCA'] data = d[()] f.close() labels, _ = pairwise_distances_argmin_min(data, centers) return labels
def compute_peaks_labels(self, f, dfile, sensor, nclusters, globalc=False, distances=False): """ Computes the labels of the data using the centroids of the cluster in the first file :param nclusters: :param dfile: :param sensor: :return: """ if globalc: d = f["All/" + sensor + "/Clustering/" + str(nclusters) + "/Centers"] else: d = f[self.datafiles[0] + "/" + sensor + "/Clustering/" + str(nclusters) + "/Centers"] centers = d[()] d = f[dfile + "/" + sensor + "/" + "PeaksResamplePCA"] data = d[()] labels, dist = pairwise_distances_argmin_min(data, centers) if distances: return labels, distances else: return labels
def fit(self, X, y=None): random_state = check_random_state(self.random_state) X = self._check_fit_data(X) # Init CosineMeans if (isinstance(self.init, np.ndarray)): self.cluster_centers_ = self.init elif (self.init == 'random'): idx = random_state.randint(X.shape[0], (self.n_clusters,)) self.cluster_centers_ = X[idx].A if sparse.issparse(X) else X[idx] elif (self.init == 'k-means++'): self.cluster_centers_ = self._kmeanspp(X=X, random_state=random_state) else: raise ValueError('Unknown param passed to `init`: {}. Allowed values are "random", "k-means++" or an ndarray') # Run CosineMeans centroids = np.zeros((self.n_clusters, X.shape[1]))#sparse.csr_matrix((self.n_clusters, X.shape[1])) for _ in range(self.max_iter): clustering, distances = pairwise_distances_argmin_min(X=X, Y=self.cluster_centers_, metric='cosine') # http://stackoverflow.com/questions/29629821/sum-over-rows-in-scipy-sparse-csr-matrix # Todo: This really needs improvement for yi in np.unique(clustering): row_idx = np.where(clustering==yi)[0] if (sparse.issparse(X)): centroids[yi] = np.asarray(X[row_idx].multiply(1/len(row_idx)).sum(axis=0)) else: centroids[yi] = np.multiply(X[row_idx], 1/len(row_idx)).sum(axis=0) # Convergence check if (np.all(np.abs(self.cluster_centers_-centroids) < self.tol)): break self.cluster_centers_ = centroids self.cluster_centers_ = centroids self.labels_ = clustering return self
def upsample(test_indices, training_set_cluster_IDs, data, method = 'k-means', usecols = None): N_samples = test_indices.size + training_set_cluster_IDs.size assert N_samples == data.shape[0] full_set_cluster_IDs = np.zeros(N_samples, dtype = int) training_indices = np.setdiff1d(np.arange(N_samples), test_indices, True) full_set_cluster_IDs[training_indices] = training_set_cluster_IDs if usecols is not None: usecols = list(usecols) data = np.take(data, usecols, 1) training_data = np.delete(data, test_indices, axis = 0) max_ID = np.amax(training_set_cluster_IDs) centroids = np.zeros((max_ID + 1, data.shape[1]), dtype = float) for cluster in xrange(max_ID + 1): samples_in_cluster = np.where(training_set_cluster_IDs == cluster)[0] if method == 'hierarchical': centroids[cluster] = np.median(training_data[samples_in_cluster], axis = 0) else: centroids[cluster] = training_data[samples_in_cluster].mean(axis = 0) test_data = np.take(data, test_indices, axis = 0) test_set_cluster_IDs, _ = pairwise_distances_argmin_min(test_data, centroids, metric = 'manhattan' if method == 'hierarchical' else 'euclidean') full_set_cluster_IDs[test_indices] = test_set_cluster_IDs return full_set_cluster_IDs
def geneticlabels(dataframe,centers): return pairwise_distances_argmin_min(dataframe,centers,metric='minkowski')
import numpy as np from sklearn.cluster import KMeans from sklearn.metrics import pairwise_distances_argmin_min #X = np.random.randn(10, 4) # generate a 10 row, 4 column random number matrix X = np.array([[1, 1, 1, 2], [5, 4, 5, 6], [2, 1, 1, 2], [6, 7, 6, 4], [8, 10, 9, 8], [10, 8, 9, 8], [1, 2, 3, 2], [3, 1, 2, 1], [9, 10, 7, 9], [9, 9, 7, 7]]) # clustering print("X is: \n", X) km = KMeans(n_clusters=3).fit(X) # create 3 clusters closest, _ = pairwise_distances_argmin_min(km.cluster_centers_, X) closest # gives out the closest data point to the center points, an array, the first is the closest to the first cluster center, the second to the second cluster, etc. print("closest to each cluster: ", closest) # sort and output the closest data points km.cluster_centers_ # the center points centers = np.array(km.cluster_centers_) num_closest = 4 # number of closest points to cluster center num_clusters = 3 print("\n...clustering into 3 clusters...") dist = km.transform(X) print("distance matrix: \n", dist) print("\n") for i in range(0, num_clusters): print("cluster ", i, ", center: ", centers[i]) d = dist[:, i] print("d to cluster center", i, ":", d) ind = np.argsort( d
ref_point = np.squeeze(eval(d["DATA"])) print(f"Ref point: {ref_point}") pref.response = pd.DataFrame( np.atleast_2d(ref_point), columns=pref.content["dimensions_data"].columns, ) color_point = pref.response.values _, pref = evolver.iterate(pref) objectives_ = evolver.population.objectives ### KMEANS # fit to n_clusters and find the closest solutions to each cluster's centroid kmeans = KMeans(n_clusters=n_clusters, verbose=0) kmeans.fit(objectives_) closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, objectives_) labels_kmeans = kmeans.labels_ print(labels_kmeans) labelled_objectives = [] labelled_variables = [] for label_n in range(n_clusters): labelled_objectives.append( objectives_[labels_kmeans == label_n]) labelled_variables.append( evolver.population.individuals[labels_kmeans == label_n]) objectives = objectives_[closest] variables = evolver.population.individuals[closest] ### DBSCAN
print('='*40) print('KMMCDUE-based ALGO') print('='*40) for al_iters in range(al_steps): t = time.time() # 1) get MCDUEs print('Starting AL iteration #', al_iters) mcdues = get_mcdues(X_pool_current) print('AL iteration #', al_iters, ': got MCDUEs') # 2) pick n_pick samples with top mcdues km_model = KMeans(n_clusters = sample_each_step, verbose=2) inds = np.argsort(mcdues)[::-1][::-1] km_model.fit(X_pool_current[inds[:int(0.1*X_train_current.shape[0])]]) # KMeans on top 10% print('Fitted KMeans with', sample_each_step, 'clusters') inds, _ = pairwise_distances_argmin_min(km_model.cluster_centers_, X_pool_current) print(sample_each_step, 'samples picked') # 3) add them to the training set X_train_current = np.concatenate([X_train_current, X_pool_current[inds, :]]) y_train_current = np.concatenate([y_train_current, y_pool_current[inds, :]]) print('Added to training set, new sizes:', X_train_current.shape, y_train_current.shape) # 4) remove them from the pool X_pool_current = np.delete(X_pool_current, inds, axis = 0) y_pool_current = np.delete(y_pool_current, inds, axis = 0) print('Deleted from pool set, new sizes:', X_pool_current.shape, y_pool_current.shape) # 5) uptrain the NN prev_test_error = 1e+10 sample_selection_time = time.time() - t t_big = time.time() t = time.time() for cnt in range(uptrain_epochs):
book = [sentence for sentence in book if len(sentence) > 20][100:200] # 截断长度 book_sequences = batch_sequence(book, dictionary, maxlen=maxlen) # 填充 encoded, attention = sess.run([model.get_thought, model.attention], feed_dict={model.INPUT: book_sequences}) n_clusters = 10 kmeans = KMeans(n_clusters=n_clusters, random_state=0) kmeans = kmeans.fit(encoded) avg = [] closest = [] for j in range(n_clusters): idx = np.where(kmeans.labels_ == j)[0] avg.append(np.mean(idx)) closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, encoded) ordering = sorted(range(n_clusters), key=lambda k: avg[k]) print('. '.join([book[closest[idx]] for idx in ordering])) print("*" * 100) indices = np.argsort(attention.mean(axis=0))[::-1] rev_dictionary = {v: k for k, v in dictionary.items()} print([rev_dictionary[i] for i in indices[:10]])
def K_Means_Clustering(self): data = self.KNN_ser.iloc[:, 0:10] data['Mode'] = self.KNN_ser.iloc[:, -1].values print(data) car = data['Mode'] == 'Car' metro = data['Mode'] == 'Metro' bus = data['Mode'] == 'Bus' walking = data['Mode'] == 'Walking' still = data['Mode'] == 'Still' # print(data[still].iloc[0:900, 0:10]) car = data[bus].iloc[:, 0:10] # print(car.iloc[:, 0:1].values) print(car[('Acc','f_1')].values) df = pd.DataFrame({ 'f1': car[('Acc','f_1')].values, 'f2': car[('Acc','f_2')].values, 'f3': car[('Acc','f_3')].values, 'f4': car[('Acc','f_4')].values, 'f5': car[('Acc','f_5')].values, 'f6': car[('Acc','f_6')].values, 'f7': car[('Acc','f_7')].values, 'f8': car[('Acc','f_8')].values, 'f9': car[('Acc','f_9')].values, 'f10': car[('Acc','f_10')].values, }) print(df) num_clusters = 1 kmeans = KMeans(n_clusters=1).fit(df) centers = np.array(kmeans.cluster_centers_) m_clusters = kmeans.labels_.tolist() print(centers) closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, df) print(closest) # print(car.loc[901].values) # print(self.car_fft) # N = 501 # T = 10.0 / 900.0 # x = np.linspace(0.0, N*T, N) # xf = fftfreq(N, T) # xf = fftshift(xf) # yplot = fftshift(self.bus_fft[closest[0]]) # yplot1 = fftshift(self.walking_fft[closest[1]]) # plt.plot(xf, 1.0/N * np.abs(yplot)) # plt.plot(xf, 1.0/N * np.abs(yplot1)) # plt.show() clostest_data = [] for i in range(num_clusters): center_vec = centers[i] data_idx_within_i_cluster = [ idx for idx, clu_num in enumerate(m_clusters) if clu_num == i ] one_cluster_tf_matrix = np.zeros( ( len(pmids_idx_in_i_cluster) , centers.shape[1] ) ) for row_num, data_idx in enumerate(data_idx_in_i_cluster): one_row = tf_matrix[data_idx] one_cluster_tf_matrix[row_num] = one_row closest, _ = pairwise_distances_argmin_min(center_vec, one_cluster_tf_matrix) closest_idx_in_one_cluster_tf_matrix = closest[0] closest_data_row_num = data_idx_within_i_cluster[closest_idx_in_one_cluster_tf_matrix] data_id = all_data[closest_data_row_num] closest_data.append(data_id) closest_data = list(set(closest_data)) assert len(closest_data) == num_clusters d = kmeans.transform(car['Acc'])[:, 1] ind = np.argsort(df)[::-1][:50] print(ind) plt.scatter(df['car'], df['metro'], c= kmeans.labels_.astype(float), s=50, alpha=0.5) plt.scatter(centers[:, 0], centers[:, 1], c='red', s=50) plt.show()
print(n_clusters) kmeans = KMeans(n_clusters=n_clusters) kmeans = kmeans.fit(encoded) # Step-6: Summarization # The candidate sentence is chosen to be the sentence whose vector representation is closest to the cluster center. from sklearn.metrics import pairwise_distances_argmin_min avg = [] for j in range(n_clusters): idx = np.where(kmeans.labels_ == j)[0] avg.append(np.mean(idx)) print('DENTRO DO FOR') print(idx) print(np.mean(idx)) closest, _ = pairwise_distances_argmin_min( kmeans.cluster_centers_, encoded) # computa a menor distancia ordering = sorted(range(n_clusters), key=lambda k: avg[k]) summary = ' '.join([sentences[closest[idx]] for idx in ordering]) print([email[closest[1]]]) print('ordering') print(ordering) print('closest') print(closest) print('closest') print(closest, ) print(summary)
from sklearn.cluster import KMeans from sklearn import metrics from sklearn.metrics.pairwise import pairwise_distances_argmin data_x = data_2020[[ 'Logged GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption' ]] # need to decide n_clusters # Elbow method WSS = {} for i in range(2, 30): k_means = KMeans(init='k-means++', n_clusters=i, n_init=10) k_means.fit(data_x) k_means_labels, k_means_distance = metrics.pairwise_distances_argmin_min( data_x, k_means.cluster_centers_) WSS[i] = np.sum(np.sqrt(k_means_distance)) WSS_pd = pd.DataFrame(WSS.values(), index=WSS.keys(), columns=['WSS']) WSS_pd.plot() # no clean answer from sklearn.metrics import silhouette_score sil = {} for i in range(2, 30): k_means = KMeans(init='k-means++', n_clusters=i, n_init=10) k_means.fit(data_x) k_means_labels, k_means_distance = metrics.pairwise_distances_argmin_min( data_x, k_means.cluster_centers_) sil[i] = silhouette_score(data_x, k_means_labels, metric='euclidean') sil_pd = pd.DataFrame(sil.values(), index=sil.keys(), columns=['silhouette']) sil_pd.plot()
else: how_many_summaries = 500 summary = [None] * how_many_summaries for rv in range(how_many_summaries): review = df['sent_tokens'].iloc[rv] enc_email = get_sent_embedding(review) if (len(enc_email) > 0): n_clusters = int(np.ceil(len(enc_email)**0.5)) kmeans = KMeans(n_clusters=n_clusters, random_state=0) kmeans = kmeans.fit(enc_email) avg = [] closest = [] for j in range(n_clusters): idx = np.where(kmeans.labels_ == j)[0] avg.append(np.mean(idx)) closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,\ enc_email) ordering = sorted(range(n_clusters), key=lambda k: avg[k]) summary[rv] = ' '.join([review[closest[idx]] for idx in ordering]) else: print("This is not a valid review") if (cmdline): print(f'{summary}') else: df_500 = df.iloc[:how_many_summaries] print(df_500.head()) df_500['PredictedSummary'] = summary df_500[['Text', 'PredictedSummary']].to_csv('top_500_summary.csv')
def active_cluster_svm_margin(foldname): twenty_train_data = getattr(prepare_data, foldname + '_train_data') twenty_train_target = getattr(prepare_data, foldname + '_train_target') twenty_test_data = getattr(prepare_data, foldname + '_test_data') twenty_test_target = getattr(prepare_data, foldname + '_test_target') #baseline active learning solution alpha = 20 #initial training set betha = int(len(twenty_train_data) / alpha) - 2 #number of iterations gamma = 20 #sampling volume tfidf_transformer = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()) ]) #try to implement silhouette analysis for number of clusters #cluster = AgglomerativeClustering(n_clusters=20,affinity='cosine', linkage='complete') cluster = KMeans(n_clusters=20) unlabeled_train_data = twenty_train_data unlabeled_train_target = twenty_train_target #print 'start transforming' unlabeled_matrix = tfidf_transformer.fit_transform(unlabeled_train_data) #print 'start fitting' #print datetime.now() res = cluster.fit_predict(unlabeled_matrix) #print datetime.now() #print 'clustering result' #print OrderedDict(Counter(res)) #print res.shape closest, _ = pairwise_distances_argmin_min(cluster.cluster_centers_, unlabeled_matrix, metric='cosine') #print closest ''' results = defaultdict(list) for idx, val in enumerate(res): results[val].append(idx) take_idx = [] for cluster_num in range(0, 20): idxset = results[cluster_num] ''' #create labeled and unlabeled training set #labeled_train_data = twenty_train_data[: alpha] #labeled_train_target = twenty_train_target[: alpha] #unlabeled_train_data = twenty_train_data[alpha:] #unlabeled_train_target = twenty_train_target[alpha:] labeled_train_data = [] labeled_train_target = [] labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target = diploma_range_sampling(labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target, closest) #print labeled_train_data.shape baseline_active_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LinearSVC()) ]) baseline_active_clf.fit(labeled_train_data, labeled_train_target) predicted = baseline_active_clf.predict(twenty_test_data) score = f1_score(twenty_test_target, predicted, average='macro') #print 'active cluster svm margin solution' scores = baseline_active_clf.decision_function(unlabeled_train_data) prob = np.divide(1, np.add(1, np.exp(np.multiply(np.array(scores), -1)))) diploma_res_print(foldname, len(labeled_train_data), score, np.amax(prob)) for t in range(1, betha): #to do use labeled dataset to train sigmoid #f1 for labeled set #pred_lab = baseline_active_clf.predict(labeled_train_data) #print 'f1 score for labeled:', f1_score(labeled_train_target, pred_lab, average='macro') #count p1 p2 p3 p4 ''' def count_p(arr): p1 = arr.min() p4 = arr.max() sorted_arr = sorted(arr) a1 = [i for i in sorted_arr if i < 0] a2 = [i for i in sorted_arr if i > 0] p2 = -100500 p3 = +100500 if len(a1) > 0: p2 = max(a1) if len(a2) > 0: p3 = min(a2) return [p1, p2, p3, p4] #prom_arr = [] norm_scores = LA.norm(scores) n_scores = np.divide(scores, norm_scores) ''' ''' plus_norm = 0 min_norm = 0 for line in scores: for elem in line: if (elem > 0): plus_norm += elem ** 2 else: min_norm += elem ** 2 plus_norm = math.sqrt(plus_norm) min_norm = math.sqrt(min_norm) n_scores = np.array(scores) for i in range(0, len(n_scores)): for j in range(0, len(n_scores[i])): if (n_scores[i][j] > 0): n_scores[i][j] = n_scores[i][j] / plus_norm else: n_scores[i][j] = n_scores[i][j] / min_norm ''' ''' #print n_scores prom_arr = [] for lin in range(0, len(n_scores)): prom_arr.append(count_p(n_scores[lin])) t_prom_arr = np.transpose(np.array(prom_arr)) #print t_prom_arr #p1 = np.amin(t_prom_arr[0]) #p2 = np.amax(t_prom_arr[1]) #p3 = np.amin(t_prom_arr[2]) #p4 = np.amax(t_prom_arr[3]) #print 'p1:', p1, 'p2:', p2, 'p3:', p3, 'p4:', p4 ''' #prob = np.divide(1, np.add(1, np.exp(np.multiply(np.array(n_scores), -1)))) #print 'norm matrix min proba:', np.amin(prob), 'norm matrix max proba:', np.amax(prob) doc_score = {} for i in range(0, len(unlabeled_train_data)): last_elems = (sorted(scores[i]))[-2:] doc_score[i] = np.abs(last_elems[0] - last_elems[1]) sorted_doc_score = sorted(doc_score.items(), key=operator.itemgetter(1)) #print 'sorted doc score minimum active cluster svm margin', sorted_doc_score[0] sample_numbers = [] for i in range(0, gamma): sample_numbers = sample_numbers + [sorted_doc_score[i][0]] labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target = diploma_range_sampling(labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target, sample_numbers) baseline_active_clf.fit(labeled_train_data, labeled_train_target) predicted = baseline_active_clf.predict(twenty_test_data) score = f1_score(twenty_test_target, predicted, average='macro') scores = baseline_active_clf.decision_function(unlabeled_train_data) prob = np.divide(1, np.add(1, np.exp(np.multiply(np.array(scores), -1)))) #print 'min proba:', np.amin(prob), 'max proba:', np.amax(prob) diploma_res_print(foldname, len(labeled_train_data), score, np.amax(prob))
print(fila["LSQN"], fila["Grupo"]) if fila["Grupo"] == 6: print(fila["LSQN"], fila["Grupo"]) #Generating table with # of clients and their respective group labels = kmeans.predict(datos_reelevantes) colores = ['red', 'green', 'blue', 'cyan', 'yellow', 'pink', 'black'] copia = pd.DataFrame() copia['LSQN'] = datos_cargados['LSQN'].values copia['OCLTV'] = datos_cargados['OCLTV'].values copia['label'] = labels cantidadGrupo = pd.DataFrame() cantidadGrupo['color'] = colores cantidadGrupo['cantidad'] = copia.groupby('label').size() #cantidadGrupo #Generate table,in notebook #We see the representative of the group, the user close to his centroid cercanos, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, datos_reelevantes) cercanos #We look for the id of the closest clients usuarios = datos_cargados['LSQN'].values for fila in cercanos: print(usuarios[fila]) #Now we associate the centroid with its id passing as a parameter the list of centroids labels = kmeans.predict(datos_reelevantes) colores = ['red', 'green', 'blue', 'cyan', 'yellow', 'pink', 'black'] copia = pd.DataFrame() copia['LSQN'] = datos_cargados['LSQN'].values copia['OCLTV'] = datos_cargados['OCLTV'].values copia['Grupo'] = labels cantidadGrupo = pd.DataFrame() cantidadGrupo['color'] = colores cantidadGrupo['cantidad'] = copia.groupby('Grupo').size()
sentence_vec += w2v.wv[word] X.append(sentence_vec) ## ======== clustering Kmean ========= n_clusters_kmean = Number_line kmeans = KMeans(n_clusters=n_clusters_kmean) kmeans = kmeans.fit(X) print(kmeans.labels_) # ======== Determining the closest point to the center avg = [] for j in range(n_clusters_kmean): idx = np.where(kmeans.labels_ == j)[0] avg.append(np.mean(idx)) closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X) ordering = sorted(range(n_clusters_kmean), key=lambda k: avg[k]) summary = ' '.join([sentences[closest[idx]] for idx in ordering]) print("\n*** Using Kmean clustering:\n") print(summary) ## ======== hierarchical clustering ========= n_clusters_hierarchy = Number_line # number of regions ward = AgglomerativeClustering(n_clusters=n_clusters_hierarchy) ward = ward.fit(X) print(ward.labels_) # ======== Determining the closest point to the center of each cluster X_Clusters = [] idx_cluster = []
def run_umap(X=None, y=None, method='unsupervised', scaler=None, neighbor=10, dist=0.1, metric='correlation', color_code=None, annotate_names=None, annotate=False, test_set=True, title=None, savefig_path=False, X_test=None, y_test=None, color_code_test=None, plot=True): reducer = umap.UMAP( n_components=dimension, n_neighbors=neighbor, min_dist=dist, metric=metric, random_state=seed_value ) #, TSNE(n_components=k, random_state=seed_value), PCA(n_components=k, random_state=seed_value)] reducer_name = 'umap' #, 'tsne', 'pca'] pipeline = Pipeline([ ('normalization', scaler), ('reducer', reducer), ]) y_encoded = LabelEncoder().fit_transform(y) if method == 'supervised': X_reduced = pipeline.fit_transform(X, y_encoded) elif method == 'metric_learning': X_reduced = pipeline.fit_transform(X, y_encoded) X_reduced_test = pipeline.transform(X_test) elif method == 'unsupervised': X_reduced = pipeline.fit_transform(X) print('running kmeans...') # Set k to amount of subreddits k = len(np.unique(y)) # Fit kmeans km = KMeans(n_clusters=k, random_state=seed_value).fit(X_reduced) # Obtain euclidean distance between centroids centers = km.cluster_centers_ # find centroid labels closest, _ = pairwise_distances_argmin_min(centers, X_reduced) data = pd.DataFrame(X_reduced, columns=['x1', 'x2']) data['label'] = y centers_labels = list(data.loc[closest].label) # Plot in 2D if plot: assert dimension == 2 if method == 'metric_learning': # train: first time point scatter_plot(X_reduced, y, color_code, method, annotate=annotate, title='First time step (train set)', savefig_path=savefig_path) # test: next time points scatter_plot(X_reduced_test, y_test, color_code_test, method, annotate=annotate, title=title, savefig_path=savefig_path) else: scatter_plot(X_reduced, y, color_code, method, annotate=annotate, title=title, savefig_path=savefig_path, centers=centers) if method == 'metric_learning': return X_reduced, X_reduced_test else: return X_reduced, centers, centers_labels
def active_cluster_svm_margin_cluster(): #baseline active learning solution alpha = 20 #initial training set betha = 600 #number of iterations gamma = 20 #sampling volume tfidf_transformer = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()) ]) #try to implement silhouette analysis for number of clusters #cluster = AgglomerativeClustering(n_clusters=20,affinity='cosine', linkage='complete') cluster = KMeans(n_clusters=20) unlabeled_train_data = twenty_train_data unlabeled_train_target = twenty_train_target print 'start transforming' unlabeled_matrix = tfidf_transformer.fit_transform(unlabeled_train_data) print 'start fitting' print datetime.now() res = cluster.fit_predict(unlabeled_matrix) print datetime.now() print 'clustering result' print OrderedDict(Counter(res)) print res.shape closest, _ = pairwise_distances_argmin_min(cluster.cluster_centers_, unlabeled_matrix, metric='cosine') print closest ''' results = defaultdict(list) for idx, val in enumerate(res): results[val].append(idx) take_idx = [] for cluster_num in range(0, 20): idxset = results[cluster_num] ''' #create labeled and unlabeled training set #labeled_train_data = twenty_train_data[: alpha] #labeled_train_target = twenty_train_target[: alpha] #unlabeled_train_data = twenty_train_data[alpha:] #unlabeled_train_target = twenty_train_target[alpha:] labeled_train_data = [] labeled_train_target = [] labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target = diploma_range_sampling(labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target, closest) print labeled_train_data.shape baseline_active_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LinearSVC()) ]) baseline_active_clf.fit(labeled_train_data, labeled_train_target) predicted = baseline_active_clf.predict(twenty_test_data) score = f1_score(twenty_test_target, predicted, average='macro') print 'active cluster svm margin cluster solution' diploma_res_print(len(labeled_train_data), score) for t in range(1, betha): sample_numbers = np.array([]) #to do use labeled dataset to train sigmoid scores = baseline_active_clf.decision_function(unlabeled_train_data) doc_score = {} for i in range(0, len(unlabeled_train_data)): last_elems = (sorted(scores[i]))[-2:] doc_score[i] = np.abs(last_elems[0] - last_elems[1]) sorted_doc_score = sorted(doc_score.items(), key=operator.itemgetter(1)) print 'sorted doc score minimum active cluster svn margin cluster', sorted_doc_score[0] if (t % 2) == 0: sample_numbers = np.array([]) #to add for i in range(0, gamma): sample_numbers = np.append(sample_numbers, sorted_doc_score[i][0]) else: unlabeled_matrix = tfidf_transformer.fit_transform(unlabeled_train_data) print datetime.now() res = cluster.fit_predict(unlabeled_matrix) print datetime.now() sample_numbers, _ = pairwise_distances_argmin_min(cluster.cluster_centers_, unlabeled_matrix, metric='cosine') print sample_numbers labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target = diploma_range_sampling(labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target, sample_numbers) baseline_active_clf.fit(labeled_train_data, labeled_train_target) predicted = baseline_active_clf.predict(twenty_test_data) score = f1_score(twenty_test_target, predicted, average='macro') diploma_res_print(len(labeled_train_data), score)
def predict(self, X): clustering, _ = pairwise_distances_argmin_min(X=X, Y=self.cluster_centers_, metric='cosine') self.labels_ = clustering return self.labels_
def closest_n_index(X, n_c=10): kmeans = KMeans(n_clusters=n_c, random_state=0).fit(X) closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X) return closest, kmeans.labels_
def execute(self, context): try: mesh = bpy.data.objects[self.currentobject] except: mesh = context.active_object if (mesh is not None): only_gisif_colors, k, gisif_name = getGISIFColorsInner( context, mesh) only_gisif_colors = only_gisif_colors.reshape( only_gisif_colors.shape[0], 1) #Normalize the gisif colors only_gisif_colors = only_gisif_colors / np.sqrt( np.sum(only_gisif_colors**2)) # k1_list, k2_list, sx, p1_list, p2_list, mean_list, gaussian_list, normals = need_curvatures(mesh); # features = np.hstack((normals, k1_list.reshape(k1_list.shape[0],1), k2_list.reshape(k2_list.shape[0],1), p1_list, p2_list, only_gisif_colors.reshape(only_gisif_colors.shape[0],1))); # mu, transformedFeatures = pcaTransform(context, mesh, features, K=12); gisif_colors = only_gisif_colors gisif_colors = StandardScaler().fit_transform(gisif_colors) count_n = mesh.gisif_markers_n gmm = GaussianMixture(n_components=count_n, covariance_type='full').fit(gisif_colors) labels_gmm = gmm.predict(gisif_colors) labels_gmm.shape = (labels_gmm.shape[0], 1) # gmm_sorted_indices = np.argsort(gmm.means_.T).flatten(); # gmm_sorted_values = np.sort(gmm.means_.T).flatten(); gmm_sorted_indices = np.array([i for i in range(count_n)]) gmm_sorted_values = gmm.means_ print(gmm.means_, gmm_sorted_indices) keyindices = [] print('=' * 40) for i in range(count_n): gmm_label_index = gmm_sorted_indices[i] gmm_value = gmm_sorted_values[gmm_label_index] gmm_subset, __ = np.where(labels_gmm == gmm_label_index) cluster_values = gisif_colors[gmm_subset] print(gmm_value, gmm_value.shape, cluster_values.shape) closest, __ = pairwise_distances_argmin_min( gmm_value.reshape(1, -1), cluster_values) closest_index = gmm_subset[closest[0]] closest_value = gisif_colors[closest_index] keyindices.append(closest_index) print('-----------------') # print('GMM VALUES (Mean: %f, Closest: %f, Closest Index: %d, In Subset Value: %f, In Subset Index: %d) ::: '%(gmm_value, closest_value, closest_index, cluster_values[closest[0]], closest[0])); faces = getMeshFaces(mesh) for vid in keyindices: uvw = [0.0, 0.0, 0.0] faces_rows, faces_column = np.where(faces == vid) face_row_index, face_column_index = faces_rows[ 0], faces_column[0] face_row = faces[face_row_index] uvw[face_column_index] = 1.0 vid1, vid2, vid3 = face_row.tolist() print(vid1, vid2, vid3) co = mesh.data.vertices[face_row[face_column_index]].co addConstraint(context, mesh, uvw, [vid1, vid2, vid3], co, faceindex=face_row_index, create_visual_landmarks=False) if (mesh.gisif_symmetries): print('~' * 40) for o_vid in keyindices: #EQuation 10 in the paper for finding the symmetry points where the euclidean distance will be zero for symmetry delta_gisif_colors = np.sqrt( (only_gisif_colors[o_vid] - only_gisif_colors)**2) # delta_gisif_colors[o_vid] = np.finfo(float).max; vidrows, __ = np.where(delta_gisif_colors == 0.0) print(delta_gisif_colors[vidrows]) print(vidrows) filtered_vid_values = delta_gisif_colors[vidrows] vid = vidrows[filtered_vid_values.argmin()] print(o_vid, vid) uvw = [0.0, 0.0, 0.0] faces_rows, faces_column = np.where(faces == vid) face_row_index, face_column_index = faces_rows[ 0], faces_column[0] face_row = faces[face_row_index] uvw[face_column_index] = 1.0 vid1, vid2, vid3 = face_row.tolist() print(vid1, vid2, vid3) co = mesh.data.vertices[face_row[face_column_index]].co addConstraint(context, mesh, uvw, [vid1, vid2, vid3], co, faceindex=face_row_index, create_visual_landmarks=False) # bpy.ops.genericlandmarks.createlandmarks('EXEC_DEFAULT', currentobject=mesh.name, updatepositions = True); bpy.ops.genericlandmarks.changelandmarks('EXEC_DEFAULT', currentobject=mesh.name) return {'FINISHED'}
def __innerQmc(outPath='./', path='Models/', FILE_DOT_OUT='analysis.out', CSV_NAME='model.csv', MAX_CLUSTERS=10, PER_CONNECT=0.5, SIL_DAMPING=0.1, NORM_METHOD='StandardScaler', clustering_names=[ 'AffinityPropagation', 'DBSCAN', 'KMeans', 'MeanShift', 'SpectralClustering', 'Ward' ], modellerScores=['molpdf', 'DOPE', 'DOPEHR', 'GA341', 'NDOPE'], molprobityScores=['outlier', 'allowed'], theColors='bgrcmykbgrcmykbgrcmykbgrcmyk', saveFig=False, molprobity=False): ''' The Quality-Models Clusterizer private method, its performs the analysis, call the pther methods and evaluate the dataset. PARAMETERS ---------- outPath : string (Default = ./ ) The path to save the csv and data analysis. path : string (Default = ./Models/ ) The path of Molprobity pdf. (All files must be on same folder and its names MUST be on modeller output file!). FILE_DOT_OUT : string (default = analysis.out) Name of output file. CSV_NAME : string (default = analysis.out) Name of .csv file with data from Modeller and Molprobity outputs MAX_CLUSTERS : int (default = 10) Maximum number of clusters for k-dependent methods. PER_CONNECT : double (default = 0.5) Percentage of the data size used as number of neighbors for Ward. SIL_DAMPING : double (default = 0.1) Minimum percentage of silhouette number to be considered as actual increasing. NORM_METHOD : string (default = StandardScaler) Method for normilize the data. Options : {'StandardScaler', 'MinMax'} saveFig : Boolean (default = False) Save a figure of all cluster results Yes (True)/No (False). clustering_names : List[string] (default = ['AffinityPropagation', 'DBSCAN', 'KMeans', 'MeanShift', 'SpectralClustering', 'Ward']) List of Method names. Supported methods are: KMeans, AddinityPropagation, MeanShift, SpecrtalClustering, Ward, DBSCAN. modellerScores: List[string] (default = ['molpdf', 'DOPE', 'DOPEHR', 'GA341', 'NDOPE']) List of Modeller attributes to evaluate. Options : {'molpdf', 'DOPE', 'DOPEHR', 'GA341', 'NDOPE'} molprobityScores: List[string] (default = ['outlier', 'allowed']) List of Molprobity attributes to evaluate. Options : {'outlier', 'allowed'} theColors : string (default = bgrcmykbgrcmykbgrcmykbgrcmyk) A stirng which each letter is a matplotlib color. (b : blue; g : green; r : red; c : cyan; m : magenta; y : yellow; k : black; w : white) RETURNS ------- ''' ########################################## PREPARING DATA ################################################ log.info('\n\n\t\tQuality-Models Clusterizer\n\n') if not modellerScores or not any( x in ['molpdf', 'DOPE', 'DOPEHR', 'GA341', 'NDOPE'] for x in modellerScores): log.error( "modellerScores list has no valid value or its empty.\nValid values are: molpdf, DOPE, DOPEHR, GA341, NDOPE\n\nABORTING EXECUTION" ) exit() if not molprobityScores or not any(x in ['outlier', 'allowed'] for x in molprobityScores): log.error( "molprobityScores list has no valid value or its empty.\nValid values are: outlier, allowed\n\nABORTING EXECUTION" ) exit() if molprobity: os.system('mkdir Modelos') log.info('####### Preparing data...') t0 = time.time() clustering_names.sort() # colors used after on the plot colors = np.array([x for x in theColors]) colors = np.hstack([colors] * 20) plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5)) plt.subplots_adjust(left=.05, right=.98, bottom=.1, top=.96, wspace=.2, hspace=.2) plot_num = 1 D = [] with open(FILE_DOT_OUT, 'r') as content: c = content.read() c_list = c.split('>>')[1].split('--------')[-1].strip().split('\n') for line in c_list: v = line.split() pdb, var = v[0], v[1::] rt = pdb.split('.pdb')[0] if bool(re.match('^[-]+$', rt)): continue pdf = path + rt + '.pdf' var = [float(i) for i in var] #print(pdf) # This code should be uncommented when you have not already generated the 'MolProbity Ramachandran analysis' for the pdb files to be analyzed. # It's necessary to install molprobity to run it. if molprobity: os.system( 'java -Xmx256m -cp /home/medina/Documentos/Paper_Agrupamento_Proteinas/molprobity/lib/chiropraxis.jar chiropraxis.rotarama.Ramalyze -pdf ' + pdb + ' ' + pdf) os.system('mv *.pdf ./Modelos') aux_path = './Modelos/' + rt + '.pdf' d = dict() gen = do_grep(aux_path, 'allowed') outputs = [output for output in gen] if 'allowed' in molprobityScores: try: d['allowed'] = float( re.sub( ',', '.', outputs[0].split('%')[0].split('\'')[1].strip())) except: d['allowed'] = 0 #s = os.popen('pdfgrep allowed '+pdf).read() #p = float(re.sub(',','.',s.split('%')[0].strip())) #s = os.popen('pdfgrep outliers '+pdf).read() gen = do_grep(aux_path, 'outliers') outputs = [output for output in gen] if 'outlier' in molprobityScores: try: d['outlier'] = int(outputs[0].split('outliers')[0].split( 'were')[-1].strip()) except: d['outlier'] = 0 d['pdb'] = rt if 'molpdf' in modellerScores: d['molpdf'] = var[0] if 'DOPE' in modellerScores: d['DOPE'] = var[1] if 'DOPEHR' in modellerScores: d['DOPEHR'] = var[2] #if 'GA341' in modellerScores: # d['GA341' ] = var[3] if 'NDOPE' in modellerScores: d['NDOPE'] = var[4] D.append(d) D = pd.DataFrame(D) # Find uniform columns # nunique = D.apply(pd.Series.nunique) # cols_to_drop = nunique[nunique == 1].index # D.drop(cols_to_drop, axis=1) # Create a csv with data D.to_csv(path + CSV_NAME, index=False) # Create a csv with data #aux = pd.read_csv(path + CSV_NAME) # Concatenate scores listOfAtrr = modellerScores + molprobityScores allowedScores = ['molpdf', 'DOPE', 'DOPEHR', 'NDOPE', 'outlier', 'allowed'] # Remove uniform columns # for dropThis in cols_to_drop: # #print(dropThis) # listOfAtrr.remove(dropThis) # allowedScores.remove(dropThis) #print(allowedScores) # Remove not allowed values listOfAtrr = list(filter(lambda i: i in allowedScores, listOfAtrr)) #print(listOfAtrr) X = D[listOfAtrr] #print(X) pdb_names = D['pdb'] dt = np.asarray(X) #print(dt) if NORM_METHOD == 'MinMax': # normalize the data in the space of 0 to 1 for i in range(len(dt[0])): # If column is uniform discard it #if np.all(dt[0:i] == dt[:i], axis=1): # dt = np.delete(dt, i, axis=1) # #dt = np.delete(dt, i, axis=2) # continue if sum(dt[:, i]) != 0: #print("\n\nCOLUNA MM: " + str(i)) #print("\nDIVISOR DO MINMAX: " + str(abs(dt[:, i]).max())) dt[:, i] = (dt[:, i] / abs(dt[:, i]).max())**2 #print(dt[:, i]) else: if NORM_METHOD != 'StandardScaler': log.warn( "NORM_METHOD must be either MinMax or StandardScaler, running as StandardScaler, since it is the default method" ) # normalize the data with mean 0 and stf 1 for i in range(len(dt[0])): mean_c = np.mean(dt[:, i]) std_c = np.std(dt[:, i]) #print("\n\nCOLUNA SS: " + str(i)) #print("\nMEDIA CALC: " + str(mean_c)) #print("\nDESVIO CALC: " + str(std_c)) if std_c < 1e-4: std_c = 1 dt[:, i] = ((dt[:, i] - mean_c) / std_c) #print(dt[:, i]) #print(dt) # run PCA for the normalized data pca = PCA(n_components=2) print("\nAntes do PCA\n") #print(X) print(D[listOfAtrr]) X = pca.fit(dt).transform(dt) print("\nDepois do PCA\n") print(X) # PCA process results results = pca.components_ print("\nResultados PCA: " + str(results)) covm = pca.explained_variance_ratio_ print("\nVariance PCA: " + str(covm)) if not os.path.exists('./../' + NORM_METHOD + '_pca_results.txt'): f = open('./../' + NORM_METHOD + '_pca_results.txt', 'w') head_line = 'pbd' for c in range(2): for at in allowedScores: head_line = head_line + ', ' + at + '_coor' + str(c + 1) head_line = head_line + ', coef_var_coor1, coef_var_coor2\n' print("HEAD LINE PCA: " + head_line) f.write(head_line) f.close() f = open('./../' + NORM_METHOD + '_pca_results.txt', 'a+') f.write( rt.split('.')[0] + ', ' + str([*results[0], *results[1], *covm])[1:-1] + '\n') f.close() #f = open('./../' + NORM_METHOD + '_corr_mtx.txt', 'a+') corr_mtx = pd.DataFrame(X).corr() corr_mtxd = pd.DataFrame(dt).corr() print("\nCorrelation Matriz: \n") print(corr_mtx) print(corr_mtxd) #f.close() # connectivity matrix for structured Ward n_neig = int(len(X) * PER_CONNECT) connectivity = kneighbors_graph(X, n_neighbors=n_neig, include_self=True) # make connectivity symmetric affinity = 'euclidean' connectivity = 0.5 * (connectivity + connectivity.T) connectivity, n_components = cluster.hierarchical._fix_connectivity( X, connectivity, affinity) # define cutoff for DBSCAN if NORM_METHOD == 'MinMax': n_eps = 0.1 else: if NORM_METHOD != 'StandardScaler': log.warn( "NORM_METHOD must be either MinMax or StandardScaler, running as StandardScaler, since it is the default method" ) n_eps = 2 * 2.57 * 0.05 t1 = time.time() log.info('\tTime spended (preparing data): %f s' % (t1 - t0)) ########################################## METHODS DEFINITION ############################################## #clustering_names = ['AffinityPropagation', 'DBSCAN', 'KMeans', 'MeanShift', 'SpectralClustering', 'Ward'] log.info('\n####### Defining clustering methods...') t0 = time.time() # create clustering estimators clustering_algorithms = [] if 'AffinityPropagation' in clustering_names: try: affinity_propagation = cluster.AffinityPropagation( damping=0.9) #,preference=-1) clustering_algorithms.append(affinity_propagation) except Exception as e: log.warn( "Problems were found while running Affinity Propagation clustering algorithm for " + NORM_METHOD + " normalization, skipping its execution.\nProblem: " + str(e)) print( "Problems were found while running Affinity Propagation clustering algorithm for " + NORM_METHOD + " normalization, skipping its execution.\nProblem: " + str(e)) if 'DBSCAN' in clustering_names: try: dbscan = cluster.DBSCAN(eps=n_eps, min_samples=5, algorithm='kd_tree', metric='euclidean') clustering_algorithms.append(dbscan) except Exception as e: log.warn( "Problems were found while running DBSCAN clustering algorithm for " + NORM_METHOD + " normalization, skipping its execution.\nProblem: " + str(e)) print( "Problems were found while running DBSCAN clustering algorithm for " + NORM_METHOD + " normalization, skipping its execution.\nProblem: " + str(e)) if 'KMeans' in clustering_names: log.info('\n\t(K-means) Searching best k-number... ') try: k, _, _ = __best_k_of_clusters('KMeans', X, MAX_CLUSTERS) log.info('\tk_best = ' + str(k)) two_means = cluster.KMeans(n_clusters=k, init='k-means++') clustering_algorithms.append(two_means) except Exception as e: log.warn( "Problems were found while running KMeans clustering algorithm for " + NORM_METHOD + " normalization, skipping its execution.\nProblem: " + str(e)) print( "Problems were found while running KMeans clustering algorithm for " + NORM_METHOD + " normalization, skipping its execution.\nProblem: " + str(e)) if 'MeanShift' in clustering_names: try: ms = cluster.MeanShift() clustering_algorithms.append(ms) except Exception as e: log.warn( "Problems were found while running MeanShift clustering algorithm for " + NORM_METHOD + " normalization, skipping its execution.\nProblem: " + str(e)) print( "Problems were found while running MeanShift clustering algorithm for " + NORM_METHOD + " normalization, skipping its execution.\nProblem: " + str(e)) if 'SpectralClustering' in clustering_names: log.info('\n\t(Spectral) Searching best k-number... ') try: k, _, _ = __best_k_of_clusters('SpectralClustering', X, MAX_CLUSTERS) #print(k) log.info('\tk_best = ' + str(k)) #spectral = cluster.SpectralClustering(n_clusters=k, eigen_solver='arpack', affinity='nearest_neighbors') spectral = cluster.SpectralClustering(n_clusters=k, eigen_solver=None, random_state=None, n_init=10, gamma=1., affinity='rbf', n_neighbors=10, eigen_tol=0.0, degree=3, coef0=1, kernel_params=None) clustering_algorithms.append(spectral) except Exception as e: log.warn( "Problems were found while running Spectral clustering algorithm for " + NORM_METHOD + " normalization, skipping its execution.\nProblem: " + str(e)) print( "Problems were found while running Spectral clustering algorithm for " + NORM_METHOD + " normalization, skipping its execution.\nProblem: " + str(e)) if 'Ward' in clustering_names: log.info('\n\t(Ward) Searching best k-number... ') try: k, _, _ = __best_k_of_clusters('Ward', X, MAX_CLUSTERS, connectivity=connectivity) log.info('\tk_best = ' + str(k)) ward = cluster.AgglomerativeClustering(n_clusters=k, linkage='ward', connectivity=connectivity) clustering_algorithms.append(ward) except Exception as e: log.warn( "Problems were found while running Ward clustering algorithm for " + NORM_METHOD + " normalization, skipping its execution.\nProblem: " + str(e)) print( "Problems were found while running Ward clustering algorithm for " + NORM_METHOD + " normalization, skipping its execution.\nProblem: " + str(e)) #clustering_algorithms = [two_means, affinity_propagation, ms, spectral, ward, dbscan] t1 = time.time() log.info('\n\tTime spended (defining clustering methods): %f s' % (t1 - t0)) ########################################## CLUSTERS & PLOTS ############################################### log.info('\n####### Cluster & Dispersion graphs...') t0 = time.time() for name, algorithm in zip(clustering_names, clustering_algorithms): # predict cluster memberships algorithm.fit(X) if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) # plot plt.subplot(2, len(clustering_algorithms) // 2, plot_num) plt.title(name, size=18) plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10) if hasattr(algorithm, 'cluster_centers_'): centers = algorithm.cluster_centers_ center_colors = colors[:len(centers)] plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors) plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') plot_num += 1 t1 = time.time() log.info('\tTime spended (clst. disp. graf.): %f s' % (t1 - t0)) ########################################## OUTPUT FILES ################################################### log.info('\n####### Generating output files...') t0 = time.time() # File containing clusters data cluster_date_dir = 'Clusters_Data_' + NORM_METHOD if not os.path.isdir(outPath + cluster_date_dir): os.makedirs(outPath + cluster_date_dir) for name, algorithm in zip(clustering_names, clustering_algorithms): # Read labels of the algorithm X_labels = algorithm.labels_ # Try to write the representative model for the clusters on analysis.out try: # Adding results on the moddeler file analysis.out with open(FILE_DOT_OUT, 'a') as arq: if clustering_names[0] == name: arq.writelines( '\n\n##############################################################################################' ) arq.writelines( '\n>>Clustering results - Representative structure - ' + name) arq.writelines('\nCluster\t\tFile_Name\n') Vec = [] # If the clustering method has cluster_centers_ attribute and it isn't nor KMeans neither MeanShift (on these Medoid != Centroid) # In this set of clustering methods - AffinityPropagation if hasattr(algorithm, 'cluster_centers_') and ( name != 'KMeans') and (name != 'MeanShift'): centers = algorithm.cluster_centers_[:] r = int(centers[:, 0].size) for j in range(r): m = __aprx_medoid(X, centers[j, :]) nm = __medoid_name(X, pdb_names, [0, 1], [m[0], m[1]]) arq.write(str(j) + '\t\t') arq.write(nm + '\n') x_aux = dict() x_aux['Nome_pdb'] = nm #str(c) x_aux['Cluster'] = j Vec.append(x_aux) else: algorithm.cluster_centers_ = [] for lb in set(algorithm.labels_): labels = algorithm.labels_ data_frame = pd.DataFrame(X) algorithm.cluster_centers_.append( data_frame[labels == lb].mean(axis=0).values) medians, _ = metrics.pairwise_distances_argmin_min( algorithm.cluster_centers_, data_frame.values) j = 0 # find medoids for m in medians: nm = __medoid_name(X, pdb_names, [0, 1], [X[m, 0], X[m, 1]]) arq.write(str(j) + '\t\t') arq.write(str(nm) + '.pdb\n') x_aux = dict() c = 'MEDOID:\t' + str(nm) + '.pdb' x_aux['Cluster'] = j x_aux['\tFilename'] = str(c) Vec.append(x_aux) j = j + 1 if clustering_names[-1] == name: arq.writelines( '##############################################################################################' ) # create results vector for the clustering method for i in range(pdb_names.size): x_aux = dict() c = '\t' + pdb_names[i] + '.pdb' x_aux['Cluster'] = X_labels[i] x_aux['\tFilename'] = str(c) Vec.append(x_aux) # sort results vector by n-cluster Vec = sorted(Vec, key=lambda k: k['Cluster']) Vec = pd.DataFrame(Vec) # n-cluster == -1 are Outlier data (for DBscan) Vec.loc[Vec.Cluster == -1, ['Cluster']] = 'Outlier' # Write .csv results Vec.to_csv(outPath + cluster_date_dir + '/' + name + '_Data.csv', index=False) except Exception as ex: log.error('Error 1: {0}'.format(ex)) t1 = time.time() log.info('\tTime spended (Generating output files): %f s' % (t1 - t0)) log.info('\n\n\t\tThat\'s it!\n\n\n') if saveFig == True: plt.savefig(NORM_METHOD + '_dispersion_graph.png') plt.show()
def interpret(self, image_paths, num_samples=1000, batch_size=50, save_path='normlime_weights.npy'): """ Main function of the interpreter. Args: image_paths (list of strs): A list of image filepaths. num_samples (int, optional): LIME sampling numbers. Larger number of samples usually gives more accurate interpretation. Default: 1000 batch_size (int, optional): Number of samples to forward each time. Default: 50 save_path (str, optional): The .npy path to save the normlime weights. It is a dictionary where the key is label and value is segmentation ids with their importance. Default: 'normlime_weights.npy' :return: NormLIME weights: {label_i: weights on features} :rtype: dict Example:: def paddle_model(image_input): import paddle.fluid as fluid class_num = 1000 model = ResNet50() logits = model.net(input=image_input, class_dim=class_num) probs = fluid.layers.softmax(logits, axis=-1) return probs # The model can be downloaded from # http://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_pretrained.tar # More pretrained models can be found in # https://github.com/PaddlePaddle/models/tree/release/1.8/PaddleCV/image_classification # 10 images are used here for example, but more images should be used. dataset_dir = "assets" image_paths = sorted(glob.glob(dataset_dir + "/*.png")) image_paths = image_paths[:10] normlime = it.NormLIMECVInterpreter(paddle_model, "assets/ResNet50_pretrained") # this can be very slow. normlime.interpret(image_paths, num_samples=2000, batch_size=50) """ _, h_pre_models_kmeans = get_pre_models() kmeans_model = load_pickle_file(h_pre_models_kmeans) # compute lime weights and put in self.all_lime_weights for i in tqdm(range(len(image_paths))): image_path = image_paths[i] self._get_lime_weights(image_path, num_samples, batch_size, auto_save=(i % 10 == 0)) np.savez(self.filepath_to_save, **self.all_lime_weights) # convert superpixel indexes to cluster indexes. normlime_weights_all_labels = {} for i, image_path in enumerate(image_paths): temp = self.all_lime_weights[image_path] if isinstance(temp, np.ndarray): temp = temp.item() fextractor = FeatureExtractor() f = fextractor.forward(temp['input'][np.newaxis, ...]).transpose( (1, 2, 0)) X = extract_superpixel_features(f, temp['segmentation']) try: cluster_labels = kmeans_model.predict( X) # a list. len = number of sp. except AttributeError: from sklearn.metrics import pairwise_distances_argmin_min cluster_labels, _ = pairwise_distances_argmin_min( X, kmeans_model.cluster_centers_) lime_weights = temp['lime_weights'] pred_labels = lime_weights.keys() for y in pred_labels: normlime_weights_label_y = normlime_weights_all_labels.get( y, {}) w_f_y = [abs(w[1]) for w in lime_weights[y]] w_f_y_l1norm = sum(w_f_y) for w in lime_weights[y]: seg_label = w[0] weight = w[1] * w[1] / w_f_y_l1norm tmp = normlime_weights_label_y.get( cluster_labels[seg_label], []) tmp.append(weight) normlime_weights_label_y[cluster_labels[seg_label]] = tmp normlime_weights_all_labels[y] = normlime_weights_label_y # compute normlime weights. for y in normlime_weights_all_labels: normlime_weights = normlime_weights_all_labels.get(y, {}) for k in normlime_weights: normlime_weights[k] = sum(normlime_weights[k]) / len( normlime_weights[k]) # check normlime if len(normlime_weights_all_labels.keys()) < max( normlime_weights_all_labels.keys()) + 1: print( "\n" + \ "Warning: !!! \n" + \ "There are at least {} classes, ".format(max(normlime_weights_all_labels.keys()) + 1) + \ "but the NormLIME has results of only {} classes. \n".format(len(normlime_weights_all_labels.keys())) + \ "It may have cause unstable results in the later computation" + \ " but can be improved by computing more test samples." + \ "\n" ) if os.path.exists(save_path): n = 0 tmp = save_path.split('.npy')[0] while os.path.exists(f'{tmp}-{n}.npy'): n += 1 np.save(f'{tmp}-{n}.npy', normlime_weights_all_labels) else: np.save(save_path, normlime_weights_all_labels) return normlime_weights_all_labels
st.subheader("Disambiguation Parameter") hasil_disambiguation = pd.DataFrame(disambiguation_df) st.dataframe(hasil_disambiguation) vector = hasil_disambiguation SUMMARY_SIZE = st.sidebar.slider("Berapa Jumlah Cluster?", 1, len(sentences), len(sentences) // 3) n = SUMMARY_SIZE avg = [] n_clusters = len(sentences) // n modelkm = KMeans(n_clusters=n_clusters, init='k-means++') modelkm = modelkm.fit(vector) for j in range(n_clusters): idx = np.where(modelkm.labels_ == j)[0] avg.append(np.mean(idx)) closest, _ = pairwise_distances_argmin_min(modelkm.cluster_centers_, vector) ordering = sorted(range(n_clusters), key=lambda k: avg[k]) col5, col6 = st.beta_columns([1, 1]) col5.subheader("Closest Cluster") col5.dataframe(closest) col6.subheader("Ordering Cluster") col6.dataframe(ordering) st.subheader("Summary Result") # summary = itemgetter(*ordering)(sentences) # hasilRingkasan = [] # for sent in summary: # a = ' '.join(sent) # hasilRingkasan.append(a) # st.write(hasilRingkasan) # summary = ' '.join([list_sentences[closest[idx]] for idx in ordering])
#!/usr/bin/env python # coding: utf-8 # In[ ]: from sklearn.neighbors import NearestCentroid from sklearn.metrics import pairwise_distances_argmin_min, pairwise_distances closest, min_dist = pairwise_distances_argmin_min(X, clusterer.cluster_centers_) # Distance of each point to the 9 clusters X_np = np.array(X) centroids = clusterer.cluster_centers_ distance = [] for i in X_np: data_point = np.array(i).reshape(1, -1) distance_to_point = pairwise_distances(data_point, centroids) distance.extend(distance_to_point) distance_all_centroids_df = pd.DataFrame(distance)
# /home/genian/SEQUOIA/branches/CURRENT/src/devops/ml/sklearn/clustering/A595940_F8803.csv df_pre = pd.read_csv('./train.csv') dataset = df_pre.values Hash_Name = dataset[:, 0] MalwareBenign = dataset[:, 1] Feature = dataset[:, 2:] # n_clusters = 8803 print("[+] Kmeans Start n: 30") kmeans = KMeans(n_clusters=30, random_state=0, n_jobs=-1).fit(Feature) kmeans_result = (kmeans.labels_).tolist() centroid = (kmeans.cluster_centers_).tolist() # closest = nearest point index closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, Feature) fp1 = open("./output.csv", 'w') #output file write fp1.write("Cluster Num." + ", " + "File_Name" + ", " + "Benign_Malware" + "\n") closest = closest.tolist() for i in range(0, len(closest)): cluster_num = str(i) malware_benign = str(MalwareBenign[closest[i]]) file_name = str(Hash_Name[closest[i]]) fp1.write(cluster_num + "," + file_name + ", " + malware_benign + "\n") # Nearest Malware if __name__ == "__main__": pass
#%% Cluster features feat = impute_nan_inf(feat) column_linkage = linkage(feat.T, method=method, metric=metric) clusters = fcluster(column_linkage, n_clusters, criterion='maxclust') un, n = np.unique(clusters, return_counts=True) #print(n) # Get cluster centers cluster_centers = (feat.T).groupby(by=clusters).mean() # get the index of the feature closest to the centroid of the cluster central, _ = pairwise_distances_argmin_min(cluster_centers, feat.T, metric='cosine') assert (np.unique(central).shape[0] == n_clusters) # get the feature name of the feature closest to the centroid of the cluster central = feat.columns.to_numpy()[central] #%% Make dataframe df = pd.DataFrame(index=feat.columns, columns=['group_label', 'stat_label', 'motion_label']) df['group_label'] = clusters stats = np.array(['10th', '50th', '90th', 'IQR']) df['stat_label'] = [ np.unique([x for x in stats if x in ft]) for ft in df.index ]
COM_action[i] = data[i + 1] - data[i] #print(COM_action[i], "COM action") #print (np.shape(data)) X = np.array(COM_action[:5000]) print(np.shape(X)) X = X.reshape(5000, 24) #first entry= X_length X_embedded = TSNE(n_components=2).fit_transform(X) print(X_embedded) n_clusters = 100 kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X_embedded) labels = kmeans.labels_ cluster_centers = kmeans.cluster_centers_ closest, _ = pairwise_distances_argmin_min(cluster_centers, X_embedded) print(closest, 'closest to the centroids') ## Getting ID of the individual cluster entries # id_per_cluster=np.zeros((n_clusters)) # for i in range(n_clusters): # #print () # for j in (range(len(labels))): # if labels[j]==i: # id_per_cluster[i]=j # break # print (id_per_cluster) actions_modified = [] #Now that we have the id's per cluster, let's see what they are.
f = codecs.open(input_vector_file, 'r', 'utf-8') df, labels_array, array_len = build_word_vector_matrix(total_words_set) clusters_to_make = int(200) kmeans_model = KMeans(init='k-means++', n_clusters=clusters_to_make, n_init=10) kmeans_model.fit(df) cluster_labels = kmeans_model.labels_ cluster_inertia = kmeans_model.inertia_ cluster_centers = kmeans_model.cluster_centers_ cluster_to_words = find_word_clusters(labels_array, cluster_labels) for key in cluster_to_words: if len(cluster_to_words[key]) != 1: clusterList, cluster_label_array, cluster_len = build_word_vector_matrix( cluster_to_words[key]) centroid = [] centroid.append(cluster_centers[key]) closest, _ = pairwise_distances_argmin_min(centroid, clusterList) else: closest = [0] cluster_to_words[key] = [cluster_to_words[key][closest[0]] ] + cluster_to_words[key] with open(pickle_file_cluster, 'w+b') as out_file: pickle.dump(cluster_labels, out_file) pickle.dump(cluster_inertia, out_file) pickle.dump(cluster_centers, out_file) pickle.dump(cluster_to_words, out_file)
def summarize(self, corpus, top_k=3, important_words=3, return_cluster=True): """ Summarize list of strings / corpus Parameters ---------- corpus: str, list top_k: int, (default=3) number of summarized strings important_words: int, (default=3) number of important words Returns ------- string: summarized string """ if not isinstance(top_k, int): raise ValueError('top_k must be an integer') if not isinstance(important_words, int): raise ValueError('important_words must be an integer') if not isinstance(return_cluster, bool): raise ValueError('return_cluster must be a boolean') if not isinstance(corpus, list) and not isinstance(corpus, str): raise ValueError('corpus must be a list') if isinstance(corpus, list): if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if isinstance(corpus, str): corpus = corpus.replace('\n', '.') corpus = split_by_dot(corpus) else: corpus = [c + '.' for c in corpus] corpus = ' '.join(corpus) corpus = re.findall('(?=\S)[^.\n]+(?<=\S)', corpus) corpus = [summary_textcleaning(i) for i in corpus] sequences = _skip_thought.batch_sequence(corpus, self.dictionary, maxlen=self._maxlen) encoded, attention = self._sess.run( [self._logits, self._attention], feed_dict={self._X: np.array(sequences)}, ) attention = attention.sum(axis=0) kmeans = KMeans(n_clusters=top_k, random_state=0) kmeans = kmeans.fit(encoded) avg = [] for j in range(top_k): idx = np.where(kmeans.labels_ == j)[0] avg.append(np.mean(idx)) closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, encoded) indices = np.argsort(attention)[::-1] top_words = [ self._rev_dictionary[i] for i in indices[:important_words] ] ordering = sorted(range(top_k), key=lambda k: avg[k]) summarized = '. '.join([corpus[closest[idx]] for idx in ordering]) if return_cluster: return { 'summary': summarized, 'top-words': top_words, 'cluster-top-words': cluster_words(top_words), } return {'summary': summarized, 'top-words': top_words}
def _kmeans_single_banilla(X, sparsity, n_clusters, centers, max_iter, verbose, tol, debug_directory, debug_header, **kargs): n_samples = X.shape[0] labels_old = np.zeros((n_samples, ), dtype=np.int) debug_label_on = kargs.get('debug_label_on', True) debug_centroid_on = kargs.get('debug_centroid_on', True) for n_iter_ in range(1, max_iter + 1): _iter_time = time.time() labels, distances = pairwise_distances_argmin_min(X, centers, metric='cosine') centers = _update(X, labels, distances, n_clusters) inertia = distances.sum() if n_iter_ == 0: n_diff = n_samples else: diff = np.where((labels_old - labels) != 0)[0] n_diff = len(diff) labels_old = labels if isinstance(sparsity, str) and sparsity == 'sculley': radius = kargs.get('radius', 10) epsilon = kargs.get('epsilon', 5) centers = _sculley_projections(centers, radius, epsilon) elif isinstance(sparsity, str) and sparsity == 'minimum_df': minimum_df_factor = kargs.get('minimum_df_factor', 0.01) centers = _minimum_df_projections(X, centers, labels_old, minimum_df_factor) _iter_time = time.time() - _iter_time degree_of_sparsity = None degree_of_sparsity = check_sparsity(centers) ds_strf = ', sparsity={:.3}'.format( degree_of_sparsity) if degree_of_sparsity is not None else '' state = 'n_iter={}, changed={}, inertia={}, iter_time={} sec{}'.format( n_iter_, n_diff, '%.3f' % inertia, '%.3f' % _iter_time, ds_strf) if debug_directory: # Log message log_path = '{}/{}_logs.txt'.format(debug_directory, debug_header) with open(log_path, 'a', encoding='utf-8') as f: f.write('{}\n'.format(state)) # Temporal labels if debug_label_on: label_path = '{}/{}_label_iter{}.txt'.format( debug_directory, debug_header, n_iter_) with open(label_path, 'a', encoding='utf-8') as f: for label in labels: f.write('{}\n'.format(label)) # Temporal cluster_centroid if debug_centroid_on: center_path = '{}/{}_centroids_iter{}.csv'.format( debug_directory, debug_header, n_iter_) np.savetxt(center_path, centers) if verbose: print(state) if n_diff <= tol: if verbose and (n_iter_ + 1 < max_iter): print('Early converged.') break return centers, labels, inertia, n_iter_
if os.path.exists(ISOMAP_FILE): iso_df = isomap_df else: # need to recompute features # my_d = pd.read_csv(FEATURES_FILE, index_col=0) print("DataFrame read!") iso_df = iso_map() print("IsoMap done!") if 'cluster' not in iso_df.columns: # compute clustering centers, labels = clustering(iso_df[['comp1', 'comp2']], show_img=True) arr = iso_df[['comp1', 'comp2']].to_numpy() closest, _ = pairwise_distances_argmin_min(centers, arr) for c in closest: print(iso_df.iloc[c]) # save clusters' labels iso_df["cluster"] = labels iso_df.to_csv(ISOMAP_FILE, index=True) print("First clustering done!\n") grp = iso_df.groupby(by=['cluster']) # do something
def phi(x, clusters): argmin, distances = pairwise_distances_argmin_min([x], clusters, metric='euclidean') return argmin[0], distances[0], argmin
# Sampling centroids and points nearby # In[24]: centroids=kmeans.cluster_centers_ # In[25]: from sklearn.metrics import pairwise_distances_argmin_min label,distances= pairwise_distances_argmin_min(X_train,kmeans.cluster_centers_) # In[26]: #ditances from cluster centroids clusters['distance']=distances # In[27]: a1=[] #split data based on labels for i in range(0,c):
f.searchOpinions() ops = f.getOpinions() corpus = ops['sentenceText'].tolist() X = vectorizer.fit_transform(corpus) transformer = TfidfTransformer() tfidf = transformer.fit_transform(X) from sklearn.cluster import KMeans num_clusters = 5 km = KMeans(n_clusters=num_clusters) km.fit(tfidf) from sklearn.metrics import pairwise_distances_argmin_min clusExamp, _ = pairwise_distances_argmin_min(km.cluster_centers_, tfidf) for i in range(len(clusExamp)): examp = corpus[clusExamp[i]] print('Cluster {}: '.format(i).encode('utf-8') + examp) order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(num_clusters): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print()
def score_post(): request_data = request.json plaintext_dir = DATA_DIR + str(request_data["plaintext_dir"]) manual_summary_dir = MANUAL_DIR + str(request_data["plaintext_dir"]) print(plaintext_dir, manual_summary_dir) modeling = str(request_data["model"]) method = str(request_data["method"]) file = open(plaintext_dir, 'r', encoding='utf8') plaintext = file.read() file.close() file = open(manual_summary_dir, 'r', encoding='utf8') manual_summary = file.read() file.close() m_s = process(manual_summary) processed = process(plaintext) sentences = nltk.sent_tokenize(m_s) nsum1 = len(sentences) print(nsum1, end=' ') summary = "" if modeling == 'bert': summary = ''.join( model(body=processed, ratio=float(nsum1), min_length=0, use_first=False)) summary = summary.replace('_', ' ') if modeling == 'word2vec': sentences = nltk.sent_tokenize(plaintext) X = [] for sentence in sentences: sentence = ViTokenizer.tokenize(sentence) words = sentence.split(" ") sentence_vec = np.zeros((300)) for word in words: if word in vocab: sentence_vec += vocab[word] break X.append(sentence_vec) kmeans = KMeans(n_clusters=nsum1) kmeans.fit(X) avg = [] for j in range(nsum1): idx = np.where(kmeans.labels_ == j)[0] avg.append(np.mean(idx)) closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X) ordering = sorted(range(nsum1), key=lambda k: avg[k]) summary = ' '.join([sentences[closest[idx]] for idx in ordering]) summary = summary.replace('...', '') print(len(summary.strip().split('. '))) p, r, f1 = 0, 0, 0 print(m_s) print(summary) if method == 'bert': p, r, f1 = bert_score_compute(summary, manual_summary, 'vi') if method == 'rouge': p, r, f1 = rouge_score_compute(summary, manual_summary, 'l') resp = { "model-summarized": summary, "manual-summarized": m_s, "paragraph": plaintext, "p": p, "r": r, "f1": f1 } return jsonify(resp)
def active_init_1(): #baseline active learning solution alpha = 20 #initial training set betha = 100 #number of iterations gamma = 20 #sampling volume tfidf_transformer = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()) ]) #try to implement silhouette analysis for number of clusters #cluster = AgglomerativeClustering(n_clusters=20,affinity='cosine', linkage='complete') cluster = KMeans(n_clusters=20) unlabeled_train_data = twenty_train_data unlabeled_train_target = twenty_train_target print 'start transforming' unlabeled_matrix = tfidf_transformer.fit_transform(unlabeled_train_data) print 'start fitting' print datetime.now() res = cluster.fit_predict(unlabeled_matrix) print datetime.now() print 'clustering result' print OrderedDict(Counter(res)) print res.shape closest, _ = pairwise_distances_argmin_min(cluster.cluster_centers_, unlabeled_matrix, metric='cosine') print closest ''' results = defaultdict(list) for idx, val in enumerate(res): results[val].append(idx) take_idx = [] for cluster_num in range(0, 20): idxset = results[cluster_num] ''' #create labeled and unlabeled training set #labeled_train_data = twenty_train_data[: alpha] #labeled_train_target = twenty_train_target[: alpha] #unlabeled_train_data = twenty_train_data[alpha:] #unlabeled_train_target = twenty_train_target[alpha:] labeled_train_data = [] labeled_train_target = [] labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target = diploma_range_sampling(labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target, closest) print labeled_train_data.shape baseline_active_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LinearSVC()) ]) baseline_active_clf.fit(labeled_train_data, labeled_train_target) predicted = baseline_active_clf.predict(twenty_test_data) score = f1_score(twenty_test_target, predicted, average='macro') print 'baseline active clustering solution' diploma_res_print(len(labeled_train_data), score) for t in range(1, betha): unlabeled_matrix = tfidf_transformer.fit_transform(unlabeled_train_data) print datetime.now() res = cluster.fit_predict(unlabeled_matrix) print datetime.now() closest, _ = pairwise_distances_argmin_min(cluster.cluster_centers_, unlabeled_matrix, metric='cosine') print closest labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target = diploma_range_sampling(labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target, closest) baseline_active_clf.fit(labeled_train_data, labeled_train_target) predicted = baseline_active_clf.predict(twenty_test_data) score = f1_score(twenty_test_target, predicted, average='macro') diploma_res_print(len(labeled_train_data), score)
def createAnnotationOutput_mimic_oracle(args, model, data_loader, gold_dict, output_dict): sentence_index = [] sentence_index_sent = {} sentence_index_tokens = defaultdict(list) typeTag, typeTagIndex, tokenIndex_map, sentIndex_map, _ = readTrain( args.test_path) data = {} with codecs.open(args.to_annotate, "w", encoding='utf-8') as fout, codecs.open( args.debug, "w", encoding='utf-8') as fdebug: sorted_type = sorted(model.approxTypeErrors.items(), key=lambda kv: kv[1], reverse=True)[:args.k] fdebug.write("TOKEN\tTYPE\tGOLD\tPRED\tPREDPROB\tERRORS\n") for (type, error_percent) in sorted_type: token_pred_error = model.predTypeErrors[type] token_tag_error = model.approxTokenClassErrors[type] sorted_token_tag_error = sorted(token_tag_error.items(), key=lambda kv: kv[1], reverse=True) errors = [] maxTag = sorted_token_tag_error[0][0] for (tagId, error) in sorted_token_tag_error: tag = data_loader.id2tags["POS"][tagId] errors.append(tag + "=" + str(error)) predErrors = [] sorted_tag_error = sorted(token_pred_error.items(), key=lambda kv: kv[1], reverse=True) for (tagId, error) in sorted_tag_error: tag = data_loader.id2tags["POS"][tagId] predErrors.append(tag + "=" + str(error)) token_indices = list(model.type_tokenIndices[type]) required_embeddings, gold_token_tags, pred_token_tags = [], [], [] for token_index in token_indices: embedding = model.token_embeddings[token_index] gammaVal = model.token_gamma_key[token_index][maxTag] prob = np.exp(gammaVal) required_embeddings.append(embedding * prob) (token_, tag_, sent_index_, relative_index_) = tokenIndex_map[token_index] one_sent_ = sentIndex_map[sent_index_] pred_path_ = output_dict[" ".join(one_sent_)] gold_path_ = gold_dict[" ".join(one_sent_)] pred_token_tags.append(pred_path_[relative_index_]) gold_token_tags.append(gold_path_[relative_index_]) cluster_center = centeroidnp(np.array(required_embeddings)) closest, _ = pairwise_distances_argmin_min( np.array([cluster_center]), required_embeddings) centroid = token_indices[closest[0]] (token, tag, sent_index, relative_index) = tokenIndex_map[centroid] one_sent = sentIndex_map[sent_index] sentence_index.append(sent_index) pred_path = output_dict[" ".join(one_sent)] gold_path = gold_dict[" ".join(one_sent)] sentence_index_sent[sent_index] = (one_sent, gold_path, pred_path) sentence_index_tokens[sent_index].append(relative_index) data[token] = { "tokenindices": token_indices, "weighted": required_embeddings, "centroid_center": cluster_center, "pred": pred_token_tags, "gold": gold_token_tags } fdebug.write( str(centroid) + "\t" + data_loader.id_to_word[type] + "\t" + gold_path[relative_index] + "\t" + pred_path[relative_index] + "\t" + "@".join(predErrors) + "\t" + "@".join(errors) + "\n") covered = set() count = 0 with open("./" + args.model_name + "approx.pkl", "wb") as f: pickle.dump(data, f) with codecs.open(args.to_annotate, "w", encoding='utf-8') as fout: for sent_index in sentence_index: if sent_index not in covered: covered.add(sent_index) (sent, gold_path, pred_path) = sentence_index_sent[sent_index] path = deepcopy(pred_path) for token_index in sentence_index_tokens[sent_index]: path[token_index] = "UNK" for token, tag_label, gold_tag in zip( sent, path, gold_path): fout.write(token + "\t" + tag_label + "\t" + gold_tag + "\n") if tag_label == "UNK": count += 1 fout.write("\n")
ax2.legend(labels) ax2.grid() fig2.savefig("/Users/angieryu2202/Desktop/3d_PCA_kmeans.png") # ## (4) Articles Closest to Centroids for Each Cluster # In[37]: import numpy as np from sklearn.metrics import pairwise_distances_argmin_min #km = KMeans(n_clusters=31).fit(topic_distribution_df) # This function computes for each row in X, the index of the row of Y which is closest (according to the specified distance). The minimal distances are also returned kmeans_closest, _ = pairwise_distances_argmin_min(clusters.cluster_centers_, topic_distribution_df) print(kmeans_closest) # In[38]: kmeans_closest_titles = [] for index in kmeans_closest: kmeans_closest_titles.append(caregiver_df.title[index]) print(str(index)+": "+str(caregiver_df.title[index])) # In[39]:
tffss.append(tfs) labels_resultados.append(kmeans.labels_) best_s = np.argmax(silhouette_scores) best_ch = np.argmax(ch_scores) print("Mejor k según silhouette_score: ", best_s + num_initial_clusters) print("Mejor k según calinski_harabaz_score: ", best_ch + num_initial_clusters) true_k = max(best_s, best_ch) print("Guardando ", true_k + num_initial_clusters, "-kmeans...") clusters = resultados[true_k] centros_clusters = centros_resultados[true_k] kmeans.labels_ = labels_resultados[true_k] closest, _ = pairwise_distances_argmin_min(centros_clusters, tfidf_matrix) with open( path + "resumenes-temas-relacionados-" + str(true_k + num_initial_clusters) + "-means.txt", 'w') as f: with open( path + "centros-temas-relacionados-" + str(true_k + num_initial_clusters) + "-means.txt", 'w') as f2: for cluster in range(len(clusters)): f.write("Cluster " + str(cluster + 1) + ": " + sentences_centers[closest[cluster]] + "\n") f2.write("Cluster " + str(cluster + 1) + ": " + sentences_centers[closest[cluster]] + "\n") cl = [] for i, sentence in enumerate(clusters[cluster]): c = indices_centros[sentence]