def dbscan(similarity, concepts=2, euclid=False): if euclid: model = DBSCAN(eps=0.6, min_samples=10, algorithm='auto', leaf_size=30) return model.fit_predict(similarity) else: model = DBSCAN(eps=0.6, min_samples=10, metric='precomputed', algorithm='auto', leaf_size=30) return model.fit_predict(1 - similarity)
def plot_dbscan(): X, y = make_blobs(random_state=0, n_samples=12) dbscan = DBSCAN() clusters = dbscan.fit_predict(X) clusters fig, axes = plt.subplots(3, 4, figsize=(11, 8), subplot_kw={'xticks': (), 'yticks': ()}) # Plot clusters as red, green and blue, and outliers (-1) as white colors = ['r', 'g', 'b'] markers = ['o', '^', 'v'] # iterate over settings of min_samples and eps for i, min_samples in enumerate([2, 3, 5]): for j, eps in enumerate([1, 1.5, 2, 3]): # instantiate DBSCAN with a particular setting dbscan = DBSCAN(min_samples=min_samples, eps=eps) # get cluster assignments clusters = dbscan.fit_predict(X) print("min_samples: %d eps: %f cluster: %s" % (min_samples, eps, clusters)) if np.any(clusters == -1): c = ['w'] + colors m = ['o'] + markers else: c = colors m = markers discrete_scatter(X[:, 0], X[:, 1], clusters, ax=axes[i, j], c=c, s=8, markers=m) inds = dbscan.core_sample_indices_ # vizualize core samples and clusters. if len(inds): discrete_scatter(X[inds, 0], X[inds, 1], clusters[inds], ax=axes[i, j], s=15, c=colors, markers=markers) axes[i, j].set_title("min_samples: %d eps: %.1f" % (min_samples, eps)) fig.tight_layout()
def search_charges(self, data, z=0, threshold = 30): A = deriv(data,z) print 'Searching charges...' time0 = time.time() det = A[3]*A[5]-A[4]**2 dx = -(A[1]*A[5]-A[2]*A[4])/det dy = -(A[2]*A[3]-A[1]*Aa[4])/det datamax = A[0]+A[1]*dx+A[2]*dy+A[3]*dx**2/2+A[4]*dx*dy+A[5]*dy**2/2 t = np.where((np.abs(dx) < 1)*(np.abs(dy) < 1)*(np.abs(datamax) > threshold)*(det > 0)) x = np.array([t[1]+dx[t], t[0]+dy[t]]).T db = DBSCAN(min_samples = 1, eps = 1) db.fit_predict(x) n_charges = np.max(db.labels_)+1 qi = np.zeros(n_charges) xi = np.zeros((3,n_charges)) for i in range(0, n_charges): xi[0:2,i] = np.mean(x[db.labels_ == i,:], axis=0) qi[i] = np.mean(datamax[t][db.labels_ == i]) self.set_charges(qi,xi) print 'Done! Elapsed time: '+str(time.time()-time0) return self
def _fit_dbscan(self, x): # clustering for r in xrange(self.repeats): # info if self.debug is True: print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1), # fit and evaluate model model = DBSCAN(eps=1.0, min_samples=100) model.fit_predict(x) k = len(set(model.labels_)) - (1 if -1 in model.labels_ else 0) self._labels[r] = model.labels_ self._parameters[r] = model.core_sample_indices_ # build equivalent gmm model_gmm = GMM(n_components=k, covariance_type="full") model_gmm.means_ = model.core_sample_indices_ model_gmm.covars_ = sp.ones( (k, self.input_dim)) * self.sigma_factor model_gmm.weights_ = sp.array( [(self._labels[r] == i).sum() for i in xrange(k)]) # evaluate goodness of fit self._ll[r] = model_gmm.score(x).sum() if self.gof_type == 'aic': self._gof[r] = model_gmm.aic(x) if self.gof_type == 'bic': self._gof[r] = model_gmm.bic(x) # debug info if self.debug is True: print self._gof[r]
def current_datapoints_dbscan(self): """ Method clusters points-outliers (after current_datapoints_threshold_filter and current_datapoints_outliers_filter) into slice-clusters using DBSCAN. Returns dict of slice-clusters - base for event-candidates. Uses self.eps attribute to estimate cluster boundaries. """ nets = self.current_datapoints.keys() ids = concatenate([self.current_datapoints[x]['ids'] for x in nets]) coords = concatenate([self.current_datapoints[x]['array'] for x in nets]) weights = concatenate([self.current_datapoints[x]['weights'] for x in nets]) if len(ids) > 0: clustering = DBSCAN(eps=self.eps, min_samples=5) labels = clustering.fit_predict(coords) core_ids = ids[clustering.core_sample_indices_] ids = ids[labels > -1] coords = coords[labels > -1] weights = weights[labels > -1] labels = labels[labels > -1] ret_tab = {} for i in range(len(labels)): try: ret_tab[labels[i]].append({'id':ids[i], 'lng':coords[i,0], 'lat':coords[i,1], 'weight':weights[i], 'is_core':ids[i] in core_ids}) except KeyError: ret_tab[labels[i]] = [{'id':ids[i], 'lng':coords[i,0], 'lat':coords[i,1], 'weight':weights[i], 'is_core':ids[i] in core_ids}] return ret_tab else: return {}
def DBScan_Flux(phots, ycenters, xcenters, dbsClean=0, useTheForce=False): """Class methods are similar to regular functions. Note: Do not include the `self` parameter in the ``Args`` section. Args: param1: The first parameter. param2: The second parameter. Returns: True if successful, False otherwise. """ dbsPhots = DBSCAN()#n_jobs=-1) stdScaler = StandardScaler() phots = np.copy(phots.ravel()) phots[~np.isfinite(phots)] = np.median(phots[np.isfinite(phots)]) featuresNow = np.transpose([stdScaler.fit_transform(ycenters[:,None]).ravel(), \ stdScaler.fit_transform(xcenters[:,None]).ravel(), \ stdScaler.fit_transform(phots[:,None]).ravel() ] ) # print(featuresNow.shape) dbsPhotsPred= dbsPhots.fit_predict(featuresNow) return dbsPhotsPred == dbsClean
def _cluster(params): cls = None method = sh.getConst('method') if method=='kmedoid': assert False # from kmedoid import kmedsoid # cls = kmedoid elif method=='dbscan': from sklearn.cluster import DBSCAN cls = DBSCAN(eps=params['eps'],min_samples=params['min_samples'], metric='precomputed') else: assert False, 'FATAL: unknown cluster method' ## mat = sh.getConst('mat') labels = cls.fit_predict(mat) nLabels = len(set(labels)) ## sil = None; cal = None if (nLabels >= 2)and(nLabels <= len(labels)-1): sil = met.silhouette_score(mat,labels,'precomputed') cal = met.calinski_harabaz_score(mat,labels) perf = dict(silhouette_score=sil,calinski_harabaz_score=cal) return (labels,perf)
def cluster_dbscan(matrix, distance_measure="sts", eps=1): """Clusters the distance matrix for a given epsilon value, if distance measure is sts. Other distance measures are: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’, ‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] Parameters ---------- matrix: np.matrix The input matrix. If distance measure is sts, this should be the sts distance matrix. If other distance, this should be the time-series matrix of size ngenes x nsamples. distance_measure: str The distance measure, default is sts, short time-series distance. Any distance measure available in scikit-learn is available here. Note: multiple time-series is NOT supported for distances other than "sts". Returns ------- cluster_labels: list of int A list of size ngenes that defines cluster membership. """ if (distance_measure == "sts"): dbs = DBSCAN(eps=eps, metric='precomputed', min_samples=2) else: dbs = DBSCAN(eps=eps, metric=distance_measure, min_samples=2) cluster_labels = dbs.fit_predict(matrix) return cluster_labels
def cluster_DBSCAN(args): """ Clustering with Ward hierarchical clustering: constructs a tree and cuts it. """ #load data g_it = node_link_data.node_link_data_to_eden(input = args.input_file, input_type = "file") vec = graph.Vectorizer(r = args.radius,d = args.distance, nbits = args.nbits) logger.info('Vectorizer: %s' % vec) X = vec.transform(g_it, n_jobs = args.n_jobs) logger.info('Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz() / X.shape[0])) #project to lower dimensional space to use clustering algorithms transformer = TruncatedSVD(n_components=args.n_components) X_dense=transformer.fit_transform(X) #log statistics on data logger.info('Dimensionality reduction Instances: %d Features: %d with an avg of %d features per instance' % (X_dense.shape[0], X_dense.shape[1], X.getnnz() / X.shape[0])) #clustering clustering_algo = DBSCAN(eps = args.eps) y = clustering_algo.fit_predict(X_dense) msg = 'Predictions statistics: ' msg += util.report_base_statistics(y) logger.info(msg) #save model for vectorizer out_file_name = "vectorizer" eden_io.dump(vec, output_dir_path = args.output_dir_path, out_file_name = out_file_name) logger.info("Written file: %s/%s",args.output_dir_path, out_file_name) #save result out_file_name = "labels" eden_io.store_matrix(matrix = y, output_dir_path = args.output_dir_path, out_file_name = out_file_name, output_format = "text") logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)
def get_clusters(tracks): neighbors = g.m.neighborsSpin.value() dist = g.m.neighborDistanceSpin.value() data = np.array([[tr['mean_x'], tr['mean_y']] for tr in tracks]) scanner = DBSCAN(eps=dist, min_samples=neighbors) ids = scanner.fit_predict(data) return ids
def dbscan_outliers(df): """ Find outliers (noise points) using DBSCAN. Parameters ---------- df: A pandas.DataFrame Returns ------- A tuple of (a sklearn.DBSCAN instance, a pandas.DataFrame) """ scaler = StandardScaler() scaler.fit(df) scaled = scaler.transform(df) dbs = DBSCAN() db = dbs.fit(scaled) outliers = dbs.fit_predict(scaled) df_o = df.ix[np.nonzero(outliers)] return db, df_o
def cluster_with_dbscan(vectors, epsilon=0.5, min_samples=5, distances=None, metric="euclidean"): # precomputing our distances will be faster as we can use multiple cores if distances is None: distances = pairwise_distances(vectors, n_jobs=-1, metric=metric) dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric="precomputed") return dbscan.fit_predict(distances)
def cluster_lvl1(self, data): db = DBSCAN(eps=2., min_samples=2, metric='precomputed') processed = np.float32(np.vstack([ np.mgrid[:self.map_height, :self.map_width].reshape(2, -1), data.ravel() ])).T dist = self.distances_for_lvl1(processed) return db.fit_predict(dist).reshape(self.map_height, self.map_width)
def regroup(self, maxdistance, minsize, algo = 'auto'): self.__loginfo('Regrouping') dbsfit = DBSCAN(eps=maxdistance, min_samples=minsize, algorithm=algo).fit(self.primarylist) dbsresult = dbsfit.fit_predict(self.primarylist) grouplist = [] for grouplabel in dbsresult: if not grouplabel in grouplist: grouplist.append(grouplabel) self.__loginfo('Group label count: %s' % len(grouplist))
def main(datafile, feature1, feature2, normalize, clusteroutput, percentile, copula): X, features = read_sah_h5(datafile, just_good=False) if 'id' not in features: ids = np.arange(len(X)) else: ids = X[:, features.index('id')] x = X[:, features.index(feature1)] y = X[:, features.index(feature2)] D = np.column_stack([x, y]) idx = np.random.randint(len(X), size=10000) D = D[idx] ids = ids[idx] if normalize: mean = np.average(D, axis=0) std = np.std(D, axis=0) std[np.nonzero(std == 0.0)] = 1.0 # Avoid NaNs Dnorm = (D - mean) / std elif copula: Dnorm = np.column_stack([copula_transform(f) for f in D.T]) else: Dnorm = D kmeans = MiniBatchKMeans(n_clusters=50) gmm = GMM(n_components=200, covariance_type='full', verbose=True) #C = gmm.fit_predict(Dnorm) dbscan = DBSCAN(eps=100.0, min_samples=1) C = dbscan.fit_predict(Dnorm) print C with open(clusteroutput, 'w+') as f: for c, i in zip(C, ids): f.write('%d,%d\n' % (i, c)) pl.scatter(D[:, 0], D[:, 1], color=pl.cm.spectral(C.astype(float) / np.max(C))) # for c in np.unique(C): # pl.bar(0, 0, lw=0, ec='none', # fc=pl.cm.spectral(float(c) / np.max(C)), label='Cluster %d' % c) # pl.legend(loc='upper left') if percentile > 0: pl.xlim( scoreatpercentile(x, percentile), scoreatpercentile(x, 100-percentile) ) pl.ylim( scoreatpercentile(y, percentile), scoreatpercentile(y, 100-percentile) ) pl.xlabel(feature1) pl.ylabel(feature2) pl.show()
def clusterize(_features): import sklearn from sklearn.cluster import DBSCAN est = DBSCAN() Y = est.fit_predict(_features[:,2:]) y_pred = [(i==major_index(Y)) for i in Y] return np.c_[_features[:,0], _features[:,1], y_pred]
def dbscan_clusterize(regions, eps, min_samples): if len(regions) < min_samples: return [], regions samples = np.array([[r.cx, r.cy] for r in regions]) clustering = DBSCAN(eps=eps, min_samples=min_samples) clusters_ids = clustering.fit_predict(samples) clusters, noise = convert_clusters(regions, clusters_ids) return clusters, noise
def dbscan_cluster(docs, eps=None): vectr = Vectorizer() docs = [clean(d) for d in docs] vecs = vectr.vectorize(docs, train=True) if eps is None: dist_mat = build_dist_mat(vecs) eps = estimate_eps(dist_mat)[0] m = DBSCAN(min_samples=3, metric='euclidean', eps=eps) labels = m.fit_predict(vecs) return labels
def DBSCAN_cluster(init_ds,ts_flag=False): ''' Parameters: init_ds - 2D list of data ts_flag - boolean specifying if the first column of init_ds is a datetime object or not Returns: 2D list with additional column denoting which cluster said row falls into ''' if ts_flag: init_ds = [i[1:] for i in init_ds] dbscn = DBSCAN() labels = dbscn.fit_predict(init_ds) return [init_ds[i]+[labels[i]] for i in range(len(init_ds))]
def evaluate_clustering(): similarity_matrix = get_sense_similarity_submatrix(range(10000)) matrix_size = len(similarity_matrix) print('got matrix') affinity_propagation = AffinityPropagation() labels1 = affinity_propagation.fit_predict(similarity_matrix) print('affinity propagation') dbscan = DBSCAN(min_samples=1) labels2 = dbscan.fit_predict(similarity_matrix) print('print dbscan') distance_matrix = np.ndarray((matrix_size, matrix_size)) for i in range(matrix_size): for j in range(matrix_size): distance_matrix[i, j] = 1 - similarity_matrix[i, j] print(distance_matrix[1, 2]) print(distance_matrix[1, 1]) print('created distance matrix') cluster_map1 = cluster_evaluation.fpena_get_clusters(labels1) cluster_map2 = cluster_evaluation.fpena_get_clusters(labels2) print(cluster_map1) print(cluster_map2) sc1 = sklearn.metrics.silhouette_score(distance_matrix, labels1, metric='euclidean') sc2 = sklearn.metrics.silhouette_score(distance_matrix, labels2, metric='euclidean') sc5 = cluster_evaluation.fpena_evaluate(cluster_map1, distance_matrix) sc6 = cluster_evaluation.fpena_evaluate(cluster_map2, distance_matrix) num_elements1 = [len(values) for values in cluster_map1.values()] num_elements2 = [len(values) for values in cluster_map2.values()] print(num_elements1) print(num_elements2) print('Number of clusters Affinity Propagation: %f' % len(cluster_map1)) print('Number of clusters DBSCAN: %f' % len(cluster_map2)) print('Average elements per cluster Affinity Propagation: %f' % np.mean(num_elements1)) print('Average elements per cluster DBSCAN: %f' % np.mean(num_elements2)) print('Standard deviation per cluster Affinity Propagation: %f' % np.std(num_elements1)) print('Standard deviation per cluster DBSCAN: %f' % np.std(num_elements2)) print('Silouhette score Affinity Propagation (distance matrix): %f' % sc1) print('Silouhette score DBSCAN (distance matrix): %f' % sc2) print('Dunn index Affinity Propagation (distance matrix): %f' % sc5) print('Dunn index DBSCAN (distance matrix): %f' % sc6)
def cv_iteration(n_jobs=2, eps=1., min_samples=30, metric='euclidean', algorithm='brute', leaf_size=30, p=2.): X, y_train, _ = load_data() scores = [] cms = [] # confusion matrices cluster_sizes = [] model = DBSCAN(n_jobs=n_jobs, eps=eps, min_samples=min_samples, metric=metric, algorithm=algorithm, leaf_size=leaf_size, p=p) predictions = model.fit_predict(X) score, confusion_matrix = scoring_function(y_train, predictions) scores.append(score) cms.append(serialise_confusion_matrix(confusion_matrix)) cluster_sizes.append(serialise_confusion_matrix(np.unique(predictions, return_counts=True))) return {'result': scores, 'confusion_matrices': eval(str(cms)), 'score_name': string_enhancer(str(scoring_function)), 'cluster_sizes': eval(str(cluster_sizes))}
def main(datafile, normalize, ndims, copula, clusteroutput, subsample): X, features = read_sah_h5(datafile) I, all_features = read_sah_h5(datafile, just_good=False) if 'id' in all_features: ids = X[:, all_features.index('id')] else: ids = np.arange(len(X)).astype(int) Xorig = X if normalize: mean = np.average(X, axis=0) std = np.std(X, axis=0) std[np.nonzero(std == 0.0)] = 1.0 # Avoid NaNs X = (X - mean) / std idx = np.random.randint(len(X), size=subsample) X = X[idx] ids = ids[idx] if copula: X = np.column_stack([copula_transform(x) for x in X.T]) # I added this for the time/freq clustering # to emphasize the frequency feature # X[:, 1] *= 1e-3 Y = bh_sne(X, d=ndims) dbscan = DBSCAN(eps=1.75, min_samples=5) C = dbscan.fit_predict(Y) tree = ExtraTreesClassifier(n_estimators=100) tree.fit(X, C) for f, i in zip(features, tree.feature_importances_): print '%s: %f' % (f, i) with open(clusteroutput, 'w+') as f: for c, i in zip(C, ids): f.write('%d,%d\n' % (i, c)) pl.scatter(Y[:, 0], Y[:, 1], color=pl.cm.spectral(C.astype(float) / np.max(C))) for c in np.unique(C): pl.bar(0, 0, lw=0, ec='none', fc=pl.cm.spectral(float(c) / np.max(C)), label='Cluster %d' % c) pl.legend() pl.show()
def predict_window_bps(self, pkt_featurizer): self.add_to_windows(pkt_featurizer) windows_bps = map(self.to_bytes_sec, self.windows) windows_bps_shaped = np.array(windows_bps).reshape(-1,1) dbscan = DBSCAN() labels = dbscan.fit_predict(windows_bps_shaped) predict_window_bps = labels[-1] == -1 if self.plot: x_range = map(self.to_mid_time, self.windows) self.plot_1d_dbscan(windows_bps_shaped, labels, x_range, self.windows_fig, "Mid Time of Window", "Average bytes/sec", "Windowed Bps Clustering") self.windows[-1].pop() if not self.windows[-1]: self.windows.pop() return predict_window_bps
def cluster_characters(self): """On the basis of co-occurrences of characters in scenes, performs a clustering to assign characters to different groups.""" cooccurences = np.zeros((len(self.characters), len(self.characters))) for scene in self: for character_i in scene.characters: for character_j in scene.characters: cooccurences[character_i.id, character_j.id] += 1.0 cooccurences[character_j.id, character_i.id] = cooccurences[ character_i.id, character_j.id] cooccurences = cooccurences / cooccurences.sum() clusterer = DBSCAN(eps=cooccurences.mean(), min_samples=1) clustering = clusterer.fit_predict(cooccurences) for character in self.characters: # check if this propagates character.cluster = clustering[character.id]
def clusterize(_features): def major_index(l): from collections import defaultdict d = defaultdict(int) for item in l: d[item] += 1 return max(d.iteritems(), key=lambda x: x[1])[0] est = DBSCAN() Y = est.fit_predict(_features[:, 2:]) y_pred = [(i == major_index(Y)) for i in Y] return np.c_[_features[:, 0], _features[:, 1], y_pred]
def cluster_subgraphs(self, matrix, nth_neighbor=1): ''' get the median distance to the NTH neighbor with NN use that distance to cluster with scan ''' neigh = NearestNeighbors(n_neighbors=nth_neighbor+1, metric='euclidean') neigh.fit(matrix) dist, indices = neigh.kneighbors(matrix) #dist = np.median(dist[:, nth_neighbor], axis=0) # 1 is the Nth neigh if self.min_clustersize < 1.0: minsamp = matrix.shape[0]*self.min_clustersize #print minsamp,matrix.shape, self.min_clustersize else: minsamp = self.min_clustersize def distances_select_first_non_id_neighbor(distances): x,y = distances.nonzero() _, idd = np.unique(x, return_index=True) """ for i,e in enumerate(zip(list(x), list(y))): print e, distances[e] if i in idd: print "!!!" print idd """ return distances[ x[idd],y[idd]] #dists = distances_select_NTH_non_id_neighbor(dist,2) dists = distances_select_first_non_id_neighbor(dist) #dist = np.median(dists) dists=np.sort(dists) idx=int(len(dists)*self.dbscan_range) dist=dists[idx] if self.debug: print "name_subgraph: choosing dist %d of %d" % (idx, len(dists)) # get the clusters scan = DBSCAN(eps=dist, min_samples=minsamp) return scan.fit_predict(matrix)
def split_dbscan(self, eps, min_samples): # Extract dataset from files dataset = [f.dataset for f in self.files] # Initialize classifier classifier = DBSCAN(eps=eps, min_samples=min_samples) # Fit dataset index = classifier.fit_predict(dataset) count = max(index) + 2 clusters = [Cluster(self.directory, self.name + '-' + str(i)) for i in range(count)] clusters[count - 1].name = 'na' for i in range(0, len(self.files), 1): clusters[index[i] % count].add_file(self.files[i]) return clusters
class DBScan: """ DBScan model. """ def __init__(self, features, eps, min_samples): """ Initialisation method for DBScan :param features: trajectory feature to learn from :type features: list() """ self.features = features self.model = DBSCAN(eps=eps, min_samples=min_samples) self.cluster_labels = [] def fit_predict(self): """ Fits the model to the data and return the cluster labels. """ self.cluster_labels = self.model.fit_predict(self.features)
def main(): print "" start_time = time.time() vectorized_data_words, vectorized_class_labels = get_sample_data(False) data_proc_time = time.time() - start_time print "Data processing took " + str(data_proc_time) + " seconds" print "Clustering data..." # estimator = KMeans(n_clusters = int(num_unique_classes(vectorized_class_labels)) ) # estimator = DBSCAN(min_samples=1) estimator = DBSCAN(eps=0.53, min_samples=2, metric="cosine", algorithm="brute") prediction = estimator.fit_predict(vectorized_data_words) print "Clustering took " + str(time.time() - start_time - data_proc_time) + " seconds" cluster_quality(prediction, vectorized_data_words, vectorized_class_labels) print "Total running time: " + str(time.time() - start_time) + " seconds" print ""
def get_clusters(self, eps_range=[0.001, 0.002, 0.003, 0.0035, 0.004, 0.0045, 0.005, 0.0055, 0.006, 0.007, 0.008, 0.009, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, .1]): best_num_clusters = 0 best_fitted_response, best_cluster = 0, None for epsilon in eps_range: clusterer = DBSCAN(eps=epsilon, **self.kwargs) fitted_response = clusterer.fit_predict(self.feature_vectors) # Check if there are more clusters num_unique_responses = len(set(fitted_response)) if num_unique_responses > best_num_clusters: best_num_clusters = num_unique_responses best_cluster = clusterer best_fitted_response = fitted_response if not best_cluster.components_.shape[0]: similarities = [1] * self.feature_vectors.shape[0] else: similarities = np.max(self.feature_vectors.dot(best_cluster.components_.T), axis = 1) return best_fitted_response, similarities.tolist()
def train(args, model, device, optimizer, exp_dir): #change rho avlue accoding to training numbers if args.trainsize > 2000 and args.trainsize <= 6000: rho = 1.7e-3 elif args.trainsize > 6000 and args.trainsize <= 8000: rho = 1.5e-3 elif args.trainsize > 8000 and args.trainsize <= 10000: rho = 1.3e-3 elif args.trainsize > 10000 and args.trainsize <= 12000: rho = 1.1e-3 elif args.trainsize > 12000 and args.trainsize <= 14000: rho = 0.9e-3 elif args.trainsize > 14000 and args.trainsize <= 16000: rho = 0.7e-3 else: rho = args.rho #start episodic training total_NMI = np.zeros(args.iteration) total_AMI = np.zeros(args.iteration) total_SMI = np.zeros(args.iteration) total_ACCU = np.zeros(args.iteration + 1) #Tesing before self-training print('Tesing before self-training') accu = test(args, model, device) total_ACCU[0] = accu for iter_n in range(args.iteration): #generate data loader extraction_loader = DataLoader( dataset.Omniglot( root=args.data_dir, train=True, size=args.trainsize, transform=transforms.Compose([ transforms.Resize(32), #transforms.Grayscale(num_output_channels=3), transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True) #extract all data features train_features, target_labels = extract_features( model=model, data_loader=extraction_loader, device=device) #rerank to get the jaccard distance rerank_dist = re_ranking(features=train_features, MemorySave=args.memory_save) #build the DBSCAN model tri_mat = np.triu(rerank_dist, 1) # tri_mat.dim=2 tri_mat = tri_mat[np.nonzero(tri_mat)] # tri_mat.dim=1 tri_mat = np.sort(tri_mat, axis=None) top_num = np.round(rho * tri_mat.size).astype(int) eps = tri_mat[:top_num].mean() print('eps in cluster: {:.3f}'.format(eps)) cluster = DBSCAN(eps=eps, min_samples=4, metric='precomputed', n_jobs=8) # select & cluster images as training set of this episode print('Clustering and labeling...') train_features = train_features.cpu().numpy() labels = cluster.fit_predict(rerank_dist) #calculate NMI of chosed data points of current episode TL = target_labels list_true = [int(TL[i].cpu().numpy()) for i in range(len(TL))] list_pred = labels.tolist() NMI = nmi_withGT(list_pred, list_true) AMI = ami_withGT(list_pred, list_true) SMI = sampling_NMI_withGT(list_pred, list_true) total_NMI[iter_n] = NMI total_AMI[iter_n] = AMI total_SMI[iter_n] = SMI num_ids = len(set(labels)) - 1 #generate new dataset new_dataset = [] unique_labels, label_count = np.unique(labels, return_counts=True) for i in range(len(extraction_loader.dataset.splittxt)): idd = np.where(unique_labels == labels[i])[0][0] if labels[i] == -1 or label_count[idd] < 6: continue new_dataset.append( (extraction_loader.dataset.splittxt[i], labels[i], 0)) LL = [new_dataset[i][1] for i in range(len(new_dataset))] print(np.unique(LL, return_counts=True)) print( 'Iteration {} have {} training ids, {} training images, NMI is {}, AMI is {}, SMI is {}' .format(iter_n + 1, num_ids, len(new_dataset), NMI, AMI, SMI)) #training dataloader BS = args.batch_size * args.ims_per_id train_loader = DataLoader( dataset.Omniglot_clustering(root=args.data_dir, dat_set=new_dataset, transform=transforms.Compose([ transforms.Resize(32), transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=BS, num_workers=4, sampler=RandomIdentitySampler(new_dataset, args.ims_per_id), pin_memory=True, drop_last=True) #training with prototipical learning methods for ep in range(args.epochs): # Adjust Learning Rate adjust_lr_exp(optimizer, args.base_lr, ep + 1, args.epochs, args.exp_decay_at_epoch) model.train() protoacc_meter = AverageMeter() protoloss_meter = AverageMeter() ep_st = time.time() for data, target in tqdm(train_loader): #pdb.set_trace() data, target = data.to(device), target.to(device) optimizer.zero_grad() feat, x_hat = model(data) protoloss, acc = loss_fn(feat, target=target, n_support=args.train_shot) protoloss = protoloss.to(device) protoloss.backward() optimizer.step() protoacc_meter.update(acc.item()) protoloss_meter.update(protoloss.item()) #Epoch log time_log = 'Ep {}, {:.2f}s'.format( ep, time.time() - ep_st, ) loss_log = (', acc {:.2%}, protoloss {:.4f}'.format( protoacc_meter.avg, protoloss_meter.avg)) final_log = time_log + loss_log print(final_log) #adjust learning rate back to initialized learning rate print('Learning rate adjuested back to base learning rate {:.10f}'. format(args.base_lr)) for g in optimizer.param_groups: g['lr'] = args.base_lr accu = test(args, model, device) total_ACCU[iter_n + 1] = accu print('total NMI value is, ', total_NMI) print('total AMI value is, ', total_AMI) print('total SMI value is, ', total_SMI) print('total ACCU value is, ', total_ACCU)
class Hdbscan(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): ''' Primitive that applies Hierarchical Density-Based Clustering or Density-Based Clustering algorithms. This is an unsupervised, clustering primitive, but has been representend as a supervised classification problem to produce a compliant primitive. Training inputs: D3M dataset with features and labels, and D3M indices Outputs: D3M dataset with predicted labels and D3M indices ''' metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': "ca014488-6004-4b54-9403-5920fbe5a834", 'version': __version__, 'name': "hdbscan", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['Clustering'], 'source': { 'name': __author__, 'contact': __contact__, 'uris': [ # Unstructured URIs. "https://github.com/NewKnowledge/D3M-Unsupervised", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package': 'cython', 'version': '0.29.14', }, { 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/NewKnowledge/D3M-Unsupervised.git@{git_commit}#egg=D3MUnsupervised' .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.clustering.hdbscan.Hdbscan', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.DBSCAN, ], 'primitive_family': metadata_base.PrimitiveFamily.CLUSTERING, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) if self.hyperparams['algorithm'] == 'HDBSCAN': self.clf = hdbscan.HDBSCAN( min_cluster_size=self.hyperparams['min_cluster_size'], min_samples=self.hyperparams['min_samples'], cluster_selection_method=self. hyperparams['cluster_selection_method']) else: self.clf = DBSCAN(eps=self.hyperparams['eps'], min_samples=self.hyperparams['min_samples']) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : dataframe with attached metadata for semi-supervised or unsupervised data Returns ---------- Outputs The output depends on the required_output hyperparameter and is either a dataframe containing a single column where each entry is the cluster ID, or the input daatframe with the cluster ID of each row added as an additional feature. """ # find target and index variables targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) target_names = [list(inputs)[t] for t in targets] index = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') index_names = [list(inputs)[i] for i in index] X_test = inputs.copy() if len(index): X_test = X_test.drop(columns=list(inputs)[index[0]]) if len(target_names): X_test = X_test.drop(columns=target_names) X_test = X_test.values # special semi-supervised case - during training, only produce rows with labels series = inputs[target_names] != '' if series.any().any(): inputs = dataframe_utils.select_rows(inputs, np.flatnonzero(series)) X_test = X_test[np.flatnonzero(series)] if self.hyperparams['required_output'] == 'feature': hdb_df = d3m_DataFrame( pandas.DataFrame(self.clf.fit_predict(X_test), columns=['cluster_labels'])) # just add last column of last column ('clusters') col_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) col_dict['name'] = 'cluster_labels' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) df_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 1 hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(utils_cp.append_columns(inputs, hdb_df)) else: hdb_df = d3m_DataFrame( pandas.DataFrame(self.clf.fit_predict(X_test), columns=[target_names[0]])) hdb_df = pandas.concat([inputs.d3mIndex, hdb_df], axis=1) col_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) col_dict['name'] = index_names[0] col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) col_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict['structural_type'] = type(1) col_dict['name'] = target_names[0] col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' ) hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, 1), col_dict) df_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 2 hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(hdb_df)
class DBScanCluster: def __init__(self, epsilon=0.3, min_pts=10): """ function: contructor -------------------- instantiate a dbscan clustering algorithm """ self.name = "dbscan" self.dbsc = None self.epsilon = epsilon self.min_pts = min_pts self.clusters = dict([]) ########################################################################### ######### general helped functions for measureing cluster quality ######### ########################################################################### def __compute_variance(self): """ function: compute_variance -------------------------- compute the variance/skew across cluster sizes returns: variance of the clustering """ mean, variance = 0.0, 0.0 for key, cluster in self.clusters.iteritems(): mean += len(cluster) mean /= len(self.clusters) for key, cluster in self.clusters.iteritems(): variance += (len(cluster) - mean)**2 variance /= len(self.clusters) return variance def __compute_entropy(self, dataset): """ function: compute_entropy ------------------------- compute the entropy of @self.vectors or @self.clusters returns: entropy scores of the dataset """ entropy = 0.0 for key, cluster in dataset.iteritems(): length = len(cluster) factor = float(length) / len(self.vectors) temp = 0.0 for topic in self.topics: inner = 0.0 for fv in cluster: if topic in fv.topics: inner += 1.0 if inner > 0: inner /= float(length) temp += -inner * log(inner, 2) entropy += factor * temp return entropy ########################################################################### ################ mains to generate and test the clustering ################ ########################################################################### def generate_clusters(self, feature_vectors): """ function: generate_clusters --------------------------- generate k-means clusters for feature vectors :param feature_vectors: set of features to construct model """ # generate clusters cluster_start = time.time() fv_space, topic_space = [], [] for key, fv in feature_vectors.iteritems(): fv_space.append(fv.vector) topic_space.append(fv.topics) self.dbsc = DBSCAN(eps=self.epsilon, min_samples=self.min_pts) clusters = self.dbsc.fit_predict(fv_space) # split dataset based on clusters for i, index in enumerate(clusters): if not self.clusters.has_key(index): self.clusters[index] = [] self.clusters[index].append(feature_vectors[i]) cluster_time = time.time() - cluster_start # set object members for entropy calculation self.vectors = feature_vectors self.topics = set().union(*topic_space) # compute entropy of clusters + gain all = self.vectors.values() before = self.__compute_entropy({0: self.vectors.values()}) after = self.__compute_entropy(self.clusters) print "Entropy Before Clustering:", before print "Entropy After Clustering :", after print "Overall Gain in Entropy:", before - after # compute variance of cluster sizes + time print "Clustering Variance:", self.__compute_variance() print "Time for Clustering:", cluster_time, "seconds" # reset clusters self.clusters = dict([])
def main(args): np.random.seed(args.seed) torch.manual_seed(args.seed) cudnn.benchmark = True # Create data loaders assert args.num_instances > 1, "num_instances should be greater than 1" assert args.batch_size % args.num_instances == 0, \ 'num_instances should divide batch_size' if args.height is None or args.width is None: args.height, args.width = (144, 56) if args.arch == 'inception' else \ (256, 128) # get source data src_dataset, src_extfeat_loader = \ get_source_data(args.src_dataset, args.data_dir, args.height, args.width, args.batch_size, args.workers) # get target data tgt_dataset, num_classes, tgt_extfeat_loader, test_loader = \ get_data(args.tgt_dataset, args.data_dir, args.height, args.width, args.batch_size, args.workers) # Create model # Hacking here to let the classifier be the number of source ids if args.src_dataset == 'dukemtmc': model = models.create(args.arch, num_classes=632, pretrained=False) coModel = models.create(args.arch, num_classes=632, pretrained=False) elif args.src_dataset == 'market1501': model = models.create(args.arch, num_classes=676, pretrained=False) coModel = models.create(args.arch, num_classes=676, pretrained=False) elif args.src_dataset == 'msmt17': model = models.create(args.arch, num_classes=1041, pretrained=False) coModel = models.create(args.arch, num_classes=1041, pretrained=False) elif args.src_dataset == 'cuhk03': model = models.create(args.arch, num_classes=1230, pretrained=False) coModel = models.create(args.arch, num_classes=1230, pretrained=False) else: raise RuntimeError('Please specify the number of classes (ids) of the network.') # Load from checkpoint if args.resume: print('Resuming checkpoints from finetuned model on another dataset...\n') checkpoint = load_checkpoint(args.resume) model.load_state_dict(checkpoint['state_dict'], strict=False) coModel.load_state_dict(checkpoint['state_dict'], strict=False) else: raise RuntimeWarning('Not using a pre-trained model.') model = nn.DataParallel(model).cuda() coModel = nn.DataParallel(coModel).cuda() evaluator = Evaluator(model, print_freq=args.print_freq) # evaluator.evaluate(test_loader, tgt_dataset.query, tgt_dataset.gallery) # if args.evaluate: return # Criterion criterion = [ TripletLoss(args.margin, args.num_instances, isAvg=False, use_semi=False).cuda(), TripletLoss(args.margin, args.num_instances, isAvg=False, use_semi=False).cuda(), ] # Optimizer optimizer = torch.optim.Adam( model.parameters(), lr=args.lr ) coOptimizer = torch.optim.Adam( coModel.parameters(), lr=args.lr ) optims = [optimizer, coOptimizer] # training stage transformer on input images normalizer = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transformer = T.Compose([ T.Resize((args.height, args.width)), T.RandomHorizontalFlip(), T.ToTensor(), normalizer, T.RandomErasing(probability=0.5, sh=0.2, r1=0.3) ]) # # Start training for iter_n in range(args.iteration): if args.lambda_value == 0: source_features = 0 else: # get source datas' feature source_features, _ = extract_features(model, src_extfeat_loader, print_freq=args.print_freq, numStripe=None) # synchronization feature order with src_dataset.train source_features = torch.cat([source_features[f].unsqueeze(0) for f, _, _ in src_dataset.train], 0) # extract training images' features print('Iteration {}: Extracting Target Dataset Features...'.format(iter_n + 1)) target_features, _ = extract_features(model, tgt_extfeat_loader, print_freq=args.print_freq, numStripe=None) # synchronization feature order with dataset.train target_features = torch.cat([target_features[f].unsqueeze(0) for f, _, _ in tgt_dataset.trainval], 0) # calculate distance and rerank result print('Calculating feature distances...') target_features = target_features.numpy() rerank_dist = re_ranking(source_features, target_features, lambda_value=args.lambda_value) if iter_n == 0: # DBSCAN cluster tri_mat = np.triu(rerank_dist, 1) # tri_mat.dim=2 tri_mat = tri_mat[np.nonzero(tri_mat)] # tri_mat.dim=1 tri_mat = np.sort(tri_mat, axis=None) top_num = np.round(args.rho * tri_mat.size).astype(int) eps = tri_mat[:top_num].mean() print('eps in cluster: {:.3f}'.format(eps)) cluster = DBSCAN(eps=eps, min_samples=4, metric='precomputed', n_jobs=8) # select & cluster images as training set of this epochs print('Clustering and labeling...') labels = cluster.fit_predict(rerank_dist) num_ids = len(set(labels)) - 1 print('Iteration {} have {} training ids'.format(iter_n + 1, num_ids)) # generate new dataset new_dataset, unknown_dataset = [], [] # assign label for target ones unknownLab = labelNoise(torch.from_numpy(target_features), torch.from_numpy(labels)) # unknownFeats = target_features[labels==-1,:] unCounter, index = 0, 0 from collections import defaultdict realIDs, fakeIDs = defaultdict(list), [] for (fname, realPID, cam), label in zip(tgt_dataset.trainval, labels): if label == -1: unknown_dataset.append((fname, int(unknownLab[unCounter]), cam)) # unknown data fakeIDs.append(int(unknownLab[unCounter])) realIDs[realPID].append(index) unCounter += 1 index += 1 continue # dont need to change codes in trainer.py _parsing_input function and sampler function after add 0 new_dataset.append((fname, label, cam)) fakeIDs.append(label) realIDs[realPID].append(index) index += 1 print('Iteration {} have {} training images'.format(iter_n + 1, len(new_dataset))) precision, recall, fscore = calScores(realIDs, np.asarray(fakeIDs)) # fakeIDs does not contain -1 print('precision:{}, recall:{}, fscore: {}'.format(100 * precision, 100 * recall, fscore)) train_loader = DataLoader( Preprocessor(new_dataset, root=tgt_dataset.images_dir, transform=train_transformer), batch_size=args.batch_size, num_workers=4, sampler=RandomIdentitySampler(new_dataset, args.num_instances), pin_memory=True, drop_last=True ) # hard samples # noiseImgs = [name[1] for name in unknown_dataset] # saveAll(noiseImgs, tgt_dataset.images_dir, 'noiseImg') # import ipdb; ipdb.set_trace() unLoader = DataLoader( Preprocessor(unknown_dataset, root=tgt_dataset.images_dir, transform=train_transformer), batch_size=args.batch_size, num_workers=4, sampler=RandomIdentitySampler(unknown_dataset, args.num_instances), pin_memory=True, drop_last=True ) # train model with new generated dataset trainer = RCoTeaching( model, coModel, train_loader, unLoader, criterion, optims ) # Start training for epoch in range(args.epochs): trainer.train(epoch, remRate=0.2 + (0.8 / args.iteration) * (1 + iter_n)) # test only rank_score = evaluator.evaluate(test_loader, tgt_dataset.query, tgt_dataset.gallery) # print('co-model:\n') # rank_score = evaluatorB.evaluate(test_loader, tgt_dataset.query, tgt_dataset.gallery) # Evaluate rank_score = evaluator.evaluate(test_loader, tgt_dataset.query, tgt_dataset.gallery) save_checkpoint({ 'state_dict': model.module.state_dict(), 'epoch': epoch + 1, 'best_top1': rank_score.market1501[0], }, True, fpath=osp.join(args.logs_dir, 'RCT.pth')) return rank_score.map, rank_score.market1501[0]
""" Created on Tue Apr 2 17:16:08 2019 @author: cankozan """ import pandas as pd import numpy as np df = pd.read_excel('data.xlsx') df = df.dropna() from sklearn.cluster import KMeans km = KMeans(n_clusters=6) km.fit(df.iloc[:, :2]) labeled = km.predict(df.iloc[:, :2]) from sklearn.cluster import AgglomerativeClustering ac = AgglomerativeClustering(n_clusters=6) labeled_ac = ac.fit_predict(df.iloc[:, :2]) from scipy.cluster.hierarchy import dendrogram, linkage z = linkage(df.iloc[:, :2]) dendrogram(z) from sklearn.cluster import DBSCAN dbs = DBSCAN(eps=10, min_samples=2) labeled_dbs = dbs.fit_predict(df.iloc[:, :2])
#kMeans clustering from sklearn.cluster import KMeans km = KMeans(init='random', max_iter=150, n_clusters=2, random_state=0) y_km = km.fit_predict(X) plt.scatter(X[y_km == 0, 0], X[y_km == 0, 1], c='green') plt.scatter(X[y_km == 1, 0], X[y_km == 1, 1], c='red') plt.title("KMeans") plt.show() #Agglomerative Clustering with complete linkage from sklearn.cluster.hierarchical import AgglomerativeClustering aggcl = AgglomerativeClustering(n_clusters=2, linkage='complete') y_agcl = aggcl.fit_predict(X) plt.scatter(X[y_agcl == 0, 0], X[y_agcl == 0, 1], c='green') plt.scatter(X[y_agcl == 1, 0], X[y_agcl == 1, 1], c='red') plt.title("Aggolomerative Clustering") plt.show() #Demonstaring clustering using density-based approach from sklearn.cluster import DBSCAN dbs = DBSCAN(eps=0.2, min_samples=5) y_dbs = dbs.fit_predict(X) plt.scatter(X[y_dbs == 0, 0], X[y_dbs == 0, 1], c='green') plt.scatter(X[y_dbs == 1, 0], X[y_dbs == 1, 1], c='red') plt.title("Density based(DBSCAN) Clustering") plt.show()
def test_weighted_dbscan(): # ensure sample_weight is validated with pytest.raises(ValueError): dbscan([[0], [1]], sample_weight=[2]) with pytest.raises(ValueError): dbscan([[0], [1]], sample_weight=[2, 3, 4]) # ensure sample_weight has an effect assert_array_equal([], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0]) assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0]) # points within eps of each other: assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0]) # and effect of non-positive and non-integer sample_weight: assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[0]) # for non-negative sample_weight, cores should be identical to repetition rng = np.random.RandomState(42) sample_weight = rng.randint(0, 5, X.shape[0]) core1, label1 = dbscan(X, sample_weight=sample_weight) assert len(label1) == len(X) X_repeated = np.repeat(X, sample_weight, axis=0) core_repeated, label_repeated = dbscan(X_repeated) core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool) core_repeated_mask[core_repeated] = True core_mask = np.zeros(X.shape[0], dtype=bool) core_mask[core1] = True assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask) # sample_weight should work with precomputed distance matrix D = pairwise_distances(X) core3, label3 = dbscan(D, sample_weight=sample_weight, metric="precomputed") assert_array_equal(core1, core3) assert_array_equal(label1, label3) # sample_weight should work with estimator est = DBSCAN().fit(X, sample_weight=sample_weight) core4 = est.core_sample_indices_ label4 = est.labels_ assert_array_equal(core1, core4) assert_array_equal(label1, label4) est = DBSCAN() label5 = est.fit_predict(X, sample_weight=sample_weight) core5 = est.core_sample_indices_ assert_array_equal(core1, core5) assert_array_equal(label1, label5) assert_array_equal(label1, est.labels_)
class LexRank(object): def __init__(self, similarity='cosine', decay_window=20, decay_alpha=0.25, clustering='dbscan', tagger='twitter', useful_tags=[ 'Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb', 'Conjunction', 'Josa', 'PreEomi', 'Eomi', 'Suffix', 'Alpha', 'Number' ], delimiters=['. ', '\n', '.\n'], min_token_length=2, stopwords=stopwords_ko, no_below_word_count=2, no_above_word_portion=0.85, max_dictionary_size=None, min_cluster_size=2, similarity_threshold=0.85, matrix_smoothing=False, n_clusters=None, compactify=True, **kwargs): self.decay_window = decay_window self.decay_alpha = decay_alpha if similarity == 'cosine': # very, very slow :( self.vectorizer = DictVectorizer() self.uniform_sim = self._sim_cosine elif similarity == 'jaccard': self.uniform_sim = self._sim_jaccard elif similarity == 'normalized_cooccurrence': self.uniform_sim = self._sim_normalized_cooccurrence else: raise LexRankError( "available similarity functions are: cosine, jaccard, normalized_cooccurrence" ) self.sim = lambda sentence1, sentence2: self.decay( sentence1, sentence2) * self.uniform_sim(sentence1, sentence2) self.factory = SentenceFactory(tagger=tagger, useful_tags=useful_tags, delimiters=delimiters, min_token_length=min_token_length, stopwords=stopwords, **kwargs) if clustering == 'birch': self._birch = Birch(threshold=0.99, n_clusters=n_clusters) self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix ) elif clustering == 'dbscan': self._dbscan = DBSCAN() self._clusterer = lambda matrix: self._dbscan.fit_predict(1 - matrix) elif clustering == 'affinity': self._affinity = AffinityPropagation() self._clusterer = lambda matrix: self._affinity.fit_predict(1 - matrix) elif clustering is None: self._clusterer = lambda matrix: [ 0 for index in range(matrix.shape[0]) ] else: raise LexRankError( "available clustering algorithms are: birch, markov, no-clustering(use `None`)" ) self.no_below_word_count = no_below_word_count self.no_above_word_portion = no_above_word_portion self.max_dictionary_size = max_dictionary_size self.similarity_threshold = similarity_threshold self.min_cluster_size = min_cluster_size self.matrix_smoothing = matrix_smoothing self.compactify = compactify def summarize(self, text): self.sentences = self.factory.text2sentences(text) self.num_sentences = len(self.sentences) self.corpus = SentenceCorpus(self.sentences, self.no_below_word_count, self.no_above_word_portion, self.max_dictionary_size) self.model = TfidfModel(self.corpus.bows, id2word=self.corpus.dictionary, normalize=True) self.tfidfs = self.model[self.corpus.bows] self._inject_tfidfs() self._build_matrix() self._clustering() if self.compactify: self._compactify() self.graphs = [] for i in range(self.num_clusters): graph = self.sentences2graph(self.clusters[i]) pagerank = networkx.pagerank(graph, weight='weight') self.clusters[i] = sorted(pagerank, key=pagerank.get, reverse=True) self.graphs.append(graph) def _sim_jaccard(self, sentence1, sentence2): if sentence1 == sentence2: return 1 p = sum((sentence1.counter & sentence2.counter).values()) q = sum((sentence1.counter | sentence2.counter).values()) return p / q if q else 0 def _sim_cosine(self, sentence1, sentence2): if sentence1 == sentence2: return 1 sentence1_tfidf = { word_id: tfidf for word_id, tfidf in sentence1.tfidf } sentence2_tfidf = { word_id: tfidf for word_id, tfidf in sentence2.tfidf } vector1, vector2 = self.vectorizer.fit_transform( [sentence1_tfidf, sentence2_tfidf]).toarray() return vector1.dot(vector2) def _sim_normalized_cooccurrence(self, sentence1, sentence2): if sentence1 == sentence2: return 1 return len(set(sentence1.tokens) & set(sentence2.tokens)) / ( math.log(len(sentence1.tokens)) + math.log(len(sentence2.tokens))) def decay(self, sentence1, sentence2): distance = abs(sentence1.index - sentence2.index) closeness = max(self.decay_window - distance, 0) / self.decay_window return math.pow(closeness, self.decay_alpha) def _inject_tfidfs(self): for index in range(self.num_sentences): bow = self.corpus.bows[index] self.sentences[index].bow = bow self.sentences[index].tfidf = self.model[bow] def _build_matrix(self): self.matrix = np.zeros((self.num_sentences, self.num_sentences)) for sentence1 in self.sentences: for sentence2 in self.sentences: self.matrix[sentence1.index, sentence2.index] = self.sim(sentence1, sentence2) if self.matrix_smoothing: for index in range(self.num_sentences): self.matrix[index, index] = 0 self.matrix[index, index] = max(self.matrix[index]) def sentences2graph(self, sentences): graph = networkx.Graph() graph.add_nodes_from(sentences) for sentence1 in sentences: for sentence2 in sentences: weight = self.matrix[sentence1.index, sentence2.index] if weight: graph.add_edge(sentence1, sentence2, weight=weight) return graph def _clustered(self): self.clusters = [ cluster for cluster in self.clusters if len(cluster) >= self.min_cluster_size ] self.num_clusters = len(self.clusters) self.clusters = sorted(self.clusters, key=lambda cluster: len(cluster), reverse=True) def _clustering(self): cls = self._clusterer(self.matrix) bucket = {} for index in range(len(cls)): key = str(cls[index]) if key not in bucket: bucket[key] = [] bucket[key].append(self.sentences[index]) self.clusters = bucket.values() self._clustered() def _compactify(self): clusters = [] for cluster in self.clusters: compact_cluster = [] cluster_size = len(cluster) for i in range(cluster_size): cluster[i].duplicated = False for i in range(cluster_size): if cluster[i].duplicated: continue for j in range(i + 1, cluster_size): if cluster[j].duplicated: continue if self.uniform_sim( cluster[i], cluster[j]) > self.similarity_threshold: cluster[j].duplicated = True compact_cluster.append(cluster[i]) clusters.append(compact_cluster) self.clusters = clusters self._clustered() def _verbose(self): summaries = sorted(self.summaries, key=lambda sentence: sentence.index) return [sentence.text for sentence in summaries] def probe(self, k=None): if not hasattr(self, 'clusters'): raise LexRankError("summarize it first") if not k: k = max(2, self.num_clusters) if k < 0: raise LexRankError( "appropriate value for `k`: float(0 ~ 1) for compress rate, or natural number for exact number of sentences" ) if k > self.num_sentences: raise LexRankError("this will not give a summarization") if k < 1: k = int(self.num_sentences * k) self.summaries = [] ends = np.array([len(cluster) for cluster in self.clusters]) drones = np.zeros(ends.shape) for i in range(self.num_clusters): self.summaries.append(self.clusters[i][0]) drones[i] += 1 if len(self.summaries) == k: return self._verbose() while True: branch = np.array([drones + 1, ends]).min(axis=0) / ends leach = int(branch.argmin()) drone = int(drones[leach]) self.summaries.append(self.clusters[leach][drone]) drones[leach] += 1 if len(self.summaries) == k: return self._verbose()
def main(argv): global chatty start = "" end = "" duration = "" bisection = 0 bisection_max = 10 min_step = datetime.timedelta(minutes=5) try: opts, args = getopt.getopt( argv, "hqs:d:t:", ["help", "quiet", "start=", "duration=", "steps="]) except getopt.GetoptError: eprint('Error: Unrecognized option!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('Use -h for help.') sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): print('categorization.py [-h] [-s <start> -d <duration>]') print(' -h: print this text') print( ' -s <start>: start date and time in elasticsearch time format' ) print(' -d <duration>: duration format <number>[mhdw]') print(' -t <steps>: steps format <number>[mhdw]') print( ' If Elasticsearch query isn\'t responed in time, step size will be automatically adjusted by bisection.' ) print(' -q: no output except errors') print(' --help: same as -h') print(' --start <start>: same as -s') print(' --duration <duration>: same as -d') print(' --steps <steps>: same as -t') print(' --quiet: same as -q') print('') print( ' If start and duration are omitted, the last 24 hours will be used.' ) sys.exit(0) elif opt in ("-q", "--quiet"): chatty = False elif opt in ("-s", "--start"): try: start_dt = datetime.datetime.strptime(arg, "%Y-%m-%dT%H:%M") start = arg except ValueError: eprint('Error: Invalid option start!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('<start> format: yyyy-mm-ddThh:mm') eprint('Use -h for help.') sys.exit(2) elif opt in ("-d", "--duration"): match = re.match("(\d+)([mhdw])$", arg) if match: (x, c) = match.groups() try: y = int(x) except: eprint('Error: Invalid option duration!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('<duration> format: <number>[mhdw]') eprint('Use -h for help.') sys.exit(2) if c == 'm': delta = datetime.timedelta(minutes=y) elif c == 'h': delta = datetime.timedelta(hours=y) elif c == 'd': delta = datetime.timedelta(days=y) elif c == 'w': delta = datetime.timedelta(days=(7 * y)) else: eprint('Error: Invalid option duration!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('<duration> format: <number>[mhdw]') eprint('Use -h for help.') sys.exit(2) duration = arg else: eprint('Error: Invalid option duration!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('<duration> format: <number>[mhdw]') eprint('Use -h for help.') sys.exit(2) elif opt in ("-t", "--steps"): match = re.match("(\d+)([mhdw])$", arg) if match: (x, c) = match.groups() try: y = int(x) except: eprint('Error: Invalid option steps!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('<duration> format: <number>[mhdw]') eprint('Use -h for help.') sys.exit(2) if c == 'm': step = datetime.timedelta(minutes=y) elif c == 'h': step = datetime.timedelta(hours=y) elif c == 'd': step = datetime.timedelta(days=y) elif c == 'w': step = datetime.timedelta(days=(7 * y)) else: eprint('Error: Invalid option steps!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('<duration> format: <number>[mhdw]') eprint('Use -h for help.') sys.exit(2) else: eprint('Error: Invalid option steps!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('<steps> format: <number>[mhdw]') eprint('Use -h for help.') sys.exit(2) if ((start == "") and (duration == "")): end_dt = datetime.datetime.now() end = end_dt.strftime("%Y-%m-%dT%H:%M") delta = datetime.timedelta(days=1) start_dt = end_dt - delta start = start_dt.strftime("%Y-%m-%dT%H:%M") elif ((start == "") or (duration == "")): eprint('Error: Invalid option combination!') eprint('Start and duration must both be specified or omitted.') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('Use -h for help.') sys.exit(2) else: end_dt = start_dt + delta end = end_dt.strftime("%Y-%m-%dT%H:%M") if ('step' not in vars()): step = delta if (end_dt - start_dt) < step: step = end_dt - start_dt es = elasticsearch.Elasticsearch(nodes) elasticsearch_version = get_elasticsearch_major_version(es) axes = [ 'entropy_peer_sessions', 'entropy_peer_packets', 'entropy_peer_bytes', 'entropy_sport_sessions', 'entropy_sport_packets', 'entropy_sport_bytes', 'entropy_dport_sessions', 'entropy_dport_packets', 'entropy_dport_bytes' ] host = {} peer = {} sport = {} dport = {} ''' During data generation slight differences in timing may cause some hosts not to be listed in all dictionaries. These hosts will be ignored. ''' ignore_hosts = [] sets = [[ 'peer (1)', peer, "src", "SourceAddress", "dst", "DestinationAddress" ], ['peer (2)', peer, "dst", "DestinationAddress", "src", "SourceAddress" ], ['sport (1)', sport, "src", "SourceAddress", "sport", "SourcePort"], [ 'sport (2)', sport, "dst", "DestinationAddress", "dport", "DestinationPort" ], [ 'dport (1)', dport, "src", "SourceAddress", "dport", "DestinationPort" ], [ 'dport (2)', dport, "dst", "DestinationAddress", "sport", "SourcePort" ]] for s in sets: moving_start_dt = start_dt moving_end_dt = min(start_dt + step, end_dt) while True: try: start = moving_start_dt.strftime("%Y-%m-%dT%H:%M:%S") end = moving_end_dt.strftime("%Y-%m-%dT%H:%M:%S") print_or_quiet('%s Fetching %s data ... %s - %s' % (strftime('%H:%M:%S'), s[0], start, end)) query_into_dictionary(es, elasticsearch_version, start, end, host, s[1], s[2], s[3], s[4], s[5]) moving_start_dt = moving_end_dt moving_end_dt += step if moving_start_dt >= end_dt: break if moving_end_dt > end_dt: moving_end_dt = end_dt except elasticsearch.exceptions.ConnectionTimeout as esect: bisection += 1 if bisection <= bisection_max: step = step // 2 if step < min_step: eprint( 'Elasticsearch Connection Timeout. Minimum timeframe reached. Exiting ...' ) sys.exit(3) eprint( 'Elasticsearch Connection Timeout. Halving step size.') moving_end_dt = moving_start_dt + step else: eprint( '%d. time Elasticsearch Connection Timeout. Exiting ...' % bisection) sys.exit(3) print_or_quiet('%s Calculating sums ...' % (strftime('%H:%M:%S'))) calculate_sums(host, 'peer', peer, ignore_hosts) calculate_sums(host, 'sport', sport, ignore_hosts) calculate_sums(host, 'dport', dport, ignore_hosts) print_or_quiet('%s Removing incomplete hosts ...' % (strftime('%H:%M:%S'))) for h in ignore_hosts: try: del host[h] except KeyError: # host may appear more than once pass print_or_quiet('%s Calculating entropy ...' % (strftime('%H:%M:%S'))) calculate_entropy(host, [[peer, 'peer'], [dport, 'dport'], [sport, 'sport']]) print_or_quiet('%s Removing dictionaries ...' % (strftime('%H:%M:%S'))) peer.clear() sport.clear() dport.clear() if (len(host) == 0): eprint("No data found. Exiting ...") sys.exit(1) print_or_quiet('%s Creating sample set ...' % (strftime('%H:%M:%S'))) labels = host.keys() npa = create_np(host, axes) n_samples, n_features = npa.shape print_or_quiet(' samples: %d features:%d' % (n_samples, n_features)) ######################### # MeanShift ######################### print_or_quiet('%s Calculating bandwidth ...' % (strftime('%H:%M:%S'))) bandwidth = estimate_bandwidth(npa, quantile=bandwidth_quantile, n_samples=bandwidth_n_samples, random_state=bandwidth_random_state, n_jobs=bandwidth_n_jobs) if (bandwidth == 0.00000000): eprint('Useless bandwith. Exiting ....') sys.exit(3) print_or_quiet('%s Calculating MeanShift ...' % (strftime('%H:%M:%S'))) ms = MeanShift(bandwidth=bandwidth, bin_seeding=meanshift_bin_seeding, cluster_all=meanshift_cluster_all, n_jobs=meanshift_n_jobs) ms.fit(npa, npa.shape) n_clusters = len(numpy.unique(ms.labels_)) print_or_quiet('%s Getting predictions ...' % (strftime('%H:%M:%S'))) prediction = ms.predict(npa) ip = [y['ip_address'] for y in host.values()] ip_prediction = zip(ip, prediction) meanshift_output = {} for (ip, prediction) in ip_prediction: meanshift_output.setdefault(prediction, list()).append(ip) print_or_quiet('%s Writing MeanShift output file ...' % (strftime('%H:%M:%S'))) with open(meanshift_outputfile, 'w') as fp: json.dump( {str(key): value for key, value in meanshift_output.iteritems()}, fp) ######################### # KMeans ######################### print_or_quiet('%s Calculating KMeans ...' % (strftime('%H:%M:%S'))) km = KMeans(n_clusters=kmeans_n_clusters) km.fit(npa, npa.shape) print_or_quiet('%s Getting predictions ...' % (strftime('%H:%M:%S'))) prediction = km.predict(npa) ip = [y['ip_address'] for y in host.values()] ip_prediction = zip(ip, prediction) kmeans_output = {} for (ip, prediction) in ip_prediction: kmeans_output.setdefault(prediction, list()).append(ip) print_or_quiet('%s Writing KMeans output file ...' % (strftime('%H:%M:%S'))) with open(kmeans_outputfile, 'w') as fp: json.dump( {str(key): value for key, value in kmeans_output.iteritems()}, fp) ######################### # AgglomerativeClustering ######################### print_or_quiet('%s Calculating Agglomerative Clustering ...' % (strftime('%H:%M:%S'))) ac = AgglomerativeClustering(n_clusters=agglomerative_n_clusters, affinity=agglomerative_affinity, linkage=agglomerative_linkage) prediction = ac.fit_predict(npa) ip = [y['ip_address'] for y in host.values()] ip_prediction = zip(ip, prediction) agglomerative_output = {} for (ip, prediction) in ip_prediction: agglomerative_output.setdefault(prediction, list()).append(ip) print_or_quiet('%s Writing Agglomerative Clustering output file ...' % (strftime('%H:%M:%S'))) with open(agglomerative_outputfile, 'w') as fp: json.dump( { str(key): value for key, value in agglomerative_output.iteritems() }, fp) ######################### # DBSCAN ######################### print_or_quiet('%s Calculating DBSCAN ...' % (strftime('%H:%M:%S'))) db = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples) prediction = db.fit_predict(npa) ip = [y['ip_address'] for y in host.values()] ip_prediction = zip(ip, prediction) dbscan_output = {} for (ip, prediction) in ip_prediction: dbscan_output.setdefault(prediction, list()).append(ip) print_or_quiet('%s Writing DBSCAN output file ...' % (strftime('%H:%M:%S'))) with open(dbscan_outputfile, 'w') as fp: json.dump( {str(key): value for key, value in dbscan_output.iteritems()}, fp)
df1 = pd.read_csv('./GpsData/CHILD_GPS(한수1).csv') df2 = pd.read_csv('./GpsData/CHILD_GPS(한수2).csv') df3 = pd.read_csv('./GpsData/CHILD_GPS(김규1).csv') df4 = pd.read_csv('./GpsData/CHILD_GPS(김규2).csv') ''' df = pd.read_csv('./GpsData/CHILD_GPS(한수1).csv') #Give Columns Name df.columns = ['ChildKey', 'Time', 'latitude', 'longitude'] X = df[['latitude', 'longitude']] distance_matrix = squareform(pdist(X, (lambda u, v: haversine(u, v)))) db = DBSCAN(eps=0.45, min_samples=3, metric='precomputed') y_db = db.fit_predict(distance_matrix) #Cluster's Info - '-1' is Noise Cluster in X['Cluster'] data X['cluster'] = y_db ''' X['color'] = np.where(X.cluster == -1, 'red', 'blue') X.plot(kind='scatter', x='longitude', y='latitude', s=20, c=X['color']) ''' ''' #Drop All of Noise Data for i, row in X.iterrows(): if row['cluster'] == -1:
def DBScanModel(self, X_train, X_test, y_train, y_test): model = DBSCAN() model.fit(X_train) y_pred = model.fit_predict(X_test) return model, y_pred
def dbscan(temp_list, hum_list, gas_list, label_list): temp = np.array(temp_list) label = [] hum = np.array(hum_list) gas = np.array(gas_list) data_temp_y = [] data_hum_y = [] data_gas_y = [] for i in label_list: data_label = get_label(i) data_temp_y.append(data_label[0]) data_hum_y.append(data_label[1]) data_gas_y.append(data_label[2]) # split data # 이때 label을 temp, hum, gas의 label 형태로 변환해줘야함 data_temp_x = temp.astype(np.float64).reshape(-1, 1) data_hum_x = hum.astype(np.float64).reshape(-1, 1) data_gas_x = gas.astype(np.float64).reshape(-1, 1) # create model model_temp = DBSCAN(min_samples=10) model_hum = DBSCAN(min_samples=10) model_gas = DBSCAN(min_samples=10) # predict y pred_temp_y = model_temp.fit_predict(data_temp_x) pred_hum_y = model_temp.fit_predict(data_hum_x) pred_gas_y = model_temp.fit_predict(data_gas_x) pred_temp_y = np.where(pred_temp_y != -1, 0, pred_temp_y) pred_temp_y = np.where(pred_temp_y == -1, 1, pred_temp_y) pred_hum_y = np.where(pred_hum_y != -1, 0, pred_hum_y) pred_hum_y = np.where(pred_hum_y == -1, 1, pred_hum_y) pred_gas_y = np.where(pred_gas_y != -1, 0, pred_gas_y) pred_gas_y = np.where(pred_gas_y == -1, 1, pred_gas_y) unique_temp, counts_temp = np.unique(data_temp_y, return_counts=True) unique_hum, counts_hum = np.unique(data_hum_y, return_counts=True) unique_gas, counts_gas = np.unique(data_gas_y, return_counts=True) unique_temp_pred, counts_temp_pred = np.unique(pred_temp_y, return_counts=True) unique_hum_pred, counts_hum_pred = np.unique(pred_hum_y, return_counts=True) unique_gas_pred, counts_gas_pred = np.unique(pred_gas_y, return_counts=True) print("temp:", dict(zip(unique_temp, counts_temp)), dict(zip(unique_temp_pred, counts_temp_pred))) print("hum:", dict(zip(unique_hum, counts_hum)), dict(zip(unique_hum_pred, counts_hum_pred))) print("gas:", dict(zip(unique_gas, counts_gas)), dict(zip(unique_gas_pred, counts_gas_pred))) print('temparature\'s accuracy is:', accuracy_score(data_temp_y, pred_temp_y)) print('humidity\'s accuracy is:', accuracy_score(data_hum_y, pred_hum_y)) print('gas\'s accuracy is:', accuracy_score(data_gas_y, pred_gas_y)) for i in range(40, len(pred_temp_y)): temp_label = 0 if pred_temp_y[i] == 1: temp_label += 4 * pred_temp_y[i] if pred_hum_y[i] == 1: temp_label += 2 * pred_hum_y[i] if pred_gas_y[i] == 1: temp_label += 1 * pred_gas_y[i] label.append(temp_label) return label
res_1 = df1.iloc[:,1:] #pca = PCA(n_components=5) #res = pca.fit_transform(res_1) res = res_1 kmean_ncluster = 8 kmeans = KMeans(n_clusters=kmean_ncluster, init='k-means++', max_iter=400, n_init=10, random_state=0) pred_y = kmeans.fit_predict(res) clustering = DBSCAN(eps=10, min_samples=2) pred_y2 = clustering.fit_predict(res) spectral_ncluster = 8 spectral = SpectralClustering(spectral_ncluster, eigen_solver='arpack', affinity="nearest_neighbors") pred_y3 = spectral.fit_predict(res) prep_data.check_result(res, pred_y) prep_data.check_result(res, pred_y2) prep_data.check_result(res, pred_y3) view = pd.DataFrame(df_yara.loc[:,('yara')].copy()) view['kmeans'] = pd.DataFrame(pred_y) view['dbscan'] = pd.DataFrame(pred_y2) view['spectral_clustering'] = pd.DataFrame(pred_y3) view['validation_groups'] = tmp['tag']
# Importing the dataset dataset = pd.read_csv('Mall_Customers.csv') X = dataset.iloc[:, [3, 4]].values X = StandardScaler().fit_transform(X) plt.scatter(X[:, 0], X[:, 1], s=100, color='blue') #posicionamento dos eixos x e y plt.grid() #função que desenha a grade no nosso gráfico plt.show() #eps # dbscan = DBSCAN(eps=0.4, min_samples=2, algorithm='kd_tree') pred_y = dbscan.fit_predict(X) max_label = np.max(pred_y) plt.scatter(X[pred_y == -1, 0], X[pred_y == -1, 1], s=100, c='gray', label='Anomalies') for i in range(0, max_label + 1): color = np.random.rand(3, ) plt.scatter(X[pred_y == i, 0], X[pred_y == i, 1], s=100, c=color, label='Cluster' + str(i)) plt.title('Clusters of customers')
from sklearn.cluster import DBSCAN X, v = make_blobs(n_samples=60, centers=4, cluster_std=.60, random_state=0) plt.figure(figsize=(15,10)) plt.scatter(X[:,0], X[:,1]) N = len(X) ind = np.arange(N) print(ind) for label, x, y in zip(ind,X[:,0], X[:,1]): plt.annotate(label,xy=(x,y)) clustering = DBSCAN(eps=.8, min_samples=5) # cambiar para que se vea diferente. etiqueta = clustering.fit_predict(X) plt.figure(figsize=(15,10)) plt.xlim(-5,5) plt.ylim(0,10) plt.scatter(X[:,0], X[:,1], c=etiqueta) N = len(X) ind = np.arange(N) for label, x, y in zip(ind, X[:,0], X[:,1]): plt.annotate(label,xy=(x,y)) def buscarVecinos(P,X,epsilon): N = len(X) vecinos = []
# If there are at least min_samples many data points within a distance of eps to a given # data point, that data point is classified as a core sample. import matplotlib.pyplot as plt import mglearn from sklearn.cluster import DBSCAN from sklearn.datasets import make_blobs, make_moons from sklearn.preprocessing import StandardScaler X, y = make_blobs(n_samples=20, random_state=0) dbscan = DBSCAN() clusters = dbscan.fit_predict(X) print("Cluster membershipd:\n{0}".format(clusters)) # [-1, -1, ..., -1, -1] - all points were assigned to noise # Increasing min_samples (going from top to bottom in the figure) means that fewer points will be core points, and # more points will be labeled as noise. # The parameter eps is somewhat more important, as it determines what it means for # points to be “close.” Setting eps to be very small will mean that no points are core # samples, and may lead to all points being labeled as noise. Setting eps to be very large # will result in all points forming a single cluster. # While DBSCAN doesn’t require setting the number of clusters explicitly, setting eps # implicitly controls how many clusters will be found. mglearn.plots.plot_dbscan()
# -*- coding: utf-8 -*- """ Created on Tue May 12 21:40:26 2020 @author: Dr. Taimoor """ import pandas as pd corpus = pd.read_csv('D:\\Dataset.csv') print(corpus) data = corpus.iloc[:, [2, 3, 5]].values print('\n Data', data) from sklearn.cluster import DBSCAN #You should know your number of desired clusters before hand #Keep tuning these two parameters until you get those clusters dbscan = DBSCAN(eps=500, min_samples=2) print(dbscan) #Cluster data result = dbscan.fit_predict(data) print('\n Multivariate Outliers labeled as -1', result)
def plot_results(results, samples, phenotypes, labels, outdir, filter_diff_thres=.2, filter_response_thres=0, response_grad_cutoff=None, stat_test=None, positive_filters_only=False, log_yscale=False, group_a='group A', group_b='group B', group_names=None, tsne_ncell=10000, regression=False, clustering=None, add_filter_response=False, percentage_drop_cluster=.1, min_cluster_freq=0.2, show_filters=True): """ Plots the results of a CellCnn analysis. Args: - results : Dictionary containing the results of a CellCnn analysis. - samples : Samples from which to visualize the selected cell populations. - phenotypes : List of phenotypes corresponding to the provided `samples`. - labels : Names of measured markers. - outdir : Output directory where the generated plots will be stored. - filter_diff_thres : Threshold that defines which filters are most discriminative. Given an array ``filter_diff`` of average cell filter response differences between classes, sorted in decreasing order, keep a filter ``i, i > 0`` if it holds that ``filter_diff[i-1] - filter_diff[i] < filter_diff_thres * filter_diff[i-1]``. For regression problems, the array ``filter_diff`` contains Kendall's tau values for each filter. - filter_response_thres : Threshold for choosing a responding cell population. Default is 0. - response_grad_cutoff : Threshold on the gradient of the cell filter response CDF, might be useful for defining the selected cell population. - stat_test: None | 'ttest' | 'mannwhitneyu' Optionally, perform a statistical test on selected cell population frequencies between two groups and report the corresponding p-value on the boxplot figure (see plots description below). Default is None. Currently only used for binary classification problems. - group_a : Name of the first class. - group_b : Name of the second class. - group_names : List of names for the different phenotype classes. - positive_filters_only : If True, only consider filters associated with higher cell population frequency in the positive class. - log_yscale : If True, display the y-axis of the boxplot figure (see plots description below) in logarithmic scale. - clustering: None | 'dbscan' | 'louvain' Post-processing option for selected cell populations. Default is None. - tsne_ncell : Number of cells to include in t-SNE calculations and plots. - regression : Whether it is a regression problem. - show_filters : Whether to plot learned filter weights. Returns: A list with the indices and corresponding cell filter response thresholds of selected discriminative filters. \ This function also produces a collection of plots for model interpretation. These plots are stored in `outdir`. They comprise the following: - clustered_filter_weights.pdf : Filter weight vectors from all trained networks that pass a validation accuracy threshold, grouped in clusters via hierarchical clustering. Each row corresponds to a filter. The last column(s) indicate the weight(s) connecting each filter to the output class(es). Indices on the y-axis indicate the filter cluster memberships, as a result of the hierarchical clustering procedure. - consensus_filter_weights.pdf : One representative filter per cluster is chosen (the filter with minimum distance to all other memebers of the cluster). We call these selected filters "consensus filters". - best_net_weights.pdf : Filter weight vectors of the network that achieved the highest validation accuracy. - filter_response_differences.pdf : Difference in cell filter response between classes for each consensus filter. To compute this difference for a filter, we first choose a filter-specific class, that's the class with highest output weight connection to the filter. Then we compute the average cell filter response (value after the pooling layer) for validation samples belonging to the filter-specific class (``v1``) and the average cell filter response for validation samples not belonging to the filter-specific class (``v0``). The difference is computed as ``v1 - v0``. For regression problems, we cannot compute a difference between classes. Instead we compute Kendall's rank correlation coefficient between the predictions of each individual filter (value after the pooling layer) and the true response values. This plot helps decide on a cutoff (``filter_diff_thres`` parameter) for selecting discriminative filters. - tsne_all_cells.png : Marker distribution overlaid on t-SNE map. In addition, the following plots are produced for each selected filter (e.g. filter ``i``): - cdf_filter_i.pdf : Cumulative distribution function of cell filter response for filter ``i``. This plot helps decide on a cutoff (``filter_response_thres`` parameter) for selecting the responding cell population. - selected_population_distribution_filter_i.pdf : Histograms of univariate marker expression profiles for the cell population selected by filter ``i`` vs all cells. - selected_population_frequencies_filter_i.pdf : Boxplot of selected cell population frequencies in samples of the different classes, if running a classification problem. For regression settings, a scatter plot of selected cell population frequencies vs response variable is generated. - tsne_cell_response_filter_i.png : Cell filter response overlaid on t-SNE map. - tsne_selected_cells_filter_i.png : Marker distribution of selected cell population overlaid on t-SNE map. """ # create the output directory mkdir_p(outdir) # number of measured markers nmark = samples[0].shape[1] if results['selected_filters'] is not None: print 'Loading the weights of consensus filters.' filters = results['selected_filters'] else: sys.exit('Consensus filters were not found.') if show_filters: plot_filters(results, labels, outdir) # get discriminative filter indices in consensus matrix keep_idx = discriminative_filters( results, outdir, filter_diff_thres, positive_filters_only=positive_filters_only, show_filters=show_filters) # encode the sample and sample-phenotype for each cell sample_sizes = [] per_cell_ids = [] for i, x in enumerate(samples): sample_sizes.append(x.shape[0]) per_cell_ids.append(i * np.ones(x.shape[0])) # for each selected filter, plot the selected cell population x = np.vstack(samples) z = np.hstack(per_cell_ids) if results['scaler'] is not None: x = results['scaler'].transform(x) print 'Computing t-SNE projection...' tsne_idx = np.random.choice(x.shape[0], tsne_ncell) x_for_tsne = x[tsne_idx].copy() x_tsne = TSNE(n_components=2).fit_transform(x_for_tsne) vmin, vmax = np.zeros(x.shape[1]), np.zeros(x.shape[1]) for seq_index in range(x.shape[1]): vmin[seq_index] = np.percentile(x[:, seq_index], 1) vmax[seq_index] = np.percentile(x[:, seq_index], 99) fig_path = os.path.join(outdir, 'tsne_all_cells') plot_tsne_grid(x_tsne, x_for_tsne, fig_path, labels=labels, fig_size=(20, 20), point_size=5) return_filters = [] for i_filter in keep_idx: w = filters[i_filter, :nmark] b = filters[i_filter, nmark] g = np.sum(w.reshape(1, -1) * x, axis=1) + b g = g * (g > 0) # skip a filter if it does not select any cell if np.max(g) <= 0: continue ecdf = sm.distributions.ECDF(g) gx = np.linspace(np.min(g), np.max(g)) gy = ecdf(gx) plt.figure() sns.set_style('whitegrid') a = plt.step(gx, gy) t = filter_response_thres # set a threshold to the CDF gradient? if response_grad_cutoff is not None: by = np.array(a[0].get_ydata())[::-1] bx = np.array(a[0].get_xdata())[::-1] b_diff_idx = np.where(by[:-1] - by[1:] >= response_grad_cutoff)[0] if len(b_diff_idx) > 0: t = bx[b_diff_idx[0] + 1] plt.plot((t, t), (np.min(gy), 1.), 'r--') plt.xlabel('Cell filter response') plt.ylabel('Cumulative distribution function (CDF)') sns.despine() plt.savefig(os.path.join(outdir, 'cdf_filter_%d.pdf' % i_filter), format='pdf') plt.clf() plt.close() condition = g > t x1 = x[condition] z1 = z[condition] g1 = g[condition] # skip a filter if it does not select any cell with the new cutoff threshold if x1.shape[0] == 0: continue # else add the filters to selected filters return_filters.append((i_filter, t)) # t-SNE plots for characterizing the selected cell population fig_path = os.path.join(outdir, 'tsne_cell_response_filter_%d.png' % i_filter) plot_2D_map(x_tsne, g[tsne_idx], fig_path, s=5) # overlay marker values on TSNE map for selected cells fig_path = os.path.join(outdir, 'tsne_selected_cells_filter_%d' % i_filter) g_tsne = g[tsne_idx] x_pos = x_for_tsne[g_tsne > t] x_tsne_pos = x_tsne[g_tsne > t] plot_tsne_selection_grid(x_tsne_pos, x_pos, x_tsne, vmin, vmax, fig_path=fig_path, labels=labels, fig_size=(20, 20), s=5, suffix='png') if clustering is None: suffix = 'filter_%d' % i_filter plot_selected_subset(x1, z1, x, labels, sample_sizes, phenotypes, outdir, suffix, stat_test, log_yscale, group_a, group_b, group_names, regression) else: if clustering == 'louvain': print 'Creating a k-NN graph with %d/%d cells...' % ( x1.shape[0], x.shape[0]) k = 10 G = create_graph(x1, k, g1, add_filter_response) print 'Identifying cell communities...' cl = G.community_fastgreedy() clusters = np.array(cl.as_clustering().membership) else: print 'Clustering using the dbscan algorithm...' eps = set_dbscan_eps(x1, os.path.join(outdir, 'kNN_distances.png')) cl = DBSCAN(eps=eps, min_samples=5, metric='l1') clusters = cl.fit_predict(x1) # discard outliers, i.e. clusters with very few cells c = Counter(clusters) cluster_ids = [] min_cells = int(min_cluster_freq * x1.shape[0]) for key, val in c.items(): if (key != -1) and (val > min_cells): cluster_ids.append(key) num_clusters = len(cluster_ids) scores = np.zeros(num_clusters) for j in range(num_clusters): cl_id = cluster_ids[j] scores[j] = np.mean(g1[clusters == cl_id]) # keep the communities with high cell filter response sorted_idx = np.argsort(scores)[::-1] scores = scores[sorted_idx] keep_idx_comm = [sorted_idx[0]] for i in range(1, num_clusters): if (scores[i - 1] - scores[i]) < percentage_drop_cluster * scores[i - 1]: keep_idx_comm.append(sorted_idx[i]) else: break for j in keep_idx_comm: cl_id = cluster_ids[j] xc = x1[clusters == cl_id] zc = z1[clusters == cl_id] suffix = 'filter_%d_cluster_%d' % (i_filter, cl_id) plot_selected_subset(xc, zc, x, labels, sample_sizes, phenotypes, outdir, suffix, stat_test, log_yscale, group_a, group_b, group_names, regression) print 'Done.\n' return return_filters
#data = data[data[:,2]<1] data = data[data[:, 3] < 15] data = data[data[:, 3] > -15] data = data[data[:, 4] < 15] data = data[data[:, 4] > -15] X = np.copy(data[:, 0:5]) X = StandardScaler().fit_transform(X) #X = MinMaxScaler().fit_transform(X) data_zs = np.copy(X) clt = DBSCAN(eps=0.26, min_samples=13) datalables = clt.fit_predict(data_zs) r1 = pd.Series(datalables).value_counts() print(r1) datapro = np.column_stack((data, datalables)) highdata = datapro[datapro[:, 8] == 0] lowdata = datapro[datapro[:, 8] == -1] np.savetxt('highdata.txt', highdata) np.savetxt('lowdata.txt', lowdata) arcmin = 15 temp = [0 for i in range(arcmin * 2)]
class Detector(object): def __init__(self, config=detector_config): self.config = config self.grid_size = self.config["grid_size"] self.min_cluster_occupied_grids = self.config[ "min_cluster_occupied_grids"] self.map_size = self.config["map_size"] self.num_grid = round(self.map_size / self.grid_size) self.shape = (self.num_grid, self.num_grid) self.location_to_index = build_location_to_index(self.config) self.index_to_location = build_index_to_location(self.config) self._confidence_map = np.zeros(self.shape, dtype=np.int) self._object_dict = {} self._object_label_waiting_list = [] self._detected_object_number = 0 # self.runtime = 0 # self.num_pub = 0 self.clustering_algo = DBSCAN( eps=self.config["neighborhood_size"], min_samples=self.config["neighborhood_min_samples"] ) # min sample=6 should be the FAST MODE logging.info('Global Map Created.') def _compute_object_overlap(self, obj1, obj2): x_min1, y_min1, x_max1, y_max1 = obj1["bounding_box"] x_min2, y_min2, x_max2, y_max2 = obj2["bounding_box"] intersection = max(0, (min(x_max1, x_max2) - max(x_min1, x_min2))) * \ max(0, (min(y_max1, y_max2) - max(y_min1, y_min2))) if intersection < 1e-6: return 0 union = (y_max2 - y_min2) * (x_max2 - x_min2) + (y_max1 - y_min1) * ( x_max1 - x_min1) - intersection return intersection / union def _compute_overlap(self, obj, cluster): # Now the intesection over union is not proper here. # Since we are using the bounding box to serve as the target. # So it can be naturally a very small IoU even if the cluster is in the bounding box. # Therefore I choose to use ``the intersection over the cluster``, # which is still not greater than 1. current_points = cluster["occupied_grids"] current_weight = cluster["occupied_weight"] if "search_range" in obj: x_min, y_min, x_max, y_max = obj["search_range"] else: x_min, y_min, x_max, y_max = obj["bounding_box"] intersection_mask = np_logical_and_list(x_min <= current_points[:, 0], current_points[:, 0] <= x_max, y_min <= current_points[:, 1], current_points[:, 1] <= y_max) intersection = current_weight[intersection_mask].sum() union = current_weight.sum() return intersection / union def _remove_object(self, obj_label): if obj_label not in self._object_dict: return self._object_label_waiting_list.append(obj_label) self._object_dict.pop(obj_label) def _fit(self, points, weight): if len(points) == 0: return None labels = self.clustering_algo.fit_predict(points, sample_weight=weight) return labels def _find_cluster(self, points, weight, high, low, occupied_grids): if len(points) < 1: logging.info("No point found in receptive field.") return {} labels = self._fit(points, weight) cluster_num = labels.max() if cluster_num < 0: logging.info("No cluster found in receptive field.") return {} raw_cluster_dict = { i: { # cluster info "points": points[labels == i], "weight": weight[labels == i], "high": high[labels == i], "low": low[labels == i], "mask": labels == i, "occupied_grids": occupied_grids[labels == i] } for i in range(0, cluster_num + 1) } return raw_cluster_dict def _process_cluster(self, raw_cluster_dict): cluster_properties = {} for label, cluster_info in raw_cluster_dict.items(): # calculate basic properties of each cluster points = cluster_info["points"] weight = cluster_info["weight"] centroid = np.dot(weight, points) / weight.sum() length = sqrt(np.sum(np.square(points - centroid), axis=1).max()) * 2 num_occupied_grids = len(points) cluster_properties[label] = { "area": num_occupied_grids * self.grid_size * self.grid_size, "length": length, "centroid": centroid, "density": weight.sum(), "occupied": cluster_info["occupied_grids"].shape[0], "high": cluster_info["high"].max(), "low": cluster_info["low"].min(), # "label": label, "occupied_grids": cluster_info["occupied_grids"], "occupied_weight": weight, } return cluster_properties def _get_new_label(self): if len(self._object_label_waiting_list) == 0: new_label = "Object {}".format(self._detected_object_number) self._detected_object_number += 1 else: new_label = self._object_label_waiting_list.pop(0) return new_label def _create_object(self, cluster): label = self._get_new_label() self._object_dict[label] = DetectedObject(label, cluster, self.config) return label def _register_cluster(self, cluster_properties): modified_objects = set() checked_cluster = set() for label, obj in self._object_dict.items(): # for each object possible_clusters = {} if obj["status"] != LOST: continue for cluster_label, cluster in cluster_properties.items(): overlap = self._compute_overlap(obj, cluster) if overlap > self.config["overlap_threshold"]: possible_clusters[cluster_label] = overlap if possible_clusters: # max would return the "key" of the maximum "value". target_cluster_label = max(possible_clusters) obj.update_property(cluster_properties[target_cluster_label]) modified_objects.add(label) checked_cluster.add(target_cluster_label) for cluster_label, cluster in cluster_properties.items( ): # for each cluster if cluster_label in checked_cluster: continue possible_objects = {} for label, obj in self._object_dict.items(): # for each object if label in modified_objects: continue overlap = self._compute_overlap(obj, cluster) if overlap > self.config["overlap_threshold"]: possible_objects[label] = overlap if not possible_objects: # No match existing object label = self._create_object(cluster) else: label = max( possible_objects ) # max would return the "key" of the maximum "value". self._object_dict[label].update_property( cluster) # _update_object(label, cluster) modified_objects.add(label) all_keys = set(self._object_dict.keys()) not_modified_objects = all_keys.difference(modified_objects) for obj_label in not_modified_objects: should_remove = self._object_dict[obj_label].lost() if should_remove: self._remove_object(obj_label) all_objects = list(self._object_dict.items()) for obj_label1, obj1 in all_objects[:-1]: if obj_label1 not in self._object_dict: continue for obj_label2, obj2 in all_objects[1:]: if obj_label2 not in self._object_dict: continue overlap = self._compute_object_overlap(obj1, obj2) if overlap > self.config["overlap_threshold"]: if obj1["confidence"] < obj2["confidence"]: self._remove_object(obj_label1) else: self._remove_object(obj_label2) return self._object_dict def _should_decrease_confidence(self, obj_cluster): should = False if len(obj_cluster["occupied_grids"] ) < self.min_cluster_occupied_grids: should = True if obj_cluster["length"] > self.config["max_object_length"]: should = True if obj_cluster["high"] > self.config["max_object_bottom_height"]: should = True if obj_cluster["status"] == LOST: should = True return should def _verify_objects(self): for obj_label, obj in self._object_dict.items(): if self._should_decrease_confidence(obj): obj.decrease_confidence() else: obj.increase_confidence() @property def object_dict(self): return { obj_label: obj_info for obj_label, obj_info in self._object_dict.items() if obj_info["confidence"] > self.config["min_detected_confidence"] } @property def availiable_obejct_keys(self): ret = {} for obj_label, obj_info in self._object_dict.items(): if obj_info["confidence"] > self.config["min_detected_confidence"]: index = obj_label.split(' ')[1] key = ord(str(index)) ret[key] = obj_label return ret def update(self, input_dict): points, weight, high, low = input_dict["points"], input_dict["weight"], input_dict["high"], \ input_dict["low"] occupied_grids = input_dict["indices"] if points is not None: raw_cluster_dict = self._find_cluster(points, weight, high, low, occupied_grids) prop = self._process_cluster(raw_cluster_dict) else: prop = {} self._register_cluster(prop) self._verify_objects() return self._object_dict
"""le=LabelEncoder() y=le.fit_transform(y) le.transform(['M','B']) X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,stratify=y,random_state=1) pipe_lr=make_pipeline(StandardScaler(),PCA(n_components=2),LogisticRegression(random_state=1)) pipe_lr.fit(X_train,y_train) y_pred=pipe_lr.predict(X_test) print(pipe_lr.score(X_test,y_test)) """ import matplotlib.pyplot as plt from sklearn.datasets import make_moons from sklearn.cluster import DBSCAN from sklearn.cluster import KMeans #X,y=make_moons(n_samples=400,noise=0.05,random_state=0) db = DBSCAN(eps=0.2, min_samples=5, metric='euclidean') y_db = db.fit_predict(X) km = KMeans(n_clusters=2, random_state=0) y_km = km.fit_predict(X) plt.scatter(X[:, 0], X[:, 1]) plt.show() plt.scatter(X[y_db == 0, 0], X[y_db == 0, 1], c='lightblue', edgecolor='black', marker='o', s=40, label='cluster 1') plt.scatter(X[y_db == 1, 0], X[y_db == 1, 1],
def detect_outliers(laptimes, divider): rng = max(laptimes) - min(laptimes) outlier_detection = DBSCAN(min_samples = 2, eps = rng/divider) return outlier_detection.fit_predict(np.array(laptimes).reshape(-1, 1))
def create_clusters(df, cluster_eps=0.03, min_samples=30): dbscan = DBSCAN(eps=cluster_eps, min_samples=min_samples) return dbscan.fit_predict(df)
# Parámetros: eps (mínima distancia), min_samples (número de ejemplos en el vecindario de un punto para considerarlo centro) dbscan = DBSCAN(eps=0.127, min_samples=20) # Lista de algoritmos a utilizar algorithms = (('DBSCAN', dbscan)) cluster_predict = {} # Bucle algoritmos filas_tabla_res = [] print('----- Ejecutando ' + 'DBSCAN', end='') # ----- #Tomamos tiempos t = time.time() # Ejecuto el algoritmo y asigno los clusters cluster_predict['DBSCAN'] = dbscan.fit_predict(X_normal) tiempo = time.time() - t #Pinto resultados print(": {:.2f} segundos, ".format(tiempo), end='') try: metric_CH = metrics.calinski_harabasz_score(X_normal, cluster_predict['DBSCAN']) print("Calinski-Harabasz Index: {:.3f}, ".format(metric_CH), end='') except: print("----ERROR: No podemos calcular el índice Calinski-Harabasz---") metric_CH = -1 # Otra medida de rendimiento, menos eficiente # el cálculo de Silhouette puede consumir mucha RAM. Si son muchos datos, digamos más de 10k, se puede seleccionar una muestra, p.ej., el 20% if len(X) > 10000: muestra_silhoutte = 0.2 else: muestra_silhoutte = 1.0
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(xlst) distances, indices = nbrs.kneighbors(xlst) mean_dist = np.mean(distances) print("MEAN NEAREST NEIGHBOR DISTANCE", np.mean(distances)) clustering = DBSCAN(eps=mean_dist * 4.0, min_samples=1) xlst_cluster = xlst * 1.0 xlst_cluster[:, 0] *= 5.0 print('xlst shape', xlst.shape) cluster_labels = clustering.fit_predict(xlst_cluster) print('cl', cluster_labels) cluster2chars = {} for j in range(0, len(cluster_labels)): if not cluster_labels[j] in cluster2chars: cluster2chars[cluster_labels[j]] = [] cluster2chars[cluster_labels[j]].append(xlst[j]) cluster2color = {} for keyc in cluster2chars: randcolor = [
epsilon_deviation = 0.33 change_width_for = {} weight_min_width = 0.9 weight_max_width = 0.1 weight_max_deviation = 0.6 weight_min_deviation = 0.4 allowed_group_wind_speed_deviation = 0.13 min_points_in_group = 4 limit_elements_in_group_std = 1.6 limit_next_center_std = 0.5 right_diff_limit = 5 for group in data_set.groupby('discrete_ActivePower'): work_group = group[1].copy() work_group['dbscan_label'] = dbscan_alg_obj.fit_predict( work_group[['scaled_WindSpeed', 'discrete_ActivePower']]) work_group = work_group[work_group['dbscan_label'] != -1].copy() if work_group.shape[0] <= n_kmeans_clusters: continue kmeans_alg_obj.fit(work_group[['scaled_WindSpeed', 'discrete_ActivePower']]) work_group['kmeans_label'] = kmeans_alg_obj.labels_ label_positions = [ i[0] for i in sorted(enumerate(kmeans_alg_obj.cluster_centers_[:, 0]), key=lambda x: x[1]) ] sorted_values = np.sort(kmeans_alg_obj.cluster_centers_[:, 0], axis=0) current_width = sorted_values[-1] - sorted_values[0]
X.head() Xscaled=StandardScaler().fit_transform(X) #array o/p Xn=normalize(Xscaled) #array o/p X=pd.DataFrame(Xn,columns=data.columns,index=data.index) X.head() obj=PCA(n_components=2,random_state=123) Xcomp=obj.fit_transform(X) Xcomp=pd.DataFrame(Xcomp, columns=['P1',"P2"],index=X.index) Xcomp.head() #model model=DBSCAN(eps=0.3,min_samples=4) fit=model.fit(Xcomp) labels=fit.labels_ y_pred = model.fit_predict(Xcomp) #labels #plot plt.scatter(Xcomp.iloc[:,0], Xcomp.iloc[:,1],c=y_pred, cmap='Paired') plt.title("DBSCAN") #summary n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print(' number of clusters: %d' % n_clusters_) print(' number of noise points: %d' % n_noise_) print("Silhouette Coefficient: %0.3f"% metrics.silhouette_score(Xcomp, labels)) # DBSCAN Clustering function # def dbscan(X, eps, min_samples): # ss = StandardScaler()
def main(_argv): # Hyperparameters max_cosine_distance = 0.4 # Used in deep sort nn_budget = None # Used in deep sort nms_max_overlap = 1.0 gathering_thresh = 3 pixels_to_meter = 150 #300 # Initialize person deep sort model_filename = 'model_data/mars-small128.pb' person_encoder = gdet.create_box_encoder( model_filename, batch_size=1 ) # This encodes the data inside a bounding box into a vector person_metric = nn_matching.NearestNeighborDistanceMetric( "cosine", max_cosine_distance, nn_budget) # Calculate cosine distance metric person_tracker = Tracker(person_metric) # Initialize person tracker print("Person deep sort initialized") # Initialize group sort group_metric = nn_matching.NearestNeighborDistanceMetric( "euclidean", max_cosine_distance, nn_budget) group_tracker = GroupTracker(group_metric) print("Group deep sort initialized") # Initialize DBSCAN model for clustering dbscan_model = DBSCAN(eps=pixels_to_meter, min_samples=1) print("DBSCAN initialized") # load configuration for object detector config = ConfigProto() config.gpu_options.allow_growth = True session = InteractiveSession(config=config) STRIDES, ANCHORS, NUM_CLASS, XYSCALE = utils.load_config(FLAGS) input_size = FLAGS.size video_path = FLAGS.video # Load object detection model saved_model_loaded = tf.saved_model.load(FLAGS.weights, tags=[tag_constants.SERVING]) infer = saved_model_loaded.signatures['serving_default'] print("Object detection model initialized") # Read in all class names from config class_names = utils.read_class_names(cfg.YOLO.CLASSES) # Only allowed_classes will be drawn allowed_classes = ['person'] #list(class_names.values()) # Display/Visual things video = None out = None if not FLAGS.output: #video = VideoReader(video_path) # Initialize async reader for video video = cv2.VideoCapture(video_path) else: video = cv2.VideoCapture(video_path) width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(video.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*FLAGS.output_format) out = cv2.VideoWriter(FLAGS.output, codec, fps, (width, height)) # Initialize color map cmap = plt.get_cmap('tab20b') colors = [cmap(i)[:3] for i in np.linspace(0, 1, 20)] display_yolo = False # Controls whether yolo bounding boxes are drawn display_person_track = True display_centroids = False # Controls whether bounding box centroids are drawn display_groups = True video_written = False cv2.namedWindow("detection", cv2.WINDOW_NORMAL) cv2.resizeWindow("detection", 1280, 720) # Main loop while True: """ PERFORM READING OF FRAME """ _, frame = video.read() if frame is None: print('Video has ended, restarting video...') if FLAGS.output: if not video_written: print("Output video written!") out.release() video_written = True break try: video.reset() except: video = cv2.VideoCapture(video_path) person_tracker.reset_tracks() group_tracker.reset_tracks() _, frame = video.read() frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Reformatting frame read from video frame_size = frame.shape[:2] image_data = cv2.resize(frame, (input_size, input_size)) image_data = image_data / 255. image_data = image_data[np.newaxis, ...].astype(np.float32) start_time = time.time() """ END READING OF FRAME """ """ PERFORM OBJECT DETECTION FOR PERSON """ # Convert frame image data to tensorflow input matrix and perform prediction batch_data = tf.constant(image_data) pred_bbox = infer(batch_data) for key, value in pred_bbox.items(): boxes = value[:, :, 0:4] pred_conf = value[:, :, 4:] # On predictions, run NMS to clean duplicate bounding boxes outputted by the model boxes, scores, classes, valid_detections = tf.image.combined_non_max_suppression( boxes=tf.reshape(boxes, (tf.shape(boxes)[0], -1, 1, 4)), scores=tf.reshape( pred_conf, (tf.shape(pred_conf)[0], -1, tf.shape(pred_conf)[-1])), max_output_size_per_class=50, max_total_size=50, iou_threshold=FLAGS.iou, score_threshold=FLAGS.score) # Convert data to numpy arrays and slice out unused elements num_objects = valid_detections.numpy()[0] bboxes = boxes.numpy()[0] bboxes = bboxes[0:int(num_objects)] scores = scores.numpy()[0] scores = scores[0:int(num_objects)] classes = classes.numpy()[0] classes = classes[0:int(num_objects)] # Get usable format of bounding boxes original_h, original_w, _ = frame.shape bboxes, bboxes_xyxy = utils.format_boxes(bboxes, original_h, original_w) # Drawing YOLO detected bounding boxes if display_yolo: for j in range(0, len(bboxes_xyxy)): if classes[j] != 0: continue box = bboxes_xyxy[j] cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 0, 255), 3) # Loop through objects and use class index to get class name, allow only classes in allowed_classes list names = [] deleted_indx = [] for i in range(num_objects): class_indx = int(classes[i]) class_name = class_names[class_indx] if class_name not in allowed_classes: deleted_indx.append(i) else: names.append(class_name) names = np.array(names) count = len(names) if FLAGS.count: cv2.putText(frame, "Objects being tracked: {}".format(count), (5, 35), cv2.FONT_HERSHEY_COMPLEX_SMALL, 2, (0, 255, 0), 2) print("Objects being tracked: {}".format(count)) # Delete detections that are not in allowed_classes bboxes = np.delete(bboxes, deleted_indx, axis=0) scores = np.delete(scores, deleted_indx, axis=0) """ END OBJECT DETECTION FOR PERSON """ """ PERFORM PERSON DEEP SORT """ # Encode YOLO person detections to feed to person_tracker person_features = person_encoder(frame, bboxes) person_detections = [ Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip( bboxes, scores, names, person_features) ] # Run non-maxima supression boxs = np.array([d.tlwh for d in person_detections]) scores = np.array([d.confidence for d in person_detections]) classes = np.array([d.class_name for d in person_detections]) indices = preprocessing.non_max_suppression(boxs, classes, nms_max_overlap, scores) person_detections = [person_detections[i] for i in indices] # Execute the person tracker person_tracker.predict() person_tracker.update(person_detections) # Update tracks to get bounding boxes of people person_bboxes = [] person_centroids = [] for person_track in person_tracker.tracks: if not person_track.is_confirmed( ) or person_track.time_since_update > 3: continue person_bboxes.append([int(x) for x in person_track.to_tlbr()]) person_centroids.append(utils.get_centroid(person_track.to_tlwh())) if display_person_track: bbox = person_track.to_tlbr() color = colors[int(person_track.track_id) % len(colors)] color = [i * 255 for i in color] cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), color, 2) cv2.rectangle( frame, (int(bbox[0]), int(bbox[1] - 30)), (int(bbox[0]) + (len("person") + len(str(person_track.track_id))) * 17, int(bbox[1])), color, -1) cv2.putText(frame, "person" + "-" + str(person_track.track_id), (int(bbox[0]), int(bbox[1] - 10)), 0, 0.75, (255, 255, 255), 2) """ END PERSON DEEP SORT """ """ PERFORM GROUP SORT """ # Detect and draw clusters of people too close together cluster_bboxes = [] cluster_sizes = [] if len(person_bboxes) != 0: cluster_assignments = dbscan_model.fit_predict(person_centroids) clusters = np.unique(cluster_assignments) for cluster in clusters: # Get array of centroids detected to be under current cluster row_ix = np.where(cluster_assignments == cluster) color = colors[random.randint(0, 10) % len(colors)] color = [j * 255 for j in color] point_arr = [] for i in row_ix[0]: point_arr.append(list(person_bboxes[i][0:2])) point_arr.append(list(person_bboxes[i][2:4])) if display_centroids: frame = cv2.circle( frame, (person_centroids[i][0], person_centroids[i][1]), 20, color, -1) # Get bounding rectangle that covers group of people x, y, w, h = cv2.boundingRect(np.array(point_arr)) # Skip all bounding rectangles that have height or width of 1 if w != 1 and h != 1: cluster_bboxes.append([x, y, w, h]) # Store bounding box of cluster cluster_sizes.append( len(point_arr) // 2) # Store number of people in current cluster # Encode cluster data and feed to tracker group_features = [] for i in range(0, len(cluster_bboxes)): cur_feature = np.array([cluster_sizes[i]]) # Get people in group # Get centre (mean position) of group # First get centroids of each bounding box in group bbox_centroids = np.apply_along_axis(utils.get_centroid, 1, cluster_bboxes) mean = np.mean(bbox_centroids, axis=0) cur_feature = np.concatenate((cur_feature, mean)) # Get variance variance = np.mean(bbox_centroids, axis=0) cur_feature = np.concatenate((cur_feature, variance)) group_features.append(cur_feature) group_features = np.array(group_features) group_detections = [ GroupDetection(bbox, num_people, feature) for bbox, num_people, feature in zip( cluster_bboxes, cluster_sizes, group_features) ] # Call the Deep sort tracker group_tracker.predict() group_tracker.update(group_detections) # update tracks for group_track in group_tracker.tracks: # If track is not confirmed or if it has been 2 or more frames since the track # was not found, we do not draw this track. The deletion of this track will # be handled automatically by the tracker.update() function I believe if not group_track.is_confirmed() or group_track.time_since_update > 1 or \ group_track.num_people <= gathering_thresh: continue bbox = group_track.to_tlbr() class_name = group_track.get_class() # Draw detection box on screen blank_frame = np.zeros(frame.shape, np.uint8) cv2.rectangle(blank_frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 0, 0), cv2.FILLED) frame = cv2.addWeighted(frame, 1.0, blank_frame, 0.4, 1) cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 0, 0), 2) cv2.putText( frame, f"{group_track.track_id} | Size: {group_track.num_people}", tuple(group_track.get_centroid()), 0, 2, (255, 255, 255), 2) # TODO If track age is greater than a certain number of frames, we issue an alert! Omg I'm done yay. """ END GROUP SORT """ # Calculate frames per second of entire process fps = 1.0 / (time.time() - start_time) print("FPS: %.2f" % fps) result = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) if not FLAGS.dont_show: cv2.imshow("detection", result) # if output flag is set, save video file if FLAGS.output: out.write(result) key_press = cv2.waitKey(1) & 0xFF if key_press == ord('q') or key_press == 27: # ESC key break elif key_press == ord('y'): display_yolo = not display_yolo elif key_press == ord('c'): display_centroids = not display_centroids elif key_press == ord('p'): display_person_track = not display_person_track cv2.destroyAllWindows()
from sklearn.cluster import DBSCAN import pandas as pd from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt import numpy as np base = pd.read_csv('dataset/credit_card_clients.csv', header = 1) base['BILL_TOTAL'] = base['BILL_AMT1'] + base['BILL_AMT2'] + base['BILL_AMT3']+ base['BILL_AMT4']+ base['BILL_AMT5']+ base['BILL_AMT6'] x = base.iloc[:,[1,25]].values scaler = StandardScaler(); x = scaler.fit_transform(x); dbscan = DBSCAN(eps = 0.37, min_samples = 4) previsoes = dbscan.fit_predict(x) # retorna a contagem de itens unicos unicos, quantidade = np.unique(previsoes, return_counts = True) plt.scatter(x[previsoes == 0, 0], x[previsoes == 0, 1], s = 100, c = 'red', label = 'Cluster 1') plt.scatter(x[previsoes == 1, 0], x[previsoes == 1, 1], s = 100, c = 'orange' , label = 'Cluster 2') plt.scatter(x[previsoes == 2, 0], x[previsoes == 2, 1], s = 100, c = 'green', label = 'Cluster 3') plt.xlabel('Limite') plt.ylabel('Gastos') plt.legend() lista_clientes = np.column_stack((base, previsoes)) lista_clientes = lista_clientes[lista_clientes[:, 26].argsort()]
# # for eps in np.linspace(0.001, 10, 1000): # for neighbours in range(1, 10): # dbscan = DBSCAN(eps=eps, min_samples=neighbours, metric='canberra', algorithm='brute') # y_predicted = dbscan.fit_predict(frame) + 1 # # score = accuracy_score(y_predicted, train_ys) # if score > mx_score: # mx_score = score # algo = dbscan # # print(mx_score) # print(algo.get_params(True)) best_algo = DBSCAN(eps=1.0119099099099098, min_samples=9, metric='canberra', algorithm='brute') pred = best_algo.fit_predict(frame) + 1 draw_clusters(train_xs, train_ys, 'Original dataset: Wine', save_name='original.png') draw_clusters(train_xs, pred, 'Clusterized by DBSCAN dataset: Wine | Accuracy: 0.78', save_name='clusterized.png') eps_xs = [] f_scores = [] silhouette_scores = [] for eps in np.linspace(0.001, 3, 500): algo = DBSCAN(eps=eps, min_samples=4, metric='canberra', algorithm='brute') predicted = algo.fit_predict(frame) + 1 eps_xs.append(eps) f_score = accuracy_score(train_ys, predicted) f_scores.append(f_score) if len(set(algo.labels_ + 1)) == 1: