def get_cluster_medoid_positions_OPTICS( file_list, cgmodel, min_samples=5, xi=0.05, frame_start=0, frame_stride=1, frame_end=-1, output_format="pdb", output_dir="cluster_output", output_cluster_traj = False, plot_silhouette=True, plot_rmsd_hist=True, filter=True, filter_ratio=0.05): """ Given PDB or DCD trajectory files and coarse grained model as input, this function performs OPTICS clustering on the poses in the trajectory, and returns a list of the coordinates for the medoid pose of each cluster. :param file_list: A list of PDB or DCD files to read and concatenate :type file_list: List( str ) :param cgmodel: A CGModel() class object :type cgmodel: class :param min_samples: minimum of number of samples in neighborhood of a point to be considered a core point (includes point itself) :type min_samples: int :param xi: OPTICS parameter for minimum slope on reachability plot signifying a cluster boundary :type xi: float :param frame_start: First frame in trajectory file to use for clustering. :type frame_start: int :param frame_stride: Advance by this many frames when reading trajectories. :type frame_stride: int :param frame_end: Last frame in trajectory file to use for clustering. :type frame_end: int :param output_format: file format extension to write medoid coordinates to (default="pdb"), dcd also supported :type output_format: str :param output_dir: directory to write clustering medoid and plot files :type output_dir: str :param plot_silhouette: option to create silhouette plot(default=True) :type plot_silhouette: boolean :param filter: option to apply neighborhood radius filtering to remove low-density data (default=True) :type filter: boolean :param filter_ratio: fraction of data points which pass through the neighborhood radius filter (default=0.05) :type filter_ratio: float :returns: - medoid_positions ( np.array( float * unit.angstrom ( n_clusters x num_particles x 3 ) ) ) - A 3D numpy array of poses corresponding to the medoids of all trajectory clusters. - cluster_sizes ( List ( int ) ) - A list of number of members in each cluster - cluster_rmsd( np.array ( float ) ) - A 1D numpy array of rmsd (in cluster distance space) of samples to cluster centers - n_noise ( int ) - number of points classified as noise - silhouette_avg - ( float ) - average silhouette score across all clusters """ if not os.path.exists(output_dir): os.mkdir(output_dir) top_from_pdb = None if cgmodel is None: top_from_pdb = file_list[0] distances, traj_all, original_indices = get_rmsd_matrix(file_list, cgmodel, frame_start, frame_stride, frame_end, return_original_indices=True) if filter: # Filter distances: distances, dense_indices, filter_ratio_actual, original_indices = \ filter_distances(distances, filter_ratio=filter_ratio, return_original_indices = True, original_indices = original_indices) traj_all = traj_all[dense_indices] if plot_rmsd_hist: # Plot rmsd histogram: distances_row = np.reshape(distances, (distances.shape[0]*distances.shape[1],1)) # Remove the diagonal 0 elements: distances_row = distances_row[distances_row != 0] figure = plt.figure() n_out, bin_edges_out, patch = plt.hist( distances_row, bins=1000,density=True) plt.xlabel('rmsd') plt.ylabel('probability density') plt.savefig(f'{output_dir}/distances_rmsd_hist.pdf') plt.close() # Cluster with sklearn OPTICS optic = OPTICS(min_samples=min_samples,xi=xi,cluster_method='xi',metric='precomputed').fit(distances) # The produces a cluster labels from 0 to n_clusters-1, and assigns -1 to noise points # Get labels labels = optic.labels_ # Number of clusters: n_clusters = len(set(labels)) - (1 if -1 in labels else 0) # Number of noise points: n_noise = list(labels).count(-1) # Get indices of frames in each cluster: cluster_indices = {} cluster_sizes = [] for k in range(n_clusters): cluster_indices[k] = np.argwhere(labels==k)[:,0] cluster_sizes.append(len(cluster_indices[k])) # Get indices of frames classified as noise: noise_indices = np.argwhere(labels==-1)[:,0] # Find the structure closest to each center (medoid): # OPTICS/DBSCAN does not have a built-in function to transform to cluster-distance space, # as the centroids of the clusters are not physically meaningful in general. However, as # RMSD between structures is our only clustering feature, the cluster centers (regions of # high density) will likely be representative structures of each cluster. # Following the protocol outlined in MDTraj example: # http://mdtraj.org/1.9.3/examples/centroids.html # Create distance matrices within each cluster: distances_k = {} for k in range(n_clusters): distances_k[k] = np.zeros((cluster_sizes[k],cluster_sizes[k])) for i in range(cluster_sizes[k]): for j in range(cluster_sizes[k]): distances_k[k][i,j] = distances[cluster_indices[k][i],cluster_indices[k][j]] # Compute medoid based on similarity scores: medoid_index = [] # Global index intra_cluster_medoid_index = [] # Index within cluster for k in range(n_clusters): intra_cluster_medoid_index.append( np.exp(-distances_k[k] / distances_k[k].std()).sum(axis=1).argmax() ) # Here we need to use the global sample index to find the medoid structure: medoid_index.append(cluster_indices[k][intra_cluster_medoid_index[k]]) medoid_xyz = np.zeros([n_clusters,traj_all.n_atoms,3]) for k in range(n_clusters): medoid_xyz[k,:,:] = traj_all[medoid_index[k]].xyz[0] # Write medoids to file write_medoids_to_file(cgmodel, medoid_xyz, output_dir, output_format, top_from_pdb=top_from_pdb) medoid_positions = medoid_xyz * unit.nanometer if output_cluster_traj: write_clusters_to_file(labels, traj_all, output_dir, output_format) # Compute intra-cluster rmsd of samples to medoid based on structure rmsd cluster_rmsd = np.zeros(n_clusters) for k in range(n_clusters): cluster_rmsd[k] = np.sqrt(((distances_k[k][intra_cluster_medoid_index[k]]**2).sum())/len(cluster_indices[k])) # Get silhouette scores try: silhouette_sample_values = silhouette_samples(distances, labels) silhouette_avg = np.mean(silhouette_sample_values[labels!=-1]) if plot_silhouette: # Plot silhouette analysis plotfile = f"{output_dir}/silhouette_optics_min_sample_{min_samples}_xi_{xi}.pdf" make_silhouette_plot( optic, silhouette_sample_values, silhouette_avg, n_clusters, cluster_rmsd, cluster_sizes, plotfile ) except ValueError: print("There are either no clusters, or no noise points identified. Try adjusting OPTICS min_samples, xi parameters") silhouette_avg = None return medoid_positions, cluster_sizes, cluster_rmsd, n_noise, silhouette_avg, labels, original_indices
j += 1 flag = True else: break if flag and traj[j - 1, 0] - traj[i, 0] > tThresh: # styPt = np.mean(traj[i:j], axis=0) styPts.append(styPt) i = j else: i += 1 return np.array(styPts) if __name__ == '__main__': import util data_dir = "data_NCSU" # traces_files = [trace for trace in os.listdir(data_dir) if re.match(r'\d+\.trace',trace)] traces_files = util.get_trace_files(data_dir) locH = [] for trace_file in traces_files: trace = np.loadtxt(trace_file) locH.append(detect_staypoints(trace, 90, 10)) X = [h[:, 1:3] for h in locH] X = np.vstack(tuple(X)) clust = OPTICS(min_samples=100, xi=.05, min_cluster_size=.05) clust.fit(X) util.plt_clusters(clust, X) print("hello")
# FeatureAgglomeration did not have fit_predict and fail in this version # 'FeatureAgglomeration_100' : FeatureAgglomeration(n_clusters=100), # 'FeatureAgglomeration_150' : FeatureAgglomeration(n_clusters=150), # 'FeatureAgglomeration_200' : FeatureAgglomeration(n_clusters=200), # 'FeatureAgglomeration_250' : FeatureAgglomeration(n_clusters=250), # 'FeatureAgglomeration_300' : FeatureAgglomeration(n_clusters=300), # 'FeatureAgglomeration_350' : FeatureAgglomeration(n_clusters=350), # 'FeatureAgglomeration_400' : FeatureAgglomeration(n_clusters=400), 'MiniBatchKMeans_100': MiniBatchKMeans(n_clusters=100), 'MiniBatchKMeans_150': MiniBatchKMeans(n_clusters=150), 'MiniBatchKMeans_200': MiniBatchKMeans(n_clusters=200), 'MiniBatchKMeans_250': MiniBatchKMeans(n_clusters=250), 'MiniBatchKMeans_300': MiniBatchKMeans(n_clusters=300), # 'OPTICS_0_5' :OPTICS(eps = 0.5, min_samples = 2), 'OPTICS_1_0': OPTICS(eps=1.5, min_samples=2), # 'OPTICS_1_5' :OPTICS(eps = 2.0, min_samples = 2), # 'OPTICS_2_5' :OPTICS(eps = 2.5, min_samples = 2), # 'OPTICS_3_0' :OPTICS(eps = 3.0, min_samples = 2), 'MeanShift_1_0': MeanShift(bandwidth=1.0), 'MeanShift_1_5': MeanShift(bandwidth=1.5), 'MeanShift_2_0': MeanShift(bandwidth=2.0), 'MeanShift_2_5': MeanShift(bandwidth=2.5), 'MeanShift_3_0': MeanShift(bandwidth=3.0), } # test all combinations results = [] for model_key in models.keys(): for df_key in dfs.keys():
# db = DBSCAN().fit(data) # score=metrics.normalized_mutual_info_score(digits.target,db.labels_,average_method='arithmetic') # print(score) bench_show3(DBSCAN(),name="DBSCAN", data=data) # 光学聚类 # clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05) # # # Run the fit # clust.fit(data) # score=metrics.normalized_mutual_info_score(digits.target,clust.labels_,average_method='arithmetic') # print(score) bench_show(OPTICS(min_samples=50, xi=.05, min_cluster_size=.05),name="OPTICS", data=data) # 高斯混合模型 #gmm = mixture.GaussianMixture(n_components=n_digits, covariance_type='full').fit(data) # score=metrics.normalized_mutual_info_score(digits.target,gmm.predict(data),average_method='arithmetic') # print(score) bench_show2(mixture.GaussianMixture(n_components=n_digits, covariance_type='full'),name="Gaussian", data=data) # 桦木 # brc = Birch(branching_factor=50, n_clusters=n_digits, threshold=0.5, compute_labels=True) # brc.fit(data) # score=metrics.normalized_mutual_info_score(digits.target,brc.labels_,average_method='arithmetic') # print(score) bench_show2(Birch(branching_factor=50, n_clusters=n_digits, threshold=0.5, compute_labels=True),name="Birch", data=data)
vectors = [] for word in sentence: if word in model: vectors.append(model[word]) df_vectors = pd.DataFrame(vectors) # Wortweise Durchschnitt bilden, sodass der ganze Satz einen einzigen "Durchschnitts-Wortvektor" erhält mean_vector = df_vectors.mean(axis=0).values.tolist() entry_vectors.append(mean_vector) df['vector'] = entry_vectors # Clustering xi = .07 clust = OPTICS(min_samples=2, xi=xi) labels = clust.fit_predict(entry_vectors) df['label'] = labels pd.set_option('display.max_colwidth', -1) # Lange Strings # Spalten wählen df = df.filter(items=['label', 'feed', 'entry']) # Unkategorisierte Zeilen weglassen df = df[df['label'] >= 0] # Sortieren df = df.sort_values(by='label') print(df.to_string())
def generate_model(df: pd.DataFrame, label_column: Optional[str]) -> Dict: num_cols = get_numeric_columns(df=df) cat_cols = get_text_categorical_columns(df=df) if label_column is None: print('clustering') ''' !!!Preprocessing!!! This applies to all processing throughout this file. I tried to reduce the data sets to data that can run on these very general data sets. An example of this may be removing all text columns for clustering, as there are more cases where this is a good idea than it is not. Similar to how you'll see me use my models, I don't think this file will work very well for any data set, but perhaps ok for a lot of data sets. I discuss this more with the models but there is truly no free lunch. ''' # drop categorical columns for i in cat_cols: df = df.drop(i, axis=1) # drop nans for i in df.columns: df[i].fillna(value=0, inplace=True) df = df[df[i] != 0] # scale everything down for PCA for i in num_cols: df[i] = normalize_column(df_column=df[i]) ''' !!!Explanation!!! Using PCA for dimensionality reduction! by using n_components = 0.90, we are keeping 90% of the datasets variance within the amount of features projected will have ''' pca = PCA(n_components=0.90) projected = pca.fit_transform(X=df) ''' !!!Models!!! This applies for all of my model sections. THERE IS NO FREE LUNCH!! Not expecting these models to run perfectly, or even well for every single data set you throw at them. Though I did try to make them capable of being as general as possible. For example, this includes setting my trees to be very shallow! I think if I can make sure no model is becoming too specific (like a deep tree would), I can perhaps swing at least an average of a generally low score for all my models, which for the possibility of any data set ever being thrown at it, I would be happy with those results. For this reason, I chose my models based on trying to cover as much ground as I can. For example, with classification I know Naive Bayes may not be great in some cases, that is why the other models are there; but for cases where Naive Bayes is very useful, it will be there to shine! I believe trying to cover as many general data sets as possible, the files may perform mediocre on any data set you may throw at it (within reason lol), once again I don't think there could be a file such as this that could run 99% on any dataset... and if someone does find it I think instead of submitting it for marks they should sell it for billions ;) I think the keyword here is generality!! ''' ''' !!!DBScan!!! ''' eps = [0.0001, 0.001, 0.01, 0.1, 1, 10] mins = [10, 15, 20, 30] db_scores = [] for i in eps: for n in mins: model = DBSCAN(eps=.2, min_samples=5) clusters = model.fit(projected) score = metrics.silhouette_score(projected, model.labels_) db_scores.append(dict(model=model, score=score)) best_db = dict(model=None, score=0) for i in range(len(db_scores)): if db_scores[i]['score'] > best_db['score']: best_db['score'] = db_scores[i]['score'] best_db['model'] = db_scores[i]['model'] ''' MeanShift ''' ms_scores = [] bands = [2, 4, 6, 8, 10] for i in bands: model = MeanShift(bandwidth=i) clusters = model.fit(projected) score = metrics.silhouette_score(projected, model.labels_) ms_scores.append(dict(model=model, score=score)) best_ms = dict(model=None, score=0) for i in range(len(db_scores)): if ms_scores[i]['score'] > best_ms['score']: best_ms['score'] = ms_scores[i]['score'] best_ms['model'] = ms_scores[i]['model'] ''' OPTICS ''' o_scores = [] eps = [0.0001, 0.001, 0.01, 0.1, 1, 10] mins = [10, 15, 20, 30] for i in eps: for n in mins: model = OPTICS(min_samples=i, max_eps=n) clusters = model.fit(projected) score = metrics.silhouette_score(projected, model.labels_) o_scores.append(dict(model=model, score=score)) best_o = dict(model=None, score=0) for i in range(len(db_scores)): if o_scores[i]['score'] > best_o['score']: best_o['score'] = o_scores[i]['score'] best_o['model'] = o_scores[i]['model'] ''' Hierarchical ''' hier_scores = [] aff = ['euclidean', 'cosine', 'l1', 'l2', 'manhatten'] for i in aff: model = AgglomerativeClustering(affinity=i) clusters = model.fit(projected) score = metrics.silhouette_score(projected, model.labels_) hier_scores.append(dict(model=model, score=score)) best_h = dict(model=None, score=0) for i in range(len(db_scores)): if hier_scores[i]['score'] > best_h['score']: best_h['score'] = hier_scores[i]['score'] best_h['model'] = hier_scores[i]['model'] ''' Return Best Cluster! ''' best_scores = np.array([ best_db['score'], best_ms['score'], best_o['score'], best_h['score'] ]) best = best_scores.max() if best == best_h['score']: print(best_h) return best_h elif best == best_ms['score']: print(best_ms) return best_ms elif best == best_o['score']: print(best_o) return best_o else: print(best_db) return best_db elif label_column in num_cols: print('regressor') ''' !!!Processing!!! ''' # label encode non numeric for i in cat_cols: le = LabelEncoder() df[i] = le.fit_transform(df[i]) # replace nans with mean for each column for i in df.columns: df[i].fillna(value=df[i].mean(), inplace=True) # partition df to x and y y = df[label_column] df = df.drop(label_column, axis=1) x = df ''' !!!Models!!! ''' X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30) ''' !!!Decision Tree Regressor!!! ''' dt_scores = [] depths = [4, 6, 8, 10] splits = [4, 6, 8, 10] impurity = [0.2, 0.3, 0.4] for i in depths: for n in impurity: for m in splits: model = DecisionTreeRegressor(max_depth=i, min_impurity_decrease=n, min_samples_split=m) model.fit(X_train, y_train) y_predict = model.predict(X_test) score = model.score(X_test, y_test) dt_scores.append(dict(model=model, score=score)) best_dt = dict(model=None, score=0) for i in range(len(dt_scores)): if dt_scores[i]['score'] > best_dt['score']: best_dt['score'] = dt_scores[i]['score'] best_dt['model'] = dt_scores[i]['model'] ''' !!!Random Forest Regressor!!! ''' rf_scores = [] estimators = [50, 70, 100, 120] for q in estimators: for i in depths: for n in impurity: for m in splits: model = RandomForestRegressor(n_estimators=q, max_depth=i, min_impurity_decrease=n, min_samples_split=m) model.fit(X_train, y_train) y_predict = model.predict(X_test) score = model.score(X_test, y_test) rf_scores.append(dict(model=model, score=score)) best_rf = dict(model=None, score=0) for i in range(len(rf_scores)): if rf_scores[i]['score'] > best_rf['score']: best_rf['score'] = rf_scores[i]['score'] best_rf['model'] = rf_scores[i]['model'] ''' !!!KNeighbours Regressor!!! ''' neighs = [3, 6, 9, 12, 15] kn_scores = [] for i in neighs: model = KNeighborsRegressor(n_neighbors=i) model.fit(X_train, y_train) y_predict = model.predict(X_test) score = model.score(X_test, y_test) kn_scores.append(dict(model=model, score=score)) best_kn = dict(model=None, score=0) for i in range(len(kn_scores)): if kn_scores[i]['score'] > best_kn['score']: best_kn['score'] = kn_scores[i]['score'] best_kn['model'] = kn_scores[i]['model'] ''' !!!Return Best!!! ''' best_scores = np.array( [best_dt['score'], best_rf['score'], best_kn['score']]) best = best_scores.max() print(best_scores) if best == best_kn['score']: print(best_kn) return best_kn elif best == best_dt['score']: print(best_dt) return best_dt else: print(best_rf) return best_rf elif label_column in cat_cols: print('label_column is categorical!') ''' !!!Preprocessing!!! ''' # label encode non numeric for i in cat_cols: le = LabelEncoder() df[i] = le.fit_transform(df[i]) # replace nans with mean for each column for i in df.columns: df[i].fillna(value=df[i].mean(), inplace=True) # partition df to x and y y_encoded = df[label_column] df = df.drop(label_column, axis=1) x = df ''' !!!Models!!! ''' ''' !!!Explanation and Citation!!! In order to average out multiple instances of the model, I use k-fold cross validation for training and testing of the model. I used the from cross_val_score function from scikit learn model selection library to do this I used this Youtube tutorial to learn how to effectively use this method. https://www.youtube.com/watch?v=gJo0uNL-5Qw. Thus I would like to cite the Youtube user "codebasics" from Jan. 26, 2019 for helping me figure out this library. ''' ''' Decision Tree ''' dt_scores = [] depths = [4, 6, 8, 10] splits = [4, 6, 8, 10] impurity = [0.2, 0.3, 0.4] for i in depths: for n in impurity: for m in splits: model = DecisionTreeClassifier(max_depth=i, min_impurity_decrease=n, min_samples_split=m) score = np.average(cross_val_score(model, x, y_encoded)) dt_scores.append(dict(model=model, score=score)) best_dt = dict(model=None, score=0) for i in range(len(dt_scores)): if dt_scores[i]['score'] > best_dt['score']: best_dt['score'] = dt_scores[i]['score'] best_dt['model'] = dt_scores[i]['model'] ''' Random Forest ''' rf_scores = [] estimators = [50, 70, 100, 120] for q in estimators: for i in depths: for n in impurity: for m in splits: model = RandomForestClassifier(n_estimators=q, max_depth=i, min_impurity_decrease=n, min_samples_split=m) score = np.average(cross_val_score( model, x, y_encoded)) rf_scores.append(dict(model=model, score=score)) best_rf = dict(model=None, score=0) for i in range(len(rf_scores)): if rf_scores[i]['score'] > best_rf['score']: best_rf['score'] = rf_scores[i]['score'] best_rf['model'] = rf_scores[i]['model'] ''' K-Neighbours ''' neighs = [3, 6, 9, 12, 15] kn_scores = [] for i in neighs: model = KNeighborsClassifier(n_neighbors=i) score = np.average(cross_val_score(model, x, y_encoded)) kn_scores.append(dict(model=model, score=score)) best_kn = dict(model=None, score=0) for i in range(len(kn_scores)): if kn_scores[i]['score'] > best_kn['score']: best_kn['score'] = kn_scores[i]['score'] best_kn['model'] = kn_scores[i]['model'] ''' GaussianNB ''' gnb_scores = [] model = GaussianNB() score = np.average(cross_val_score(model, x, y_encoded)) gnb_scores.append(dict(model=model, score=score)) best_gnb = gnb_scores[0] ''' Return Best Classifier! ''' best_scores = np.array([ best_dt['score'], best_rf['score'], best_kn['score'], best_gnb['score'] ]) best = best_scores.max() if best == best_gnb['score']: print(best_gnb) return best_gnb elif best == best_kn['score']: print(best_kn) return best_kn elif best == best_dt['score']: print(best_dt) return best_dt else: print(best_rf) return best_rf else: print('Error: label_column is not valid.') print( 'It must be relate to a column in df that is of categorical data type, numeric data type, ' + 'or is simply None. Returning empty dict object.') return dict(model=None, final_score=None)
plt.figure(figsize=(10, 10)) sns.scatterplot(principalDf["principal component 1"], principalDf["principal component 2"], hue=principalDf["kMeans"], markers="1", palette="Accent").set_title("PCA of kMeans analysis") del (dataCopy, principalComponents, principalDf) # OPTICS Clustering #explaination of methods in sklearn documentation from sklearn.cluster import OPTICS optics = OPTICS(min_samples=5, xi=.05, min_cluster_size=5) dataCopy = data.copy() del (dataCopy["Gate"]) resOptics = optics.fit_predict(dataCopy) dataCopy["optics"] = resOptics #sns.pairplot(dataCopy, diag_kind="kde", markers="1", hue = "optics") dataCopy["optics"].value_counts() #DBSCAN Clustering from sklearn.cluster import DBSCAN dbscan = DBSCAN(eps=121, min_samples=10)
plt.ylabel('n') plt.xlabel('Epsilon distance') #from the k distance graph we see that eps = 5 or 6 is an appropriate value from sklearn.cluster import DBSCAN from sklearn import metrics dbscanClustering = DBSCAN(eps=5, min_samples=6).fit(clData) dbscanLabels = dbscanClustering.labels_ cluster = list(dbscanLabels) plotter(x, y, cluster, clusteringPlotsPath + "DBSCAN.pdf") # -1 cluster are noise points ############## OPTICS ############## from sklearn.cluster import OPTICS opticsClustering = OPTICS(min_samples=50, xi=0.05, max_eps=10) opticsLabels = opticsClustering.fit_predict(clData) cluster = list(opticsLabels) plotter(x, y, cluster, clusteringPlotsPath + "OPTICS.pdf") # -1 cluster are noise points ########### Hierarchical Clustering ############## from sklearn.cluster import AgglomerativeClustering HierClustering = AgglomerativeClustering().fit(clData) agglomerativeLabels = clustering.labels_ cluster = list(agglomerativeLabels) plotter(x, y, cluster, clusteringPlotsPath + "Agglomerative_Clustering.pdf")
def prepdf_or_featureselection(mydata, myfeature_importances=None, prep=True): #create location feature for data using optics clustering optics_df = mydata[['Latitude', 'Longitude']].copy() clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05) clust.fit(optics_df) # optics_df['clust_label'] = clust.labels_ # location_max = np.max(optics_df.clust_label.unique()) #optics labels noisy samples as -1 need to replace for successful onehotencoding optics_df['clust_label'].replace([-1], location_max + 1, inplace=True) #one hot encoding and combining to mydata enc = OneHotEncoder(categories='auto') optics_df_1hot = enc.fit_transform(optics_df[['clust_label']]) location_labels = [ 'cluster' + str(l) for l in optics_df.clust_label.unique() ] optics_df_1hot = pd.DataFrame(optics_df_1hot.todense(), index=optics_df.index, columns=location_labels) #part1done cluster columns added #print(mydata.shape[1])#39 mydata = pd.concat([mydata, optics_df_1hot], axis=1) #print(mydata.shape[1])#42 #drop unneccessary columns in our case mydata = mydata.drop([ 'city', 'Latitude', 'Longitude', 'change_hunits', 'studio_1000_1499', 'studio_1500_more', 'studio_750_999', 'onebed_1000_1499', 'onebed_1500_more', 'onebed_750_999', 'twobed_1000_1499', 'twobed_1500_more', 'twobed_750_999', 'threebed_1000_1499', 'threebed_1500_more', 'threebed_750_999' ], axis=1) feature_data = mydata.copy() mydata = mydata.drop('med_rental_rate', axis=1) if prep: mydatacolumns = mydata.columns #print(mydata.shape[1])#37 #prepare data section imputer = IterativeImputer(max_iter=10, random_state=22, min_value=0) mydata = imputer.fit_transform(mydata) #scale only numerical attrbs which are everything but the columns which were appended earlier #print(len(location_labels),mydata.shape[1]) num_attrbs = mydata.shape[1] - len(location_labels) ct_columns = list(range(num_attrbs)) ct = ColumnTransformer([('scale1', RobustScaler(), ct_columns)], remainder='passthrough') mydata = ct.fit_transform(mydata) myfeature_selection = 'onlyprep_selected' else: num_pipeline = Pipeline([('imputer', IterativeImputer(max_iter=10, random_state=22, min_value=0)), ('rob_scaler', RobustScaler())]) #only num attributes num_attrbs = mydata.shape[1] - len(location_labels) ct_columns = list(range(num_attrbs)) full_pipeline = ColumnTransformer([('num', num_pipeline, ct_columns)], remainder='passthrough') #Thanks to Aurelien Geron https://github.com/ageron for TopFeatureSelector k = 10 feature_importances = myfeature_importances def indices_of_top_k(arr, k): return np.sort(np.argpartition(np.array(arr), -k)[-k:]) class TopFeatureSelector(BaseEstimator, TransformerMixin): def __init__(self, feature_importances, k): self.feature_importances = feature_importances self.k = k def fit(self, X, y=None): self.feature_indices_ = indices_of_top_k( self.feature_importances, self.k) return self def transform(self, X): return X[:, self.feature_indices_] prepare_select_and_predict_pipeline = Pipeline([ ('preparation', full_pipeline), ('feature_selection', TopFeatureSelector(feature_importances, k)), ('rf_reg', RandomForestRegressor(random_state=22, n_estimators=100)) ]) param_grid = [{ 'feature_selection__k': list(range(1, len(feature_importances) + 1)) }] grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1) grid_search_prep.fit(feature_data.drop('med_rental_rate', axis=1), feature_data['med_rental_rate'].copy()) myfeature_selection = grid_search_prep.best_params_ mydata = 'onlyprep_selected' mydatacolumns = 'onlyprep_selected' return { 'mydata': mydata, 'mydatacolumns': mydatacolumns, 'myfeature_selection': myfeature_selection }
# normalization data = pre_processing.scaling(data) # # OPTICS # Refs: # https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html # https://scikit-learn.org/stable/modules/clustering.html#optics # https://scikit-learn.org/stable/auto_examples/cluster/plot_optics.html#sphx-glr-auto-examples-cluster-plot-optics-py # start_time = time.time() print("[i] OPTICS Clustering: min_samples = {} ...\n".format(min_samples)) clust = OPTICS(min_samples=min_samples, xi=0.1, min_cluster_size=0.1, n_jobs=4, algorithm="ball_tree") # Run the fit clust.fit(data) print("successfully clustered!") print("[i] Run Time: {}".format((time.time() - start_time))) # Performs DBSCAN extraction for an arbitrary epsilon. #Ref: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.cluster_optics_dbscan.html#sklearn.cluster.cluster_optics_dbscan # 0.22 eh o vertice da curva da distancia entre os vizinhos #labels_022 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=0.22) # 0.8 eh o epsilon que o indiano usou e encontrou um fscore de 96% #labels_080 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=0.8)
def similarity(x, y): max_len = max(len(x[np.where(x > 0)]), len(y[np.where(y > 0)])) totals = np.add(x, y) total_incommon = len(totals[np.where(totals > 1)]) result = 1 - (total_incommon / max(max_len, 0.000001) ) # max ensures no division by 0. return result clust = OPTICS(min_samples=2, min_cluster_size=2, metric="minkowski", n_jobs=10, cluster_method='dbscan', max_eps=0.5) # Run the fit res = clust.fit(data) result = res d_testing = data.assign(cluster=result) d_testing = d_testing.assign(concept=names) csvRes = d_testing[["concept", 'cluster']].sort_values(by=['cluster', 'concept'], ascending=False) csvRes = pd.merge(csvRes, cuis, on='concept')
def test_clustering_results(z, edgeList, args): ''' Try different clustring without known celltypes ''' try: # graph Louvain print("Louvain") listResult, size = generateLouvainCluster(edgeList) measure_clustering_results(z, listResult) except: pass try: # KMeans print("KMeans") clustering = KMeans(n_clusters=args.n_clusters, random_state=0).fit(z) listResult = clustering.predict(z) measure_clustering_results(z, listResult) except: pass # try: # #Spectral Clustering # print("SpectralClustering") # clustering = SpectralClustering(n_clusters=args.n_clusters, assign_labels="discretize", random_state=0).fit(z) # listResult = clustering.labels_.tolist() # measure_clustering_results(z,listResult) # except: # pass try: # AffinityPropagation print("AffinityPropagation") clustering = AffinityPropagation().fit(z) listResult = clustering.predict(z) measure_clustering_results(z, listResult) except: pass try: # AgglomerativeClustering print("AgglomerativeClustering") clustering = AgglomerativeClustering().fit(z) listResult = clustering.labels_.tolist() measure_clustering_results(z, listResult) except: pass try: # Birch print("Birch") clustering = Birch(n_clusters=args.n_clusters).fit(z) listResult = clustering.predict(z) measure_clustering_results(z, listResult) except: pass # #DBSCAN # print("DBSCAN") # clustering = DBSCAN().fit(z) # listResult = clustering.labels_.tolist() # measure_clustering_results(z,listResult) # FeatureAgglomeration # print("FeatureAgglomeration") # clustering = FeatureAgglomeration(n_clusters=args.n_clusters).fit(z) # listResult = clustering.labels_.tolist() # measure_clustering_results(z,listResult) # MeanShift # print("MeanShift") # clustering = MeanShift().fit(z) # listResult = clustering.predict(z) # measure_clustering_results(z,listResult) try: # OPTICS print("OPTICS") clustering = OPTICS().fit(z) listResult = clustering.labels_.tolist() measure_clustering_results(z, listResult) except: pass
def class3_output(hz_fft3, num_fft3): max_sum = 0 max_index = -1 min_sum = 0 min_index = -1 fft = [] # 最大周波数とピーク数のリストを一つのリストにまとめる for i in range(len(hz_fft3)): fft.append([hz_fft3[i], num_fft3[i]]) fft = np.array(fft) # 最小-1,最大1にリストを正規化 normal_fft = scipy.stats.zscore(fft).tolist() # 正規化されたリストから最小の和と最大の和のリストを抽出 for i in range(len(normal_fft)): sum_fft = normal_fft[i][0]+normal_fft[i][1] if max_sum < sum_fft: max_sum = sum_fft max_index = [normal_fft[i][0], normal_fft[i][1]] if min_sum > sum_fft: min_sum = sum_fft min_index = [normal_fft[i][0], normal_fft[i][1]] #分類対象のデータのリスト。各要素はfloatのリスト vectors = np.array(normal_fft) #分類対象のデータをクラスタ数3でクラスタリング clustering = OPTICS(**pram3).fit(vectors) # 各特徴点をラベルに従いプロットする label = clustering.labels_ print(label) ''' for i in range(len(normal_fft)): if label[i]==0: fft_0x.append(normal_fft[i][0]) fft_0y.append(normal_fft[i][1]) elif label[i]==1: fft_1x.append(normal_fft[i][0]) fft_1y.append(normal_fft[i][1]) else: fft_2x.append(normal_fft[i][0]) fft_2y.append(normal_fft[i][1]) ''' # figure fig = plt.figure(figsize=(14,10)) ax = fig.add_subplot(1, 1, 1) clist = ['gray', 'blue', 'orange', 'green', 'red', 'purple', 'brown', 'yellow'] # plot for i in range(len(vectors)): ax.scatter(vectors[i,0], vectors[i,1], color=clist[label[i]+1], s=36) #plt.title('Method-2', fontsize=36) ax.set_xlabel('vector in x', fontsize=36) ax.set_ylabel('vector in y', fontsize=36) #plt.tick_params(labelsize=36) fig.show() fig.savefig('D:/opticalflow/evaluation/plt/class3/' + videoName[:-4] + '_' + algorithm + '_figure.png') return label.tolist()
test = test.apply(lambda row: np.radians(row)) # In[112]: #Min samples is the amount of 'core' points needed to be recognized as a cluster #core points are basically points in the 'middle' (or just semi-central) of a high density zone #Metric is the way the algorithm will measure distance. Since we are working in a 'sphere' we will use haversine #(although Earth is not REALLY a sphere, but the error is acceptable, about 0.3% #Max_exps is the maximun distance the algorithm will search for neighbors for a cluster #since the haversine function returns very small values (above the distance was 0.04, which converted is about 268 km), #the value in this function is also really small. With 0.01 a many points that weren't noise were considered as such, #with 0.08 it takes WAY to long to run, I suggest to lower it to 0.05 and see it the results are good enough clust = OPTICS(min_samples = 3, metric = haversine_distance, min_cluster_size= 3, max_eps = 0.08) # In[ ]: start_time = time.time()#Line to measure the time it takes to run the algorithm clust.fit(test)#the process of clustering itself print("--- %s seconds ---" % (time.time() - start_time))#Also to measure time # In[96]: space = np.arange(test.shape[0])#array with numbers from 0 to the number of cbgs labels = np.asarray(clust.labels_)# cluster to which the data point belongs (has the same length and index that the main dataset)
delimiter=",") np.savetxt(results_folderName + labels_fileName, np.concatenate([labels_train, labels_test], axis=0), delimiter=",", fmt='%s') if plotFeatures: plot_features(X_train[3, :], nTokens) plot_features_compare(X_train, y_train, nTokens) # remove outliers from train data: X_train_inliers = [] y_train_inliers = [] for label in labels_to_class_dict.values(): X = X_train[y_train == label, :] y = y_train[y_train == label] clusters = OPTICS(min_samples=min_samples).fit_predict(X) X_train_inliers.append(X[clusters != -1, :]) y_train_inliers.append(y[clusters != -1]) X_train = np.concatenate(X_train_inliers, axis=0) y_train = np.concatenate(y_train_inliers) # scaling: scaler = preprocessing.StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) # select good features: n_features = int(X_train_scaled.shape[1] * nFeatures_factor) n_neighbors = 10 r = ReliefF(n_features_to_select=n_features, n_neighbors=n_neighbors) r.fit(X_train_scaled, y_train)
df['Cluster_GM'] = gm.fit_predict(X) ''' birch ''' from sklearn.cluster import Birch bh = Birch(threshold=0.01, branching_factor=100, n_clusters=9) # fit model and predict clusters df['Cluster_BH'] = bh.fit_predict(X).astype(str) ''' dbscan ''' from sklearn.cluster import DBSCAN # define the model db = DBSCAN(eps=0.3, min_samples=100) # fit model and predict clusters df['Cluster_DB'] = db.fit_predict(X) ''' optics ''' from sklearn.cluster import OPTICS # define the model op = OPTICS(xi=0.05, min_samples=10, min_cluster_size=0.1) # fit model and predict clusters df['Cluster_OP'] = op.fit_predict(X) ''' spectral ''' from sklearn.cluster import SpectralClustering # define the model sp = SpectralClustering(n_clusters=9) # fit model and predict clusters df['Cluster_SP'] = sp.fit_predict(X) df['Cluster_SP'].value_counts(normalize=True) ''' TSNE ''' from sklearn.manifold import TSNE tsne = TSNE( n_components=2, perplexity=100,
if P["domain name"] == 'bickley_jet_domain': ax.tick_params(labelsize=8) if i==0: plt.yticks(np.arange(-2000,4000,2000), np.arange(-2,4,2)) else: plt.yticks([]) plt.xticks(np.arange(0,25000,5000), np.arange(0,25,5)) f.savefig(P["filename"] + "_Kmeans", dpi=300) """ OPTICS """ if P["OPTICS"]: optics_clustering = OPTICS(min_samples=P["MinPts"], metric="euclidean").fit(X_embedding) reachability = optics_clustering.reachability_ core_distances = optics_clustering.core_distances_ ordering = optics_clustering.ordering_ predecessor = optics_clustering.predecessor_ labels = [] for op in P["optics_params"]: m, c = op[0], op[1] if m == "xi": l, _ = cluster_optics_xi(reachability, predecessor, ordering, P["MinPts"], xi=c) else: l = cluster_optics_dbscan(reachability=reachability, core_distances=core_distances, ordering=ordering, eps=c)
def do_clustering(target_csv, cluster_method): num_cluster = 24 df_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, target_csv + '.csv'), index_col=0, header=0, encoding='utf-8-sig') df_data.index.name = 'short_code' print(df_data.iloc[:100]) print(df_data.shape) start_time = time.time() if cluster_method == 0: clustering = DBSCAN(eps=0.3, min_samples=1000) clustering.fit(df_data) csv_name = 'clustered_dbscan_' + target_csv + '.csv' elif cluster_method == 1: clustering = OPTICS(min_samples=1000, metric='cosine') clustering.fit(df_data) csv_name = 'clustered_optics_' + target_csv + '.csv' elif cluster_method == 2: clustering = AgglomerativeClustering(n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_ward_' + target_csv + '.csv' elif cluster_method == 3: clustering = AgglomerativeClustering(affinity='cosine', linkage='complete', n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_agglo_complete_' + target_csv + '.csv' elif cluster_method == 4: clustering = AgglomerativeClustering(affinity='cosine', linkage='single', n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_agglo_single_' + target_csv + '.csv' elif cluster_method == 5: clustering = Birch(n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_birch_' + target_csv + '.csv' elif cluster_method == 6: clustering = KMeans(n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_kmeans_' + target_csv + '.csv' elif cluster_method == 7: clustering = SpectralClustering(n_clusters=num_cluster, random_state=42, assign_labels='discretize') clustering.fit(df_data) csv_name = 'clustered_spectral_' + target_csv + '.csv' print("time elapsed for clustering: " + str(time.time() - start_time)) print(clustering.get_params()) print(clustering.labels_) count_percentage(clustering.labels_) result_df = pd.DataFrame(data=clustering.labels_, index=df_data.index, columns=['cluster']) start_time = time.time() print("calinski_harabasz_score: ", calinski_harabasz_score(df_data, result_df['cluster'].squeeze())) print("silhouette_score: ", silhouette_score(df_data, result_df['cluster'].squeeze())) print("davies_bouldin_score: ", davies_bouldin_score(df_data, result_df['cluster'].squeeze())) print("time elapsed for scoring: " + str(time.time() - start_time)) result_df.to_csv(os.path.join(CONFIG.CSV_PATH, csv_name), encoding='utf-8-sig')
arguments = sys.argv[1:] filePath = arguments[1] eps = float(arguments[3]) minNumSamples = int(arguments[5]) #read in point cloud file in csv inFile = np.genfromtxt(filePath, delimiter=',', skip_header=1) #nonGround_coords: X = inFile[:, 1:4] startTime = time.time() print("Running OPTICS. \n") testtree = OPTICS(eps=eps, min_samples=minNumSamples).fit(X) """# | eps : float, optional given the label -1. | The maximum distance between two samples for them to be considered | as in the same neighborhood. This is also the largest object size | expected within the dataset. Lower eps values can be used after | OPTICS is run the first time, with fast returns of labels. Default | value of "np.inf" will identify clusters across all scales; reducing | eps will result in shorter run times. | min_samples : int, optional | The number of samples in a neighborhood for a point to be considered | as a core point. """ timeElapsed = time.time() - startTime print( "OPTICS.fit(eps = {0}, min_samples = {1}) time elapsed: ".format( eps, minNumSamples), timeElapsed, "\n")
def __init__(self, **params): super().__init__('optics') self.model = OPTICS(**params)
def execute_event_detection_procedure(task_id: int, task_name: str, min_x, min_y, max_x, max_y, look_back_hours: int, lang_code, min_cluster_size=10, st_clustering_max_eps=0.2, text_clustering_max_eps=0.4, verbose=True): global postgres_tweets, postgres_events, vectorizer, languages, exec_number exec_number = exec_number + 1 end_date = datetime.now() start_date = end_date - timedelta(hours=int(look_back_hours)) print("*"*60) print("*"*60) print( F"Process: {task_name} ({task_id}), Language: {lang_code}, Interval: {start_date} to {end_date}") print(F"Execution number: {exec_number}") if exec_number % 100 == 0: try: postgres_tweets.delete_old_tweets() print('Old tweets were deleted from the database.') except: print('Unable to delete old tweets.') if not lang_code in languages: print(f"The selected language ({lang_code}) is not supported.") print('Processing was terminated.') # Read data from database print("1. Read data from database.") df, num = postgres_tweets.read_data_from_postgres( start_date=start_date, end_date=end_date, min_x=min_x, min_y=min_y, max_x=max_x, max_y=max_y, lang=lang_code) if num <= 0: print('There was no record for processing.') print('Processing was terminated.') return if num <= min_cluster_size: print('There was no enough record for processing.') print('Processing was terminated.') return if verbose: print(F"Number of retrieved tweets: {num}") # convert to geodataframe print("2. convert to GeoDataFrame") gdf = add_geometry(df, crs=get_wgs84_crs()) # get location vectors print("3. Tweet info") x = np.asarray(gdf.geometry.x)[:, np.newaxis] y = np.asarray(gdf.geometry.y)[:, np.newaxis] # get time vector t = np.asarray(gdf.created_at.dt.year * 365.2425 + gdf.created_at.dt.day) date_time = gdf.created_at.dt.to_pydatetime() # get tweet_id and user_id tweet_id = gdf.id.values user_id = gdf.user_id.values # Vectorzie text print("4. Get text vector") clean_text = df.c.values text = df.text.values text_vect = None text_vect = vectorizer.vectorize(df.c.values, lang_code) # Added to debugging # if __debug__: # text_vect_path = '~/temp/text.npy' # os.makedirs('~/temp', exist_ok=True) # if os.path.exists(text_vect_path): # text_vect = np.load(text_vect_path) # else: # text_vect = vectorizer.vectorize(df.c.values, lang_code) # np.save(text_vect_path, text_vect) # else: # text_vect = vectorizer.vectorize(df.c.values, lang_code) # print(F"Shape of the vectorized tweets: {text_vect.shape}") # Text-based clustering print("5. Clustering - First-level: Text-based") start_time = time() optics_ = OPTICS( min_cluster_size=min_cluster_size, max_eps=text_clustering_max_eps, metric='precomputed') text_dist = np.absolute(cosine_distances(text_vect)) optics_.fit(text_dist) time_taken = time() - start_time txt_clust_labels = optics_.labels_ txt_clust_label_codes = np.unique(txt_clust_labels) num_of_clusters = len(txt_clust_label_codes[txt_clust_label_codes >= 0]) if verbose: print(F'\tNumber of text based clusters: {num_of_clusters - 1}') print(F"\tTime: {math.ceil(time_taken)} seconds") if num_of_clusters <= 0: print("No first level cluster was detected.") print('Processing was terminated.') return # topic identification print("6. Identify topics") # TODO: We need to specify the maximum number of tweets enter into the clustering procedures identTopic = HDPTopicIdentification() identTopic.identify_topics(txt_clust_labels, clean_text) if verbose: identTopic.print_cluster_topics('\t') topics = identTopic.get_cluster_topics() clusters = [] print("\n7. Clustering - Second-level: Spatiotemporal") for label in txt_clust_label_codes: if label >= 0: start_time = time() optics_ = OPTICS( min_cluster_size=min_cluster_size, max_eps=st_clustering_max_eps, metric='precomputed') _x = x[txt_clust_labels == label] _y = y[txt_clust_labels == label] _tweet_id = tweet_id[txt_clust_labels == label] _user_id = user_id[txt_clust_labels == label] # _x = StandardScaler().fit_transform(x[txt_clust_labels == label]) # _y = StandardScaler().fit_transform(y[txt_clust_labels == label]) _text = text[txt_clust_labels == label] _date_time = date_time[txt_clust_labels == label] st_vect = np.concatenate((_x, _y, # t[txt_clust_labels==label], ), axis=1) st_dist = euclidean_distances(st_vect) optics_.fit(st_dist) time_taken = time() - start_time st_clust_labels = optics_.labels_ st_clust_label_codes = np.unique(st_clust_labels) num_of_clusters = len( st_clust_label_codes[st_clust_label_codes >= 0]) st_any_clust = num_of_clusters > 0 for l in st_clust_label_codes[st_clust_label_codes >= 0]: topic = topics[label][3] topic_words = topics[label][4] points_text = _text[st_clust_labels == l].tolist() points_x = _x[st_clust_labels == l] points_y = _y[st_clust_labels == l] points_tweet_id = _tweet_id[st_clust_labels == l] points_user_id = _user_id[st_clust_labels == l] points_date_time = _date_time[st_clust_labels == l].tolist() lat_min = np.min(points_y) lat_max = np.max(points_y) lon_min = np.min(points_x) lon_max = np.max(points_x) dt_min = min(points_date_time) dt_max = max(points_date_time) if (len(np.unique(points_user_id)) > 1): clusters.append({ 'id': None, 'task_id': task_id, 'task_name': task_name, 'topic': topic, 'topic_words': topic_words, 'latitude_min': lat_min, 'latitude_max': lat_max, 'longitude_min': lon_min, 'longitude_max': lon_max, 'date_time_min': dt_min, 'date_time_max': dt_max, 'points': [{'cluster_id': None, 'longitude': xx.item(), 'latitude': yy.item(), 'text': tt, 'date_time': dd, 'tweet_id': ti.item(), 'user_id': ui.item()} for xx, yy, tt, dd, ti, ui in zip(points_x, points_y, points_text, points_date_time, points_tweet_id, points_user_id)] }) if verbose: print(F'\tNumber of spatial clusters: {len(clusters)}') print(F"\tTime: {math.ceil(time_taken)} seconds") print("8. Link clusters") num_new_cluster = 0 num_updated_cluster = 0 for cluster in clusters: # 8.1 Select cluster that coincide with the current time interval and extent db_clusters = postgres_events.get_clusters( cluster['latitude_min'], cluster['latitude_max'], cluster['longitude_min'], cluster['longitude_max'], cluster['date_time_min'], cluster['date_time_max'] ) # 8.2 Retrieve their points coverage_id = [] coverage_ratio = [] # 8.3 Compare the point of the old clusters and the new clusters for db_cluster in db_clusters: db_cluster_point_tweet_ids = np.array( postgres_events.get_cluster_point_tweet_ids(db_cluster['id'])) cluster_point_tweet_ids = np.array( [point['tweet_id'] for point in cluster['points']]) numerator = len(np.intersect1d( db_cluster_point_tweet_ids, cluster_point_tweet_ids)) denominator = len(np.union1d( db_cluster_point_tweet_ids, cluster_point_tweet_ids)) if denominator > 0: coverage_id.append(db_cluster['id']) coverage_ratio.append(numerator / denominator) # 8.4 Link the clusters with higher cluster relation strength if len(coverage_id) > 0 and max(coverage_ratio) >= min_linking_ratio: coverage_ratio = np.array(coverage_ratio) coverage_id = np.array(coverage_id) cluster['id'] = np.max( coverage_id[coverage_ratio == max(coverage_ratio)]).item() db_cluster = None for db_clust in db_clusters: if db_clust['id'] == cluster['id']: db_cluster = db_clust if not db_cluster is None: cluster['latitude_min'] = min( cluster['latitude_min'], db_cluster['latitude_min']) cluster['latitude_max'] = max( cluster['latitude_max'], db_cluster['latitude_max']) cluster['longitude_min'] = min( cluster['longitude_min'], db_cluster['longitude_min']) cluster['longitude_max'] = max( cluster['longitude_max'], db_cluster['longitude_max']) cluster['date_time_min'] = min( cluster['date_time_min'], db_cluster['date_time_min']) cluster['date_time_max'] = max( cluster['date_time_max'], db_cluster['date_time_max']) for i in range(0, len(cluster['points'])): cluster['points'][i]['cluster_id'] = cluster['id'] num_updated_cluster += 1 else: num_new_cluster += 1 pass print( f'\t # updated clusters: {num_updated_cluster}, # new clusters: {num_new_cluster}') print("9. Save clusters") postgres_events.insert_clusters(clusters) print(F"Process {task_name} ({task_id}) finished.") print('*'*60) print("*"*60)
df[a] = df[a].cat.codes features = df.columns scaler = MinMaxScaler().fit(df[features]) scaled_df = pd.DataFrame(scaler.transform(df[features])) scaled_df.columns = features scaled_df.fillna(scaled_df.mean(), inplace=True) fig = plt.figure(figsize=(7, 7)) index = 1 for s in [50, 60, 70, 80]: est = OPTICS(min_samples=s) est.fit(scaled_df) df['labels'] = est.labels_ num_clusters = len(df['labels'].unique()) sp = fig.add_subplot(2, 2, index) sp.set_xlabel('Ball Control') sp.set_ylabel('Interceptions') print("Broj klastera: %d" % num_clusters) print("Samples: %d " % s) print("Senka koeficijent: %f " % silhouette_score(scaled_df, est.labels_)) for j in range(-1, num_clusters): if j == -1: label = 'noise'
def get_clusters(self, st_arr): import traceback import numpy as np from sklearn.cluster import DBSCAN, AffinityPropagation, OPTICS, MeanShift, AgglomerativeClustering, Birch from sklearn.cluster import KMeans, SpectralClustering from sklearn.neighbors import NearestNeighbors import hdbscan, pyamg # import sklearn.utils from sklearn.preprocessing import StandardScaler from sklearn.datasets import make_blobs from sklearn.metrics.pairwise import haversine_distances import sys sys.path.insert(1, '../lib') import denclue, GDT.api, GDT.plot_tools try: if self.name == 'DBSCAN': clusterer = DBSCAN(eps=self.epsilon, min_samples=self.minimum_samples, algorithm=self.algorithm, metric=self.metric) elif self.name == 'HDBSCAN': clusterer = hdbscan.HDBSCAN( min_samples=self.minimum_samples, # min_cluster_size=self.minimum_cluster_size, min_cluster_size=self.minimum_samples, cluster_selection_epsilon=self.epsilon, metric=self.metric, cluster_selection_method=self.cluster_method, gen_min_span_tree=True, prediction_data=True) elif self.name == 'AFFINITYPROPAGATION': if self.metric in ['haversine', 'precomputed']: lat = np.array(st_arr[:, 0]) lon = np.array(st_arr[:, 1]) st_coords = np.column_stack((lat, lon)) st_arr = haversine_distances(np.radians(st_coords), np.radians(st_coords)) clusterer = AffinityPropagation( affinity=self.metric, damping=0.5, max_iter=self.maximum_iterations, convergence_iter=15, preference=None, random_state=self.random_state, #affinity='precomputed', ) elif self.metric in ['euclidean']: clusterer = AffinityPropagation( affinity=self.metric, damping=0.5, max_iter=self.maximum_iterations, convergence_iter=15, preference=None, random_state=self.random_state, ) else: raise ValueError( 'Invalid metric %s . Must be euclidean or havesine' % self.metric) elif self.name == 'OPTICS': clusterer = OPTICS( min_samples=self.minimum_samples, # min_cluster_size=self.minimum_cluster_size, min_cluster_size=self.minimum_samples, max_eps=self.epsilon, eps=self.epsilon, metric=self.metric, cluster_method=self.cluster_method, algorithm=self.algorithm) elif self.name == 'AGGLOMERATIVE': clusterer = AgglomerativeClustering( distance_threshold=self.epsilon, affinity=self.metric, linkage='average', n_clusters=None) if self.metric in ['haversine', 'precomputed']: lat = np.array(st_arr[:, 0]) lon = np.array(st_arr[:, 1]) st_coords = np.column_stack((lat, lon)) st_arr = haversine_distances(np.radians(st_coords), np.radians(st_coords)) # clusterer = AgglomerativeClustering(distance_threshold=self.epsilon, # affinity=self.metric, # n_clusters=None) elif self.metric in ['euclidean']: pass else: raise ValueError( 'Invalid metric %s . Must be euclidean, havesine, or precomputed' % self.metric) elif self.name == 'DENCLUE': clusterer = denclue.DENCLUE( h=None, eps=self.epsilon, min_density=self.minimum_cluster_size, metric=self.metric) if self.fit_predict: print( 'WARNING DENCLUE does does not have a fit_predict function. Switching to fit' ) self.fit_predict = False elif self.name == 'BIRCH': clusterer = Birch(n_clusters=None, threshold=self.epsilon) elif self.name == 'MEANSHIFT': clusterer = MeanShift() elif self.name == 'KMEANS': scaler = StandardScaler() scaled_features = scaler.fit_transform(st_arr) ''' init="random" or "k-means++" n_init=10 (Number of runs with different centroid seeds) max_iter=300 (Maximum number of iterations for a single run) random_state=5 (Determines random number generation for centroid initialization) ''' clusterer = KMeans( init='k-means++', n_clusters=self.n_clusters, # default=8 n_init=self.centroid_init, max_iter=self.maximum_iterations, # default=300 random_state=self.random_state) # default=5 elif self.name == 'SPECTRAL': clusterer = SpectralClustering( assign_labels=self. algorithm, # {‘kmeans’, ‘discretize’}, default=’kmeans’ random_state=self.random_state, # default: 0 n_clusters=self.n_clusters, # default=8 # {'nearest_neighbors','rbf','precomputed','precomputed_nearest_neighbors'} affinity=self.metric, n_neighbors=self. minimum_samples, # Number of neighbors to use; default=10 eigen_solver=self. cluster_method # {‘arpack’, ‘lobpcg’, ‘amg’} ) if self.metric in [ 'precomputed', 'precomputed_nearest_neighbors' ]: lat = np.array(st_arr[:, 0]) lon = np.array(st_arr[:, 1]) st_coords = np.column_stack((lat, lon)) st_arr = haversine_distances(np.radians(st_coords), np.radians(st_coords)) #d elif self.name == 'NEARESTNEIGHBORS': #d clusterer = NearestNeighbors(n_neighbors=self.n_clusters, #d metric=self.metric, #d weights='distance', #d algorithm=self.algorithm,) else: print( "Class cluster_data [get_clusters] something was not right" ) X, _labels_true = make_blobs(n_samples=len(st_arr), centers=st_arr, cluster_std=self.cluster_std, random_state=self.random_state) if self.fit_predict: clusterer.fit_predict(np.radians(st_arr)) else: clusterer.fit(np.radians(st_arr)) # _core_samples_mask = np.zeros_like(clusterer.labels_, dtype=bool) # _core_samples_mask[clusterer.core_sample_indices_] = True # print(clusterer) cluster_centers = self.get_cluster_centers(self.name, clusterer) return clusterer.labels_, _labels_true, cluster_centers #, _core_samples_mask except Exception as err: print("Class cluster_data [get_clusters] Error message:", err) print(traceback.format_exc())
def detect(file_path, space, deleted_features): """ Detect outliers """ start_time = time.time() print("==================================================") print("Outlier detection and treatment started ...") print("Space:", space) X = DataLoader.load(file_path) # X = pd.read_csv(file_path) if len(deleted_features) > 0: X = X.drop(deleted_features, axis=1, inplace=False) # Basic data cleaning X = data_cleaning_formatting(X) y_predicted = None params = space['params'] error = dict() try: if space['model'] == "DBSCAN": model = DBSCAN(**params) y_predicted = model.fit_predict(X) y_predicted = list(map(lambda x: 1 if x < 0 else 0, y_predicted)) elif space['model'] == "OPTICS": model = OPTICS(**params) y_predicted = model.fit_predict(X) print(y_predicted) y_predicted = list(map(lambda x: 1 if x < 0 else 0, y_predicted)) elif space['model'] == "EllipticEnvelope": model = EllipticEnvelope(**params) y_predicted = model.fit_predict(X) y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted)) elif space['model'] == "IsolationForest": model = IsolationForest(**params) with parallel_backend('threading'): y_predicted = model.fit_predict(X) y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted)) elif space['model'] == "OneClassSVM": model = OneClassSVM(**params) y_predicted = model.fit_predict(X) y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted)) elif space['model'] == "LocalOutlierFactor": model = LocalOutlierFactor(**params) with parallel_backend('threading'): y_predicted = model.fit_predict(X) y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted)) elif space['model'] == "zscore": model = ZScore(threshold=params['threshold']) y_predicted = model.fit_predict(X) except Exception as e: print("Error:", e) y_predicted = [0] * X.shape[0] error['detect_' + str(space)] = e if isinstance(y_predicted, list): y_predicted = np.array(y_predicted) time_taken = time.time() - start_time print("Time taken:", time_taken) return y_predicted
import numpy as np # Generate sample data np.random.seed(0) n_points_per_cluster = 250 C1 = [-5, -2] + 0.8 * np.random.randn(n_points_per_cluster, 2) C2 = [4, -1] + 0.1 * np.random.randn(n_points_per_cluster, 2) C3 = [1, -2] + 0.2 * np.random.randn(n_points_per_cluster, 2) C4 = [-2, 3] + 0.3 * np.random.randn(n_points_per_cluster, 2) C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2) C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2) X = np.vstack((C1, C2, C3, C4, C5, C6)) clust = OPTICS(min_samples=50, xi=0.05, min_cluster_size=0.05) # Run the fit clust.fit(X) labels_050 = cluster_optics_dbscan( reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=0.5, ) labels_200 = cluster_optics_dbscan( reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=2,
def update_chart(self): if self.last_results is None: return x = self.transfer(self.last_results[0].classes_φ) self.save_typical_button.setEnabled(True) cluster = OPTICS(min_samples=self.min_samples_input.value(), min_cluster_size=self.min_cluster_size_input.value(), xi=self.xi_input.value()) flags = cluster.fit_predict(self.data_to_clustering) cmap = plt.get_cmap() self.clustering_axes.clear() flag_set = set(flags) for flag in flag_set: key = np.equal(flags, flag) if flag == -1: c = "#7a7374" label = self.tr("Not clustered") else: c = cmap(flag) label = self.tr("EM{0}").format(flag + 1) self.clustering_axes.plot(self.data_to_clustering[key] [:, self.x_axis_combo_box.currentIndex()], self.data_to_clustering[key] [:, self.y_axis_combo_box.currentIndex()], c="#ffffff00", marker=".", ms=8, mfc=c, mew=0.0, zorder=flag, label=label) if len(flag_set) < 6: self.clustering_axes.legend(loc="upper left") self.clustering_axes.set_xlabel(self.x_axis_combo_box.currentText()) self.clustering_axes.set_ylabel(self.y_axis_combo_box.currentText()) self.clustering_axes.set_title(self.tr("Clustering of end-members")) self.component_axes.clear() if self.xlog: self.component_axes.set_xscale("log") for flag in flag_set: if flag == -1: c = "#7a7374" else: c = cmap(flag) key = np.equal(flags, flag) for distribution in self.stacked_components[key]: self.component_axes.plot(x, distribution, c=c, zorder=flag) if flag != -1: typical = np.mean(self.stacked_components[key], axis=0) self.component_axes.plot(x, typical, c="black", zorder=1e10, ls="--", linewidth=1) self.component_axes.set_title(self.tr("Typical end-members")) self.component_axes.set_xlabel(self.xlabel) self.component_axes.set_ylabel(self.ylabel) self.component_axes.set_xlim(x[0], x[-1]) self.component_axes.set_ylim(0, None) self.figure.tight_layout() self.canvas.draw()
clean_data = clean_data[clean_data.KM != -1] data_no_km = clean_data.loc[:, clean_data.columns != 'KM'] # Plotting data without the noise pca = PCA(n_components=2, random_state=1).fit_transform(data_no_km) sns.scatterplot(x=pca[:, 0], y=pca[:, 1], hue=clean_data['KM'], palette='deep') plt.show() # Let us now attempt to apply OPTICS algorithm on our cleaned dataset without noise # First let us remove the K-means clusters from the dataset # clean_data.drop(['KM'], axis=1, inplace=True) # min_samples=82 is set as the data after feature selection has 41 dimensions # eps=18.498287329980066 determined using k-neighbours plot and imported to increase speed opt = OPTICS(min_samples=82, eps=18.498287329980066, n_jobs=-1).fit_predict(data_no_km) clean_data['OPT'] = opt # OPTICS algorithm has rejected most of the data (100000) and treated it as noise, while the rest 30000, # it split into various groups of small sizes ranging from 1626 to 84 members. It can be seen on the plot bellow print(clean_data['OPT'].value_counts()) # Therefore we will discard OPTICS and keep K-Means # Interpreting clusters using Random Forest and extracting the most important features # And graphically showing how the clusters were sorted based on important features #(0.5, 1.3) rf = RandomForestClassifier(random_state=1, n_jobs=-1).fit(data_no_km, clean_data.KM) selected_columns = list(
def save_typical(self, filename): assert self.last_results is not None if len(self.last_results) == 0: return cluster = OPTICS(min_samples=self.min_samples_input.value(), min_cluster_size=self.min_cluster_size_input.value(), xi=self.xi_input.value()) classes_μm = self.last_results[0].classes_μm flags = cluster.fit_predict(self.data_to_clustering) flag_set = set(flags) typicals = [] for flag in flag_set: if flag != -1: key = np.equal(flags, flag) typical = np.mean(self.stacked_components[key], axis=0) typicals.append(typical) wb = openpyxl.Workbook() prepare_styles(wb) ws = wb.active ws.title = self.tr("README") description = \ """ This Excel file was generated by QGrain ({0}). Please cite: Liu, Y., Liu, X., Sun, Y., 2021. QGrain: An open-source and easy-to-use software for the comprehensive analysis of grain size distributions. Sedimentary Geology 423, 105980. https://doi.org/10.1016/j.sedgeo.2021.105980 It contanins 2 + N_clusters sheets: 1. The first sheet is the sum distributions of all component clusters. 2. The second sheet is used to put the component distributions that not in any cluster. 3. The left sheet is the component distributions of each cluster, separately. The clustering algorithm is OPTICS, implemented by scikit-learn. https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html Clustering algorithm details min_samples={1} min_cluster_size={2} xi={3} others=default """.format(QGRAIN_VERSION, self.min_samples_input.value(), self.min_cluster_size_input.value(), self.xi_input.value()) def write(row, col, value, style="normal_light"): cell = ws.cell(row + 1, col + 1, value=value) cell.style = style lines_of_desc = description.split("\n") for row, line in enumerate(lines_of_desc): write(row, 0, line, style="description") ws.column_dimensions[column_to_char(0)].width = 200 ws = wb.create_sheet(self.tr("Typical Components")) write(0, 0, self.tr("Typical Component"), style="header") ws.column_dimensions[column_to_char(0)].width = 16 for col, value in enumerate(classes_μm, 1): write(0, col, value, style="header") ws.column_dimensions[column_to_char(col)].width = 10 for row, distribution in enumerate(typicals, 1): if row % 2 == 0: style = "normal_dark" else: style = "normal_light" write(row, 0, self.tr("Component{0}").format(row), style=style) for col, value in enumerate(distribution, 1): write(row, col, value, style=style) QCoreApplication.processEvents() for flag in flag_set: if flag == -1: ws = wb.create_sheet(self.tr("Not Clustered"), 2) else: ws = wb.create_sheet(self.tr("Cluster{0}").format(flag + 1)) write(0, 0, self.tr("Index"), style="header") ws.column_dimensions[column_to_char(0)].width = 16 for col, value in enumerate(classes_μm, 1): write(0, col, value, style="header") ws.column_dimensions[column_to_char(col)].width = 10 key = np.equal(flags, flag) for row, component in enumerate(self.stacked_components[key], 1): if row % 2 == 0: style = "normal_dark" else: style = "normal_light" write(row, 0, str(row), style=style) for col, value in enumerate(component, 1): write(row, col, value, style=style) QCoreApplication.processEvents() wb.save(filename) wb.close()
lon0 = np.array(lon0) lat0 = np.array(lat0) X = direct_embedding(lons, lats) #%% P = {} P = { "MinPts": mins, "optics_params": [ ["dbscan", 4000], ["xi", 0.002], ], "ylims": [200, 20000] } #%% optics_clustering = OPTICS(min_samples=mins, metric="euclidean").fit(X) reachability = optics_clustering.reachability_ core_distances = optics_clustering.core_distances_ ordering = optics_clustering.ordering_ predecessor = optics_clustering.predecessor_ #%% np.savez('results/OPTICS_sp%d_smin%d' % (sp, mins), reachability=reachability, core_distances=core_distances, ordering=ordering, predecessor=predecessor, lon=lon0, lat=lat0)
km = EARTHRADIUS * c return km def group_euclid(qw): return np.sqrt((qw['latitude'] - qw['latitude'].mean())**2 + (qw['longitude'] - qw['longitude'].mean())**2).max() df1 = pd.read_csv('df_concat.csv') df2 = df1.drop(['Industry'], axis=1) distance_matrix = squareform( pdist(df2, (lambda u, v: getDistanceByHaversine(u, v)))) db = OPTICS(min_samples=5, metric='precomputed') y_db = db.fit_predict(distance_matrix) df1['cluster'] = y_db uf = df1[['Industry', 'cluster']].groupby('cluster') unique_cluster = uf.nunique() mean_cluster = df1[['longitude', 'latitude', 'cluster']].groupby('cluster').mean() max_distance = df1[['longitude', 'latitude', 'cluster']].groupby('cluster').apply(group_euclid) three_cluster = mean_cluster[unique_cluster['Industry'] == 3]