def dmdbscan_algorithm(data, folder): """ Function to find optimal distance for DBSCAN using DMDBSCAN algorithm. param: 1. data - pandas DataFrame (10000, 82) or (10000, 3), where values are mean spendings of customers for every category 2. folder - string path to save plot return: Float value of optimal distance """ # Create Nearest Neighbors model to find distance to the # first closest neighbor nn_model = sklearn.neighbors.NearestNeighbors(n_neighbors=2, n_jobs=-1).fit(data) # Get and sort distances distances, indices = nn_model.kneighbors(data) distances = np.sort(distances, axis=0)[:, 1] # Find elbow (knee) on distances knee_loc = kneed.KneeLocator(distances, np.arange(len(distances)), curve="concave", direction="increasing", online=False, interp_method="polynomial") # Plot distances and optimal distance plotting.line_plotting( [np.arange(len(distances)), distances, knee_loc.knee], ["Distance", ""], "Optimal distance", folder) return knee_loc.knee
def main(file): dev_strain_cnt = [] dev_dev = [] with open(file) as dev: dev_csv = csv.reader(dev) for idx, row in enumerate(dev_csv): if idx == 0: continue else: dev_strain_cnt.append(int(row[0])) dev_dev.append(float(row[3])) dev_strain_cnt, dev_dev = zip(*sorted(zip(dev_strain_cnt, dev_dev))) dev_dev = list(dev_dev) devs = [] devs_mean = [] cnt = 0 for dev in dev_dev: cnt += 1 devs.append(dev) if cnt == 4: devs_mean.append(sum(devs) / len(devs)) cnt = 0 devs = [] x = range(len(devs_mean)) kn = kneed.KneeLocator(x, devs_mean, curve='convex', direction='decreasing', interp_method='polynomial') print(kn.knee + 1)
def fit(self, X): for k, model in enumerate(self.models, 1): model.fit(X) print(f"AutoKMeans: Finished cycle {k}") self.sse.append(model.inertia_) if k > 1: self.sil_coef.append( sk_metrics.silhouette_score(X, model.labels_)) if self.do_plot: _, ax = plt.subplots() ax.plot(range(1, self.max_clusters), np.array(self.sse) / max(self.sse), label="sse") ax.plot(range(2, self.max_clusters), np.array(self.sil_coef) / max(self.sil_coef), label="sil_coef") ax.legend() kl = kneed.KneeLocator(range(1, self.max_clusters), self.sse, curve="convex", direction="decreasing") if self.preferred == "SSE": print( f"SSE detection finds {kl.elbow} clusters " f"(silhouette coefficients say {np.argmax(self.sil_coef) + 2} clusters)" ) return self.models[kl.elbow - 1] print( f"Silhouette coefficients find {np.argmax(self.sil_coef) + 2} clusters" f"(SSE detection says {kl.elbow} clusters)") return self.models[np.argmax(self.sil_coef) + 2 - 1]
def perform_k_means(data, num_salesmen): sse = [] k_rng = range(1, num_salesmen + 1) for k in k_rng: km = KMeans(n_clusters=k) km.fit(data) sse.append(km.inertia_) # TODO improve the estimation of the best number of K. Maybe through tracking the slope of the curve """ for i in range(0, len(sse)-1): m = 1-((sse[i+1]/sse[i])/1) print(m) """ kn = kneed.KneeLocator(k_rng, sse, curve='convex', direction='decreasing', interp_method='interp1d') print("Best Estimated K: ", kn.elbow) km = KMeans(n_clusters=kn.elbow) y_predicted = km.fit_predict(data) plt.xlabel('K') plt.ylabel('Sum of squared error') plt.plot(k_rng, sse, 'bx-') return y_predicted
def main(args): # main routine os.chdir(args.output) if not os.path.isfile('dfreqstran_df.csv') and not os.path.isfile( 'dfreqssel_var.csv'): print( "Missing essential files dfreqstran_df.csv and dfreqssel_var.csv!") os.sys.exit(-1) # Use LOWESS to smooth data points x = [] y = [] with open(args.strains, 'r') as fh: next(fh) for l in fh: segs = re.split(",", l) x.append(int(segs[1])) y.append(float(segs[3])) lowess = sm.nonparametric.lowess(y, x) l_x = list(zip(*lowess))[0] l_y = list(zip(*lowess))[1] real = dict() for i, j in zip(l_x, l_y): real[i] = j r_x = list(real.keys()) r_y = list(real.values()) # Select x value from elbow plot and use that in desman #a = kneed.KneeLocator(x, y, curve='convex', direction='decreasing') #astrains = a.knee k = kneed.KneeLocator(r_x, r_y, curve='convex', direction='decreasing', S=1.0) strains = k.knee for i, j in zip(r_x, r_y): print(f'x:{i}\ty:{j}') print(f'Identified {strains} as the optimal strain count from the values') with open(args.strains + ".strain.count", 'w') as out: out.write(f'{args.output}\t{strains}\n') # Run desman on the data gamma, eta, tau = desmanRun(args.desman, 'dfreqssel_var.csv', 'dfreqstran_df.csv', int(strains)) print(f'Desman stats: gamma = {gamma}, eta = {eta}, tau = {tau}')
def elbow_method(data, folder, max_clusters=102): """ Function to find clusters number for k-means using Elbow method. param: 1. data - pandas DataFrame (10000, 82) or (10000, 3), where values are mean spendings of customers for every category 2. folder - string path to save plot 3. max_clusters - int number of maximum clusters (102 as default) return: Int value of optimal clusters number """ elbow_results = [] # Do k-means clustering with number of clusters from 2 to # max_clusters (102) and compute sum of squared distances of samples # to their closest cluster centers as scores for clusters_number in range(2, max_clusters): kmeans_model = sklearn.cluster.KMeans(n_clusters=clusters_number, n_jobs=-1).fit(data) elbow_results.append(kmeans_model.inertia_) elbow_results = np.array(elbow_results) # Find the elbow (knee) on scores knee_loc = kneed.KneeLocator(np.arange(2, max_clusters), elbow_results, curve="convex", direction="decreasing", online=False, interp_method="polynomial") # Plot scores and optimal number of clusters plotting.line_plotting( [elbow_results, np.arange(2, max_clusters), knee_loc.knee], ["Clusters number", "Score"], "Elbow score", folder) return knee_loc.knee
def elbow_cluster_analysis(X, algorithm="pc-kmeans", start=2, end=15, **params_): labels_list = [] models = [] for n_clusters in range(start, end + 1): with silence(): labels, model = cluster_analysis(X, algorithm=algorithm, **params_) labels_list.append(labels) models.append(model) print(f"\033[KDid model with n_clusters={n_clusters}", end="\r") goodnesses = [goodness_of_model(model) for model in models] kl = kneed.KneeLocator(range(start, end + 1), goodnesses, curve="convex", direction="increasing") n_clusters = kl.elbow print(f"ELBOW-{algorithm.upper()}: Found elbow at {n_clusters} clusters " f"(goodness: {goodnesses[n_clusters - start]:.2f})") return labels_list[n_clusters - start], models[n_clusters - start]
def data_clustering(self): try: """ Dataset is divided into clustes so similar data get trained in a particular model Input : N/A Output : Clusters will be generated and dataset will be stored in a particular directory """ self.logger.add_in_logs("chk", "Clustering process Initialized") self.logger.add_in_logs("chk", "finding number of clusters") self.wcss = [] x = self.df.drop(["SalePrice"], axis=1) self.logger.add_in_logs( "inf", "saving a plot of wcss vs number of clusters ") for i in range(1, 30): model = KMeans(n_clusters=i) model.fit(x) self.wcss.append(model.inertia_) k = kneed.KneeLocator(range(1, 30), self.wcss, curve="convex", direction="decreasing") self.knee = k.knee self.logger.add_in_logs( "inf", str(self.knee) + " number of clusters will be formed") self.wcss = np.array(self.wcss) / max(self.wcss) plt.figure() plt.style.use("classic") plt.plot(range(1, 30), self.wcss, label="WCSS vs Inertia", color="blue") plt.plot([self.knee, self.knee], [min(self.wcss), 1], label="Number of cluster used", color="black") plt.xlabel("No of cluster") plt.ylabel("WCSS") plt.legend(loc="upper right") plt.savefig(self.path + "/static/plots/Cluster.jpg") self.logger.add_in_logs("pas", "finding number of clusters completed") self.logger.add_in_logs("chk", "getting cluster number for dataset") model = KMeans(n_clusters=self.knee) x = self.df.drop(["SalePrice"], axis=1) model.fit(x) cluster_no = model.predict(x) self.df["Cluster"] = cluster_no self.logger.add_in_logs("inf", "saving clustering model in model file") pickle.dump( model, open( self.path + "/Model_files/Cluster_directory/Cluster.pickle", "wb")) for i in range(0, self.knee): self.logger.add_in_logs( "inf", str(i) + " cluster is exported in .csv file") self.df[self.df["Cluster"] == i].to_csv( self.path + "/Input_files/Cluster_data/" + str(i) + "_cluster.csv", index=False) self.logger.add_in_logs("pas", "Exporting Clustered dataset Completed") self.logger.add_in_logs("pas", "Clustering process Completed") except Exception as e: self.logger.add_in_logs("ERR", "data preprocessing in data clustering") self.logger.add_in_logs( "LIN", "Error on line number : {}".format( sys.exc_info()[-1].tb_lineno)) self.logger.add_in_logs("TYP", str(e))
print(vWcsse) # plotting the results onto a line graph, allowing us to observe 'The elbow' print("\n*** Plot WCSSE ***") plt.figure() plt.plot(range(1, vIters), lWcsse) plt.title('The elbow method') plt.xlabel('Number of clusters') plt.ylabel('WCSSE') #within cluster sum of squares error plt.show() # programatically #!pip install kneed print("\n*** Find Best K ***") import kneed kl = kneed.KneeLocator(range(1, vIters), lWcsse, curve="convex", direction="decreasing") vBestK = kl.elbow print(vBestK) # k means cluster model print("\n*** Model Create & Train ***") model = KMeans(n_clusters=vBestK, random_state=707) model.fit(X) # result print("\n*** Model Results ***") print(model.labels_) df['PredKnn'] = model.labels_ # counts for knn print("\n*** Counts For Knn ***")
kmeans_fit = kmeans.fit(new_r_dat) inertia_vals.append(kmeans_fit.inertia_) pl.plot([*range(3, 15)], inertia_vals) pl.xlabel('Number of clusters') pl.ylabel('K-means inertia') pl.vlines([4, 7], ymin=12000, ymax=18500, linestyle='dashed') pl.show() # %% import kneed kn = kneed.KneeLocator([*range(3, 15)], inertia_vals, curve='convex', direction='decreasing', interp_method='polynomial') print(kn.knee) kn_gini = kneed.KneeLocator([*range(3, 33)], r_score_new, curve='convex', direction='decreasing', interp_method='polynomial') print(kn_gini.knee) kn_gini_old = kneed.KneeLocator([*range(3, 33)], r_score_old, curve='convex',
#finding the optimum number of clusters for k-means classification to check wcss = [] K = range(1, 11) for i in K: k = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0) y_k = k.fit(x) wcss.append(k.inertia_) print(wcss) #find the elbow point import kneed kn = kneed.KneeLocator(K, wcss, curve='convex', direction='decreasing') print("\nknee=", kn.knee) #plot sum of squared distances plt.plot(K, wcss, 'bx-') plt.xlabel('k') plt.ylabel('Sum_of_squared_distances-wcss') plt.title('Elbow Method to find Optimal k') plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') plt.show() for i in K: k = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10,