def findrgbrange(self, image): RGB = [] # Computing the knees for rgb if len(image.shape) >= 3: try: #print("***Colour RGB range detection***") for i in range(0, 3): cdf3 = exposure.cumulative_distribution(image[:, :, i]) kneedle = KneeLocator(cdf3[1], cdf3[0], curve='convex', direction='increasing') RGB.append(int(kneedle.knee)) return RGB except ValueError: print(ValueError) print("Can't get the range") input("press any key to back to processing menu...") return False else: try: #print("***Grayscale range detection***") cdf3 = exposure.cumulative_distribution(image) kneedle = KneeLocator(cdf3[1], cdf3[0], curve='convex', direction='increasing') RGB.append(int(kneedle.knee)) return RGB except ValueError: print(ValueError) print("Can't get the range") input("press any key to back to processing menu...") return False
def find_epsilon(feature_file, min_samp, kupa, number_of_features, dest): X = np.loadtxt(feature_file) X = StandardScaler().fit_transform(X) nearest_neighbors = NearestNeighbors(n_neighbors=min_samp + 1) neighbors = nearest_neighbors.fit(X) distances, indices = neighbors.kneighbors(X) distances = np.sort(distances[:, min_samp], axis=0) i = np.arange(len(distances)) knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial') epsilon = distances[knee.knee] fig = plt.figure(figsize=(5, 5)) knee.plot_knee() plt.title( f"Elbow for {kupa}\nMfcc-{number_of_features}-features\nEpsilon={epsilon}" ) plt.xlabel("Points") plt.ylabel("Distance") plt.savefig( f'{dest}results/Elbow_{kupa}_Mfcc_{number_of_features}_features.png') plt.close() print(epsilon) return epsilon
def locate_knee(time, eps_fit, eps_stat): from kneed import KneeLocator # cast as numpy arrays due to some bug with xarrays/pandas indexing time = np.array(time) eps_fit = np.array(eps_fit) eps_stat = np.array(eps_stat) while not np.array_equal(time, np.sort(time)): idx_del = np.where(np.diff(time) < 0)[0] + 1 time = np.delete(time, idx_del) eps_fit = np.delete(eps_fit, idx_del) if eps_fit.max() > 2 * eps_stat: # log-norm + tanh knee = KneeLocator(time, eps_fit, direction="decreasing") idx = knee.knee_x else: knee = KneeLocator(time, eps_fit) idx = knee.knee_x if idx is None: # non-stationary case idx = -1 return idx
def findrgbrange(image): from kneed import KneeLocator RGB = [] # Computing the knees for rgb if len(image.shape) >= 3: try: print("***Colour RGB range detection***") for i in range(0, 3): cdf3 = exposure.cumulative_distribution(image[:, :, i]) # print(i) # print(image.shape) # print(image[:, :, i].shape) # print(cdf3) # div = np.gradient(np.array([cdf3[0], cdf3[1]], dtype=float)) # print(div) # plt.plot(div[0], div[1]) # plt.plot(cdf3[1], cdf3[0]) kneedle = KneeLocator(cdf3[1], cdf3[0], curve='convex', direction='increasing') # kneedle.plot_knee() # finding the knee # kneedle = KneeLocator(-div[0][0], div[1][1], S=5, curve='convex', direction='increasing') # kneedle.plot_knee() print(round(kneedle.knee, 3)) RGB.append(int(kneedle.knee)) print(round(kneedle.knee_y, 3)) # plt.show() return RGB except ValueError: print(ValueError) print("Can't get the range") input("press any key to back to processing menu...") return False else: try: print("***Grayscale range detection***") cdf3 = exposure.cumulative_distribution(image) # print(image.shape) # print(cdf3) kneedle = KneeLocator(cdf3[1], cdf3[0], curve='convex', direction='increasing') # kneedle.plot_knee() print(round(kneedle.knee, 3)) RGB.append(int(kneedle.knee)) print(round(kneedle.knee_y, 3)) return RGB except ValueError: print(ValueError) print("Can't get the range") input("press any key to back to processing menu...") return False
def find_eps(data, k, metric): nbrs = NearestNeighbors(n_neighbors=k, metric=metric).fit(data) distances, indices = nbrs.kneighbors(data) distanceDec = sorted(distances[:, k - 1], reverse=True) knee = KneeLocator(indices[20:500, 0], distanceDec[20:500], direction="decreasing", curve="convex") knee.plot_knee_normalized() return distanceDec[knee.elbow]
def find_eps(data, k, metric): END = round(data.shape[0] * 0.9) nbrs = NearestNeighbors(n_neighbors=k, metric=metric).fit(data) distances, indices = nbrs.kneighbors(data) distanceDec = np.array(sorted(distances[:, k - 1], reverse=True)) knee = KneeLocator(indices[:END, 0], distanceDec[:END], curve="convex", direction="decreasing", S=1.0) knee.plot_knee_normalized() return distanceDec[knee.elbow], knee
def kneedle_cutoff(df_analysis, verbose=True): """Get kneedle estimate for what is a cell.""" results = [] # Get kneedle estimate using raw data OR fit smoothing spline. Also # maximize sensitivity. # NOTE: Much better estimates with kneedle (that is similar to cellranger3) # if this is done the original value space - not log space. kneedle_dict = {} for fit in [None, 'interp1d', 'polynomial']: key = 'kneedle::spline={}'.format(fit) sensitivity = 5000 while True: if fit: kneedle_dict[key] = KneeLocator( df_analysis['barcode'].values, df_analysis['umi_counts'].values, curve='convex', direction='decreasing', S=sensitivity, interp_method=fit) else: kneedle_dict[key] = KneeLocator( df_analysis['barcode'].values, df_analysis['umi_counts'].values, curve='convex', direction='decreasing', S=sensitivity) if kneedle_dict[key].knee is None: sensitivity -= 100 else: if verbose: print('S:\t{}\nknee:\t{}\nelbow:\t{}'.format( sensitivity, round(kneedle_dict[key].knee, 3), round(kneedle_dict[key].elbow, 3))) results.append({ 'method': key, 'umi_counts_cutoff': df_analysis.loc[df_analysis['barcode'].values == int(kneedle_dict[key].knee), 'umi_counts'].values[0], 'n_cells': int(kneedle_dict[key].knee), 'sensitivity': sensitivity # 'elbow': 10 ** kneedle_dict[key].elbow # same as knee }) break return results
def feature_selection(data, target, method=c.XGB, verbose=False): if method == c.COR: correlation = data.corr() if verbose: sns.heatmap(correlation, cmap='Blues', annot=True) plt.show() return correlation.loc[(correlation[target] > 0.2) & (correlation[target] < 0.8)].index.tolist() else: xgb_params = { 'eta': 0.05, 'max_depth': 10, 'subsample': 1.0, 'colsample_bytree': 0.7, 'objective': 'reg:squarederror', 'eval_metric': 'rmse' } df = data.copy(deep=True) y = df[target] del df[target] x = df dtrain = xgb.DMatrix(x, y, feature_names=df.columns.values) model = xgb.train(xgb_params, dtrain, num_boost_round=1000) importance = model.get_score(importance_type='total_cover') imp = pd.DataFrame(importance, index=range(1)).T imp.columns = ["Importance"] imp = imp.sort_values(by=["Importance"], ascending=False) imp /= imp.sum() if verbose: sns.heatmap(imp, cmap='Blues', annot=True) plt.show() imp["x"] = range(len(imp)) # Online is a good parameter but you might wanna get rid of it if it gives a bad accuracy kneedle = KneeLocator(imp.x, imp.Importance, curve="convex", direction="decreasing", online=True) if verbose: kneedle.plot_knee() return imp.iloc[0:kneedle.knee].index.tolist() + [target]
def start_GMM(): global dfCluster # We only want the vitals to be fed into the clustering algorithm dfIDs = pd.DataFrame(columns=['SId','AId']) dfIDs['SId'] = dfCluster['SId'].astype('int64',copy=True) dfIDs['AId'] = dfCluster['AId'].astype('int64',copy=True) dfCluster.drop(['SId','AId'],1,inplace=True) # The clustering algorithm takes in a list of lists which in our case will be the vitals allVitals = dfCluster.values # Now in a similar fashion to k-means we need to find out how many clusters we want nList = [ i for i in range(1,15) ] allVitalsNorm = scale(allVitals) models = [ GaussianMixture(n, covariance_type='full', random_state=0).fit(allVitalsNorm) for n in nList ] bicList = [ m.bic(allVitals) for m in models ] aicList = [ m.aic(allVitals) for m in models ] kn = KneeLocator(nList,bicList,S=1.0, curve='convex',direction='decreasing') kn1 = KneeLocator(nList,aicList,S=1.0, curve='convex', direction='decreasing') print('Recommended number of clusters by BIC: {}'.format(kn.knee)) print('Recommended number of clusters by AIC: {}'.format(kn1.knee)) nComps = kn.knee model = models[nComps] # may be interested in knowing the probabilities in the future # probs = model.predict_proba(allVitals) # for p in probs: # if any( (i < 1.00) and (i > 0.00) for i in p ): # print(p.round(3)) dfCluster['Cluster Label'] = model.predict(allVitalsNorm) for column in dfIDs: dfIDs[column] = pd.to_numeric(dfIDs[column]) # Add both the subject id and admission id to the cluster dataframe dfCluster['SId'] = dfIDs['SId'].astype('int64',copy=True) dfCluster['AId'] = dfIDs['AId'].astype('int64',copy=True) return
def knee_loc(pipe, pipe_clusterer, X_train_trans): """Locate knee for clusterer and plot WCSS Args: pipe: Pipe for the entire model pipe_clusterer: Pipe for clusterer X_train_trans (): pre-transformed X_train (scaled and encoded) Returns: knee/elbow Plot of WCSS """ wcss = [] for i in range(1, 11): pipe_clusterer.n_clusters = i pipe.fit(X_train_trans) wcss.append(pipe_clusterer.inertia_) kl = KneeLocator(range(1, 11), wcss, curve="convex", direction="decreasing") print(kl.elbow) plt.plot(range(1, 11), wcss) plt.title('Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('Within cluster sum of squares (WCSS)') plt.show()
def elbow_plot(self): logger.info("Inside the elbow_plot function in KMeansClustering class") try: data = self.data k_range = range(1, 11) sse = [] for k in k_range: km = KMeans(n_clusters=k, init="k-means++", random_state=100, max_iter=1000) km.fit(data) sse.append(km.inertia_) plt.title("K vs sse(choose k value)") plt.xlabel("Number of clusters") plt.ylabel("sse") plt.plot(k_range, sse) plt.savefig("Elbow_plot.png") kn = KneeLocator(k_range, sse, curve="convex", direction="decreasing") logger.info("The optimum number of clusters are: {}".format( kn.knee)) return kn.knee except Exception as e: logger.warning( "Exception has occured in elbow_plot. Exception message: " + str(e)) logger.warning("Finding the number of clusters failed.") raise Exception()
def find_the_number_of_clusters(principal_components, limit): # Find the number of clusters wcss = [] for i in range(1, limit + 1): print("Fitting components {0}/{1}".format(i, limit)) kmeans_pca = KMeans(n_clusters=i, init='k-means++', random_state=42) kmeans_pca.fit(principal_components) wcss.append( sum( np.min(cdist(principal_components, kmeans_pca.cluster_centers_, 'euclidean'), axis=1)) / principal_components.shape[0]) # Plot the figure plt.figure(figsize=(16, 10)) plt.plot(range(1, limit + 1), wcss, marker='o', linestyle='--') plt.xlabel("Number of clusters") plt.ylabel("WCSS") plt.title("K-Means PCA") order = np.linspace(1, limit, limit) # find the elbow # https://github.com/arvkevi/kneed/blob/master/notebooks/decreasing_function_walkthrough.ipynb kn = KneeLocator(order, wcss, curve='convex', direction='decreasing') plt.vlines(kn.elbow, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') plt.show() print("Number of clusters: {0}".format(kn.elbow)) return int(kn.elbow)
def k_means_elbow(self): kmeans_kwargs = { "init": "random", "n_init": 10, "max_iter": 300, "random_state": 42 } sse = [] X = self.scaled_data[self.use_metrics] for k in range(1, 11): kmeans = KMeans(n_clusters=k, **kmeans_kwargs) kmeans.fit(X) sse.append(kmeans.inertia_) kl = KneeLocator(range(1, 11), sse, curve="convex", direction="decreasing") logging.info('Recommended cluster number for K-means: ' + str(kl.elbow)) if self.output_plots_location is not None: plt.close("all") plt.plot(range(1, 11), sse, 'bx-') plt.xticks(range(1, 11)) plt.xlabel("Number of Clusters") plt.ylabel("SSE") plt.savefig(self.output_plots_location / 'k_means-sse.pdf', bbox_inches='tight', pad_inches=0) plt.close('all')
def give_num_clusters(matrix, min_cluster, max_cluster): distortions = [] N_clusters = range(min_cluster, max_cluster) for n in N_clusters: kmeans = KMeans(init='k-means++', n_clusters=n, n_init=100) kmeans.fit(matrix) distortions.append( sum( np.min(cdist(matrix, kmeans.cluster_centers_, 'euclidean'), axis=1)) / matrix.shape[0]) kn = KneeLocator(list(N_clusters), distortions, S=0.1, curve='convex', direction='decreasing') fig, ax = plt.subplots() ax.plot(N_clusters, distortions, 'bx-') ax.set_xlabel('N') ax.set_ylabel('Distortion') ax.set_title('The Elbow Method showing the optimal customer clusters') ax.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles="--", label="knee/elbow") return {'Best_N': kn.knee, 'Plot': plt.tight_layout()}
def calculate_best_k_clustering(wcss, min_k, max_k): """ Finds the best k based on MSS :param max_k: :param min_k: :param wcss: :return: """ x = range(min_k, min_k + len(wcss)) y = wcss sensitivity = [1, 3, 5, 10, 100, 200, 400] knees = [] norm_knees = [] for s in sensitivity: kl = KneeLocator(x, y, curve='convex', direction='decreasing', S=s) knees.append(kl.knee) norm_knees.append(kl.norm_knee) print("knees") print(knees) plt.plot(range(min_k, min_k + len(wcss)), wcss) plt.title('Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.show() print("Errors: ") print(wcss) return knees[0]
def define_threshold(pydam_df, min_knee=0.5, alpha=0.05): """Find kneedle point in PyDamage results Finding the kneedle point to get the optimal tradeoff between FP and FN, for the predicted accurary threshold Args: pydam_df (pandas df): pydamage results min_knee (float, optional): Min predicted_accuracy threshold. alpha(float, optional): Alpha q-value threshold """ thresholds = [i.round(2) for i in arange(min_knee, 1, 0.01)] nb_contigs = list() nb_contigs = [] for i in thresholds: nb_contigs.append( pydam_df.query( f"predicted_accuracy >= {i} & qvalue <= {alpha}").shape[0]) kneedle = KneeLocator( thresholds, nb_contigs, S=1.0, curve="convex", direction="decreasing", online=True, ) print(thresholds) print(nb_contigs) return kneedle.knee
def get_num_pca_comp(x, name): pca = PCA().fit(x) vr = np.cumsum(pca.explained_variance_ratio_) print("Distribution of Eigen Values") print(pca.explained_variance_) x = range(1, len(vr) + 1) kneedle = KneeLocator(x, vr, S=1.0, curve='concave', direction='increasing') knee = math.ceil(kneedle.knee) plt.figure() plt.plot(x, vr, 'bx-') plt.xlabel('Number of Components') plt.ylabel('Cumulative Explained Variance') plt.axvline(x=knee, label="Selected Number of Components") plt.legend(loc="best") plt.title('N-Components vs. Explained Variance for {}'.format(name)) plt.savefig("pca_curves/{}.png".format(name)) return knee
def Elbow_kneeLocator(self, X, verbose): clusters = [] best_n_clusters = 0 best_sil = 0 for i in range(1, 11): km = KMeans(n_clusters=i).fit(X) clusters.append(km.inertia_) labels = km.labels_ if len(set(labels)) <= 1: continue sil = silhouette_score(X, labels) if sil > best_sil: best_sil = sil best_n_clusters = i fig, ax = plt.subplots() sns.lineplot(x=list(range(1, 11)), y=clusters, ax=ax) ax.set_title('Searching for Elbow') ax.set_xlabel('Clusters') ax.set_ylabel('Inertia') plt.show() kl = KneeLocator(range(1, 11), clusters, curve="convex", direction="decreasing") if verbose: print("\nResult finding by Knee Locator function: ") print(kl.elbow) print("\n") return best_n_clusters
def testk(df, data): kmeans_kwargs = { "init": "random", "n_init": 10, "max_iter": 100, "random_state": 42, } sse = [] for k in range(1, 6): kmeans = KMeans(n_clusters=k, **kmeans_kwargs) kmeans.fit(data) sse.append(kmeans.inertia_) # try: # except: try: # kl = KneeLocator(range(1, 6), sse, curve="concave", direction="increasing") kl = KneeLocator(range(1, 6), sse, curve="convex", direction="decreasing") except TypeError: print("hello") # kl.plot_knee_normalized() # n = kl.elbow n = 3 kmeans = KMeans(n_clusters=n, **kmeans_kwargs).fit(data) df['kmeans'] = list(kmeans.labels_) return df, n
def find_components(graph, allow_outliers=False): """ find the strongly connected components """ comps = [ c for c in sorted(nx.connected_components(graph), key=len, reverse=True) ] labels = [0] * graph.number_of_nodes() for i, c in enumerate(comps): for n in c: labels[n] = i if allow_outliers: hist = [len(v) for v in comps] x_axis = list(range(len(hist))) kn = KneeLocator(x_axis, hist, S=1.0, curve='convex', direction='decreasing') idx = kn.knee for i, c in enumerate(comps[idx:]): for n in c: labels[n] = -1 return np.array(labels), comps
def elbow_manual(n_clusters, X): sample, features = X.shape e = 10**(-10) X = DataFrameImputer().fit_transform(X) # X.fillna(X.mean()) SSE = [] SSE1 = [] for i in range(1, n_clusters): initial_centers = kpp_init_notrials(i, X) # initial_random=random_init(n_clusters,X) centers, labels = lloyd(i, X, e, initial_centers) centers_sk, labels_sk = kmean_sklearn(i, X) # en utilisant lloyd SSE.append(np.sum(np.min(cdist(X, centers, 'euclidean'), axis=1))) # en utilisant sklearn SSE1.append(np.sum(np.min(cdist(X, centers_sk, 'euclidean'), axis=1))) K = np.arange(1, n_clusters) plt.plot(K, SSE, label='méthode manuel', color='blue') plt.plot(K, SSE1, label='méthode sklearn', color='orange') plt.title('Comparaison Méthode du coude entre notre algorithme et sklearn') plt.show() plt.legend() # On doit prendre au minimum 2 clusters K_ = np.arange(2, n_clusters) kn = KneeLocator(K_, SSE, curve='convex', direction='decreasing') print(kn.knee) # plotting dashed_vline on knee plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
def make_loadings_matrix(rating_m): '''Takes a rating matrix and returns the loading matrix. Optimized for number of components using the knee, with a oblimin rotation for interpretability ''' # Fit the initial factor analysis fa = FactorAnalyzer(n_factors=10, rotation='oblimin') fa.fit(rating_m) x = list(range(1, 16)) fa_eigens = fa.get_eigenvalues()[1] fa_matrix_knee = KneeLocator(x, fa_eigens, S=1.0, curve='convex', direction='decreasing') fa_knee = fa_matrix_knee.knee fa_kneed = FactorAnalyzer(n_factors=fa_knee, rotation='varimax').fit(rating_m) loadings_m = pd.DataFrame(fa_kneed.loadings_.round(2)) loadings_m.index = get_construct_names() loadings_m.index = loadings_m.index.rename(name='Construct') loadings_m.columns = [ 'Factor {} ({:.0f}%)'.format( i + 1, fa_kneed.get_factor_variance()[1][i] * 100) for i in loadings_m.columns ] return loadings_m
def get_elbow(mode, adj_matrix, max_modules, min_modules): ''' Inputs mode (str): just a string label to keep track of what matrix we're initializing. No computational value. adj_matrix (np.array (N, N)): the adjacency matrix of the network max_modules (int): Max number of expected labels min_modules (int): Min number of expected labels Returns elbow/knee (int): The most appropriate number of knee/elbow value ''' s, principal_axes = np.linalg.eig(adj_matrix) N = max_modules + 1 ind = np.arange(min_modules, N, 1) # the x locations for the groups kn = KneeLocator(ind, s[min_modules:N], S=1.0, curve='convex', direction='decreasing', online=True) ''' plt.figure() plt.xlabel('k') plt.ylabel('Distortion') plt.title('The Elbow Method showing the optimal k') plt.plot(ind, s[min_modules:N], 'bx-') plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') savefig('modules_' + str(mode) + '.eps', bbox_inches='tight', format='eps', dpi=200) plt.close() ''' if kn.knee is None: return int((max_modules + min_modules)/2) return kn.knee
def elbow_plot(self, data): wcss = [] # initializing an empty list --within cluster sum of errors try: self.logger.info('Start of elbow plotting...') for i in range(1, 11): kmeans = KMeans( n_clusters=i, init='k-means++', random_state=0) # initializing the KMeans object kmeans.fit(data) # fitting the data to the KMeans Algorithm wcss.append(kmeans.inertia_) plt.plot( range(1, 11), wcss ) # creating the graph between WCSS and the number of clusters plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') #plt.show() plt.savefig('apps/models/kmeans_elbow.png' ) # saving the elbow plot locally # finding the value of the optimum cluster programmatically self.kn = KneeLocator(range(1, 11), wcss, curve='convex', direction='decreasing') self.logger.info('The optimum number of clusters is: ' + str(self.kn.knee)) self.logger.info('End of elbow plotting...') return self.kn.knee except Exception as e: self.logger.exception('Exception raised while elbow plotting:' + str(e)) raise Exception()
def calc_top_degron_threshold(result): """See how many top degron potential sequences need to be dropped before the bag of amino acids and position specific models provide similar results.""" step = 20 possible_thresh = np.arange(step, 8000, step) # get the delta auc for every threshold output_list = [] for thresh in possible_thresh: tmp = result.iloc[thresh:-thresh] pos_auc = metrics.roc_auc_score(tmp['y'], tmp['sequence position specific']) bag_auc = metrics.roc_auc_score(tmp['y'], tmp['bag of words']) delta_auc = pos_auc - bag_auc output_list.append([thresh, delta_auc]) #if delta_auc < 0.01: return thresh-step # figure out the knee result_df = pd.DataFrame(output_list, columns=['threshold', 'delta auc']) knee_obj = KneeLocator(result_df['threshold'], result_df['delta auc'], curve='convex', direction='decreasing') return knee_obj.knee
def getEpsilon(train_data): neigh = sklearn.neighbors.NearestNeighbors(n_neighbors=4) nbrs = neigh.fit(train_data) distances, indices = nbrs.kneighbors(train_data) distances = np.sort(distances, axis=0) distances = distances[:, 1] y = distances x = list(np.arange(0, len(distances))) sensitivity = [ 1, 3, 5, 10, 20, 40, 60, 80, 100, 120, 150, 180, 200, 250, 300, 350, 400 ] epsilons = [] for s in sensitivity: try: kneedle = KneeLocator(x, y, S=s, curve='convex', direction='increasing') epsilon = kneedle.all_elbows_y[0] if (len(epsilons) >= 1 and epsilons[-1] - epsilon <= 0.001): print("") else: epsilons.append(epsilon) except Exception as e: print(e) if (len(epsilons) >= 1): epsilons.append(epsilons[-1] + s / 10) else: epsilons.append(s / 10) return epsilons
def random_Crap(): json_object = json.load(open("datasets/reg_season_advanced.json")) json_object_height = json.load(open("datasets/data.json")) data = [] for person in json_object: values = json_object[person] values_to_add = (values[1])[5:] first = (json_object_height[person]) json_acceptable_string = first.replace("'", "\"") d = json.loads(json_acceptable_string) curr_height = d['height'] height_array = curr_height.split('-') final_height_inch = (int(height_array[0]) * 12) + int(height_array[1]) values_to_add.append(final_height_inch) data.append(values_to_add) mms = MinMaxScaler() mms.fit(data) data_transformed = mms.transform(data) Sum_of_squared_distances = [] K = range(1, 15) for k in K: km = KMeans(n_clusters=k) km = km.fit(data_transformed) Sum_of_squared_distances.append(km.inertia_) kneedle = KneeLocator(K, Sum_of_squared_distances, S=1.0, curve='convex', direction='decreasing') print(round(kneedle.knee, 3))
def elbowPointLocate(): global data localdata = data dictionaryvalues = {} elbowPoint = [] for k in range(1, 20): kmeans = KMeans(n_clusters=k, max_iter=1000).fit(localdata) localdata["labeldata"] = kmeans.labels_ dictionaryvalues[k] = kmeans.inertia_ elbowPoint.append((k, kmeans.inertia_)) findElbow = [] createElbow = pd.DataFrame(elbowPoint, columns=["x", "y"]) kn = KneeLocator(createElbow.x, createElbow.y, curve='convex', direction='decreasing') print("*****************ELBOW KNEE VALUE*********************") print(kn.knee) print("**********************************************") findElbow = pd.DataFrame(data=findElbow, columns=["x", "y"]) findElbow["x"] = list(dictionaryvalues.keys()) findElbow["y"] = list(dictionaryvalues.values()) findElbow = findElbow.to_dict(orient='records') findElbow = {'data': findElbow} return jsonify(findElbow)
def kmeans_elbow(points, range_, title): scaler = MinMaxScaler() points_scaled = scaler.fit_transform(points) inertia = [] clusters_n = range(1, range_) for k in clusters_n: kmeans = KMeans(n_clusters=k, random_state=5221) kmeans.fit(points_scaled) y_km = kmeans.predict(points) inertia.append(kmeans.inertia_) plt.figure(figsize=(10, 6)) plt.plot( clusters_n, inertia, ) plt.scatter(clusters_n, inertia, marker='x', c='r', s=100, label='Inertia') plt.legend() plt.xlabel('K') plt.ylabel('Sum_of_squared_distances') plt.title('Elbow Method For Optimal k (' + title + ' )') plt.show() kn = KneeLocator(clusters_n, inertia, S=2.0, curve='convex', direction='decreasing') return kn.knee
def distance_curve(distances, mode='show'): """ Save distance curve with knee candidates in file. :param distances: :param mode: show | save :return: """ sensitivity = [1, 3, 5, 10, 100, 150] knees = [] y = list(range(len(distances))) for s in sensitivity: kl = KneeLocator(distances, y, S=s) knees.append(kl.knee) plt.style.use('ggplot'); plt.figure(figsize=(10, 10)) plt.plot(distances, y) colors = ['r', 'g', 'k', 'm', 'c', 'b', 'y'] for k, c, s in zip(knees, colors, sensitivity): plt.vlines(k, 0, len(distances), linestyles='--', colors=c, label=f'S = {s}') plt.legend() if mode == 'show': plt.show() else: plt.savefig("distance_curve.png")