def find_eps(data, k, metric): nbrs = NearestNeighbors(n_neighbors=k, metric=metric).fit(data) distances, indices = nbrs.kneighbors(data) distanceDec = sorted(distances[:, k - 1], reverse=True) knee = KneeLocator(indices[20:500, 0], distanceDec[20:500], direction="decreasing", curve="convex") knee.plot_knee_normalized() return distanceDec[knee.elbow]
def find_eps(data, k, metric): END = round(data.shape[0] * 0.9) nbrs = NearestNeighbors(n_neighbors=k, metric=metric).fit(data) distances, indices = nbrs.kneighbors(data) distanceDec = np.array(sorted(distances[:, k - 1], reverse=True)) knee = KneeLocator(indices[:END, 0], distanceDec[:END], curve="convex", direction="decreasing", S=1.0) knee.plot_knee_normalized() return distanceDec[knee.elbow], knee
def estimate_epsilon(X_embedded, sensitivity, plot=False): neigh = NearestNeighbors(n_neighbors=2) nbrs = neigh.fit(X_embedded) distances, indices = nbrs.kneighbors(X_embedded) distances = np.sort(distances, axis=0) distances = distances[int(distances.shape[0] / 2):, 1] i = np.arange(distances.shape[0]) # get the elbow kneedle = KneeLocator(i, distances, S=1.0, curve='convex', direction='increasing', interp_method='polynomial') if plot: plt.figure() plt.plot(distances) kneedle.plot_knee_normalized() return sensitivity * distances[kneedle.knee]
def singleDistributionTest(path_in='./data', path_out='./outputs', adjusted_pvalue=False, plot_all=False, plot_legend=False, num_fractions=10, min_fraction=0.1): """ Parameters: ---------- path_in : str folder path with input data in .csv format, './data', by default path_out : str folder path with output data images, './otputs' by default adjusted_pvalue : bool size adjusted p-value, False by default plot_all : bool plot all graphs, if False plot mean value only num_fractions : integer number of interations to create subsets, 10 by default min_fraction : float minimal size of subset, 0.1 by default """ fractions = np.linspace(1.0, min_fraction, num=num_fractions) mypath = path_in numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] datafiles = [ f for f in listdir(mypath) if isfile(join(mypath, f)) if f.endswith('.csv') ] print(datafiles) dfs = [] for f in datafiles: df = pd.read_csv(mypath + '/' + f) df = df.dropna() df = df.select_dtypes(include=numerics) dfs.append(df) for idx, df in enumerate(dfs): p_adjust = 1 / (np.sqrt(((len(df) + len(df)) / (len(df) * len(df))))) if len(df) < 5000: alpha = 0.05 / (np.sqrt( ((len(df) + len(df)) / (len(df) * len(df))))) else: alpha = 1.037 stats_fractions = [] pvals_fractions = [] ks_vals = [] pvals_adjusted_fractions = [] for f in fractions: df_frac = df.sample(frac=f) stats = [] pvals = [] for c in df.columns: kst = ks_2samp(df[c].values, df_frac[c].values, mode='asymp') stats.append(kst[0]) pvals.append(1 - kst[1]) pvals_adj = (0.05 * (np.sqrt( ((len(df_frac) + len(df)) / (len(df_frac) * len(df))))) * p_adjust) stats_fractions.append(stats) pvals_fractions.append(pvals) ks_val = alpha * (np.sqrt( ((len(df_frac) + len(df)) / (len(df_frac) * len(df))))) ks_vals.append(ks_val) pvals_adjusted_fractions.append(pvals_adj) print(pvals_adjusted_fractions) stats_fractions = np.asarray(stats_fractions) pvals_fractions = np.asarray(pvals_fractions) if plot_all: for i, v in enumerate(df.columns): plt.plot(stats_fractions[:, i]) plt.xticks(range(10), [np.round(f, 1) for f in fractions]) #plt.hlines(0.05, colors='r', linestyles='dashed', xmin=0.0, xmax=8.0) plt.title(datafiles[0]) if len(df.columns) < 10: plt.legend(df.columns) plt.plot(ks_vals, color='r', linestyle='dotted') plt.savefig(path_out + '/' + datafiles[idx] + ' KS stats all' + '.pdf', bbox_inches='tight') plt.show() for i, v in enumerate(df.columns): plt.plot(pvals_fractions[:, i]) plt.xticks(range(10), [np.round(f, 1) for f in fractions]) #plt.yscale('log') plt.title(datafiles[0]) if len(df.columns) < 10: plt.legend(df.columns) if adjusted_pvalue: plt.plot(pvals_adjusted_fractions, colors='r', linestyles='dotted') else: plt.hlines(0.05, colors='r', linestyles='dotted', xmin=0.0, xmax=9.0) plt.savefig(path_out + '/' + datafiles[idx] + ' pvals all' + '.pdf', bbox_inches='tight') plt.show() stats_mean = np.mean(stats_fractions, axis=1) pvals_mean = np.mean(pvals_fractions, axis=1) plt.plot(stats_mean) plt.plot(ks_vals, color='r', linestyle='dotted') plt.xticks(range(10), [np.round(f, 1) for f in fractions]) plt.title(datafiles[0] + ' KS stats mean') plt.savefig(path_out + '/' + datafiles[idx] + ' KS stats mean' + '.pdf', bbox_inches='tight') plt.show() kneedle = KneeLocator(stats_mean, range(10), S=1.0, curve='convex', direction='increasing') kneedle.plot_knee_normalized() plt.show() plt.plot(pvals_mean) plt.xticks(range(10), [np.round(f, 1) for f in fractions]) plt.title(datafiles[0] + ' p-values mean') if adjusted_pvalue: plt.plot(pvals_adjusted_fractions, colors='r', linestyles='dotted') else: plt.hlines(0.05, colors='r', linestyles='dotted', xmin=0.0, xmax=9.0) plt.savefig(path_out + '/' + datafiles[idx] + ' KS stats mean' + '.pdf', bbox_inches='tight') plt.show() kneedle = KneeLocator(pvals_mean, range(10), S=1.0, curve='convex', direction='increasing') kneedle.plot_knee_normalized() plt.show()
data = pd.read_csv("combat_adjusted_minimal.csv", index_col=0) # calculate variance variance = data.var(axis=1).sort_values(ascending=False) # x and y values x = np.array(list(range(len(variance)))) y = variance.values # Elbow finder kneedle = KneeLocator(x, y, S=50, curve='convex', direction='decreasing') kneedle = KneeLocator(x, y, S=2, curve='convex', direction='decreasing') # Plot kneedle.plot_knee_normalized() sns.set_style("white") kneedle.plot_knee() # Print results kneedle.elbow kneedle.knee_y # Subset the genes mvg = variance[0:kneedle.elbow].index mvg = data.loc[mvg, :] # write out the data mvg.to_csv("mvg_knee.csv") ##############################################################################
print("The corresponding Within-Cluster-Sum of Squared Errors (WSS):", kneedle.knee_y) # %% # Plot Number of clusters against Within-Cluster-Sum of Squared Errors kneedle.plot_knee(figsize=plt_cfg.figsize) plt.xlabel("Number of clusters") plt.ylabel("Within-Cluster-Sum of Squared Errors") plt.xticks(np.arange(min(list_k), max(list_k) + 1, 1)) plt.tight_layout() plt.savefig("results/knee.png") plt.show() # %% # Plot the normalized knee curves kneedle.plot_knee_normalized(figsize=plt_cfg.figsize) plt.tight_layout() plt.savefig("results/knee_normalized.png") plt.show() # %% [markdown] # ### The Silhouette Method # # The silhouette value measures how similar a point is to its own cluster (cohesion) compared to other clusters (separation). # %% plt.subplots(figsize=plt_cfg.figsize) ax = sns.lineplot(x="n_clusters", y="mean_sil_coeff", data=km_stat) ax.set(xlabel="Number of clusters", ylabel="Mean Silhouette Coefficient") plt.xticks(np.arange(min(list_k), max(list_k) + 1, 1)) plt.tight_layout()
from scipy.interpolate import interp1d with open("sse_minibatch.json", "r") as f: sse_ = json.load(f) n_clusters = sorted([int(k) for k in sse_.keys()]) sse = {int(k): v for k, v in sse_.items()} y = [sse[k] for k in n_clusters] x = n_clusters # print(x) # f = interp1d(x, y) # x_new = np.arange(10, max(n_clusters)+1, 5) # print(x_new) # y_new = f(x_new) # plt.plot(x, y, 'o', x_new, y_new, '-') # plt.savefig("interp1d.png") # slope = get_1st_deriviatives(sse) # for i, j in zip(x_new, y_new): # print(i,j) # # # plt.style.use('fivethirtyeight') kneedle = KneeLocator(x, y, S=1.0, curve='convex', direction='decreasing', online=True, interp_method="polynomial") print(kneedle.knee) print(kneedle.knee_y) plt.style.use('fivethirtyeight') kneedle.plot_knee(figsize=(18, 7)) plt.savefig("knee.png") kneedle.plot_knee_normalized(figsize=(18, 7)) plt.savefig("knee_normal.png")