Exemplo n.º 1
0
def find_epsilon(feature_file, min_samp, kupa, number_of_features, dest):
    X = np.loadtxt(feature_file)
    X = StandardScaler().fit_transform(X)
    nearest_neighbors = NearestNeighbors(n_neighbors=min_samp + 1)
    neighbors = nearest_neighbors.fit(X)
    distances, indices = neighbors.kneighbors(X)
    distances = np.sort(distances[:, min_samp], axis=0)
    i = np.arange(len(distances))
    knee = KneeLocator(i,
                       distances,
                       S=1,
                       curve='convex',
                       direction='increasing',
                       interp_method='polynomial')
    epsilon = distances[knee.knee]
    fig = plt.figure(figsize=(5, 5))
    knee.plot_knee()
    plt.title(
        f"Elbow for {kupa}\nMfcc-{number_of_features}-features\nEpsilon={epsilon}"
    )
    plt.xlabel("Points")
    plt.ylabel("Distance")
    plt.savefig(
        f'{dest}results/Elbow_{kupa}_Mfcc_{number_of_features}_features.png')
    plt.close()
    print(epsilon)
    return epsilon
Exemplo n.º 2
0
def feature_selection(data, target, method=c.XGB, verbose=False):
    if method == c.COR:
        correlation = data.corr()
        if verbose:
            sns.heatmap(correlation, cmap='Blues', annot=True)
            plt.show()

        return correlation.loc[(correlation[target] > 0.2)
                               & (correlation[target] < 0.8)].index.tolist()
    else:
        xgb_params = {
            'eta': 0.05,
            'max_depth': 10,
            'subsample': 1.0,
            'colsample_bytree': 0.7,
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse'
        }

        df = data.copy(deep=True)
        y = df[target]
        del df[target]
        x = df

        dtrain = xgb.DMatrix(x, y, feature_names=df.columns.values)
        model = xgb.train(xgb_params, dtrain, num_boost_round=1000)
        importance = model.get_score(importance_type='total_cover')
        imp = pd.DataFrame(importance, index=range(1)).T
        imp.columns = ["Importance"]
        imp = imp.sort_values(by=["Importance"], ascending=False)
        imp /= imp.sum()

        if verbose:
            sns.heatmap(imp, cmap='Blues', annot=True)
            plt.show()

        imp["x"] = range(len(imp))

        # Online is a good parameter but you might wanna get rid of it if it gives a bad accuracy
        kneedle = KneeLocator(imp.x,
                              imp.Importance,
                              curve="convex",
                              direction="decreasing",
                              online=True)
        if verbose:
            kneedle.plot_knee()

        return imp.iloc[0:kneedle.knee].index.tolist() + [target]
def find_eps(dataset):

    nearest_neighbors = NearestNeighbors(n_neighbors=6)
    neighbors = nearest_neighbors.fit(dataset)
    distances, indices = neighbors.kneighbors(dataset)
    distances = np.sort(distances[:,5], axis=0)
    fig = plt.figure(figsize=(5, 5))
    plt.plot(distances)
    plt.xlabel("Points")
    plt.ylabel("Distance")
    # plt.show()
    i = np.arange(len(distances))
    knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial')
    fig = plt.figure(figsize=(5, 5))
    knee.plot_knee()
    plt.xlabel("Points")
    plt.ylabel("Distance")
    plt.savefig("Distance_curve.png", dpi=300)
    return distances[knee.knee]
Exemplo n.º 4
0
def knee_filt(data):
    newdata = [np.log(np.abs(v - f) + 1) for v, f in data.values()]
    hists, bins = np.histogram(newdata, 80)
    cdf = np.cumsum(hists)
    x = bins[1:]
    kneedle = KneeLocator(x,
                          cdf,
                          S=1.0,
                          curve='concave',
                          direction='increasing',
                          online=True)
    kneedle.plot_knee()
    knee = kneedle.knee
    inverse = defaultdict(list)
    for attr, d in data.items():
        if np.log(np.abs(d[0] - d[1]) + 1) >= knee:
            dev = cal_dev(d[0], d[1])
            inverse[dev].append(attr)
    return inverse
Exemplo n.º 5
0
    def Power_decrease_rate(self, Fs, cutoff, plot: bool):

        t_ = self.time[self.valley_id].values

        P_by_lowpass = effectiv_trans.butter_lowpass_filter(data=self.power,
                                                            cutoff=cutoff,
                                                            Fs=Fs,
                                                            order=1)

        P_by_lowpass = P_by_lowpass[self.valley_id]

        P_by_lowpass_series = pd.Series(P_by_lowpass, index=t_)

        # detection the inflection point

        kl = KneeLocator(t_[-round(len(t_) * 0.1):],
                         P_by_lowpass[-round(len(P_by_lowpass) * 0.1):],
                         curve="concave",
                         direction="decreasing")

        if plot:

            kl.plot_knee()

            plt.show()

        inflection_pt_id = kl.knee

        max_id = P_by_lowpass_series.idxmax()

        self.P_max = max(P_by_lowpass_series)

        self.P_inflection = kl.knee_y

        delta_P = self.P_max - self.P_inflection

        delta_t = abs(max_id - inflection_pt_id)

        Pv = delta_P / delta_t

        Ps = self.P_max / delta_P

        return Pv, delta_P, self.P_max
def perform_pca(features, datasetLabel):
    data_PCA = PCA(random_state=120)
    data_eigen = data_PCA.fit(features)
    data_variance = data_eigen.explained_variance_
    plot_features = np.arange(start=1, stop=features.shape[1]+1)
    data = {'variance': data_variance,
            'features': plot_features }
    df = pd.DataFrame(data, columns=['variance', 'features'])

    kl = KneeLocator(
        plot_features, data_variance, curve="convex", direction="decreasing"
    )
    print(kl.elbow)
    kl.plot_knee()
    plt.xlabel('Features')
    plt.ylabel('Variance')
    plt.title('Variance vs features')
    plt.grid(True)
    plt.savefig('plots/dr/pca/'+datasetLabel+'/variance_pca.png')
    plt.clf()
Exemplo n.º 7
0
    def set_n_pcs(self, min_n_pcs=5):
        # knee detection
        y = np.array(self.adata.uns['pca']['variance_ratio'])
        x = np.arange(len(y))
        kneedle = KneeLocator(x, 1-y, S=self.pca_s, curve='concave', direction='increasing')
        self.n_pcs = max(kneedle.knee+1, min_n_pcs) # change to 1-based

        # plot 
        fig, ax = plt.subplots()
        ax.plot(x+1, y, '-') # change to 1-based
        ax.axvline(x = self.n_pcs, color='red')
        ax.set_xlabel('PC') 
        ax.set_ylabel('PCA variance ratio')
        ax.set_title('n_pcs={}, S={}'.format(self.n_pcs, self.pca_s))
        fig.savefig(os.path.join(self.out, 'pca/pca_variance_ratio_cutoff.png'))
        kneedle.plot_knee()
        plt.savefig(os.path.join(self.out, 'pca/pca_kneedle.png'))

        # add rep with top pcs
        self.adata.obsm['X_pcs'] = self.adata.obsm['X_pca'][:, :self.n_pcs]
        return 
Exemplo n.º 8
0
def findKneeValue(mu):
    
    filenames = os.listdir(r'E:/BE PROJECT/Flask/static/frames')
    feature_list_np = np.array(globals.feature_list_q)
    myu=mu
    ysize = len(filenames)+1 
    neigh = NearestNeighbors(n_neighbors=myu) 
    nbrs = neigh.fit(feature_list_np) 
    dist, ind = nbrs.kneighbors(feature_list_np,return_distance=True) 
    distanceDec = sorted(dist[:,myu-1], reverse=False) 
   
    kn = KneeLocator(list(range(1,ysize)), distanceDec, curve='convex', direction='increasing') 
    epsilon = np.interp(kn.knee, list(range(1,ysize)), distanceDec) 
     
    kn.plot_knee() 
    plt.xlabel('Sample points') 
    plt.ylabel('Epsilon') 
    plt.plot(list(range(1,ysize)), distanceDec) 
    plt.hlines(epsilon, plt.xlim()[0], plt.xlim()[1], linestyles='dashed') 
    print("Knee is at : {},{}".format(kn.knee,epsilon))
 
    return epsilon
                                                     dimensions=(1, 2, 3, 4))
plt.show()

#K-means

#Elbow Method - SSE = Sum Squared Error

sse = []
for k in range(1, 15):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(x_norm)
    sse.append(kmeans.inertia_)

kl = KneeLocator(range(1, 15), sse, curve='convex', direction='decreasing')
nbr_cluster = kl.elbow
KneeLocator.plot_knee(kl)
plt.show()
print(nbr_cluster)

#Print the minimal SSE

km = KMeans(n_clusters=4)
km.fit(x_norm)
print(km.inertia_)

# Plotting the cluster centers and the data points on a 2D plane

plt.scatter(ml_data['PC1'], ml_data['PC2'])
plt.scatter(km.cluster_centers_[:, 0],
            km.cluster_centers_[:, 1],
            c='red',
Exemplo n.º 10
0
# %%
kneedle = KneeLocator(km_stat["n_clusters"],
                      km_stat["wss"],
                      S=1.0,
                      curve='convex',
                      direction='decreasing',
                      online=False,
                      interp_method="interp1d")
print("The number of cluster according to elbow method:", kneedle.knee)
print("The corresponding Within-Cluster-Sum of Squared Errors (WSS):",
      kneedle.knee_y)

# %%
# Plot Number of clusters against Within-Cluster-Sum of Squared Errors
kneedle.plot_knee(figsize=plt_cfg.figsize)
plt.xlabel("Number of clusters")
plt.ylabel("Within-Cluster-Sum of Squared Errors")
plt.xticks(np.arange(min(list_k), max(list_k) + 1, 1))
plt.tight_layout()
plt.savefig("results/knee.png")
plt.show()

# %%
# Plot the normalized knee curves
kneedle.plot_knee_normalized(figsize=plt_cfg.figsize)
plt.tight_layout()
plt.savefig("results/knee_normalized.png")
plt.show()

# %% [markdown]
Exemplo n.º 11
0
from scipy.interpolate import interp1d

with open("sse_minibatch.json", "r") as f:
    sse_ = json.load(f)

n_clusters = sorted([int(k) for k in sse_.keys()])
sse = {int(k): v for k, v in sse_.items()}
y = [sse[k] for k in n_clusters]
x = n_clusters
# print(x)
# f = interp1d(x, y)
# x_new = np.arange(10, max(n_clusters)+1, 5)
# print(x_new)
# y_new = f(x_new)
# plt.plot(x, y, 'o', x_new, y_new, '-')
# plt.savefig("interp1d.png")
# slope = get_1st_deriviatives(sse)
# for i, j in zip(x_new, y_new):
#     print(i,j)

# # # plt.style.use('fivethirtyeight')
kneedle = KneeLocator(x, y, S=1.0, curve='convex', direction='decreasing', online=True, interp_method="polynomial")
print(kneedle.knee)
print(kneedle.knee_y)
plt.style.use('fivethirtyeight')
kneedle.plot_knee(figsize=(18, 7))
plt.savefig("knee.png")

kneedle.plot_knee_normalized(figsize=(18, 7))
plt.savefig("knee_normal.png")
#Using df['close'] as the input array for clustering can also give out supports and resistances
#X = np.array(df['close'])
X = np.delete(X, 0)

sum_of_squared_distances = []
K = range(1, 15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(X.reshape(-1, 1))
    sum_of_squared_distances.append(km.inertia_)
kn = KneeLocator(K,
                 sum_of_squared_distances,
                 S=1.0,
                 curve="convex",
                 direction="decreasing")
kn.plot_knee()
#plt.plot(sum_of_squared_distances)

kmeans = KMeans(n_clusters=kn.knee).fit(X.reshape(-1, 1))
c = kmeans.predict(X.reshape(-1, 1))
minmax = []
for i in range(kn.knee):
    minmax.append([-np.inf, np.inf])
for i in range(len(X)):
    cluster = c[i]
    if X[i] > minmax[cluster][0]:
        minmax[cluster][0] = X[i]
    if X[i] < minmax[cluster][1]:
        minmax[cluster][1] = X[i]
""""
for i in range(len(X)):
def GMM_clustering_R(X_method_df, method, default_cluster_num=None):
    """Function to check BIC and perform GMM clustering on embedded dataset"""
    #First, import r packages and fix random seed:
    base = importr('base')
    mclust = importr('mclust')
    ro.r('set.seed(0)')

    #Now, check BIC and make a plot
    num_components_to_try = pd.Series(np.arange(1,
                                                12))  #try up to 12 components
    with localconverter(ro.default_converter + pandas2ri.converter):
        ro.r('set.seed(0)')
        BIC_method = mclust.mclustBIC(X_method_df, G=num_components_to_try)

    model_names = [
        'EII', 'VII', 'EEI', 'VEI', 'EVI', 'VVI', 'EEE', 'EVE', 'VEE', 'VVE',
        'EEV', 'VEV', 'EVV', 'VVV'
    ]
    sns.set(style="darkgrid")
    #     sns.set_palette("tab10")
    BIC_method_df = pd.DataFrame(BIC_method, columns=model_names)
    BIC_method_df = BIC_method_df.dropna(
        axis=1)  #drop parametrizations with NaNs
    #     plt.figure()
    BIC_method_df.plot(marker='o')
    plt.title('GMM BIC on ' + method.__name__)

    #Now, find the knee point of the optimal BIC plot (the best GMM parametrization)
    best_parametrization = BIC_method_df.columns[BIC_method_df.max().argmax()]
    kneedle = KneeLocator(num_components_to_try,
                          BIC_method_df[best_parametrization],
                          S=1,
                          curve='concave',
                          direction='increasing',
                          interp_method='polynomial')
    #     plt.figure()
    kneedle.plot_knee()
    plt.title('GMM BIC on ' + method.__name__ + ': Knee Point')
    plt.xlabel('num_GMM_components')
    plt.ylabel('')
    print('Elbow point: {} components with BIC {}'.format(
        kneedle.knee, kneedle.knee_y))

    #Pick the best number of GMM components:
    best_num_components = kneedle.knee - 1
    if default_cluster_num is not None:
        best_num_components = default_cluster_num - 1
    with localconverter(ro.default_converter + pandas2ri.converter):
        ro.r('set.seed(0)')
        mc = mclust.Mclust(X_method_df,
                           G=pd.Series(
                               [num_components_to_try[best_num_components]]))
        print(base.summary(mc))
        print('Uncertainty quantiles:',
              np.quantile(mc[15], [0, 0.25, 0.5, 0.75, 1]))
        mc_dict = convert_to_python_dict(mc)
        method_model_name = mc_dict['modelName']
        print(method_model_name)
        param = mc_dict['parameters']
        method_means = np.array(convert_to_python_dict(param)['mean'])
        method_uncertainty = np.array(mc_dict['uncertainty'])
        method_z = np.array(convert_to_python_dict(mc)['z'])
        method_clusters = np.array(
            convert_to_python_dict(mc)['classification'])
        method_means = pd.DataFrame(
            method_means,
            columns=['V' + str(i + 1) for i in range(method_means.shape[1])])
    return method_clusters, method_means, method_z, method_uncertainty
def analysis(STATE,
             method,
             method_kwargs,
             hyperparams_to_test,
             fig,
             spec,
             row,
             precomputed=False,
             separate=False,
             two_cols=False,
             NUM_STATES=1,
             configurations=None,
             default_cluster_num=5):
    #First, define appropriate paths
    SHAPE_PATH, FIGURE_PATH, RAW_DATA_PATH, INCOME_POPULATION_PATH = define_paths(
        STATE)

    #Load the data
    covid_, X, index_X, columns_X = load_data(RAW_DATA_PATH)

    #Do dim red
    print('##################D-RED#################')
    emb_method = method
    if not precomputed:
        errors_results, embeddings_results, trustws_results = choose_dimension(
            X, emb_method, hyperparams_to_test, **method_kwargs)

        save_obj(embeddings_results,
                 STATE + '_embeddings_results' + method.__name__)
        save_obj(errors_results, STATE + '_errors_results' + method.__name__)
        save_obj(trustws_results, STATE + '_trustws_result' + method.__name__)
    if precomputed:
        embeddings_results = load_obj(STATE + '_embeddings_results' +
                                      method.__name__)
        errors_results = load_obj(STATE + '_errors_results' + method.__name__)
        trustws_results = load_obj(STATE + '_trustws_result' + method.__name__)

    if (len(hyperparams_to_test['n_components']) >
            1) and (errors_results['n_components'][0] is not None):
        plt.plot(hyperparams_to_test['n_components'],
                 errors_results['n_components'])

    if (len(hyperparams_to_test['n_components']) > 1):
        kneedle = KneeLocator(hyperparams_to_test['n_components'],
                              np.array(trustws_results['n_components']),
                              S=1,
                              curve='concave',
                              direction='increasing',
                              interp_method='polynomial',
                              online=False)
        kneedle.plot_knee()
        plt.title(emb_method.__name__ + ' trustworthiness')
        plt.xlabel('n_components')
        plt.ylabel('trustworhiness')
        kneedle.knee, kneedle.knee_y

    #Save the dataframe with optimal dim
    if (len(hyperparams_to_test['n_components']) > 1):
        good_dim = int(
            np.squeeze(
                np.where(hyperparams_to_test['n_components'] == kneedle.knee)))
    else:
        good_dim = 0
    X_method = embeddings_results['n_components'][
        good_dim]  #pick the best (knee point) n_components
    X_method_df = pd.DataFrame(
        X_method,
        columns=['Mode {}'.format(i)
                 for i in range(X_method.shape[1])])  #, index = index_X)
    X_method_df.to_csv(
        os.path.join(
            configurations['DATA_PATH'], 'interim',
            method.__name__ + str(X_method.shape[1]) + 'D_' + STATE + '.csv'))
    print('Saving optimal embedding. Method: ', method.__name__, 'shape: ',
          X_method_df.shape)

    print('##################INITIAL VIZ#################')
    #Find the 2D and 3D embeddings and continuous colors based on that
    filename_initial = os.path.join(FIGURE_PATH, 'initial_' + method.__name__)
    if method.__name__ == 'Isomap':
        viz = viz_Isomap
    if method.__name__ == 'SpectralEmbedding':
        viz = viz_SE
    if method.__name__ == 'LocallyLinearEmbedding':
        viz = viz_LLE

    if precomputed:
        load_path = os.path.join('obj', STATE)
        save_path = None
    else:
        load_path = None
        save_path = os.path.join('obj', STATE)
    X_2D_emb, X_3D_emb = viz(X,
                             colors=None,
                             filename=filename_initial,
                             alpha=0.5,
                             load_path=load_path,
                             save_path=save_path)
    cos_colors = find_cos_similarity(X_2D_emb)
    #Color the manifold continuously
    filename_initial_colored = os.path.join(
        FIGURE_PATH, 'initial_' + method.__name__ + '_colored')
    X_2D_emb, X_3D_emb = viz(X,
                             colors=cos_colors,
                             filename=filename_initial_colored,
                             cbar=None,
                             alpha=0.5,
                             load_path=load_path,
                             save_path=save_path)

    print('##################GMM CLUSTERING#################')
    #Import R for clustering
    base = importr('base')
    mclust = importr('mclust')
    ro.r('set.seed(1)')

    dontprecomputeclusters = not precomputed
    #     if not precomputed:
    if dontprecomputeclusters:
        clusters, means, z, uncertainty = GMM_clustering_R(
            X_method_df, method, default_cluster_num=default_cluster_num
        )  #could change this to 5 to be consistent across states to auto-id clust #
        clusters_block_indexed = pd.Series(clusters, index=index_X)

        avg_per_clust = create_avg_df(clusters, index_X, covid_)

        reordered_clusters, reordered_means, reordered_z, reordered_uncertainty = relabel_clusters(
            clusters.astype('int'), avg_per_clust, means, z, uncertainty)
        reordered_avg_per_clust = create_avg_df(reordered_clusters, index_X,
                                                covid_)
        #Save
        np.save(
            os.path.join('obj', STATE + '_reordered_clusters.npy'),
            reordered_clusters,
        )
        reordered_means.to_csv(
            os.path.join('obj', STATE + '_reordered_means.csv'))
        reordered_z.to_csv(os.path.join('obj', STATE + '_reordered_z.csv'))
        np.save(os.path.join('obj', STATE + '_reordered_uncertainty.npy'),
                reordered_uncertainty)

        reordered_avg_per_clust.to_csv(
            os.path.join('obj', STATE + '_reordered_avg_per_clust.csv'))


#     if precomputed:
    if not dontprecomputeclusters:
        reordered_clusters = np.load(
            os.path.join('obj', STATE + '_reordered_clusters.npy'))
        reordered_means = pd.read_csv(os.path.join(
            'obj', STATE + '_reordered_means.csv'),
                                      index_col=0)
        reordered_z = pd.read_csv(os.path.join('obj',
                                               STATE + '_reordered_z.csv'),
                                  index_col=0)
        reordered_uncertainty = np.load(
            os.path.join('obj', STATE + '_reordered_uncertainty.npy'))
        reordered_avg_per_clust = pd.read_csv(os.path.join(
            'obj', STATE + '_reordered_avg_per_clust.csv'),
                                              index_col=0)

    #Save the data for Dennis (for only this method)
    index_with_blocks_and_save(STATE, X_method_df, X_2D_emb, X_3D_emb,
                               reordered_clusters, reordered_z,
                               reordered_uncertainty, index_X, emb_method)

    N_TIMESERIES = 5
    closest_to_mean_samples, closest_to_mean_block_ids = find_closest_time_series(
        X_method_df, reordered_means, covid_, index_X, n=N_TIMESERIES)

    print('##################FINAL VIZ#################')
    sns.set(style="whitegrid")
    if two_cols:
        reordered_clusters = cos_colors  #Change colors
    add_state_to_fig(STATE,
                     fig,
                     spec,
                     row,
                     NUM_STATES,
                     X,
                     reordered_clusters,
                     index_X,
                     reordered_avg_per_clust,
                     load_path=load_path,
                     save_path=save_path,
                     separate=separate,
                     two_cols=two_cols,
                     configurations=configurations)
Exemplo n.º 15
0
def clustering():
    global target, daypara, df, df2, df_4pycaret, df_temp
    st.write(df)
    lookback = len(df.index) * (-1)
    X = np.array(df["price"][lookback:])
    sum_of_squared_distances = []
    K = range(1, 15)
    for k in K:
        km = KMeans(n_clusters=k)
        km = km.fit(X.reshape(-1, 1))
        sum_of_squared_distances.append(km.inertia_)
    kn = KneeLocator(K,
                     sum_of_squared_distances,
                     S=1.0,
                     curve="convex",
                     direction="decreasing")
    kn.plot_knee(figsize=(7, 3))
    st.set_option('deprecation.showPyplotGlobalUse', False)
    st.subheader("Search Number of Regime")
    st.pyplot()
    st.subheader("Plotting in Reallity")
    with st.spinner("Loading Chart..."):
        kmeans = KMeans(n_clusters=kn.knee).fit(X.reshape(-1, 1))
        c = kmeans.predict(X.reshape(-1, 1))
        minmax = []
        for i in range(kn.knee):
            minmax.append([-np.inf, np.inf])
        for i in range(len(X)):
            cluster = c[i]
            if X[i] > minmax[cluster][0]:
                minmax[cluster][0] = X[i]
            if X[i] < minmax[cluster][1]:
                minmax[cluster][1] = X[i]
        plt.figure(figsize=(11, 5), dpi=30)
        plt.title("Clustering Pressure/Support of {}".format(target),
                  fontsize=20)
        plt.ylabel("price")
        index_p = []
        index_s = []
        a = np.transpose(minmax)
        a = np.sort(a)
        for i in range(len(X)):
            colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
            c = kmeans.predict(X[i].reshape(-1, 1))[0]
            color = colors[c]
            if X[i] in a[1]:
                index_s.append(i)
            if X[i] in a[0]:
                index_p.append(i)
            plt.scatter(i, X[i], c=color, s=20, marker="o")
        for i in range(len(minmax)):
            plt.hlines(a[0][i],
                       xmin=index_p[i] - 10,
                       xmax=index_p[i] + 10,
                       colors="red",
                       linestyle="--")
            plt.text(index_p[i] - 15,
                     a[0][i],
                     "Pressure= {:.2f}".format(a[0][i]),
                     fontsize=13)
            plt.hlines(a[1][i],
                       xmin=index_s[i] - 10,
                       xmax=index_s[i] + 10,
                       colors="b",
                       linestyle="--")
            plt.text(index_s[i] - 15,
                     a[1][i],
                     "Support= {:.2f}".format(a[1][i]),
                     fontsize=13)
    st.set_option('deprecation.showPyplotGlobalUse', False)
    st.pyplot()
def identify_single_knee_point(x, y, plot=False):
    kl = KneeLocator(x, y, curve='convex', direction="increasing", S=5)
    if plot:
        kl.plot_knee()
    return kl.all_knees
Exemplo n.º 17
0
from kneed import KneeLocator

df = pd.read_csv("../ulabox_orders_with_categories_partials_2017.csv")

# %%
dfp = df[["Fresh%", "Food%", "Drinks%", "Home%", "Beauty%", "Health%", "Baby%", "Pets%"]]

ssd = []
ks = range(1,11)
for k in range(1,11):
    km = KMeans(n_clusters=k)
    km = km.fit(dfp)
    ssd.append(km.inertia_)

kneedle = KneeLocator(ks, ssd, S=1.0, curve="convex", direction="decreasing")
kneedle.plot_knee()
plt.show()

k = round(kneedle.knee)

print(f"Number of clusters suggested by knee method: {k}")
# %%
kmeans = KMeans(n_clusters=k).fit(df[["Fresh%", "Food%", "Drinks%", "Home%", "Beauty%", "Health%", "Baby%", "Pets%"]])

#%%
sns.histplot(x="total_items", data=df, multiple="stack", hue=kmeans.labels_)
plt.show() 

#%%
sns.displot(x=kmeans.labels_, y="discount%", data=df, palette='rainbow')
plt.show() 
Exemplo n.º 18
0
minPts = 2 * dim
kneighbours = minPts - 1

#Building K-distance graph to find optimal epsilon value and using kneed to get the exact value
nearest_neighbors = NearestNeighbors(n_neighbors=kneighbours)
neighbors = nearest_neighbors.fit(df_pca)
distances, indices = neighbors.kneighbors(df_pca)
distances = np.sort(distances[:, (kneighbours - 1)], axis=0)
i = np.arange(len(distances))
knee = KneeLocator(i,
                   distances,
                   S=1,
                   curve='convex',
                   direction='increasing',
                   interp_method='polynomial')
knee.plot_knee(figsize=[13.5, 9])
plt.xlabel("Points")
plt.ylabel("Distance")
plt.title('K-distance Graph', fontsize=15)
optimal_eps = distances[knee.knee]
plt.show()

print('min_samples=' + str(minPts))
print('n_neighbours=' + str(kneighbours))
print('eps=' + str(optimal_eps))

dbscan = DBSCAN(eps=optimal_eps, min_samples=minPts)
dbscan.fit(df_pca)
dbscan_labels = dbscan.labels_

silhouette = silhouette_score(df_pca, dbscan.labels_)