def get_knee_results(data, cluster_lims, cores, categorical): knee_results = [] cluster_range = range(*cluster_lims) for n_clusters in tqdm(cluster_range): kp = KPrototypes(n_clusters, init="cao", random_state=0, n_jobs=cores) kp.fit(data[cols], categorical=categorical) knee_results.append(kp.cost_) kl = KneeLocator( cluster_range, knee_results, curve_nature="convex", curve_direction="decreasing", ) n_clusters = kl.knee with open(OUT_DIR / "n_clusters.txt", "w") as f: f.write(str(n_clusters)) knee_results = pd.Series(index=cluster_range, data=knee_results) knee_results.to_csv(OUT_DIR / "knee_results.csv", header=False) return n_clusters
def choose_from_metric(metric, k_values): if chosen_k is not None: return chosen_k knee_locator = KneeLocator(k_values, metric) if knee_locator.knee is None: return k_values[numpy.argmax(metric)] else: return knee_locator.knee
def choose_from_metric(metric, k_values): if chosen_k is not None: return chosen_k knee_locator = KneeLocator(k_values, metric, curve_nature='convex', curve_direction='decreasing') if knee_locator.knee is None: return k_values[numpy.argmin(metric)] else: return knee_locator.knee
def run_decision_tree_dimensionality_reduction(name, features, classes, min_k=2, max_k=None, min_depth=2, max_depth=None, chosen_k=None, random_state=6126540): if max_k is None: max_k = features.shape[1] - 1 if max_depth is None: max_depth = features.shape[1] scores = [] k_values = [] k_value_strings = [] k_to_depth = {} for k, depth, score in get_k_depth_values(features, classes, min_depth, max_depth, random_state): k_values.append(k) scores.append(score) k_to_depth[k] = depth k_value_strings.append('{}({})'.format(k, depth)) print('.') if chosen_k is not None: best_k = chosen_k else: knee_locator = KneeLocator(k_values, scores) if knee_locator.knee is None: best_k = k_values[numpy.argmax(scores)] else: best_k = knee_locator.knee plot_metric(name, 'f1 score', scores, k_values, best_k) best_depth = k_to_depth[best_k] transformer = DecisionTreeDimReducer(best_depth, random_state) transformer.fit(features, classes) return transformer
def plot_elbow(thresholds, num_features_left, auto_pick_elbow=True): if auto_pick_elbow: try: from yellowbrick.utils import KneeLocator elbow_locator = KneeLocator(x=thresholds, y=num_features_left, curve_nature="convex", curve_direction="decreasing") best_threshold = elbow_locator.knee best_index_at = list(thresholds).index(best_threshold) best_num_feat = num_features_left[best_index_at] except: pass from matplotlib import rcParams plt.figure(figsize=(16, 6)) #plt.xscale('log', nonposy='clip') rcParams.update({'font.size': 16}) plt.plot(thresholds, num_features_left, '-o') if auto_pick_elbow: try: elbow_label = f"elbow at the={best_threshold} index_at: [{best_index_at}], best_num_feat={best_num_feat}" print(elbow_label) plt.vlines(best_threshold, color='r', linestyle="--", label=elbow_label, ymin=0, ymax=len(non_zero_features)) plt.legend(loc="best") except: pass plt.title('Feature Importance') plt.ylabel('Num of Features') plt.xlabel('Scores threshold') pic_name = f'num_feat_vs_score_threshold_{time_produced}.jpg' plt.savefig(pic_name) print(f"{pic_name} was produced.")
def fit(self, X, y=None, **kwargs): """ Fits n KMeans models where n is the length of ``self.k_values_``, storing the silhouette scores in the ``self.k_scores_`` attribute. The "elbow" and silhouette score corresponding to it are stored in ``self.elbow_value`` and ``self.elbow_score`` respectively. This method finishes up by calling draw to create the plot. """ self.k_scores_ = [] self.k_timers_ = [] self.kneedle = None self.knee_value = None if self.locate_elbow: self.elbow_value_ = None self.elbow_score_ = None for k in self.k_values_: # Compute the start time for each model start = time.time() # Set the k value and fit the model self.estimator.set_params(n_clusters=k) self.estimator.fit(X) # Append the time and score to our plottable metrics self.k_timers_.append(time.time() - start) self.k_scores_.append( self.scoring_metric(X, self.estimator.labels_)) if self.locate_elbow: locator_kwargs = { "distortion": { "curve_nature": "convex", "curve_direction": "decreasing", }, "silhouette": { "curve_nature": "concave", "curve_direction": "increasing", }, "calinski_harabasz": { "curve_nature": "concave", "curve_direction": "increasing", }, }.get(self.metric, {}) elbow_locator = KneeLocator(self.k_values_, self.k_scores_, **locator_kwargs) if elbow_locator.knee is None: self.elbow_value_ = None self.elbow_score_ = 0 warning_message = ( "No 'knee' or 'elbow' point detected, " "pass `locate_elbow=False` to remove the warning") warnings.warn(warning_message, YellowbrickWarning) else: self.elbow_value_ = elbow_locator.knee self.elbow_score_ = self.k_scores_[self.k_values_.index( self.elbow_value_)] self.draw() return self