def _get_optimal_number_of_clusters(self, correlation, asset_returns, linkage, num_reference_datasets=5, max_number_of_clusters=10): """ Find the optimal number of clusters for hierarchical clustering using the Gap statistic. :param correlation: (np.array) matrix of asset correlations :param asset_returns: (pd.DataFrame) historical asset returns :param linkage: (str) the type of linkage method to use for clustering :param num_reference_datasets: (int) the number of reference datasets to generate for calculating expected inertia :param max_number_of_clusters: (int) the maximum number of clusters to check for finding the optimal value :return: (int) the optimal number of clusters """ cluster_func = AgglomerativeClustering(affinity='precomputed', linkage=linkage) original_distance_matrix = np.sqrt(2 * (1 - correlation).round(5)) gap_values = [] for num_clusters in range(1, max_number_of_clusters + 1): cluster_func.n_clusters = num_clusters # Calculate expected inertia from reference datasets reference_inertias = [] for _ in range(num_reference_datasets): # Generate reference returns from uniform distribution and calculate the distance matrix. reference_asset_returns = pd.DataFrame( np.random.rand(*asset_returns.shape)) reference_correlation = np.array( reference_asset_returns.corr()) reference_distance_matrix = np.sqrt( 2 * (1 - reference_correlation).round(5)) reference_cluster_assignments = cluster_func.fit_predict( reference_distance_matrix) inertia = self._compute_cluster_inertia( reference_cluster_assignments, reference_asset_returns.values) reference_inertias.append(inertia) expected_inertia = np.mean(reference_inertias) # Calculate inertia from original data original_cluster_asignments = cluster_func.fit_predict( original_distance_matrix) inertia = self._compute_cluster_inertia( original_cluster_asignments, asset_returns.values) # Calculate the gap statistic gap = expected_inertia - inertia gap_values.append(gap) return np.argmax(gap_values)
def plot_agglomerative_algorithm(): # generate synthetic two-dimensional data X, y = make_blobs(random_state=0, n_samples=12) agg = AgglomerativeClustering(n_clusters=X.shape[0], compute_full_tree=True).fit(X) fig, axes = plt.subplots(X.shape[0] // 5, 5, subplot_kw={ 'xticks': (), 'yticks': () }, figsize=(20, 8)) eps = X.std() / 2 x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)] for i, ax in enumerate(axes.ravel()): ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) agg.n_clusters = X.shape[0] - i agg.fit(X) ax.set_title("Step %d" % i) ax.scatter(X[:, 0], X[:, 1], s=60, c='grey') bins = np.bincount(agg.labels_) for cluster in range(agg.n_clusters): if bins[cluster] > 1: points = X[agg.labels_ == cluster] other_points = X[agg.labels_ != cluster] kde = KernelDensity(bandwidth=.5).fit(points) scores = kde.score_samples(gridpoints) score_inside = np.min(kde.score_samples(points)) score_outside = np.max(kde.score_samples(other_points)) levels = .8 * score_inside + .2 * score_outside ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels], colors='k', linestyles='solid', linewidths=2) axes[0, 0].set_title("Initialization") plt.show()
def plot_agglomerative(): X, y = make_blobs(random_state=0, n_samples=12) agg = AgglomerativeClustering(n_clusters=3) eps = X.std() / 2. x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)] ax = plt.gca() for i, x in enumerate(X): ax.text(x[0] + .1, x[1], "%d" % i, horizontalalignment='left', verticalalignment='center') ax.scatter(X[:, 0], X[:, 1], s=60, c='grey') ax.set_xticks(()) ax.set_yticks(()) for i in range(11): agg.n_clusters = X.shape[0] - i agg.fit(X) bins = np.bincount(agg.labels_) for cluster in range(agg.n_clusters): if bins[cluster] > 1: points = X[agg.labels_ == cluster] other_points = X[agg.labels_ != cluster] kde = KernelDensity(bandwidth=.5).fit(points) scores = kde.score_samples(gridpoints) score_inside = np.min(kde.score_samples(points)) score_outside = np.max(kde.score_samples(other_points)) levels = .8 * score_inside + .2 * score_outside ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels], colors='k', linestyles='solid', linewidths=1) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max)
def plot_agglomerative(): from sklearn.datasets import make_blobs from sklearn.cluster import AgglomerativeClustering from sklearn.neighbors import KernelDensity import matplotlib.pyplot as plt import numpy as np import pandas as pd m = 16 k = 3 X, y = make_blobs(n_samples= m, n_features=2, centers=k, cluster_std=1.3, random_state = 2255) agg = AgglomerativeClustering(n_clusters=3) eps = X.std() / 2. x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)] ax = plt.gca() for i, x in enumerate(X): ax.text(x[0] + .1, x[1], "%d" % i, horizontalalignment='left', verticalalignment='center') ax.scatter(X[:, 0], X[:, 1], s=20, c='grey') ax.set_xticks(()) ax.set_yticks(()) for i in range((m-1)): agg.n_clusters = X.shape[0] - i agg.fit(X) bins = np.bincount(agg.labels_) for cluster in range(agg.n_clusters): if bins[cluster] > 1: points = X[agg.labels_ == cluster] other_points = X[agg.labels_ != cluster] kde = KernelDensity(bandwidth= 0.9).fit(points) scores = kde.score_samples(gridpoints) score_inside = np.min(kde.score_samples(points)) score_outside = np.max(kde.score_samples(other_points)) levels = .80 * score_inside + .20 * score_outside ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels], colors='k', linestyles='solid', linewidths=0.8) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max)
def plot_agglomerative_algorithm(): from sklearn.datasets import make_blobs from sklearn.cluster import AgglomerativeClustering from sklearn.neighbors import KernelDensity import matplotlib.pyplot as plt import numpy as np import pandas as pd m = 16 k = 3 X, y = make_blobs(n_samples= m, n_features=2, centers=k, cluster_std=1.3, random_state = 2255) agg = AgglomerativeClustering(n_clusters=X.shape[0], compute_full_tree=True).fit(X) fig, axes = plt.subplots(X.shape[0] // 5, 5, subplot_kw={'xticks': (), 'yticks': ()}, figsize=(20, 8)) eps = X.std() / 1.7 x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)] for i, ax in enumerate(axes.ravel()): ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) agg.n_clusters = X.shape[0] - i agg.fit(X) ax.set_title("Step %d" % i) ax.scatter(X[:, 0], X[:, 1], s=20, c='grey') bins = np.bincount(agg.labels_) for cluster in range(agg.n_clusters): if bins[cluster] > 1: points = X[agg.labels_ == cluster] other_points = X[agg.labels_ != cluster] kde = KernelDensity(bandwidth=.3).fit(points) scores = kde.score_samples(gridpoints) score_inside = np.min(kde.score_samples(points)) score_outside = np.max(kde.score_samples(other_points)) levels = .745 * score_inside + .255 * score_outside ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels], colors='k', linestyles='solid', linewidths=1) axes[0, 0].set_title("Initialization")
def plot_agglomerative(): X, y = make_blobs(random_state=0, n_samples=12) agg = AgglomerativeClustering(n_clusters=3) eps = X.std() / 2.0 x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)] ax = plt.gca() for i, x in enumerate(X): ax.text(x[0] + 0.1, x[1], "%d" % i, horizontalalignment="left", verticalalignment="center") ax.scatter(X[:, 0], X[:, 1], s=60, c="grey") ax.set_xticks(()) ax.set_yticks(()) for i in range(11): agg.n_clusters = X.shape[0] - i agg.fit(X) bins = np.bincount(agg.labels_) for cluster in range(agg.n_clusters): if bins[cluster] > 1: points = X[agg.labels_ == cluster] other_points = X[agg.labels_ != cluster] kde = KernelDensity(bandwidth=0.5).fit(points) scores = kde.score_samples(gridpoints) score_inside = np.min(kde.score_samples(points)) score_outside = np.max(kde.score_samples(other_points)) levels = 0.8 * score_inside + 0.2 * score_outside ax.contour( xx, yy, scores.reshape(100, 100), levels=[levels], colors="k", linestyles="solid", linewidths=1 ) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max)
def plot_agglomerative_algorithm(): # generate synthetic two-dimensional data X, y = make_blobs(random_state=0, n_samples=12) agg = AgglomerativeClustering(n_clusters=X.shape[0], compute_full_tree=True).fit(X) fig, axes = plt.subplots(X.shape[0] // 5, 5, subplot_kw={"xticks": (), "yticks": ()}, figsize=(20, 8)) eps = X.std() / 2 x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)] for i, ax in enumerate(axes.ravel()): ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) agg.n_clusters = X.shape[0] - i agg.fit(X) ax.set_title("Step %d" % i) ax.scatter(X[:, 0], X[:, 1], s=60, c="grey") bins = np.bincount(agg.labels_) for cluster in range(agg.n_clusters): if bins[cluster] > 1: points = X[agg.labels_ == cluster] other_points = X[agg.labels_ != cluster] kde = KernelDensity(bandwidth=0.5).fit(points) scores = kde.score_samples(gridpoints) score_inside = np.min(kde.score_samples(points)) score_outside = np.max(kde.score_samples(other_points)) levels = 0.8 * score_inside + 0.2 * score_outside ax.contour( xx, yy, scores.reshape(100, 100), levels=[levels], colors="k", linestyles="solid", linewidths=2 ) axes[0, 0].set_title("Initialization")