def process_service(path, service_name, res): scores = [] for i in range(2, 8): centroids = [] metrics = [] labels = [] for j in range(i): name = "%s-cluster-%d_%d.tsv" % (service_name, i, j + 1) cluster_path = os.path.join(path, name) df = pd.read_csv(cluster_path, sep="\t", index_col='time', parse_dates=True) centroids.append(df.columns[0]) for idx, c in enumerate(df.columns[1:]): metrics.append(df[c]) labels.append(j) distances = np.zeros([len(metrics), len(metrics)]) for idx_a, metric_a in enumerate(metrics): for idx_b, metric_b in enumerate(metrics): distances[idx_a, idx_b] = _sbd(metric_a, metric_b)[0] labels = np.array(labels) # def gap(centroids, data, labels, refs=None, nrefs=20, ks=range(1,11)): score = gap(centroids, np.array(metrics), labels) #if len(np.unique(labels)) == 1: # score = -1 #else: # score = silhouette_score(distances, labels, metric='precomputed') res["name"].append(service_name) res["cluster"].append(i) res["silhouette_score"].append(score) res["best_cluster"].append(None) scores.append(score) best = np.argmax(scores) res["name"].append(service_name) res["cluster"].append("best") res["silhouette_score"].append(scores[best]) res["best_cluster"].append(best + 2)
def write(df, name): fig, axes = plt.subplots(ncols=2, nrows=3, figsize=(20, 10)) draw_series_combined(df, name, axes[0, 0]) axes[0, 1].axis('off') axes[0, 1].legend(*axes[0, 0].get_legend_handles_labels(), loc='upper left', ncol=2) draw_series_seperate(df, axes[2, 0]) if df.centroid.notnull().any() and df.centroid.var() != 0: distances = [] for c in df.columns[1:]: distances.append(_sbd(df.centroid, df[c])[0]) draw_lag(df, axes[2, 1]) draw_sbd_bar_plot(distances, axes[1, 0]) draw_sbd_dist_plot(distances, axes[1, 1]) try: plt.tight_layout() plt.savefig(name, dpi=200) plt.close("all") except Exception as e: import pdb pdb.set_trace() print("graph %s failed %s" % (name, e))
def best_column_of_cluster(service, filenames, path, prev_cluster_metadata=None): selected_columns = {} index = None # representative metrics, per cluster index rep_metrics = dict() for i, filename in enumerate(filenames): best_distance = np.inf best_column = None cluster_path = os.path.join(path, filename) df = pd.read_csv(cluster_path, sep="\t", index_col='time', parse_dates=True) # pick the rep, metric as the one w/ shortest distance to the 'centroid' for c in df.columns: if c == "centroid": continue distance = _sbd(df.centroid, df[c])[0] if distance < best_distance: best_distance = distance best_column = c # if we're deriving clusters from prev. version assigments, a rep. metric # switch may happen if prev_cluster_metadata != None: # get cluster assigments from prev. version s_score, cluster_metrics = msu.get_cluster_metrics(prev_cluster_metadata, service) for key, values in cluster_metrics.iteritems(): # if one finds a cluster w/ the same rep. metrics, break if values['rep_metric'] == best_column: break # else, look for cases in which some prev. rep. metric is # part of the current cluster and the current rep. metric is # part of the cluster represented by that prev. metric. elif values['rep_metric'] in df.columns and best_column in values['other_metrics']: # in this case, switch the rep metrics, since the clusters # should be 'similar' best_column = values['rep_metric'] if best_column != None: selected_columns[best_column] = df[best_column] rep_metrics[i] = str(best_column) return rep_metrics, pd.DataFrame(data=selected_columns, index=df.index)
def silhouette_score(series, clusters): distances = np.zeros((series.shape[0], series.shape[0])) for idx_a, metric_a in enumerate(series): for idx_b, metric_b in enumerate(series): distances[idx_a, idx_b] = _sbd(metric_a, metric_b)[0] labels = np.zeros(series.shape[0]) for i, (cluster, indicies) in enumerate(clusters): for index in indicies: labels[index] = i # silhouette is only defined, if we have 2 clusters with assignments at # minimum if len(np.unique(labels)) == 1 or (len(np.unique(labels)) >= distances.shape[0]): #if len(np.unique(labels)) == 1: return labels, -1 else: return labels, _silhouette_score(distances, labels, metric='precomputed')
rands = np.random.random_sample(size=(shape[0], shape[1], nrefs)) for i in range(nrefs): rands[:,:,i] = rands[:,:,i] * dists + bots else: rands = refs gaps = np.zeros((len(ks),)) for (i,k) in enumerate(ks): disp = sum(_sbd([data[m,:], centroids[labels[m],:]) for m in range(shape[0])]) refdisps = np.zeros((rands.shape[2],)) for j in range(rands.shape[2]): for centroid, indicies in kshape(rands[:, :, j], k): for index in indicies: refdisps[j] += _sbd(rands[index, :, j], centroid)[0] gaps[i] = np.log(np.mean(refdisps))- np.log(disp) return gaps def process_service(path, service_name, res): scores = [] for i in range(2, 8): centroids = [] metrics = [] labels = [] for j in range(i): name = "%s-cluster-%d_%d.tsv" % (service_name, i, j + 1) cluster_path = os.path.join(path, name) df = pd.read_csv(cluster_path, sep="\t", index_col='time', parse_dates=True) centroids.append(df.columns[0]) for idx, c in enumerate(df.columns[1:]):