예제 #1
0
def process_service(path, service_name, res):
  scores = []
  for i in range(2, 8):
      centroids = []
      metrics = []
      labels = []
      for j in range(i):
          name = "%s-cluster-%d_%d.tsv" % (service_name, i, j + 1)
          cluster_path = os.path.join(path, name)
          df = pd.read_csv(cluster_path, sep="\t", index_col='time', parse_dates=True)
          centroids.append(df.columns[0])
          for idx, c in enumerate(df.columns[1:]):
              metrics.append(df[c])
              labels.append(j)
      distances = np.zeros([len(metrics), len(metrics)])
      for idx_a, metric_a in enumerate(metrics):
          for idx_b, metric_b in enumerate(metrics):
              distances[idx_a, idx_b] = _sbd(metric_a, metric_b)[0]
      labels = np.array(labels)
      # def gap(centroids, data, labels, refs=None, nrefs=20, ks=range(1,11)):
      score = gap(centroids, np.array(metrics), labels)
      #if len(np.unique(labels)) == 1:
      #    score = -1
      #else:
      #    score = silhouette_score(distances, labels, metric='precomputed')
      res["name"].append(service_name)
      res["cluster"].append(i)
      res["silhouette_score"].append(score)
      res["best_cluster"].append(None)
      scores.append(score)
  best = np.argmax(scores)
  res["name"].append(service_name)
  res["cluster"].append("best")
  res["silhouette_score"].append(scores[best])
  res["best_cluster"].append(best + 2)
예제 #2
0
def write(df, name):
    fig, axes = plt.subplots(ncols=2, nrows=3, figsize=(20, 10))
    draw_series_combined(df, name, axes[0, 0])
    axes[0, 1].axis('off')
    axes[0, 1].legend(*axes[0, 0].get_legend_handles_labels(),
                      loc='upper left',
                      ncol=2)

    draw_series_seperate(df, axes[2, 0])

    if df.centroid.notnull().any() and df.centroid.var() != 0:
        distances = []
        for c in df.columns[1:]:
            distances.append(_sbd(df.centroid, df[c])[0])

        draw_lag(df, axes[2, 1])
        draw_sbd_bar_plot(distances, axes[1, 0])
        draw_sbd_dist_plot(distances, axes[1, 1])
    try:
        plt.tight_layout()
        plt.savefig(name, dpi=200)
        plt.close("all")
    except Exception as e:
        import pdb
        pdb.set_trace()
        print("graph %s failed %s" % (name, e))
def best_column_of_cluster(service, filenames, path, prev_cluster_metadata=None):
    selected_columns = {}
    index = None
    # representative metrics, per cluster index
    rep_metrics = dict()

    for i, filename in enumerate(filenames):

        best_distance = np.inf
        best_column = None

        cluster_path = os.path.join(path, filename)
        df = pd.read_csv(cluster_path, sep="\t", index_col='time', parse_dates=True)
        # pick the rep, metric as the one w/ shortest distance to the 'centroid'
        for c in df.columns:
            if c == "centroid":
                continue
            distance = _sbd(df.centroid, df[c])[0]
            if distance < best_distance:
                best_distance = distance
                best_column = c

        # if we're deriving clusters from prev. version assigments, a rep. metric 
        # switch may happen
        if prev_cluster_metadata != None:
            # get cluster assigments from prev. version
            s_score, cluster_metrics = msu.get_cluster_metrics(prev_cluster_metadata, service)

            for key, values in cluster_metrics.iteritems():
                # if one finds a cluster w/ the same rep. metrics, break
                if values['rep_metric'] == best_column:
                    break
                # else, look for cases in which some prev. rep. metric is 
                # part of the current cluster and the current rep. metric is 
                # part of the cluster represented by that prev. metric.
                elif values['rep_metric'] in df.columns and best_column in values['other_metrics']:
                    # in this case, switch the rep metrics, since the clusters 
                    # should be 'similar'
                    best_column = values['rep_metric']

        if best_column != None:
            selected_columns[best_column] = df[best_column]
            rep_metrics[i] = str(best_column)

    return rep_metrics, pd.DataFrame(data=selected_columns, index=df.index)
예제 #4
0
def silhouette_score(series, clusters):
    distances = np.zeros((series.shape[0], series.shape[0]))
    for idx_a, metric_a in enumerate(series):
        for idx_b, metric_b in enumerate(series):
            distances[idx_a, idx_b] = _sbd(metric_a, metric_b)[0]
    labels = np.zeros(series.shape[0])
    for i, (cluster, indicies) in enumerate(clusters):
        for index in indicies:
            labels[index] = i

    # silhouette is only defined, if we have 2 clusters with assignments at
    # minimum
    if len(np.unique(labels)) == 1 or (len(np.unique(labels)) >=
                                       distances.shape[0]):
        #if len(np.unique(labels)) == 1:
        return labels, -1
    else:
        return labels, _silhouette_score(distances,
                                         labels,
                                         metric='precomputed')
예제 #5
0
      rands = np.random.random_sample(size=(shape[0], shape[1], nrefs))
      for i in range(nrefs):
          rands[:,:,i] = rands[:,:,i] * dists + bots
    else:
        rands = refs

    gaps = np.zeros((len(ks),))
    for (i,k) in enumerate(ks):
        disp = sum(_sbd([data[m,:], centroids[labels[m],:]) for m in range(shape[0])])

        refdisps = np.zeros((rands.shape[2],))
        for j in range(rands.shape[2]):
            for centroid, indicies in kshape(rands[:, :, j], k):
                for index in indicies:
                    refdisps[j] += _sbd(rands[index, :, j], centroid)[0]
        gaps[i] = np.log(np.mean(refdisps))- np.log(disp)
    return gaps

def process_service(path, service_name, res):
  scores = []
  for i in range(2, 8):
      centroids = []
      metrics = []
      labels = []
      for j in range(i):
          name = "%s-cluster-%d_%d.tsv" % (service_name, i, j + 1)
          cluster_path = os.path.join(path, name)
          df = pd.read_csv(cluster_path, sep="\t", index_col='time', parse_dates=True)
          centroids.append(df.columns[0])
          for idx, c in enumerate(df.columns[1:]):