def read_service(srv, path, prev_cluster_metadata): preferred = preferred_cluster(srv["clusters"]) if preferred == 0: srv_path = os.path.join(path, srv["preprocessed_filename"]) df = pd.read_csv(srv_path, sep="\t", index_col='time', parse_dates=True) for c in df.columns: df[c] = zscore(df[c]) else: cluster = srv["clusters"][str(preferred)] rep_metrics, df = best_column_of_cluster(srv["name"], cluster["filenames"], path, prev_cluster_metadata) # write additional metadata about components: # - the preferred cluster for the component (is it really necessary?) # - the representative metrics for each cluster with metadata.update(path) as data: for _service in data["services"]: if _service["name"] == srv["name"]: if "pref_cluster" not in _service: _service["pref_cluster"] = preferred if preferred == 0: continue if "rep_metrics" not in _service["clusters"][str(preferred)]: _service["clusters"][str(preferred)]["rep_metrics"] = rep_metrics new_names = [] for column in df.columns: if column.startswith(srv["name"]): new_names.append(column) else: new_names.append(srv["name"] + APP_METRIC_DELIMITER + column) df.columns = new_names return df
def kmeans_cluster(data, idx, idy, plot=True): data = zscore(data, axis=1) clf = KMeans(n_clusters=idx * idy, max_iter=30000, init='k-means++') y_pred = clf.fit_predict(data) if plot: plot_kmeans(y_pred, data, idx, idy) return y_pred
def do_kshape(name_prefix, df, cluster_size, initial_clustering=None): columns = df.columns matrix = [] for c in columns: matrix.append(zscore(df[c])) res = kshape(matrix, cluster_size, initial_clustering) labels, score = silhouette_score(np.array(matrix), res) # keep a reference of which metrics are in each cluster cluster_metrics = defaultdict(list) # we keep it in a dict: cluster_metrics[<cluster_nr>]{<metric_a>, <metric_b>} for i, col in enumerate(columns): cluster_metrics[int(labels[i])].append(col) filenames = [] for i, (centroid, assigned_series) in enumerate(res): d = {} for serie in assigned_series: d[columns[serie]] = pd.Series(matrix[serie], index=df.index) d["centroid"] = pd.Series(centroid, index=df.index) df2 = pd.DataFrame(d) figure = df2.plot() figure.legend(loc='center left', bbox_to_anchor=(1, 0.5)) name = "%s_%d" % (name_prefix, (i + 1)) filename = name + ".tsv.gz" print(filename) df2.to_csv(filename, sep="\t", compression='gzip') filenames.append(os.path.basename(filename)) graphs.write(df2, name + ".png") return cluster_metrics, score, filenames
def kshape_cluster(data, idx, idy, plot=True): cluster_num = idx * idy data = zscore(data, axis=1) cluster = kshape(data, cluster_num) predict = pred(cluster, data) if plot: plot_kshapes(cluster, data, idx, idy) return predict
def return_best_shapelets(maxIter, minNum): shapelets_tmp = [] for i in range(maxIter): cluster = kshape(zscore(X), n_clusters) cluster_result = get_cluster_result(cluster, n_clusters) real_numOfcluster = len(cluster_result) d = Categorized(Y, cluster_result, real_numOfcluster) shapelets_vec = return_shapelets(real_numOfcluster, d, cluster_result) predict_label = test(shapelets_vec, train_data) FP_FN = confusion_matrix(train_label, predict_label)[0][1] + confusion_matrix( train_label, predict_label)[1][0] print(FP_FN) if FP_FN < minNum: minNum = FP_FN shapelets_tmp = shapelets_vec return shapelets_tmp, minNum
x_total.append(temp[1:]) y_total.append(int(temp[0])) for x in x_test_file: temp = [float(ts) for ts in x.split(',')] x_total.append(temp[1:]) y_total.append(int(temp[0])) # normalize: #preprocessing #relabeling print "Relabeling!" le = preprocessing.LabelEncoder() le.fit(y_total) y_total = le.transform(y_total) x_total = np.array(x_total) y_total = np.array(y_total) print "x_total.shape is ", x_total.shape n_cluster = np.max(y_total) + 1 print 'n_clusters=', n_cluster start = time.time() clusters = kshape(zscore(x_total), n_cluster) #clusters=kshape(x_total,n_cluster) y_pred = np.zeros(x_total.shape[0]) for i in range(len(clusters)): y_pred[clusters[i][1]] = i time_kshape = time.time() - start nmi_kshape = normalized_mutual_info_score(y_total, y_pred) ri_kshape = adjusted_rand_score(y_total, y_pred) print 'time: %.3f, nmi: %.3f, ri: %.3f' % (time_kshape, nmi_kshape, ri_kshape)
from kshape import kshape, zscore time_series = [[1,2,3,4,5], [0,1,2,3,4], [3,2,1,0,-1], [1,2,2,3,3]] cluster_num = 2 clusters = kshape(zscore(time_series), cluster_num) print(clusters)