def read_service(srv, path, prev_cluster_metadata):
    preferred = preferred_cluster(srv["clusters"])
    if preferred == 0:
        srv_path = os.path.join(path, srv["preprocessed_filename"])
        df = pd.read_csv(srv_path, sep="\t", index_col='time', parse_dates=True)
        for c in df.columns:
            df[c] = zscore(df[c])
    else:
        cluster = srv["clusters"][str(preferred)]
        rep_metrics, df = best_column_of_cluster(srv["name"], cluster["filenames"], path, prev_cluster_metadata)

    # write additional metadata about components:
    #   - the preferred cluster for the component (is it really necessary?)
    #   - the representative metrics for each cluster
    with metadata.update(path) as data:
        for _service in data["services"]:
            if _service["name"] == srv["name"]:
                if "pref_cluster" not in _service:
                    _service["pref_cluster"] = preferred
                if preferred == 0:
                    continue
                if "rep_metrics" not in _service["clusters"][str(preferred)]:
                    _service["clusters"][str(preferred)]["rep_metrics"] = rep_metrics

    new_names = []
    for column in df.columns:
        if column.startswith(srv["name"]):
            new_names.append(column)
        else:
            new_names.append(srv["name"] + APP_METRIC_DELIMITER + column)
    df.columns = new_names
    return df
def kmeans_cluster(data, idx, idy, plot=True):
    data = zscore(data, axis=1)
    clf = KMeans(n_clusters=idx * idy, max_iter=30000, init='k-means++')
    y_pred = clf.fit_predict(data)
    if plot:
        plot_kmeans(y_pred, data, idx, idy)
    return y_pred
Exemplo n.º 3
0
def do_kshape(name_prefix, df, cluster_size, initial_clustering=None):
    columns = df.columns
    matrix = []
    for c in columns:
        matrix.append(zscore(df[c]))
    res = kshape(matrix, cluster_size, initial_clustering)
    labels, score = silhouette_score(np.array(matrix), res)

    # keep a reference of which metrics are in each cluster
    cluster_metrics = defaultdict(list)
    # we keep it in a dict: cluster_metrics[<cluster_nr>]{<metric_a>, <metric_b>}
    for i, col in enumerate(columns):
        cluster_metrics[int(labels[i])].append(col)

    filenames = []
    for i, (centroid, assigned_series) in enumerate(res):
        d = {}
        for serie in assigned_series:
            d[columns[serie]] = pd.Series(matrix[serie], index=df.index)
        d["centroid"] = pd.Series(centroid, index=df.index)
        df2 = pd.DataFrame(d)
        figure = df2.plot()
        figure.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        name = "%s_%d" % (name_prefix, (i + 1))
        filename = name + ".tsv.gz"
        print(filename)
        df2.to_csv(filename, sep="\t", compression='gzip')
        filenames.append(os.path.basename(filename))
        graphs.write(df2, name + ".png")
    return cluster_metrics, score, filenames
def kshape_cluster(data, idx, idy, plot=True):
    cluster_num = idx * idy
    data = zscore(data, axis=1)
    cluster = kshape(data, cluster_num)
    predict = pred(cluster, data)
    if plot:
        plot_kshapes(cluster, data, idx, idy)
    return predict
Exemplo n.º 5
0
def return_best_shapelets(maxIter, minNum):
    shapelets_tmp = []
    for i in range(maxIter):
        cluster = kshape(zscore(X), n_clusters)
        cluster_result = get_cluster_result(cluster, n_clusters)
        real_numOfcluster = len(cluster_result)
        d = Categorized(Y, cluster_result, real_numOfcluster)
        shapelets_vec = return_shapelets(real_numOfcluster, d, cluster_result)
        predict_label = test(shapelets_vec, train_data)
        FP_FN = confusion_matrix(train_label,
                                 predict_label)[0][1] + confusion_matrix(
                                     train_label, predict_label)[1][0]
        print(FP_FN)
        if FP_FN < minNum:
            minNum = FP_FN
            shapelets_tmp = shapelets_vec
    return shapelets_tmp, minNum
Exemplo n.º 6
0
        x_total.append(temp[1:])
        y_total.append(int(temp[0]))
    for x in x_test_file:
        temp = [float(ts) for ts in x.split(',')]
        x_total.append(temp[1:])
        y_total.append(int(temp[0]))
    # normalize:
    #preprocessing
    #relabeling
    print "Relabeling!"
    le = preprocessing.LabelEncoder()
    le.fit(y_total)
    y_total = le.transform(y_total)
    x_total = np.array(x_total)
    y_total = np.array(y_total)
    print "x_total.shape is ", x_total.shape
    n_cluster = np.max(y_total) + 1
    print 'n_clusters=', n_cluster
    start = time.time()
    clusters = kshape(zscore(x_total), n_cluster)
    #clusters=kshape(x_total,n_cluster)
    y_pred = np.zeros(x_total.shape[0])
    for i in range(len(clusters)):
        y_pred[clusters[i][1]] = i

    time_kshape = time.time() - start
    nmi_kshape = normalized_mutual_info_score(y_total, y_pred)
    ri_kshape = adjusted_rand_score(y_total, y_pred)
    print 'time: %.3f, nmi: %.3f, ri: %.3f' % (time_kshape, nmi_kshape,
                                               ri_kshape)
Exemplo n.º 7
0
from kshape import kshape, zscore

time_series = [[1,2,3,4,5], [0,1,2,3,4], [3,2,1,0,-1], [1,2,2,3,3]]
cluster_num = 2
clusters = kshape(zscore(time_series), cluster_num)
print(clusters)