def tsclusteringN(ts_data, names):
    # クラスタリング

    # 正規化
    ts_dataset = TimeSeriesScalerMinMax().fit_transform(ts_data)

    metric = 'dtw'
    n_clusters = [n for n in range(2, 6)]
    for n in n_clusters:
        print('クラスター数 =', n)

        # metricが「DTW」か「softdtw」なら異なるデータ数の時系列データでもOK
        km = TimeSeriesKMeans(n_clusters=n,
                              metric=metric,
                              verbose=False,
                              random_state=1).fit(ts_dataset)

        # クラスタリングの結果
        print('クラスタリング結果 =', km.labels_)

        # -1から1の範囲の値。シルエット値が1に近く、かつシルエット値をプロットしたシルエット図でクラスター間の幅の差が最も少ないクラスター数が最適
        # 今回はシルエット値のみを確認
        print('シルエット値 =',
              silhouette_score(ts_dataset, km.labels_, metric=metric))
        print()
예제 #2
0
def visualize_n_cluster(train_ts,
                        n_lists=[3, 4, 5, 6],
                        metric='dtw',
                        seed=2021,
                        vis=True):

    if vis:
        fig = plt.figure(figsize=(20, 5))
        plt.title('군집 개수별 건물수 분포', fontsize=15, y=1.2)
        plt.axis('off')

    for idx, n in enumerate(n_lists):
        ts_kmeans = TimeSeriesKMeans(n_clusters=n,
                                     metric=metric,
                                     random_state=seed)
        train_ts['cluster(n={})'.format(n)] = ts_kmeans.fit_predict(train_ts)
        score = round(
            silhouette_score(train_ts,
                             train_ts['cluster(n={})'.format(n)],
                             metric='euclidean'), 3)

        vc = train_ts['cluster(n={})'.format(n)].value_counts()

        if vis:
            ax = fig.add_subplot(1, len(n_lists), idx + 1)
            sns.barplot(x=vc.index, y=vc, palette='Pastel1')
            ax.set(title='n_cluster={0}\nscore:{1}'.format(n, score))
    if vis:
        plt.tight_layout()
        plt.show()

    return train_ts
예제 #3
0
def elbowmethod(df: pd.DataFrame):
    """
    Function that finds the best number of clusters and plots a graph of the distortion with the number of clusters
    Args:
        pd.DataFrame: Columns are (country(index), year_week, value)
    Returns:
        int: Best number of cluster 
    """
    import numpy as np
    from tslearn.clustering import silhouette_score
    import matplotlib.pyplot as plt

    distortions = []
    inertias = []
    mapping1 = {}
    mapping2 = {}

    K = range(2, df.shape[0])
    for k in K:
        # Building and fitting the model
        model = TimeSeriesKMeans(n_clusters=k, metric="softdtw", max_iter=50)
        model.fit(df.values[..., np.newaxis])
        distortions.append(
            silhouette_score(df, model.labels_, metric="softdtw"))

    plt.plot(K, distortions, 'bx-')
    plt.xlabel('Values of K')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method using Distortion')
    plt.show()

    best_num_cluster = np.argmax(distortions) + 2
    return best_num_cluster
예제 #4
0
def visualize_n_cluster(train_ts, n_lists=[3,4,5,6],metric='dtw',seed=2021,vis=True):

    for idx,n in enumerate(n_lists):
        ts_kmeans=TimeSeriesKMeans(n_clusters=n, metric=metric, random_state=seed)
        train_ts['cluster(n={})'.format(n)]=ts_kmeans.fit_predict(train_ts)
        score=round(silhouette_score(train_ts,train_ts['cluster(n={})'.format(n)],metric='euclidean'),3)

    return train_ts
예제 #5
0
    def shape_score(self, data, labels, metric='dtw'):
        """

        :param df:
        :param labels:
        :param metric:
        :return:
        """
        score = silhouette_score(data, labels, metric)
        return score
예제 #6
0
def cluster_time_series(ts_sample, cluster_alg, n_clusters, cluster_metric, score=False):

    # Dataframe to store cluster results
    clust_df = pd.DataFrame(ts_sample.tslist.tolist(), index=ts_sample.tslist.index).reset_index()
    clust_df.columns.values[3:] = ts_sample.sample_dates

    # Fit model
    if cluster_alg == "GAKM":
        km = clust.GlobalAlignmentKernelKMeans(n_clusters=n_clusters)

    if cluster_alg == "TSKM":
        km = clust.TimeSeriesKMeans(n_clusters=n_clusters, metric=cluster_metric)

    # Add predicted cluster labels to cluster results dataframe
    labels = km.fit_predict(ts_sample.ts_dataset)
    clust_df['cluster'] = labels

    if score:
        s = silhouette_score(ts_sample.ts_dataset, labels)
        return clust_df, s

    return clust_df
예제 #7
0
                          metric="softdtw",
                          max_iter=5,
                          max_iter_barycenter=5,
                          random_state=0).fit(multivariate_time_series_train)
km_dba.cluster_centers_.shape
#prediction on train data
prediction_train = km_dba.fit_predict(multivariate_time_series_train, y=None)
len(prediction_train)
#prediction on test data
prediction_test = km_dba.predict(multivariate_time_series_test)
len(prediction_test)
prediction_test

#accuracy of the clustering on the train data
silhouette_score(multivariate_time_series_train,
                 prediction_train,
                 metric="softdtw")
#accuracy of the clustering on the test data
silhouette_score(multivariate_time_series_test,
                 prediction_test,
                 metric="softdtw")

############################################ k=2 #########################################
#select randomly time series from first cluster

cluster1 = multivariate_time_series_train[prediction_train == 0]

random.shuffle(cluster1)

sample1 = cluster1[50:65]
예제 #8
0
def subseqeuence_clustering(sequence, changepoints, y_label='y', norm=False):
    """
    Clusters subsequences of time series indicated by the changepoints variable.
    Uses silhouette score to determine the number of clusters
    :param y_label: Name of y-label in plot
    :param norm: normlise data using MinMaxScaler
    :param sequence: np array of the time series
    :param changepoints: detected changepoints on which subseuqences are build
    :return:
    """
    from tslearn.clustering import TimeSeriesKMeans, silhouette_score
    from tslearn.utils import to_time_series_dataset
    from tslearn.preprocessing import TimeSeriesScalerMinMax

    sub_ids = []
    x_index = []
    X = []
    i = 0
    end_p = [len(sequence) - 1]
    for cp in changepoints + end_p:
        X.append(sequence[i:cp])
        index = 'sub_' + str(i) + '_' + str(cp)
        sub_ids.append(index)
        x_index.append([x_id for x_id in range(i, cp + 1)])
        i = cp

    # Normalize the data (y = (x - min) / (max - min))
    if norm:
        X = TimeSeriesScalerMinMax().fit_transform(X)
    X = to_time_series_dataset(X)
    #  Find optimal # clusters by
    #  looping through different configurations for # of clusters and store the respective values for silhouette:
    sil_scores = {}
    for n in range(2, len(changepoints)):
        model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10)
        model_tst.fit(X)
        sil_scores[n] = (silhouette_score(X,
                                          model_tst.predict(X),
                                          metric="dtw"))

    opt_k = max(sil_scores, key=sil_scores.get)
    print('Number of Clusters in subsequence clustering: ' + str(opt_k))
    model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10)
    labels = model.fit_predict(X)
    print(labels)

    # build helper df to map metrics to their cluster labels
    df_cluster = pd.DataFrame(list(zip(sub_ids, x_index, model.labels_)),
                              columns=['metric', 'x_index', 'cluster'])
    cluster_metrics_dict = df_cluster.groupby(
        ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict()

    print('Plotting Clusters')
    #  plot changepoints as vertical lines
    for cp in changepoints:
        plt.axvline(x=cp, ls=':', lw=2, c='0.65')
    #  preprocessing for plotting cluster based
    x_scat = []
    y_scat = []
    cluster = []
    for index, row in df_cluster.iterrows():
        x_seq = row['x_index']
        x_scat.extend(x_seq)
        y_seq = sequence[x_seq[0]:x_seq[-1] + 1]
        y_scat.extend(y_seq)
        label_seq = [row['cluster']]
        cluster.extend(label_seq * len(x_seq))
        # plt.scatter(x_seq, y_seq, label=label_seq)
    # plotting cluster based
    x_scat = np.array(x_scat)
    y_scat = np.array(y_scat)
    for c in np.unique(cluster):
        i = np.where(cluster == c)
        plt.scatter(x_scat[i], y_scat[i], label=c)
    plt.legend()
    plt.title('Subsequence k-means Clustering')
    plt.xlabel('Time index')
    plt.ylabel(y_label)
    plt.show()

    return cluster_metrics_dict
예제 #9
0
def main(args):

    data_dir = './Data/User Categorization/'

    if args.method == 'K':
        print('Working on K-means clustering')
        ts_dataset = []

        #Only take the first 500 unique ID's
        n_samples = 500

        for i in range(n_samples):
            csv_file = pd.read_csv(data_dir + str(i) + '.csv')
            time_series_df = csv_file[(~csv_file['f_1'].isnull())
                                      & (~csv_file['f_2'].isnull())]
            time_series_seq = list(time_series_df[['f_1', 'f_2',
                                                   'f_3']].values)
            ts_dataset.append(time_series_seq)

        #Preparing Time-series dataset
        formatted_dataset = to_time_series_dataset(ts_dataset)

        silhouette_scores = []
        n_clusters = [2, 3, 4, 5, 6]

        for cluster in n_clusters:
            km = TimeSeriesKMeans(n_clusters=cluster,
                                  metric="dtw",
                                  verbose=True,
                                  max_iter=5)
            y_pred = km.fit_predict(formatted_dataset)
            s_score = silhouette_score(formatted_dataset, y_pred, metric="dtw")
            silhouette_scores.append(s_score)

        sns.lineplot(x=n_clusters, y=silhouette_scores, sort=False)

        #Optimal clusters
        km = TimeSeriesKMeans(n_clusters=2,
                              metric="dtw",
                              verbose=True,
                              max_iter=5)
        y_pred = km.fit_predict(formatted_dataset)
        df = pd.DataFrame(data=y_pred, columns=['Cluster No.'])
        df.to_csv('./kmeans_clustering.csv', index=False)

        #Visualise Clusters
        sz = formatted_dataset.shape[1]
        plt.figure(figsize=(20, 20))

        for yi in range(2):
            plt.subplot(3, 3, 2 + yi)
            for xx in formatted_dataset[y_pred == yi]:
                plt.plot(xx.ravel(), "k-", alpha=.2)
            plt.plot(km.cluster_centers_[yi].ravel(), "r-")
            plt.xlim(0, sz)
            plt.ylim(-500000, 500000)
            plt.text(0.55,
                     0.85,
                     'Cluster %d' % (yi + 1),
                     transform=plt.gca().transAxes)
            if yi == 1:
                plt.title("DTW $k$-means")
        plt.tight_layout()
        plt.show()

    if args.method == 'H':
        #Hierarchical clustering
        print('Working on Hierarchical clustering')
        #Build distance matrix
        manual_dist_matrix = True
        n_samples = 500

        if manual_dist_matrix == False:
            distance_matrix = np.zeros(shape=(n_samples, n_samples))

            for i in range(n_samples):
                for j in range(n_samples):
                    sequence_1_df = pd.read_csv('./Data/User Categorization/' +
                                                str(i) + '.csv')
                    sequence_2_df = pd.read_csv('./Data/User Categorization/' +
                                                str(j) + '.csv')

                    seq_1 = sequence_1_df[(~sequence_1_df['f_1'].isnull())
                                          & (~sequence_1_df['f_2'].isnull())]
                    seq_2 = sequence_2_df[(~sequence_2_df['f_1'].isnull())
                                          & (~sequence_2_df['f_2'].isnull())]

                    x = seq_1[['f_1', 'f_2', 'f_3']].values
                    y = seq_2[['f_1', 'f_2', 'f_3']].values

                    distance, path = fastdtw(x, y, dist=euclidean)

                    if i != j:
                        distance_matrix[i, j] = distance

            savetxt('distance_matrix.csv', distance_matrix, delimiter=',')

        distance_matrix = np.genfromtxt('distance_matrix.csv', delimiter=',')
        linkage_matrix = hierarchical_clustering(distance_matrix)

        # select maximum number of clusters
        cluster_labels = fcluster(linkage_matrix, 4, criterion='maxclust')
        print(np.unique(cluster_labels))

        categorization_df = []
        files_list = os.listdir('./Data/User Categorization')

        for files in files_list:
            csv_file = pd.read_csv('./Data/User Categorization/' + str(files))
            unique_id = files[:-4]
            csv_file['ID'] = unique_id
            categorization_df.append(csv_file)

        df = pd.concat(categorization_df, axis=0, ignore_index=True)

        #filter out null values
        filtered_df = df[(~df['f_1'].isnull()) & (~df['f_2'].isnull())]

        df_vis = filtered_df.sort_values(by='ID')
        df_vis['ID'] = df_vis['ID'].astype('int')
        df_vis = df_vis[df_vis['ID'] <= 499].sort_values(by='ID').reset_index(
            drop=True)
        df_vis_fil = df_vis.groupby('ID')['f_1', 'f_2',
                                          'f_3'].mean().reset_index()
        df_vis_fil['Cluster'] = cluster_labels
        df_vis_fil.to_csv('./hier_clustering.csv', index=False)

        #Plotting Visualisation 3D scatterplot
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')

        x = np.array(df_vis_fil['f_1'])
        y = np.array(df_vis_fil['f_2'])
        z = np.array(df_vis_fil['f_3'])

        ax.scatter(x, y, z, marker="s", c=df_vis_fil["Cluster"], cmap="RdBu")

        plt.show()

    else:
        print('Please input K or H clustering method correctly')
예제 #10
0
def k_means_clustering(sd_log):
    """
    k_means clustering of all features using dtw for multivariate time series
    :param sd_log: sd_log object
    :return: cluster_metrics_dict: dict with clusters as key and features as values
    """
    from tslearn.clustering import TimeSeriesKMeans, silhouette_score
    from tslearn.utils import to_time_series_dataset
    from tslearn.preprocessing import TimeSeriesScalerMinMax

    data = sd_log.data
    # TODO handle outliers
    tmp = sd_log.waiting_time
    data.drop(columns=[sd_log.waiting_time], inplace=True)
    X = []
    # Get data as numpy array
    for col in data.columns:
        X.append(sd_log.get_points(col))

    # Normalize the data (y = (x - min) / (max - min))
    data_norm = data.copy()
    for column in data_norm.columns:
        data_norm[column] = (data_norm[column] - data_norm[column].min()) / (
            data_norm[column].max() - data_norm[column].min())

    X = TimeSeriesScalerMinMax().fit_transform(X)
    X = to_time_series_dataset(X)

    #  Find optimal # clusters by
    #  looping through different configurations for # of clusters and store the respective values for silhouette:
    sil_scores = {}
    for n in range(2, len(data.columns)):
        model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10)
        model_tst.fit(X)
        sil_scores[n] = (silhouette_score(X,
                                          model_tst.predict(X),
                                          metric="dtw"))

    opt_k = max(sil_scores, key=sil_scores.get)
    model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10)
    labels = model.fit_predict(X)
    print(labels)

    # build helper df to map metrics to their cluster labels
    df_cluster = pd.DataFrame(list(zip(data.columns, model.labels_)),
                              columns=['metric', 'cluster'])

    # make some helper dictionaries and lists
    cluster_metrics_dict = df_cluster.groupby(
        ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict()
    cluster_len_dict = df_cluster['cluster'].value_counts().to_dict()
    clusters_dropped = [
        cluster for cluster in cluster_len_dict
        if cluster_len_dict[cluster] == 1
    ]
    clusters_final = [
        cluster for cluster in cluster_len_dict
        if cluster_len_dict[cluster] > 1
    ]

    print('Plotting Clusters')

    fig, axs = plt.subplots(opt_k)  # , figsize=(10, 5))
    # fig.suptitle('Clusters')
    row_i = 0
    # column_j = 0
    # For each label there is,
    # plots every series with that label
    for cluster in cluster_metrics_dict:
        for feat in cluster_metrics_dict[cluster]:
            axs[row_i].plot(data_norm[feat], label=feat, alpha=0.4)
            axs[row_i].legend(loc="best")
        if len(cluster_metrics_dict[cluster]) > 100:
            # TODO draw mean in red if more than one cluster
            tmp = np.nanmean(np.vstack(cluster), axis=1)
            axs[row_i].plot(tmp, c="red")
        axs[row_i].set_title("Cluster " + str(cluster))
        row_i += 1
        # column_j += 1
        # if column_j % k == 0:
        #    row_i += 1
        #    column_j = 0
    plt.show()

    # return dict {cluster_id: features}
    return cluster_metrics_dict
예제 #11
0
    return xs, ys


if __name__=='__main__':
    TIMELINE = 'C:/Users/Vivian Imbriotis/Desktop/2018-04-26_01_CFEB106/1-3-2018-04-26_01_CFEB106_1/metadata matlab/2018-04-26_01_CFEB106_Timeline.mat'
    SPKS = 'C:/Users/Vivian Imbriotis/Desktop/2018-04-26_01_CFEB106/1-3-2018-04-26_01_CFEB106_1/npy/spks.npy'
    TIMEPOINTS = 200
    MAX_K = 6
    stamps = get_frame_times(TIMELINE)

    scores = []
    models = []
    data = np.load(SPKS)
    for i in range(2,MAX_K+1):
        model = get_fitted_model(data, 
                            clusters = i, 
                            timepoints = TIMEPOINTS,
                            verbose = True)
        models.append(model)
    for model in models:
        scores.append(silhouette_score(
            data[:,0:TIMEPOINTS],
            model.labels_,
            metric = 'dtw',
            n_jobs = -1,
            verbose = True))
    plt.plot(list(range(2,MAX_K+1)),scores)
    plt.title("Performance of KMeans on axonal data")
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.show()
예제 #12
0
def test_elbow(X, dtw_value, seed):
    print(len(X))
    distortions = []
    silhouette_value = []
    dists = dtw_value
    print(dists)
    if seed == -1:
        for seed in range(0, 21):
            cur_silhouette = [seed]
            cur_distortions = [seed]
            for i in range(2, 15):
                print(i)
                km = KMedoids(n_clusters=i,
                              random_state=seed,
                              metric="precomputed",
                              init='k-medoids++',
                              max_iter=30000)
                km.fit(dists)
                # 记录误差和
                cur_distortions.append(km.inertia_)
                y_pred = km.fit_predict(dists)
                np.fill_diagonal(dists, 0)
                score = silhouette_score(dists, y_pred, metric="precomputed")
                cur_silhouette.append(score)
            distortions.append(cur_distortions)
            silhouette_value.append(cur_silhouette)
        with open(r".//res//grid_distortions_destination.csv",
                  "w",
                  encoding='UTF-8',
                  newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in distortions:
                writer.writerow(row)
                print(row)
        with open(r".//res//grid_silhouette_destination.csv",
                  "w",
                  encoding='UTF-8',
                  newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in silhouette_value:
                writer.writerow(row)
                print(row)
    else:
        csv_reader = csv.reader(
            open(".//res//grid_distortions_destination.csv", encoding='UTF-8'))
        for row in csv_reader:
            distortions.append([float(item) for item in row])
        csv_reader = csv.reader(
            open(".//res//grid_silhouette_destination.csv", encoding='UTF-8'))
        for row in csv_reader:
            silhouette_value.append([float(item) for item in row])
        chosen_distortions = distortions[seed][1:]
        chosen_silhouette = silhouette_value[seed][1:]
        plt.figure(1)
        plt.plot(range(2, 15), chosen_distortions, marker='o')
        plt.xlabel('Number of clusters')
        plt.ylabel('Distortion')
        plt.savefig(r'.//res//grid_distortions_destination.png')
        plt.close()
        plt.figure(1)
        plt.bar(range(2, 15), chosen_silhouette, color='grey')
        plt.xlabel('Number of clusters')
        plt.ylabel('Silhouette score')
        plt.savefig(r'.//res//grid_silhouette_destination.png')
예제 #13
0
    max_cluster = 21
    silhouette_score_dict = {}
    sse_dict = {}
    label_dict = {}

    silhouette_score_dict["time-series-k-means"] = []
    sse_dict["time-series-k-means"] = []
    label_dict["time-series-k-means"] = {}
    # silhouette_score_dict["k-shape"] = []
    # silhouette_score_dict["global-alignment-kernel-k-means"] = []
    for i in range(min_cluster, max_cluster):
        print(service + "-cluster:" + str(i))
        km = TimeSeriesKMeans(n_clusters=i, verbose=True)
        label = km.fit_predict(X_train)
        silhouette_score_dict["time-series-k-means"].append(
            silhouette_score(X_train, label, metric="dtw"))
        sse_dict["time-series-k-means"].append(km.inertia_)
        label_dict["time-series-k-means"][i] = label

        # km = GlobalAlignmentKernelKMeans(n_clusters=i, verbose=True)
        # label = km.fit_predict(X_train)
        # silhouette_score_dict["global-alignment-kernel-k-means"].append(silhouette_score(X_train, label, metric="dtw"))

        # km = KShape(n_clusters=i, verbose=True)
        # label = km.fit_predict(X_train)
        # silhouette_score_dict["k-shape"].append(silhouette_score(X_train, label, metric="dtw"))

    s1 = str(silhouette_score_dict)
    s2 = str(sse_dict)

    service = service.replace("/", "-")
dt['UnixTime'] = dt.index.astype(np.int64) // 10**9

dt = dt.fillna(0)

evalu = []

for k in range(10):

    km = TimeSeriesKMeans(n_clusters=k + 2,
                          verbose=True,
                          random_state=23,
                          metric="dtw")

    Y = km.fit_predict(dt.T)

    evalu.append(silhouette_score(dt.T, Y, metric="dtw"))

# 6 clusteres is best

km = TimeSeriesKMeans(n_clusters=7,
                      verbose=True,
                      random_state=23,
                      metric="dtw")

Y = km.fit_predict(dt.T)

c1 = np.where(Y == 0)[0].tolist()
c2 = np.where(Y == 1)[0].tolist()
c3 = np.where(Y == 2)[0].tolist()
c4 = np.where(Y == 3)[0].tolist()
c5 = np.where(Y == 4)[0].tolist()