示例#1
0
step = 1
period = 60
while True:
    split = his_df[idx:idx + period]
    split = split.reset_index()
    if idx > len(his_df) or len(split) < period: break
    train_df = train_df.assign(
        idx=split['Close'] / split.tail(1)['Close'].values - 1)
    train_df = train_df.rename(columns={'idx': idx})
    idx += step

train_df = train_df.T  #전치
train_np = train_df.to_numpy()

ds = dtw.distance_matrix_fast(train_np,
                              block=((0, 1), (1, len(train_np))),
                              compact=True)
ds_array = np.array(ds)

ds_array = np.delete(ds_array, range(60), axis=0)  #target 주변 삭제
temp_value = []
temp_index = []
store = {}
rtn_store = {}
scale = period
bar_data = pd.DataFrame()

#get 10
for i in range(10):

    temp_value.append(min(ds_array))
示例#2
0
## For storing boxplots data
median_dist = []
min_dist = []
max_dist = []
q1_dist = []
q3_dist = []
## For storing violin plot data
mean_dist_all = []

## calculating distance matrices for t-SNE
series_data = []
for i in data_all.index:
    val = map(int, data_all.loc[i, 'values'].split("_"))
    series_data.append(np.array(list(val), dtype=np.double))

ds = dtw.distance_matrix_fast(series_data, penalty=penalty)

ds[np.tril_indices(ds.shape[0], k=-1)] = ds.T[np.tril_indices(ds.shape[0],
                                                              k=-1)]
np.fill_diagonal(ds, 0)
ds = pd.DataFrame(ds)
ds.index = data_all['kmer']
ds.columns = data_all['kmer']

os.makedirs(out_folder + '/distance_matrices/')
os.makedirs(out_folder + '/raw_signal/')
for kmer_row in data_all['kmer'].unique():
    for kmer_column in data_all['kmer'].unique():

        current_ds = ds.loc[[kmer_row], [kmer_column]]
        current_ds.columns = range(0, len(current_ds.columns))
示例#3
0
def x2p(X=np.array([]), tol=1e-5, perplexity=30.0):
    """
        Performs a binary search to get P-values in such a way that each
        conditional Gaussian has the same perplexity.
    """

    # Initialize some variables
    print("Computing pairwise distances...")
    # https://stackoverflow.com/questions/37009647/compute-pairwise-distance-in-a-batch-without-replicating-tensor-in-tensorflow
    (n, d) = X.shape
    # sum_X = np.sum(np.square(X), 1)
    # D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X)
    D = dtw.distance_matrix_fast(X)

    print(D.shape)
    P = np.zeros((n, n))
    beta = np.ones((n, 1))
    logU = np.log(perplexity)

    # Loop over all datapoints
    for i in range(n):

        # Print progress
        if i % 500 == 0:
            print("Computing P-values for point %d of %d..." % (i, n))

        # Compute the Gaussian kernel and entropy for the current precision
        betamin = -np.inf
        betamax = np.inf
        Di = D[i, np.concatenate((np.r_[0:i], np.r_[i + 1:n]))]
        (H, thisP) = Hbeta(Di, beta[i])

        # Evaluate whether the perplexity is within tolerance
        Hdiff = H - logU
        tries = 0
        while np.abs(Hdiff) > tol and tries < 50:

            # If not, increase or decrease precision
            if Hdiff > 0:
                betamin = beta[i].copy()
                if betamax == np.inf or betamax == -np.inf:
                    beta[i] = beta[i] * 2.
                else:
                    beta[i] = (beta[i] + betamax) / 2.
            else:
                betamax = beta[i].copy()
                if betamin == np.inf or betamin == -np.inf:
                    beta[i] = beta[i] / 2.
                else:
                    beta[i] = (beta[i] + betamin) / 2.

            # Recompute the values
            (H, thisP) = Hbeta(Di, beta[i])
            Hdiff = H - logU
            tries += 1

        # Set the final row of P
        P[i, np.concatenate((np.r_[0:i], np.r_[i + 1:n]))] = thisP

    # Return final P-matrix
    print("Mean value of sigma: %f" % np.mean(np.sqrt(1 / beta)))
    return P
import numpy as np
from numpy import inf
import pandas as pd
import matplotlib.pyplot as plt
from dtaidistance import clustering
import sklearn
from sklearn import cluster

df = pd.read_csv("Scania_Data_Clustering.csv", header=0) # header=0 is default
head = list(df.columns.values) # get machine names
print("head", head) # print machine names

df = df.T # transpose the data
df = df.values

ds = dtw.distance_matrix_fast(df) # get dist matrix
ds[ds == inf] = 0 # replace all infinity vals in the dist matrix with 0.

pd.DataFrame(ds).to_excel("ds.xlsx") # save dist matrix to a xlsx.

# clustering starts
# Custom Hierarchical clustering
model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {})
# Augment Hierarchical object to keep track of the full tree
model2 = clustering.HierarchicalTree(model1)
# SciPy linkage clustering
model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})

cluster_idx = model3.fit(df)

# plot
示例#5
0
def cluster_the_ts_curves(infile, outfolder, maturity, smoothing):

    series = {}
    venues = []
    indicies = [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (1, 0), (1, 1), (1, 2),
                (1, 3), (1, 4), (1, 5)]

    for ind, line in enumerate(open(infile)):
        fields = line.strip().split('\t')
        venue = fields[0]
        ts = fields[1:]
        venues.append(venue)
        #if ind == 500: break

        if smoothing == 'smooth':
            series[venue] = savgol_filter(
                np.asarray([float(fff) for fff in ts]), 5, 3)
        elif smoothing == 'notsmooth':
            series[venue] = np.asarray([float(fff) for fff in ts])
        else:
            print('F**K OFF')

    dists = dtw.distance_matrix_fast(list(series.values()))
    model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})
    cluster_idx = model3.fit(list(series.values()))
    linkage_matrix = model3.linkage

    nnn = len(series)
    cluster_dict = {}

    if not os.path.exists(maturity):
        os.makedirs(maturity)

    for i in range(0, nnn - 1):

        new_cluster_id = nnn + i
        old_cluster_id_0 = linkage_matrix[i, 0]
        old_cluster_id_1 = linkage_matrix[i, 1]
        combined_ids = list()
        if old_cluster_id_0 in cluster_dict:
            combined_ids += cluster_dict[old_cluster_id_0]
            del cluster_dict[old_cluster_id_0]
        else:
            combined_ids += [old_cluster_id_0]
        if old_cluster_id_1 in cluster_dict:
            combined_ids += cluster_dict[old_cluster_id_1]
            del cluster_dict[old_cluster_id_1]
        else:
            combined_ids += [old_cluster_id_1]
        cluster_dict[new_cluster_id] = combined_ids

        nodes_included = []
        for v in cluster_dict.values():
            nodes_included += v

        nc = len(cluster_dict)
        nnodes = len(set(nodes_included))

        #for NNN in [6]:
        #for NNN in [3, 5, 6, 10]:
        for NNN in [10]:

            #NNN = 6   # 5 # 6 # 10

            figfolder = outfolder + '/' + maturity + '/figs_clusters_' + smoothing + '/' + str(
                NNN)
            curvefodler = outfolder + '/' + maturity + '/avg_curves_' + smoothing + '/' + str(
                NNN)
            vensfolder = outfolder + '/' + maturity + '/clusters_venues_' + smoothing + '/' + str(
                NNN)

            if not os.path.exists(figfolder): os.makedirs(figfolder)
            if not os.path.exists(curvefodler): os.makedirs(curvefodler)
            if not os.path.exists(vensfolder): os.makedirs(vensfolder)

            MINCSIZE = 100
            MAXSIZE = len(series) / 2

            cnt = [(c, len(n)) for (c, n) in cluster_dict.items()
                   if len(n) > MINCSIZE and len(n) < MAXSIZE]
            num = min(len(cnt), NNN)
            cnt = sorted(cnt, key=lambda tup: tup[1], reverse=True)[0:num]

            biggest = sum([cc[1] for cc in cnt])
            top5cluster = [c[0] for c in cnt]

            if biggest > len(series) / 2:

                f, ax = plt.subplots(2, 5, figsize=(20, 8))
                ind = 0

                for ccc, nodes in cluster_dict.items():

                    if ccc in top5cluster:

                        ttt = []
                        sss = []

                        cluster_vens = []
                        subseries = []

                        for n in nodes:

                            subseries.append(list(series.values())[int(n)])

                            sss += list(list(series.values())[int(n)])
                            ttt += transform_ts(
                                list(range(len(list(
                                    series.values())[int(n)]))), 11)

                        for n in nodes:

                            cluster_vens.append(list(series.keys())[int(n)])
                            linetotplot = list(series.values())[int(n)]
                            xlinetotplot = transform_ts(
                                list(range(len(list(
                                    series.values())[int(n)]))), 11)

                            ax[indicies[ind]].plot(xlinetotplot,
                                                   linetotplot,
                                                   linewidth=0.4,
                                                   color='grey',
                                                   alpha=0.15)

                        ffout = open(
                            vensfolder + '/venues_in_' + str(ind) + '_' +
                            str(biggest) + '_venuesnum=' +
                            str(len(subseries)) + '.dat', 'w')
                        ffout.write('\n'.join(cluster_vens))
                        ffout.close()

                        ax[indicies[ind]].set_title('Number of venues = ' +
                                                    str(len(subseries)),
                                                    fontsize=15)

                        bx, by = getBinnedDistribution(ttt, sss, 8)
                        bx = (bx[1:] + bx[:-1]) / 2

                        fout = open(
                            curvefodler + '/avg_curve_' + str(ind) + '_' +
                            str(biggest) + '_venuesnum=' +
                            str(len(subseries)) + '.dat', 'w')
                        fout.write('\t'.join([str(b) for b in bx]) + '\n')
                        fout.write('\t'.join([str(b) for b in by]) + '\n')
                        fout.close()
                        ax[indicies[ind]].plot(bx, by, linewidth=3, color='r')

                        ind += 1

                plt.savefig(figfolder + '/top_' + str(NNN) + '_clusters_' +
                            str(biggest) + '.png')
                plt.close()
示例#6
0
# F
ts_d1 = np.array(data[0][3])
ts_d1 = ts_d1.reshape([ts_d1.shape[0], ts_d1.shape[1]])

ts_d2 = np.array(data[1][3])
ts_d2 = ts_d2.reshape([ts_d2.shape[0], ts_d2.shape[1]])

ts_d3 = np.array(data[2][3])
ts_d3 = ts_d3.reshape([ts_d3.shape[0], ts_d3.shape[1]])

ts_d4 = np.array(data[3][3])
ts_d4 = ts_d4.reshape([ts_d4.shape[0], ts_d4.shape[1]])

num_d1 = ts_d1.shape[0]
num_d2 = ts_d2.shape[0]
num_d3 = ts_d3.shape[0]
num_d4 = ts_d4.shape[0]

ts = np.concatenate([ts_d1, ts_d2, ts_d3, ts_d4], axis=0)
ds = dtw.distance_matrix_fast(ts)

ds_d1 = ds[:num_d1, :num_d1]
ds_d1_d2 = ds[:num_d1, num_d1:num_d1 + num_d2]

ds_d1 = ds_d1.flatten()
ds_d1_d2 = ds_d1_d2.flatten()

sns.distplot(ds_d1, hist=False, rug=True, color="g")
sns.distplot(ds_d1_d2, hist=False, rug=True, color="m")

plt.show()
示例#7
0
    #if ind == 50: break
    series.append(  np.asarray([float(fff) for fff in line.strip().split('\t')]))


for ind, line in enumerate(open('TIMESERIES_tims.dat')):
    #if ind == 50: break
    tseries.append(  np.asarray([float(fff) for fff in line.strip().split('\t')]))






print (len(series))

dists = dtw.distance_matrix_fast(series)




# model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {})
# Augment Hierarchical object to keep track of the full tree
# model2 = clustering.HierarchicalTree(model1)
# SciPy linkage clustering
model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})
cluster_idx = model3.fit(series)
#print (dir(model3))
#print (model3.linkage)


示例#8
0
def distance_fast(c_series,
                  ic,
                  jc,
                  subim,
                  S,
                  m,
                  rmin,
                  cmin,
                  window=None,
                  max_dist=None,
                  max_step=None,
                  max_diff=None,
                  penalty=None,
                  psi=None):
    """This function computes the spatial-temporal distance between \
    two pixels using the dtw distance with C implementation.

    :param c_series: average time series of cluster.
    :type c_series: numpy.ndarray

    :param ic: X coordinate of cluster center.
    :type ic: int

    :param jc: Y coordinate of cluster center.
    :type jc: int

    :param subim: Block of image from the cluster under analysis.
    :type subim: int

    :param S: Pattern spacing value.
    :type S: int

    :param m: Compactness value.
    :type m: float

    :param rmin: Minimum row.
    :type rmin: int

    :param cmin: Minimum column.
    :type cmin: int

    :param window: Only allow for maximal shifts from the two diagonals \
    smaller than this number. It includes the diagonal, meaning that an \
    Euclidean distance is obtained by setting window=1.

    :param max_dist: Stop if the returned values will be larger than \
    this value.

    :param max_step: Do not allow steps larger than this value.

    :param max_diff: Return infinity if length of two series is larger.

    :param penalty: Penalty to add if compression or expansion is applied.

    :param psi: Psi relaxation parameter (ignore start and end of matching).
        Useful for cyclical series.

    :returns D:  numpy.ndarray distance.
    """
    from dtaidistance import dtw

    # Normalizing factor
    m = m / 10

    # Initialize submatrix
    ds = numpy.zeros([subim.shape[1], subim.shape[2]])

    # Tranpose matrix to allow dtw fast computation with dtaidistance
    linear = subim.transpose(1, 2, 0).reshape(subim.shape[1] * subim.shape[2],
                                              subim.shape[0])
    merge = numpy.vstack((linear, c_series)).astype(numpy.double)

    # Compute dtw distances
    c = dtw.distance_matrix_fast(merge,
                                 block=((0, merge.shape[0]),
                                        (merge.shape[0] - 1, merge.shape[0])),
                                 compact=True,
                                 parallel=True,
                                 window=window,
                                 max_dist=max_dist,
                                 max_step=max_step,
                                 max_length_diff=max_diff,
                                 penalty=penalty,
                                 psi=psi)
    c1 = numpy.frombuffer(c)
    dc = c1.reshape(subim.shape[1], subim.shape[2])

    x = numpy.arange(subim.shape[1])
    y = numpy.arange(subim.shape[2])
    xx, yy = numpy.meshgrid(x, y, sparse=True, indexing='ij')

    # Calculate Spatial Distance
    ds = (((xx - ic)**2 + (yy - jc)**2)**0.5)
    # Calculate SPatial-temporal distance
    D = (dc) / m + (ds / S)

    return D
def _cluster(df):

    flow_df = df.copy()
    sites = df['Site'].to_list()
    sites_len = len(sites)

    df = df.fillna(0).drop(columns=["Site", "Flow"])
    df = df.to_numpy()

    try:
        distance = dtw.distance_matrix_fast(df, compact=True)
    except Exception as e:
            print('Distance calculation failed, shoudnt continue')
            exit(99)

    distance_ssd = ssd.squareform(distance)

    # Hierarchical clustering - linkage matrix Z
    Z = linkage(distance_ssd, "average")

    # Inconsistent matrix - has mean-distance, standard dev's for each linkage
    IN = inconsistent(Z)

    # Creating a temporary data-frame to extract clusters from linkage and inconsistent matrices
    cols = ['pt1', 'pt2', 'dist', 'tot_pts', 'mean_dist', 'SD_dist', 'cls_level', 'co_eff']
    temp_df = pd.DataFrame(np.hstack([Z, IN]), columns=cols)

    # get the bin's - only using the range from the first level clustering distances
    # Further clustering level will increase linkages' mean distance
    # points that fall above first level mean-distances are deemed as outliers
    cls_level_1_distances = temp_df.loc[temp_df['cls_level'] == 1, 'mean_dist']
    q1, q3 = np.percentile(cls_level_1_distances, [25, 75])
    IQR = q3 - q1

    # Handy formula to calculate bin width - to make sure bin counts are minimal but represents the spread well
    bw = 2 * IQR/ int(round(sites_len ** (1. / 3))) * BIN_FACTOR

    bins_ = (np.arange(min(cls_level_1_distances)- 0.1, max(cls_level_1_distances) + bw, bw))

    # hierarchical clustering groups data till it reaches a single cluster that has all data points
    # we don't need rows from linkage matrix, which represents higher level clustering,
    # keeping link rows only the leaf_nodes (i.e single site data point)
    temp_df = temp_df[(temp_df['pt1'] < sites_len) | (temp_df['pt2'] < sites_len) ]

    # apply the bins
    temp_df['bins'] = pd.cut(temp_df['mean_dist'], bins_ ).astype('str')

    # Map digits to intervals , for readability
    map_dict = {str(value):counter for counter, value in enumerate(temp_df['bins'].unique()) if value != 'nan'}
    temp_df['Cluster'] = temp_df['bins'].map(map_dict)

    # NaNs are the outliers , treat them as singleton cluster, giving name to each NAN
    total_nans = (temp_df['Cluster'].isna().sum())
    temp_df.loc[temp_df['Cluster'].isna(), 'Cluster'] = [ 'O' + str(i) for i in range(1,total_nans+1) ]

    # Combine linkage matrix columns - to create a single column view of, site vs cluster mapping
    df1 = temp_df.loc[temp_df['pt1'] < sites_len, ['pt1', 'Cluster']].rename(columns={'pt1':'Site'}).copy()
    df2 = temp_df.loc[temp_df['pt2'] < sites_len, ['pt2', 'Cluster']].rename(columns={'pt2':'Site'}).copy()
    temp_df = pd.concat([df1, df2]).sort_values(by='Site').reset_index(drop=True)
    flow_df['Cluster'] = temp_df['Cluster']

    # # visualizing
    # sites_n = [(str(site) + '-' + str(i)) for site, i in enumerate(sites)]
    # fig, ax = plt.subplots()
    # fig.set_size_inches(20,40)
    # dend = dendrogram(Z, leaf_rotation=90, leaf_font_size=8, labels=sites_n, ax=ax)
    # plt.show()
    return flow_df