Exemplo n.º 1
0
 def reshape_data(self):
     ts_value = self.input_df.T.values
     ts_value = ts_value.reshape(ts_value.shape[0], ts_value.shape[1], 1)
     scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)
     data_scaled = scaler.fit_transform(ts_value)
     data_scaled = np.nan_to_num(data_scaled)
     self.data_scaled = data_scaled
Exemplo n.º 2
0
def _get_random_walk():
    numpy.random.seed(0)
    # Generate a random walk time series
    n_ts, sz, d = 1, 100, 1
    dataset = random_walks(n_ts=n_ts, sz=sz, d=d)
    scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)
    return scaler.fit_transform(dataset)
Exemplo n.º 3
0
def perform_sax(dataset, gram_number, symbols, segments):
    scaler = TimeSeriesScalerMeanVariance(
        mu=0., std=np.std(dataset))  # Rescale time series
    dataset = scaler.fit_transform(dataset)

    # SAX transform
    sax = SymbolicAggregateApproximation(n_segments=segments,
                                         alphabet_size_avg=symbols)
    sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset))
    # print(pd.DataFrame(sax_dataset_inv[0])[0].value_counts())
    #     sax_dataset_inv = sax.fit_transform(dataset)
    #     print(len(sax_dataset_inv[0]))

    # Convert result to strings
    df_sax = pd.DataFrame(sax_dataset_inv[0])
    sax_series = df_sax[0]

    # Convert sax from numeric to characters
    sax_values = sax_series.unique()
    alphabet = 'abcdefghijklmnopqrstuvw'
    sax_dict = {x: alphabet[i] for i, x in enumerate(sax_values)}
    sax_list = [sax_dict[x] for x in sax_series]

    # Convert the list of characters to n_grams based on input parameter
    tri = n_grams(gram_number, sax_list)
    #     print(Counter(tri))
    return tri
Exemplo n.º 4
0
def test_single_value_ts_no_nan():
    X = to_time_series_dataset([[1, 1, 1, 1]])

    standard_scaler = TimeSeriesScalerMeanVariance()
    assert np.sum(np.isnan(standard_scaler.fit_transform(X))) == 0

    minmax_scaler = TimeSeriesScalerMinMax()
    assert np.sum(np.isnan(minmax_scaler.fit_transform(X))) == 0
Exemplo n.º 5
0
def normalize(df):
    df_normalized = df.copy()
    df_normalized = df_normalized

    normalize = TimeSeriesScalerMeanVariance(mu=0, std=1)
    for col in df:
        df_normalized[col] = normalize.fit_transform(df_normalized[col])[0]

    return df_normalized
Exemplo n.º 6
0
def getStdData(originData):
    n_paa_segments = 120  #一天分成4份,每6个小时整合为一段
    paa_data = PiecewiseAggregateApproximation(
        n_segments=n_paa_segments).fit_transform(originData)
    #进行平均值归一化
    scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)
    dataset = scaler.fit_transform(paa_data)
    dataset = dataset.reshape(dataset.shape[0], dataset.shape[1])
    return dataset
Exemplo n.º 7
0
    def _transform(self, X, y=None):
        n_ts, sz, d = X.shape

        if d > 1:
            raise NotImplementedError("We currently don't support using "
                                      "multi-dimensional matrix profiles "
                                      "from the stumpy library.")

        output_size = sz - self.subsequence_length + 1
        X_transformed = np.empty((n_ts, output_size, 1))

        if self.implementation == "stump":
            if not STUMPY_INSTALLED:
                raise ImportError(stumpy_msg)

            for i_ts in range(n_ts):
                result = stumpy.stump(T_A=X[i_ts, :, 0].ravel(),
                                      m=self.subsequence_length)
                X_transformed[i_ts, :, 0] = result[:, 0].astype(np.float)

        elif self.implementation == "gpu_stump":
            if not STUMPY_INSTALLED:
                raise ImportError(stumpy_msg)

            for i_ts in range(n_ts):
                result = stumpy.gpu_stump(T_A=X[i_ts, :, 0].ravel(),
                                          m=self.subsequence_length)
                X_transformed[i_ts, :, 0] = result[:, 0].astype(np.float)

        elif self.implementation == "numpy":
            scaler = TimeSeriesScalerMeanVariance()
            band_width = int(np.ceil(self.subsequence_length / 4))
            for i_ts in range(n_ts):
                segments = _series_to_segments(X[i_ts],
                                               self.subsequence_length)
                if self.scale:
                    segments = scaler.fit_transform(segments)
                n_segments = segments.shape[0]
                segments_2d = segments.reshape(
                    (-1, self.subsequence_length * d))
                dists = squareform(pdist(segments_2d, "euclidean"))
                band = (np.tri(
                    n_segments, n_segments, band_width, dtype=np.bool
                ) & ~np.tri(
                    n_segments, n_segments, -(band_width + 1), dtype=np.bool))
                dists[band] = np.inf
                X_transformed[i_ts] = dists.min(axis=1, keepdims=True)

        else:
            available_implementations = ["numpy", "stump", "gpu_stump"]
            raise ValueError(
                'This "{}" matrix profile implementation is not'
                ' recognized. Available implementations are {}.'.format(
                    self.implementation, available_implementations))

        return X_transformed
Exemplo n.º 8
0
def cor(x, y):
    """
    Correlation-based distance (COR) between two multivariate time series given as arrays of shape (timesteps, dim)
    """
    scaler = TimeSeriesScalerMeanVariance()
    x_norm = scaler.fit_transform(x)
    y_norm = scaler.fit_transform(y)
    pcc = np.mean(x_norm * y_norm)  # Pearson correlation coefficients
    d = np.sqrt(2.0 * (1.0 - pcc + 1e-9))  # correlation-based similarities
    return np.sum(d)
Exemplo n.º 9
0
def saa_pax(dataset, title):
    """
    Show the graph of PAA and SAX of time series data
    :param dataset: time series of a stock
    :return:
    """
    n_ts, sz, d = 1, 100, 1
    scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)  # Rescale time series
    dataset = scaler.fit_transform(dataset)

    # PAA transform (and inverse transform) of the data
    n_paa_segments = 10
    paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
    paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset))

    # SAX transform
    n_sax_symbols = 8
    sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                         alphabet_size_avg=n_sax_symbols)
    sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset))

    # 1d-SAX transform
    n_sax_symbols_avg = 8
    n_sax_symbols_slope = 8
    one_d_sax = OneD_SymbolicAggregateApproximation(
        n_segments=n_paa_segments,
        alphabet_size_avg=n_sax_symbols_avg,
        alphabet_size_slope=n_sax_symbols_slope)
    one_d_sax_dataset_inv = one_d_sax.inverse_transform(
        one_d_sax.fit_transform(dataset))

    plt.figure()
    plt.subplot(2, 2, 1)  # First, raw time series
    plt.plot(dataset[0].ravel(), "b-")
    plt.title("Raw time series " + title)

    plt.subplot(2, 2, 2)  # Second, PAA
    plt.plot(dataset[0].ravel(), "b-", alpha=0.4)
    plt.plot(paa_dataset_inv[0].ravel(), "b-")
    plt.title("PAA " + title)

    plt.subplot(2, 2, 3)  # Then SAX
    plt.plot(dataset[0].ravel(), "b-", alpha=0.4)
    plt.plot(sax_dataset_inv[0].ravel(), "b-")
    plt.title("SAX, %d symbols" % n_sax_symbols)

    plt.subplot(2, 2, 4)  # Finally, 1d-SAX
    plt.plot(dataset[0].ravel(), "b-", alpha=0.4)
    plt.plot(one_d_sax_dataset_inv[0].ravel(), "b-")
    plt.title("1d-SAX, %d symbols (%dx%d)" %
              (n_sax_symbols_avg * n_sax_symbols_slope, n_sax_symbols_avg,
               n_sax_symbols_slope))

    plt.tight_layout()
    plt.show()
Exemplo n.º 10
0
def ApplyPaa(n_paa_segments, df, ckt):
    circuito = ckt
    print("Quantidade de segmentos de PAA: {}".format(n_paa_segments))
    paa = PiecewiseAggregateApproximation(n_paa_segments)
    scaler = TimeSeriesScalerMeanVariance()
    dadosPaa = df
    for i in range(0, len(df)):
        dataset = scaler.fit_transform(df[i])
        dadosPaa[i] = paa.inverse_transform(paa.fit_transform(dataset))[0]
    dadosPaa = dadosPaa.T

    return dadosPaa
Exemplo n.º 11
0
def check_classifiers_classes(name, classifier_orig):
    # Case of shapelet models
    if name == 'SerializableShapeletModel':
        raise SkipTest('Skipping check_classifiers_classes for shapelets'
                       ' due to convergence issues...')
    elif name == 'ShapeletModel':
        X_multiclass, y_multiclass = _create_large_ts_dataset()
        classifier_orig = clone(classifier_orig)
        classifier_orig.max_iter = 1000
    else:
        X_multiclass, y_multiclass = _create_small_ts_dataset()

    X_multiclass, y_multiclass = shuffle(X_multiclass,
                                         y_multiclass,
                                         random_state=7)

    scaler = TimeSeriesScalerMeanVariance()
    X_multiclass = scaler.fit_transform(X_multiclass)

    X_multiclass = np.reshape(X_multiclass,
                              (X_multiclass.shape[0], X_multiclass.shape[1]))

    X_binary = X_multiclass[y_multiclass != 2]
    y_binary = y_multiclass[y_multiclass != 2]

    X_multiclass = pairwise_estimator_convert_X(X_multiclass, classifier_orig)
    X_binary = pairwise_estimator_convert_X(X_binary, classifier_orig)

    labels_multiclass = ["one", "two", "three"]
    labels_binary = ["one", "two"]

    y_names_multiclass = np.take(labels_multiclass, y_multiclass)
    y_names_binary = np.take(labels_binary, y_binary)

    problems = [(X_binary, y_binary, y_names_binary)]

    if not classifier_orig._get_tags()['binary_only']:
        problems.append((X_multiclass, y_multiclass, y_names_multiclass))

    for X, y, y_names in problems:
        for y_names_i in [y_names, y_names.astype('O')]:
            y_ = choose_check_classifiers_labels(name, y, y_names_i)
            check_classifiers_predictions(X, y_, name, classifier_orig)

    labels_binary = [-1, 1]
    y_names_binary = np.take(labels_binary, y_binary)
    y_binary = choose_check_classifiers_labels(name, y_binary, y_names_binary)
    check_classifiers_predictions(X_binary, y_binary, name, classifier_orig)
Exemplo n.º 12
0
def test_variable_length_knn():
    X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [9, 8, 7, 6, 5, 2],
                                [8, 7, 6, 5, 3]])
    y = [0, 0, 1, 1]

    clf = KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1)
    clf.fit(X, y)
    assert_allclose(clf.predict(X), [0, 0, 1, 1])

    clf = KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1)
    clf.fit(X, y)
    assert_allclose(clf.predict(X), [0, 0, 1, 1])

    scaler = TimeSeriesScalerMeanVariance()
    clf = KNeighborsTimeSeriesClassifier(metric="sax",
                                         n_neighbors=1,
                                         metric_params={'n_segments': 2})
    X_transf = scaler.fit_transform(X)
    clf.fit(X_transf, y)
    assert_allclose(clf.predict(X_transf), [0, 0, 1, 1])
Exemplo n.º 13
0
 def _transform(self, X, y=None):
     n_ts, sz, d = X.shape
     output_size = sz - self.subsequence_length + 1
     X_transformed = numpy.empty((n_ts, output_size, 1))
     scaler = TimeSeriesScalerMeanVariance()
     for i_ts in range(n_ts):
         Xi = X[i_ts]
         elem_size = Xi.strides[0]
         segments = as_strided(
             Xi,
             strides=(elem_size, elem_size, Xi.strides[1]),
             shape=(Xi.shape[0] - self.subsequence_length + 1,
                    self.subsequence_length, d),
             writeable=False)
         if self.scale:
             segments = scaler.fit_transform(segments)
         segments_2d = segments.reshape((-1, self.subsequence_length * d))
         dists = squareform(pdist(segments_2d, "euclidean"))
         numpy.fill_diagonal(dists, numpy.inf)
         X_transformed[i_ts] = dists.min(axis=1, keepdims=True)
     return X_transformed
Exemplo n.º 14
0
def get_distance_matrix(numpy_array):
    sc = TimeSeriesScalerMeanVariance()
    X_s = sc.fit_transform(to_time_series_dataset(numpy_array))

    size = len(X_s)

    idx = [(i, j) for i in range(0, size) for j in range(i + 1, size)]

    def calc_dtw(my_idx):
        i, j = my_idx
        return dtw(X_s[i], X_s[j])

    with mp.Pool(mp.cpu_count() - 1) as p:
        distances = p.map(calc_dtw, idx)

    dm = np.zeros(shape=(size, size))
    for (i, j), v in zip(idx, distances):
        dm[i, j] = v
        dm[j, i] = v

    return dm
Exemplo n.º 15
0
 def approximate(self,
                 series: np.ndarray,
                 window: int = 1,
                 should_fit: bool = True) -> np.ndarray:
     # series is already in batches
     debug('TSLearnApproximatorWrapper.approximate: series shape {}'.format(
         series.shape))
     debug(
         'TSLearnApproximatorWrapper.approximate: to_time_series shape {}'.
         format(series.shape))
     ts_representation = list()
     debug(
         f'TSLearnApproximatorWrapper.approximate: param series \n{series} '
     )
     for segment in series:
         if isinstance(self.transformer,
                       SymbolicAggregateApproximation) or isinstance(
                           self.transformer,
                           OneD_SymbolicAggregateApproximation):
             logger.info(
                 "Scaling the data so that they consist a normal distribution."
             )
             scaler = TimeSeriesScalerMeanVariance(
                 mu=0., std=1.)  # Rescale time series
             segment = scaler.fit_transform(segment)
         ts_representation.append(self.transformer.fit_transform(segment))
     # debug('TSLearnApproximatorWrapper.approximate: ts_representation \n{}'.format(ts_representation))
     debug(
         'TSLearnApproximatorWrapper.approximate: ts_representation shape {}'
         .format(np.shape(ts_representation)))
     ts_representation = np.reshape(
         ts_representation,
         (np.shape(ts_representation)[0],
          np.shape(ts_representation)[1] * np.shape(ts_representation)[2]))
     debug('TSLearnApproximatorWrapper.approximate: ts_representation \n{}'.
           format(ts_representation))
     debug(
         'TSLearnApproximatorWrapper.approximate: ts_representation shape {}'
         .format(ts_representation.shape))
     return ts_representation
def ApplyPaa(n_paa_segments,df):
    '''
    Aplica o PAA no dataframe fornecido.

    :param n_paa_segments: quantidade de segmento do PAA para redução de dados
    :param df: dataframe com dados em que se deseja aplicar o PAA
    :return: df após aplicação do PAA
    '''
    df = df.values.T.tolist()
    scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)
    dadosPaa = scaler.fit_transform(df)
    print("Quantidade de segmentos de PAA: {}".format(n_paa_segments))
    paa = PiecewiseAggregateApproximation(n_paa_segments)
    dadosPaa = paa.inverse_transform(paa.fit_transform(dadosPaa))

    df = pd.DataFrame()

    for i in range(len(dadosPaa.T)):
        for j in range(len(dadosPaa.T[0])):
            df[j] = dadosPaa.T[i][j]

    return df
Exemplo n.º 17
0
def train_nn(
    dataset: str, batch_size: int, depth: int, epochs: int
) -> Tuple[CNN, Tuple[Union[np.ndarray, np.ndarray], Union[
        np.ndarray, np.ndarray]], Tuple[Union[np.ndarray, np.ndarray], Union[
            np.ndarray, np.ndarray]]]:
    experiment = Experiment(project_name="cphap", auto_output_logging=False)
    experiment.add_tag(dataset)
    experiment.add_tag("NN-depth-{}".format(depth))
    (x_train, y_train), (x_test, y_test) = fetch_dataset(dataset)
    scaler = TimeSeriesScalerMeanVariance()
    x_train: np.ndarray = scaler.fit_transform(x_train)
    x_test: np.ndarray = scaler.transform(x_test)

    x_train = x_train.transpose((0, 2, 1)).astype(np.float32)
    x_test = x_test.transpose((0, 2, 1)).astype(np.float32)

    n_features = x_train.shape[1]
    n_targets = len(np.unique(y_train))

    train_ds = get_dataset(x_train, y_train)
    test_ds = get_dataset(x_test, y_test)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    model = CNN(n_features, 32, n_targets, depth=depth)
    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()

    runner = ClassificationRunner(model, optimizer, criterion, experiment)
    runner.add_loader("train", train_loader)
    runner.add_loader("test", test_loader)
    runner.train_config(epochs=epochs)
    runner.run()
    runner.quite()

    return runner.model.eval(), (x_train, x_test), (y_train, y_test)
Exemplo n.º 18
0
def normalize_series(series):
    # Rescale series to mean 0 and unit variance
    scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)
    return scaler.fit_transform(series)
Exemplo n.º 19
0
from tslearn.generators import random_walks
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.piecewise import PiecewiseAggregateApproximation, _paa_to_symbols
from tslearn.piecewise import SymbolicAggregateApproximation, OneD_SymbolicAggregateApproximation

np.random.seed(0)
# Generate a random walk time series
# n_ts, sz, d = 1, 100, 1
# dataset = random_walks(n_ts=n_ts, sz=sz, d=d)
# scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)  # Rescale time series
# dataset = scaler.fit_transform(dataset)

# load txt
data = np.loadtxt('sorted/Beef.txt', delimiter=',')
scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)  # Rescale time series
dataset = scaler.fit_transform(data[:, 1:])
rows, cols = data.shape
print(rows, cols)

# PAA transform (and inverse transform) of the data
n_paa_segments = 10
paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset))

# SAX transform
n_sax_symbols = 8
sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                     alphabet_size_avg=n_sax_symbols)
sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset))
sax_data = sax.fit_transform(data)
print(sax_data)
Exemplo n.º 20
0
# @Time    : 2018/5/21 17:05
# @Author  : Inkky
# @Email   : [email protected]
'''
PAA DRAW FIG
'''
from tslearn.generators import random_walks
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.piecewise import PiecewiseAggregateApproximation
import numpy as np
import matplotlib.pyplot as plt

#draw ecg200
data = np.loadtxt('data/ecg200.txt', delimiter=',')
scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)  # Rescale time series
data_score = scaler.fit_transform(data)
rows, cols = data.shape
# PAA transform (and inverse transform) of the data
n_paa_segments = 1
paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
paa_dataset_inv = paa.inverse_transform(paa.fit_transform(data))
a = np.mean(paa_dataset_inv.ravel())
print(a)

plt.figure(1)
fig = plt.gcf()
fig.set_size_inches(6, 3)
plt.plot(data_score[2].ravel(), "b-", label='Raw', linewidth=2.5, alpha=0.6)
plt.plot(paa_dataset_inv[2].ravel(), 'r-', label='PAA', linewidth=2.5)
# print(data_score[2].ravel())
x_new = np.linspace(0, 50)
Exemplo n.º 21
0
                   key=lambda x: -x[1]))

        # Re-sample the test and train set with same sizes and strataified
        if X_test is None or len(X_test) == 0: continue
        nr_test_samples = len(X_test)
        X = np.vstack((X_train, X_test))
        y = np.vstack((np.reshape(y_train,
                                  (-1, 1)), np.reshape(y_test, (-1, 1))))
        y = pd.Series(np.reshape(y, (-1, )))
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, stratify=y, test_size=nr_test_samples)
        test_idx = y_test.index
        train_idx = y_train.index

        scaler = TimeSeriesScalerMeanVariance()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)

        X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1]))
        X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1]))

        # Map labels to [0, .., C-1]
        map_dict = {}
        for j, c in enumerate(sorted(set(y_train))):
            map_dict[c] = j
        y_train = y_train.map(map_dict).values
        y_test = y_test.map(map_dict).values

        timestamp = int(time.time())

        pd.DataFrame(
Exemplo n.º 22
0
            ('Plane', 64), ('Car', 256), ('Beef', 128), ('Coffee', 128),
            ('OliveOil', 256)]

# We will compare the accuracies & execution times of 1-NN using:
# (i) MINDIST on SAX representations, and
# (ii) euclidean distance on raw values
knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='sax')
knn_eucl = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='euclidean')

accuracies = {}
times = {}
for dataset, w in datasets:
    X_train, y_train, X_test, y_test = data_loader.load_dataset(dataset)

    ts_scaler = TimeSeriesScalerMeanVariance()
    X_train = ts_scaler.fit_transform(X_train)
    X_test = ts_scaler.fit_transform(X_test)

    # Fit 1-NN using SAX representation & MINDIST
    metric_params = {'n_segments': w, 'alphabet_size_avg': 10}
    knn_sax = clone(knn_sax).set_params(metric_params=metric_params)
    start = time.time()
    knn_sax.fit(X_train, y_train)
    acc_sax = accuracy_score(y_test, knn_sax.predict(X_test))
    time_sax = time.time() - start

    # Fit 1-NN using euclidean distance on raw values
    start = time.time()
    knn_eucl.fit(X_train, y_train)
    acc_euclidean = accuracy_score(y_test, knn_eucl.predict(X_test))
    time_euclidean = time.time() - start
Exemplo n.º 23
0
# License: BSD 3 clause

import matplotlib.pyplot as plt
import numpy
from scipy.signal import find_peaks

from tslearn import metrics
from tslearn.generators import random_walks
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

numpy.random.seed(0)
n_ts, sz, d = 2, 100, 1
n_repeat = 5
dataset = random_walks(n_ts=n_ts, sz=sz, d=d)
scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)  # Rescale time series
dataset_scaled = scaler.fit_transform(dataset)

# We repeat the long sequence multiple times to generate multiple possible
# matches
long_sequence = numpy.tile(dataset_scaled[1], (n_repeat, 1))
short_sequence = dataset_scaled[0]

sz1 = len(long_sequence)
sz2 = len(short_sequence)

print('Shape long sequence: {}'.format(long_sequence.shape))
print('Shape short sequence: {}'.format(short_sequence.shape))

# Calculate the accumulated cost matrix
mat = metrics.subsequence_cost_matrix(short_sequence, long_sequence)
Exemplo n.º 24
0
# 3. 使用k-means之类的算法进行一次简单聚类
# 4. 在简单聚类的基础上再进行一次复杂聚类
# 5. 基线进行预测,每个残余数据对震荡幅度进行预测

# 1.原始数据异常点去除实现:对原始数据进行排序。去除范围为前1%与后1%,合计2%左右
# 如果旁边没有点,则直接删除,然后对原有数据空缺部分进行线性插值。否则进行最大最小抑制

# 时间序列:异常点排除一为方法1
# 时间序列:异常点排除方法2:对原数据进行归一化。将所有数据减去平均值的绝对值后排序,去除5%,并进行线性插值

ratio = 0.05  #异常点数量

#归一化
from tslearn.preprocessing import TimeSeriesScalerMinMax
scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)
stdData = scaler.fit_transform(formatted_dataset)

# 为原有的均值设定为指定的方差和平均值(可能因为极端值的问题有些差别)
# 将原始数据进行放缩,在原数据的基础上生成新的统一平均值数据
DistAvg = 2000
VarAvg = 15000
for i in range(len(formatted_dataset)):
    repres = stdData[i]
    formatted_dataset[i] = repres * np.sqrt(VarAvg) + DistAvg

# 对于每一行的数据,得到一个绝对值排序结果,将最大的前5%排除出去。然后对空缺的位置进行线性差值
for index, row in enumerate(stdData):
    element = abs(row)
    element = element.ravel()
    element.sort()
    maxNum = element[-1 * int(ratio * len(element))]
Exemplo n.º 25
0
def main():
    #FOR NOAA DB
    influx_url = "http://localhost:8086/query?db=" + dbname + \
                    "&epoch=ms&q=SELECT+%22water_level%22+FROM+%22h2o_feet%22+WHERE+time+%3E%3D+1440658277944ms+and+time+%3C%3D+1441435694328ms"

    r = requests.get(influx_url)
    json_dict = json.loads(r.content)

    data = json_dict["results"][0]["series"][0]["values"]
    print(data[0:5])
    
##    #NOTE:just for NOAA h2o_feet
    time_interval = data[2][0] - data[0][0]
    print("time interval:", time_interval)
   
    lst2 = [item[1] for item in data]
    n_segments = len(lst2)

    print(max(lst2),min(lst2))
    
    original_data_size = len(lst2)
    print("original data size:", original_data_size)
    
    alphabet_size_avg = math.ceil(max(lst2)-min(lst2))
    print("alphabet size avg:", alphabet_size_avg)


    ## a list of sample ratios.
    ## Want to select the min ratio within the similarity range.
    ratiolist = [0.025,0.05,0.1,0.15,0.2,0.3,0.4,0.5,0.6]
    sizelist = []
    distlist = []
    
    for ratio in ratiolist:
        print()
        print("ratio:",ratio)
            
        #generate sample data
        sample_size = math.floor(original_data_size * ratio)
        sizelist.append(sample_size)
        print("sample_size:",sample_size)

       #NOAA DB: h2o_feet
        sample_url = "http://localhost:8086/query?db=" + dbname + \
                    "&epoch=ms&q=SELECT+sample%28%22water_level%22%2C"+str(sample_size) + \
                    "%29+FROM+%22h2o_feet%22+WHERE+time+%3E%3D+1440658277944ms+and+time+%3C%3D+1441435694328ms"
        
        r2 = requests.get(sample_url)
        json_dict2 = json.loads(r2.content)
        sampled_data = json_dict2["results"][0]["series"][0]["values"] # [[time, value], ...]
        
        sample = [item[1] for item in sampled_data] #[value,...]

        #fill the sample data with a linear model
        start_x = data[0][0]
        end_x = data[-1][0]
        current_x = start_x
        current_loc = 0
        
        slope = (sampled_data[current_loc][1]-sampled_data[current_loc+2][1])\
                /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0])      ##NOTE!
        intersection = sampled_data[current_loc][1]-slope*sampled_data[current_loc][0]

        sample_fit = []
        end_sample_x = sampled_data[-1][0]

        while current_x <= end_sample_x:
            if current_x >= sampled_data[current_loc+1][0] and current_loc+1 < len(sampled_data)-2:  ##NOTE: -2 !! CHANGE TO -1 LATER
                current_loc+=1
                ##NOTE: +2 was just for h2o_feet
                if (sampled_data[current_loc][0] - sampled_data[current_loc+1][0]) == 0:
    
                    slope = (sampled_data[current_loc] [1]-sampled_data[current_loc+1][1]) \
                            /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0])
                else:
                    slope = (sampled_data[current_loc] [1]-sampled_data[current_loc+1][1]) \
                            /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0])

                    
                intersection = sampled_data[current_loc][1] - slope*sampled_data[current_loc][0]
            
            
            sample_fit.append([current_x, slope*current_x+intersection])
            current_x += time_interval #1000ms
           
        #chop the original data to match the linear fit sample data.
        chopped_data = []
        for item in data:
            if item[0]>= sample_fit[0][0] and item[0] <= sample_fit[-1][0]:
                chopped_data.append(item)
        print("size of chopped_data:",len(chopped_data))

        chopped_lst2 = [item[1] for item in chopped_data]
        chopped_len = len(chopped_lst2)

        #build a sax model for chopped original data
        sax = SymbolicAggregateApproximation(chopped_len,alphabet_size_avg)
        scalar = TimeSeriesScalerMeanVariance(mu=0., std=1.)    
        sdb = scalar.fit_transform(chopped_lst2)
        sax_data = sax.transform(sdb)
        s3 = sax.fit_transform(sax_data)

        #build a sax model for linear-fit sampled data
        sample_fit_extract = [item[1] for item in sample_fit]
        fit_sample_data = scalar.fit_transform(sample_fit_extract)
        sax_sample_data = sax.transform(fit_sample_data)
        s4 = sax.fit_transform(sax_sample_data)

        #compute the distance between to dataset to calculate the similarity       
        dist = sax.distance_sax(s3[0], s4[0])
        print("distance:", dist)
        norm_dist = 1000*dist/chopped_len
        distlist.append(norm_dist)
        print("normalized distance: {:.4f}".format(norm_dist))

    plotdist(ratiolist,distlist)
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.matrix_profile import MatrixProfile

import warnings
warnings.filterwarnings('ignore')

# Set a seed to ensure determinism
numpy.random.seed(42)

# Load the Trace dataset
X_train, y_train, _, _ = CachedDatasets().load_dataset("Trace")

# Normalize the time series
scaler = TimeSeriesScalerMeanVariance()
X_train = scaler.fit_transform(X_train)

# Take the first time series
ts = X_train[0, :, :]

# We will take the spike as a segment
subseq_len = 20
start = 45
segment = ts[start:start + subseq_len]

# Create our matrix profile
matrix_profiler = MatrixProfile(subsequence_length=subseq_len, scale=True)
mp = matrix_profiler.fit_transform([ts]).flatten()

# Create a grid for our plots
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True)
Exemplo n.º 27
0
# 3. 使用k-means之类的算法进行一次简单聚类
# 4. 在简单聚类的基础上再进行一次复杂聚类
# 5. 基线进行预测,每个残余数据对震荡幅度进行预测

# 1.原始数据异常点去除实现:对原始数据进行排序。去除范围为前1%与后1%,合计2%左右
# 如果旁边没有点,则直接删除,然后对原有数据空缺部分进行线性插值。否则进行最大最小抑制

# 时间序列:异常点排除一为方法1
# 时间序列:异常点排除方法2:对原数据进行归一化。将所有数据减去平均值的绝对值后排序,去除5%,并进行线性插值

ratio = 0.05  #异常点数量

#归一化
from tslearn.preprocessing import TimeSeriesScalerMinMax
scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)
stdData = scaler.fit_transform(formatted_dataset)

# 对于每一行的数据,得到一个绝对值排序结果,将最大的前5%排除出去。然后对空缺的位置进行线性差值
for index, row in enumerate(stdData):
    element = abs(row)
    element = element.ravel()
    element.sort()
    maxNum = element[-1 * int(ratio * len(element))]
    del element
    # 对于特殊情况,进行极值抑制
    # 只要能够找得到线性插值,就采用线性插值
    # 具体做法:不正常点(在最前或最后),直接采用最大值抑制
    # 使用一个数组进行一轮操作,将中间删除部分标出。
    # 对于被删掉的部分,寻找距离其最近的,两端进行线性插值。
    # previous为第一个异常点
    previous = -1
def arc_length(angle_1, angle_2, r=1.):
    """Length of the arc between two angles (in rad) on a circle of
    radius r.
    """
    # Compute the angle between the two inputs between 0 and 2*pi.
    theta = np.mod(angle_2 - angle_1, 2 * pi)
    if theta > pi:
        theta = theta - 2 * pi
    # Return the length of the arc
    L = r * np.abs(theta)
    return (L)


dataset_1 = random_walks(n_ts=n_ts, sz=sz, d=1)
scaler = TimeSeriesScalerMeanVariance(mu=0., std=pi)  # Rescale the time series
dataset_scaled_1 = scaler.fit_transform(dataset_1)

# DTW using a function as the metric argument
path_1, sim_1 = metrics.dtw_path_from_metric(dataset_scaled_1[0],
                                             dataset_scaled_1[1],
                                             metric=arc_length)

# Example 2 : Hamming distance between 2 multi-dimensional boolean time series
rw = random_walks(n_ts=n_ts, sz=sz, d=15, std=.3)
dataset_2 = np.mod(np.floor(rw), 4) == 0

# DTW using one of the options of sklearn.metrics.pairwise_distances
path_2, sim_2 = metrics.dtw_path_from_metric(dataset_2[0],
                                             dataset_2[1],
                                             metric="hamming")
for idxname in Stock_target.iloc[0:3].index.tolist():
    pos_relatedStock.append(idxname[1])

print("Positive cov: ", pos_relatedStock)
print("Num Stock: ", len(pos_relatedStock))

# Plotting Graph
plt.figure()
graph_idx = 0

# Transform PAA, SAX, 1d-SAX,
for stockCode in pos_relatedStock:

    dataset = dfpivot['v_updownpercent'][stockCode]
    scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)  # Rescale time series
    dataset = scaler.fit_transform(dataset)

    # PAA transform (and inverse transform) of the data
    n_paa_segments = 10
    paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
    paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset))

    # SAX transform
    n_sax_symbols = 8
    sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                         alphabet_size_avg=n_sax_symbols)
    sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset))

    # 1d-SAX transform
    n_sax_symbols_avg = 8
    n_sax_symbols_slope = 8
Exemplo n.º 30
0
#No. of companies with >60 records
listnew = df_new["name"].unique().tolist()
len(listnew)
print(list_new)
df_red = df_new.set_index(['name', 'day']).dif.dropna()
print(df_red)

scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)  # Rescale time series
n_paa_segments = 10
n_sax_symbols = 10
n_sax_symbols_avg = 10
n_sax_symbols_slope = 6
for i in listnew:
    records = len(df_red[[i]])
    print("stockname" + str(i))
    scaleddata = scaler.fit_transform(df_red[[i]])
    #print(scaleddata)
    paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
    paa_dataset_inv = paa.inverse_transform(paa.fit_transform(scaleddata))
    # SAX transform
    sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                         alphabet_size_avg=n_sax_symbols)
    sax_dataset_inv = sax.inverse_transform(sax.fit_transform(scaleddata))
    # 1d-SAX transform
    one_d_sax = OneD_SymbolicAggregateApproximation(
        n_segments=n_paa_segments,
        alphabet_size_avg=n_sax_symbols_avg,
        alphabet_size_slope=n_sax_symbols_slope)
    one_d_sax_dataset_inv = one_d_sax.inverse_transform(
        one_d_sax.fit_transform(scaleddata))
    plt.figure()