Пример #1
0
"""
Implementation of the Deep Temporal Clustering model
Dataset loading functions

@author Florent Forest (FlorentF9)
"""

import numpy as np
from tslearn.datasets import UCR_UEA_datasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from sklearn.preprocessing import LabelEncoder

ucr = UCR_UEA_datasets()
# UCR/UEA univariate and multivariate datasets.
all_ucr_datasets = ucr.list_datasets()


def load_ucr(dataset='CBF'):
    X_train, y_train, X_test, y_test = ucr.load_dataset(dataset)
    X = np.concatenate((X_train, X_test))
    y = np.concatenate((y_train, y_test))
    if dataset == 'HandMovementDirection':  # this one has special labels
        y = [yy[0] for yy in y]
    y = LabelEncoder().fit_transform(
        y)  # sometimes labels are strings or start from 1
    assert (y.min() == 0)  # assert labels are integers and start from 0
    # preprocess data (standardization)
    X_scaled = TimeSeriesScalerMeanVariance().fit_transform(X)
    return X_scaled, y

Пример #2
0
"""
Implementation of the Deep Temporal Clustering model
Dataset loading functions

"""

import numpy as np
from tslearn.datasets import UCR_UEA_datasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from sklearn.preprocessing import LabelEncoder

ucr = UCR_UEA_datasets()
# try to use UCR/UEA univariate and multivariate datasets.
# requires forked version of tslearn from https://github.com/yichangwang/tslearn
try:
    all_ucr_datasets = ucr.list_datasets() + ucr._multivariate_dataset
except AttributeError:
    all_ucr_datasets = ucr.list_datasets()


def load_ucr(dataset='CBF'):
    X_train, y_train, X_test, y_test = ucr.load_dataset(dataset)
    X = np.concatenate((X_train, X_test))
    y = np.concatenate((y_train, y_test))
    if dataset == 'HandMovementDirection':  # this one has special labels
        y = [yy[0] for yy in y]
    y = LabelEncoder().fit_transform(
        y)  # sometimes labels are strings or start from 1
    assert (y.min() == 0)  # assert labels are integers and start from 0
    # preprocess data (standardization)
    X_scaled = TimeSeriesScalerMeanVariance().fit_transform(X)
Пример #3
0
class UCRDataset(torch.utils.data.Dataset):
    """
    A torch wrapper around tslearn UCR_Datasets datasets
    """
    def __init__(self,
                 name,
                 partition="train",
                 ratio=.75,
                 randomstate=None,
                 silent=True,
                 augment_data_noise=0):
        r = np.random.RandomState(seed=randomstate)

        self.name = name
        self.dataset = UCR_UEA_datasets()

        self.augment_data_noise = augment_data_noise

        if name not in self.dataset.list_datasets():
            raise ValueError("Dataset not found: Please choose from " +
                             ", ".join(self.dataset.list_datasets()))

        X_trainvalid, y_trainvalid, X_test, y_test = self.dataset.load_dataset(
            name)

        self.nclasses = len(np.unique(np.append(y_test, y_trainvalid, axis=0)))
        self.ndims = 1  # UCR datasets have one featuredimension

        train_mask = r.rand(len(X_trainvalid)) < ratio
        valid_mask = np.logical_not(train_mask)

        if partition == "train":
            self.X = X_trainvalid[train_mask]
            self.y = y_trainvalid[train_mask]
        elif partition == "valid":
            self.X = X_trainvalid[valid_mask]
            self.y = y_trainvalid[valid_mask]
        elif partition == "trainvalid":
            self.X = X_trainvalid
            self.y = y_trainvalid
        elif partition == "test":
            self.X = X_test
            self.y = y_test
        else:
            raise ValueError(
                "Invalid partition! please provide either 'train','valid', 'trainvalid', or 'test'"
            )

        # some binary datasets e.g. EGC200 or Lightning 2 have classes: -1, 1 -> clipping to 1:2
        if self.y.min() < 0:
            if not silent:
                print("Found class ids < 0 in dataset. clipping to zero!")
            self.y = np.clip(self.y, 0, None)

        # some datasets (e.g. Coffee) have classes with zero index while all other start with 1...
        if self.y.min() > 0:
            if not silent:
                print(
                    "Found class id starting from 1. reducing all class ids by one to start from zero"
                )
            self.y -= 1

        #self.classes = np.unique(self.y)
        self.sequencelength = X_trainvalid.shape[1]

        if not silent:
            msg = "Loaded dataset {}-{} T={}, classes={}: {}/{} samples"
            print(
                msg.format(name, partition, self.sequencelength, self.nclasses,
                           len(self.X),
                           len(X_trainvalid) + len(X_test)))

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):

        X = self.X[idx]

        X += np.random.rand(*X.shape) * self.augment_data_noise

        X = torch.from_numpy(X).type(torch.FloatTensor)
        y = torch.from_numpy(np.array([self.y[idx]])).type(torch.LongTensor)

        # add 1d hight and width dimensions and copy y for each time
        return X, y.expand(X.shape[0]), idx

    def __str__(self):

        str = """
UCR Dataset = {dataset}
X.shape = {Xshape}
y.shape = {yshape}
nclasses = {nclasses}
ndims = {ndims}
        """.format(dataset=self.name,
                   Xshape=self.X.shape,
                   yshape=self.y.shape,
                   nclasses=self.nclasses,
                   ndims=self.ndims)

        return str