Exemplo n.º 1
0
 def distance(instance_a, instance_b, **params):
     # find distance
     instance_a = from_nested_to_2d_array(
         instance_a, return_numpy=True)  # todo use specific
     # dimension rather than whole
     # thing?
     instance_b = from_nested_to_2d_array(
         instance_b, return_numpy=True)  # todo use specific
     # dimension rather than whole thing?
     instance_a = np.transpose(instance_a)
     instance_b = np.transpose(instance_b)
     return distance_measure(instance_a, instance_b, **params)
Exemplo n.º 2
0
    def fit(self, X: NumpyOrDF, y: NumpyOrDF = None):
        """
        Method that is used to fit the clustering algorithm
        on the dataset X

        Parameters
        ----------
        X: Numpy array or Dataframe
            sktime data_frame or numpy array to train the model on

        y: Numpy array of Dataframe, default = None
            sktime data_frame or numpy array that is the labels for training.
            Unlikely to be used for clustering but kept for consistency

        Returns
        -------
        self
            Fitted estimator
        """
        if isinstance(X, pd.DataFrame):
            X = from_nested_to_2d_array(X, return_numpy=True)

        self._check_params(X)
        self._fit(X)

        self._is_fitted = True
        return self
Exemplo n.º 3
0
    def transform(self, X, y=None):
        """Concatenate multivariate time series/panel data into long
        univariate time series/panel
        data by simply concatenating times series in time.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_samples, n_features]
            Nested dataframe with time-series in cells.

        Returns
        -------
        Xt : pandas DataFrame
          Transformed pandas DataFrame with same number of rows and single
          column
        """
        self.check_is_fitted()
        X = check_X(X)

        # We concatenate by tabularizing all columns and then detabularizing
        # them into a single column
        if isinstance(X, pd.DataFrame):
            Xt = from_nested_to_2d_array(X)
        else:
            Xt = from_3d_numpy_to_2d_array(X)
        return from_2d_array_to_nested(Xt)
Exemplo n.º 4
0
def test_from_nested_to_2d_array(n_instances, n_columns, n_timepoints):
    nested, _ = make_classification_problem(n_instances, n_columns,
                                            n_timepoints)

    array = from_nested_to_2d_array(nested)
    assert array.shape == (n_instances, n_columns * n_timepoints)
    assert array.index.equals(nested.index)
Exemplo n.º 5
0
def test_output_format_dim(len_series, n_instances, n_components):
    np.random.seed(42)
    X = from_2d_array_to_nested(
        pd.DataFrame(data=np.random.randn(n_instances, len_series)))

    trans = PCATransformer(n_components=n_components)
    Xt = trans.fit_transform(X)

    # Check number of rows and output type.
    assert isinstance(Xt, pd.DataFrame)
    assert Xt.shape[0] == X.shape[0]

    # Check number of principal components in the output.
    assert from_nested_to_2d_array(Xt).shape[1] == min(
        n_components,
        from_nested_to_2d_array(X).shape[1])
Exemplo n.º 6
0
    def _combine_data_frames(self, dataFrames, weighting_factor, col_names):
        """
        Helper function for the shape_dtw class to combine two dataframes
        together into a single dataframe.
        Used when the shape_descriptor_function is set to "compound".
        """
        first_desc = dataFrames[0]
        second_desc = dataFrames[1]

        first_desc_array = []
        second_desc_array = []

        # Convert the dataframes into arrays
        for x in first_desc.columns:
            first_desc_array.append(
                from_nested_to_2d_array(first_desc[x], return_numpy=True)
            )

        for x in second_desc.columns:
            second_desc_array.append(
                from_nested_to_2d_array(second_desc[x], return_numpy=True)
            )

        # Concatenate the arrays together
        res = []
        for x in range(len(first_desc_array)):
            dim1 = []
            for y in range(len(first_desc_array[x])):
                dim2 = []
                dim2.extend(first_desc_array[x][y])
                dim2.extend(second_desc_array[x][y] * weighting_factor)
                dim1.append(dim2)
            res.append(dim1)

        res = np.asarray(res)

        # Convert to pandas dataframe
        df = pd.DataFrame()

        for col in col_names:
            colToAdd = []
            for row in range(len(res[col])):
                inst = res[col][row]
                colToAdd.append(pd.Series(inst))
            df[col] = colToAdd

        return df
Exemplo n.º 7
0
 def row_first(X):
     if isinstance(X, pd.Series):
         X = pd.DataFrame(X)
     Xt = pd.concat(
         [
             pd.Series(from_nested_to_2d_array(col).iloc[:, 0])
             for _, col in X.items()
         ],
         axis=1,
     )
     return Xt
Exemplo n.º 8
0
def test_tsfresh_extractor(default_fc_parameters):
    X, y = make_classification_problem()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    transformer = TSFreshFeatureExtractor(
        default_fc_parameters=default_fc_parameters, disable_progressbar=True)

    Xt = transformer.fit_transform(X_train, y_train)
    actual = Xt.filter(like="__mean", axis=1).values.ravel()
    expected = from_nested_to_2d_array(X_train).mean(axis=1).values

    assert expected[0] == X_train.iloc[0, 0].mean()
    np.testing.assert_allclose(actual, expected)
Exemplo n.º 9
0
    def transform(self, X, y=None):
        """
        Function to transform a data frame of time series data.

        Parameters
        ----------
        X : a pandas dataframe of shape = [n_samples, num_dims]
            The training input samples.

        Returns
        -------
        dims: a pandas data frame of shape = [n_samples, num_dims]
        """

        # Check the data
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=False, coerce_to_pandas=True)

        # Get information about the dataframe
        num_insts = X.shape[0]
        col_names = X.columns
        num_atts = len(X.iloc[0, 0])

        # Check the parameters are appropriate
        self._check_parameters(num_atts)

        df = pd.DataFrame()

        for x in col_names:
            # Convert one of the columns in the dataframe to a numpy array
            arr = from_nested_to_2d_array(pd.DataFrame(X[x]),
                                          return_numpy=True)

            # Get the HOG1Ds of each time series
            transformedData = []
            for y in range(num_insts):
                inst = self._calculate_hog1ds(arr[y])
                transformedData.append(inst)

            # Convert to numpy array
            transformedData = np.asarray(transformedData)

            # Add it to the dataframe
            colToAdd = []
            for i in range(len(transformedData)):
                inst = transformedData[i]
                colToAdd.append(pd.Series(inst))

            df[x] = colToAdd

        return df
Exemplo n.º 10
0
def read_dataset(root_dir, dataset_name):
    datasets_dict = {}
    curr_root_dir = root_dir.replace('-temp', '')

    #For UCR
    root_dir_dataset = curr_root_dir + '/' + 'UCRArchive_2018'

    x_train, y_train = load_from_tsfile_to_dataframe(root_dir_dataset + '/' +
                                                     dataset_name + '/' +
                                                     dataset_name +
                                                     '_TRAIN.ts')
    x_test, y_test = load_from_tsfile_to_dataframe(root_dir_dataset + '/' +
                                                   dataset_name + '/' +
                                                   dataset_name + '_TEST.ts')

    #x_train, y_train = load_from_arff_to_dataframe(root_dir_dataset + '/'+ dataset_name + '/' + dataset_name + '_TRAIN.arff')
    #x_test, y_test = load_from_arff_to_dataframe(root_dir_dataset + '/'+ dataset_name + '/' + dataset_name + '_TEST.arff')

    #print(x_train)

    x_train = from_nested_to_2d_array(x_train, return_numpy=True)
    x_test = from_nested_to_2d_array(x_test, return_numpy=True)

    # znorm
    std_ = x_train.std(axis=1, keepdims=True)
    std_[std_ == 0] = 1.0
    x_train = (x_train - x_train.mean(axis=1, keepdims=True)) / std_

    std_ = x_test.std(axis=1, keepdims=True)
    std_[std_ == 0] = 1.0
    x_test = (x_test - x_test.mean(axis=1, keepdims=True)) / std_

    datasets_dict[dataset_name] = (x_train.copy(), y_train.copy(),
                                   x_test.copy(), y_test.copy())

    return datasets_dict
Exemplo n.º 11
0
def test_pca_results(n_components):
    np.random.seed(42)

    # sklearn
    X = pd.DataFrame(data=np.random.randn(10, 5))
    pca = PCA(n_components=n_components)
    Xt1 = pca.fit_transform(X)

    # sktime
    Xs = from_2d_array_to_nested(X)
    pca_transform = PCATransformer(n_components=n_components)
    Xt2 = pca_transform.fit_transform(Xs)

    assert np.allclose(np.asarray(Xt1),
                       np.asarray(from_nested_to_2d_array(Xt2)))
Exemplo n.º 12
0
def test_padding_transformer():
    # load data
    name = "JapaneseVowels"
    X_train, y_train = _load_dataset(name, split="train", return_X_y=True)
    X_test, y_test = _load_dataset(name, split="test", return_X_y=True)

    # print(X_train)

    padding_transformer = PaddingTransformer()
    Xt = padding_transformer.fit_transform(X_train)

    # when we tabulrize the data it has 12 dimensions
    # and we've padded them to there normal length of 29
    data = from_nested_to_2d_array(Xt)
    assert len(data.columns) == 29 * 12
Exemplo n.º 13
0
def test_padding_fill_value_transformer():
    # load data
    name = "JapaneseVowels"
    X_train, y_train = _load_dataset(name, split="train", return_X_y=True)
    X_test, y_test = _load_dataset(name, split="test", return_X_y=True)

    # print(X_train)

    padding_transformer = PaddingTransformer(pad_length=40, fill_value=1)
    Xt = padding_transformer.fit_transform(X_train)

    # when we tabulrize the data it has 12 dimensions
    # and we've truncated them all to (10-2) long.
    data = from_nested_to_2d_array(Xt)
    assert len(data.columns) == 40 * 12
Exemplo n.º 14
0
def test_truncation_paramterised_transformer():
    # load data
    name = "JapaneseVowels"
    X_train, y_train = _load_dataset(name, split="train", return_X_y=True)
    X_test, y_test = _load_dataset(name, split="test", return_X_y=True)

    # print(X_train)

    truncated_transformer = TruncationTransformer(2, 10)
    Xt = truncated_transformer.fit_transform(X_train)

    # when we tabulrize the data it has 12 dimensions
    # and we've truncated them all to (10-2) long.
    data = from_nested_to_2d_array(Xt)
    assert len(data.columns) == 8 * 12
Exemplo n.º 15
0
    def transform(self, X, y=None):
        """
        Parameters
        ----------
        X : a pandas dataframe of shape = [n_samples, num_dims]
            The training input samples.

        Returns
        -------
        df: a pandas data frame of shape = [num_intervals, num_dims]
        """

        # Check the data
        self.check_is_fitted()
        X = check_X(X, coerce_to_pandas=True)

        # Get information about the dataframe
        n_timepoints = len(X.iloc[0, 0])
        num_instances = X.shape[0]
        col_names = X.columns

        self._check_parameters(n_timepoints)

        df = pd.DataFrame()

        for x in col_names:
            # Convert one of the columns in the dataframe to numpy array
            arr = from_nested_to_2d_array(pd.DataFrame(X[x]),
                                          return_numpy=True)

            # Calculate gradients
            transformedData = []
            for y in range(num_instances):
                res = self._get_gradients_of_lines(arr[y])
                transformedData.append(res)

            # Convert to Numpy array
            transformedData = np.asarray(transformedData)

            # Add it to the dataframe
            colToAdd = []
            for i in range(len(transformedData)):
                inst = transformedData[i]
                colToAdd.append(pd.Series(inst))

            df[x] = colToAdd

        return df
Exemplo n.º 16
0
def plot_cluster_algorithm(model: BaseClusterer, predict_series: NumpyOrDF, k: int):
    """
    Method that is used to plot a clustering algorithms output

    Parameters
    ----------
    model: BaseClusterer
        Clustering model to plot

    predict_series: Numpy or Dataframe
        The series to predict the values for

    k: int
        Number of centers
    """
    _check_soft_dependencies("matplotlib")
    import matplotlib.pyplot as plt
    import matplotlib.patches as mpatches

    if isinstance(predict_series, pd.DataFrame):
        predict_series = from_nested_to_2d_array(predict_series, return_numpy=True)
    plt.figure(figsize=(5, 10))
    plt.rcParams["figure.dpi"] = 100
    indexes = model.predict(predict_series)
    centers = model.get_centers()

    series_values = TimeSeriesLloydsPartitioning.get_cluster_values(
        indexes, predict_series, k
    )
    fig, axes = plt.subplots(nrows=k, ncols=1)
    for i in range(k):
        _plot(series_values[i], centers[i], axes[i])

    blue_patch = mpatches.Patch(color="blue", label="Series that belong to the cluster")
    red_patch = mpatches.Patch(color="red", label="Cluster centers")
    plt.legend(
        handles=[red_patch, blue_patch],
        loc="upper center",
        bbox_to_anchor=(0.5, -0.40),
        fancybox=True,
        shadow=True,
        ncol=5,
    )
    plt.tight_layout()
    plt.show()
Exemplo n.º 17
0
    def _perform_paa_along_dim(self, X):
        X = from_nested_to_2d_array(X, return_numpy=True)

        num_atts = X.shape[1]
        num_insts = X.shape[0]
        dims = pd.DataFrame()
        data = []

        for i in range(num_insts):
            series = X[i, :]

            frames = []
            current_frame = 0
            current_frame_size = 0
            frame_length = num_atts / self.num_intervals
            frame_sum = 0

            for n in range(num_atts):
                remaining = frame_length - current_frame_size

                if remaining > 1:
                    frame_sum += series[n]
                    current_frame_size += 1
                else:
                    frame_sum += remaining * series[n]
                    current_frame_size += remaining

                if current_frame_size == frame_length:
                    frames.append(frame_sum / frame_length)
                    current_frame += 1

                    frame_sum = (1 - remaining) * series[n]
                    current_frame_size = 1 - remaining

            # if the last frame was lost due to double imprecision
            if current_frame == self.num_intervals - 1:
                frames.append(frame_sum / frame_length)

            data.append(pd.Series(frames))

        dims[0] = data

        return dims
Exemplo n.º 18
0
    def transform(self, X, y=None):
        """
        Parameters
        ----------
        X : a pandas dataframe of shape = [n_samples, num_dims]
            The training input samples.

        Returns
        -------
        dims: a pandas data frame of shape
              = [n_samples, num_dims]
        """

        # Check the data
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=False, coerce_to_pandas=True)

        self._check_parameters()

        # Get information about the dataframe
        col_names = X.columns

        df = pd.DataFrame()
        for x in col_names:
            # Convert one of the columns in the dataframe to numpy array
            arr = from_nested_to_2d_array(pd.DataFrame(X[x]),
                                          return_numpy=True)

            transformedData = self._extract_wavelet_coefficients(arr)

            # Convert to a numpy array
            transformedData = np.asarray(transformedData)

            # Add it to the dataframe
            colToAdd = []
            for i in range(len(transformedData)):
                inst = transformedData[i]
                colToAdd.append(pd.Series(inst))

            df[x] = colToAdd

        return df
Exemplo n.º 19
0
    def transform(self, X, y=None):
        """Transform nested pandas dataframe into tabular dataframe.

        Parameters
        ----------
        X : pandas DataFrame
            Nested dataframe with pandas series or numpy arrays in cells.
        y : array-like, optional (default=None)

        Returns
        -------
        Xt : pandas DataFrame
            Transformed dataframe with only primitives in cells.
        """
        self.check_is_fitted()
        X = check_X(X)
        if isinstance(X, pd.DataFrame):
            return from_nested_to_2d_array(X)
        else:
            return from_3d_numpy_to_2d_array(X)
Exemplo n.º 20
0
    def _transform_single_feature(self, X, feature):
        """transforms data into a specified catch22 feature

        Parameters
        ----------
        X : pandas DataFrame, input time series
        feature : int, catch22 feature id or String, catch22 feature
                  name.

        Returns
        -------
        Numpy array containing a catch22 feature for each input series
        """
        if isinstance(feature, (int, np.integer)) or isinstance(
            feature, (float, np.float)
        ):
            if feature > 21 or feature < 0:
                raise ValueError("Invalid catch22 feature ID")
        elif isinstance(feature, str):
            if feature in feature_names:
                feature = feature_names.index(feature)
            else:
                raise ValueError("Invalid catch22 feature name")
        else:
            raise ValueError("catch22 feature name or ID required")

        if isinstance(X, pd.DataFrame):
            X = from_nested_to_2d_array(X, return_numpy=True)

        n_instances = X.shape[0]
        X = np.reshape(X, (n_instances, -1))

        c22_list = Parallel(n_jobs=self.n_jobs)(
            delayed(self._transform_case_single)(
                X[i],
                feature,
            )
            for i in range(n_instances)
        )

        return np.asarray(c22_list)
Exemplo n.º 21
0
def test_dft_mft():
    # load training data
    X, Y = load_gunpoint(split="train", return_X_y=True)
    X_tab = from_nested_to_2d_array(X, return_numpy=True)

    word_length = 6
    alphabet_size = 4

    # print("Single DFT transformation")
    window_size = np.shape(X_tab)[1]
    p = SFA(
        word_length=word_length,
        alphabet_size=alphabet_size,
        window_size=window_size,
        binning_method="equi-depth",
    ).fit(X, Y)
    dft = p._discrete_fourier_transform(X_tab[0])
    mft = p._mft(X_tab[0])

    assert (mft - dft < 0.0001).all()

    # print("Windowed DFT transformation")

    for norm in [True, False]:
        for window_size in [140]:
            p = SFA(
                word_length=word_length,
                norm=norm,
                alphabet_size=alphabet_size,
                window_size=window_size,
                binning_method="equi-depth",
            ).fit(X, Y)
            mft = p._mft(X_tab[0])
            for i in range(len(X_tab[0]) - window_size + 1):
                dft_transformed = p._discrete_fourier_transform(
                    X_tab[0, i:window_size + i])
                assert (mft[i] - dft_transformed < 0.001).all()

            assert len(mft) == len(X_tab[0]) - window_size + 1
            assert len(mft[0]) == word_length
Exemplo n.º 22
0
    def predict(self, X: NumpyOrDF) -> NumpyArray:
        """
        Method used to perform a prediction from the already
        trained clustering algorithm

        Parameters
        ----------
        X: Numpy array or Dataframe
            sktime data_frame or numpy array to predict
            cluster for

        Returns
        -------
        Numpy_Array: np.array
            Index of the cluster each sample belongs to
        """
        self.check_is_fitted()

        if isinstance(X, pd.DataFrame):
            X = from_nested_to_2d_array(X, return_numpy=True)

        return self._predict(X)
Exemplo n.º 23
0
    def _transform_single_feature(self, X, feature):
        """transforms data into the catch22 features

        Parameters
        ----------
        X : pandas DataFrame, input time series
        feature : int, catch22 feature id or String, catch22 feature
                  name.

        Returns
        -------
        Numpy array containing a catch22 feature for each input series
        """
        if isinstance(feature, (int, np.integer)) or isinstance(
            feature, (float, np.float)
        ):
            if feature > 21 or feature < 0:
                raise ValueError("Invalid catch22 feature ID")
        elif isinstance(feature, str):
            if feature in feature_names:
                feature = feature_names.index(feature)
            else:
                raise ValueError("Invalid catch22 feature name")
        else:
            raise ValueError("catch22 feature name or ID required")

        if isinstance(X, pd.DataFrame):
            X = from_nested_to_2d_array(X, return_numpy=True)

        n_instances = X.shape[0]
        X = np.reshape(X, (n_instances, -1))

        c22_list = []
        for i in range(n_instances):
            series = X[i, :].tolist()
            c22_val = features[feature](series)
            c22_list.append(c22_val)

        return np.array(c22_list)
Exemplo n.º 24
0
    def predict(self, X: NumpyOrDF, y=None) -> NumpyArray:
        """
        Return cluster center index for data samples.

        Parameters
        ----------
        X: 2D np.array with shape (n_instances, n_timepoints)
           or pd.DataFrame in nested format
            panel of time series to cluster

        y: ignored, exists for API consistency reasons

        Returns
        -------
        Numpy_Array: 1D np.array of length n_instances
            Index of the cluster each sample belongs to
        """
        self.check_is_fitted()

        if isinstance(X, pd.DataFrame):
            X = from_nested_to_2d_array(X, return_numpy=True)

        self._check_params(X)
        return self._predict(X)
Exemplo n.º 25
0
    def fit(self, X: NumpyOrDF, y=None):
        """
        Fit the clustering algorithm on the dataset X

        Parameters
        ----------
        X: 2D np.array with shape (n_instances, n_timepoints)
           or pd.DataFrame in nested format
            panel of univariate time series to train the clustering model on

        y: ignored, exists for API consistency reasons

        Returns
        -------
        reference to self
        """
        if isinstance(X, pd.DataFrame):
            X = from_nested_to_2d_array(X, return_numpy=True)

        self._check_params(X)
        self._fit(X)

        self._is_fitted = True
        return self
Exemplo n.º 26
0
def _univariate_nested_df_to_array(X):
    return from_nested_to_2d_array(X, return_array=True)
Exemplo n.º 27
0
def _univariate_nested_df_to_array(X):
    return from_nested_to_2d_array(X, return_numpy=False)