示例#1
0
 def distance(instance_a, instance_b, **params):
     # find distance
     instance_a = from_nested_to_2d_array(
         instance_a, return_numpy=True)  # todo use specific
     # dimension rather than whole
     # thing?
     instance_b = from_nested_to_2d_array(
         instance_b, return_numpy=True)  # todo use specific
     # dimension rather than whole thing?
     instance_a = np.transpose(instance_a)
     instance_b = np.transpose(instance_b)
     return distance_measure(instance_a, instance_b, **params)
示例#2
0
def test_row_transformer_transform_inverse_transform():
    X, y = load_gunpoint(return_X_y=True)
    t = RowTransformer(StandardScaler())
    Xt = t.fit_transform(X)
    Xit = t.inverse_transform(Xt)
    assert Xit.shape == X.shape
    assert isinstance(
        Xit.iloc[0, 0], (pd.Series, np.ndarray)
    )  # check series-to-series transforms
    np.testing.assert_array_almost_equal(
        from_nested_to_2d_array(X).values,
        from_nested_to_2d_array(Xit).values,
        decimal=5,
    )
示例#3
0
def test_from_nested_to_2d_array(n_instances, n_columns, n_timepoints):
    nested, _ = make_classification_problem(n_instances, n_columns,
                                            n_timepoints)

    array = from_nested_to_2d_array(nested)
    assert array.shape == (n_instances, n_columns * n_timepoints)
    assert array.index.equals(nested.index)
示例#4
0
    def transform(self, X, y=None):
        """Concatenate multivariate time series/panel data into long
        univariate time series/panel
        data by simply concatenating times series in time.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_samples, n_features]
            Nested dataframe with time-series in cells.

        Returns
        -------
        Xt : pandas DataFrame
          Transformed pandas DataFrame with same number of rows and single
          column
        """
        self.check_is_fitted()
        X = check_X(X)

        # We concatenate by tabularizing all columns and then detabularizing
        # them into a single column
        if isinstance(X, pd.DataFrame):
            Xt = from_nested_to_2d_array(X)
        else:
            Xt = from_3d_numpy_to_2d_array(X)
        return from_2d_array_to_nested(Xt)
示例#5
0
def test_output_format_dim(len_series, n_instances, n_components):
    np.random.seed(42)
    X = from_2d_array_to_nested(
        pd.DataFrame(data=np.random.randn(n_instances, len_series)))

    trans = PCATransformer(n_components=n_components)
    Xt = trans.fit_transform(X)

    # Check number of rows and output type.
    assert isinstance(Xt, pd.DataFrame)
    assert Xt.shape[0] == X.shape[0]

    # Check number of principal components in the output.
    assert from_nested_to_2d_array(Xt).shape[1] == min(
        n_components,
        from_nested_to_2d_array(X).shape[1])
示例#6
0
    def _combine_data_frames(self, dataFrames, weighting_factor, col_names):
        """
        Helper function for the shape_dtw class to combine two dataframes
        together into a single dataframe.
        Used when the shape_descriptor_function is set to "compound".
        """
        first_desc = dataFrames[0]
        second_desc = dataFrames[1]

        first_desc_array = []
        second_desc_array = []

        # Convert the dataframes into arrays
        for x in first_desc.columns:
            first_desc_array.append(
                from_nested_to_2d_array(first_desc[x], return_numpy=True)
            )

        for x in second_desc.columns:
            second_desc_array.append(
                from_nested_to_2d_array(second_desc[x], return_numpy=True)
            )

        # Concatenate the arrays together
        res = []
        for x in range(len(first_desc_array)):
            dim1 = []
            for y in range(len(first_desc_array[x])):
                dim2 = []
                dim2.extend(first_desc_array[x][y])
                dim2.extend(second_desc_array[x][y] * weighting_factor)
                dim1.append(dim2)
            res.append(dim1)

        res = np.asarray(res)

        # Convert to pandas dataframe
        df = pd.DataFrame()

        for col in col_names:
            colToAdd = []
            for row in range(len(res[col])):
                inst = res[col][row]
                colToAdd.append(pd.Series(inst))
            df[col] = colToAdd

        return df
示例#7
0
def _handle_tabularizer_args(*args, **kwargs):
    # the Tabularizer transforms a nested pd.DataFrame/3d numpy array into a
    # 2d numpy array, so the inverse transform goes from a 2d numpy array to a
    # nested pd.DataFrame/3d array
    # TODO refactor Tabularizer as series-as-features composition meta-estimator,
    #  rather than transformer or introduce special transformer type
    X, y = args
    if "return_numpy" in kwargs and kwargs["return_numpy"]:
        return from_3d_numpy_to_2d_array(X), y
    else:
        return from_nested_to_2d_array(X), y
示例#8
0
 def row_first(X):
     if isinstance(X, pd.Series):
         X = pd.DataFrame(X)
     Xt = pd.concat(
         [
             pd.Series(from_nested_to_2d_array(col).iloc[:, 0])
             for _, col in X.items()
         ],
         axis=1,
     )
     return Xt
示例#9
0
def test_tsfresh_extractor(default_fc_parameters):
    X, y = make_classification_problem()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    transformer = TSFreshFeatureExtractor(
        default_fc_parameters=default_fc_parameters, disable_progressbar=True)

    Xt = transformer.fit_transform(X_train, y_train)
    actual = Xt.filter(like="__mean", axis=1).values.ravel()
    expected = from_nested_to_2d_array(X_train).mean(axis=1).values

    assert expected[0] == X_train.iloc[0, 0].mean()
    np.testing.assert_allclose(actual, expected)
示例#10
0
    def _apply_rowwise(self, func, X, y=None):
        """Helper function to apply transform or inverse_transform function
        on each row of data container"""
        self.check_is_fitted()
        X = check_X(X, coerce_to_pandas=True)

        # 1st attempt: apply, relatively fast but not robust
        # try and except, but sometimes breaks in other cases than excepted
        # ValueError
        # Works on single column, but on multiple columns only if columns
        # have equal-length series.
        # try:
        #     Xt = X.apply(self.transformer.fit_transform)
        #
        # # Otherwise call apply on each column separately.
        # except ValueError as e:
        #     if str(e) == "arrays must all be same length":
        #         Xt = pd.concat([pd.Series(col.apply(
        #         self.transformer.fit_transform)) for _, col in X.items()],
        #         axis=1)
        #     else:
        #         raise

        # 2nd attempt: apply but iterate over columns, still relatively fast
        # but still not very robust
        # but column is not 2d and thus breaks if transformer expects 2d input
        try:
            Xt = pd.concat(
                [pd.Series(col.apply(func)) for _, col in X.items()], axis=1)

        # 3rd attempt: explicit for-loops, most robust but very slow
        except Exception:
            cols_t = []
            for c in range(X.shape[1]):  # loop over columns
                col = X.iloc[:, c]
                rows_t = []
                for row in col:  # loop over rows in each column
                    row_2d = pd.DataFrame(row)  # convert into 2d dataframe
                    row_t = func(row_2d).ravel()  # apply transform
                    rows_t.append(row_t)  # append transformed rows
                cols_t.append(rows_t)  # append transformed columns

            # if series-to-series transform, flatten transformed series
            Xt = _concat_nested_arrays(
                cols_t)  # concatenate transformed columns

            # tabularise/unnest series-to-primitive transforms
            xt = Xt.iloc[0, 0]
            if isinstance(xt, (pd.Series, np.ndarray)) and len(xt) == 1:
                Xt = from_nested_to_2d_array(Xt)
        return Xt
示例#11
0
    def transform(self, X, y=None):
        """
        Function to transform a data frame of time series data.

        Parameters
        ----------
        X : a pandas dataframe of shape = [n_samples, num_dims]
            The training input samples.

        Returns
        -------
        dims: a pandas data frame of shape = [n_samples, num_dims]
        """

        # Check the data
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=False, coerce_to_pandas=True)

        # Get information about the dataframe
        num_insts = X.shape[0]
        col_names = X.columns
        num_atts = len(X.iloc[0, 0])

        # Check the parameters are appropriate
        self._check_parameters(num_atts)

        df = pd.DataFrame()

        for x in col_names:
            # Convert one of the columns in the dataframe to a numpy array
            arr = from_nested_to_2d_array(pd.DataFrame(X[x]),
                                          return_numpy=True)

            # Get the HOG1Ds of each time series
            transformedData = []
            for y in range(num_insts):
                inst = self._calculate_hog1ds(arr[y])
                transformedData.append(inst)

            # Convert to numpy array
            transformedData = np.asarray(transformedData)

            # Add it to the dataframe
            colToAdd = []
            for i in range(len(transformedData)):
                inst = transformedData[i]
                colToAdd.append(pd.Series(inst))

            df[x] = colToAdd

        return df
示例#12
0
def test_pca_results(n_components):
    np.random.seed(42)

    # sklearn
    X = pd.DataFrame(data=np.random.randn(10, 5))
    pca = PCA(n_components=n_components)
    Xt1 = pca.fit_transform(X)

    # sktime
    Xs = from_2d_array_to_nested(X)
    pca_transform = PCATransformer(n_components=n_components)
    Xt2 = pca_transform.fit_transform(Xs)

    assert np.allclose(np.asarray(Xt1), np.asarray(from_nested_to_2d_array(Xt2)))
示例#13
0
def test_padding_transformer():
    # load data
    name = "JapaneseVowels"
    X_train, y_train = _load_dataset(name, split="train", return_X_y=True)
    X_test, y_test = _load_dataset(name, split="test", return_X_y=True)

    # print(X_train)

    padding_transformer = PaddingTransformer()
    Xt = padding_transformer.fit_transform(X_train)

    # when we tabulrize the data it has 12 dimensions
    # and we've padded them to there normal length of 29
    data = from_nested_to_2d_array(Xt)
    assert len(data.columns) == 29 * 12
示例#14
0
def test_truncation_paramterised_transformer():
    # load data
    name = "JapaneseVowels"
    X_train, y_train = _load_dataset(name, split="train", return_X_y=True)
    X_test, y_test = _load_dataset(name, split="test", return_X_y=True)

    # print(X_train)

    truncated_transformer = TruncationTransformer(2, 10)
    Xt = truncated_transformer.fit_transform(X_train)

    # when we tabulrize the data it has 12 dimensions
    # and we've truncated them all to (10-2) long.
    data = from_nested_to_2d_array(Xt)
    assert len(data.columns) == 8 * 12
示例#15
0
def test_padding_fill_value_transformer():
    # load data
    name = "JapaneseVowels"
    X_train, y_train = _load_dataset(name, split="train", return_X_y=True)
    X_test, y_test = _load_dataset(name, split="test", return_X_y=True)

    # print(X_train)

    padding_transformer = PaddingTransformer(pad_length=40, fill_value=1)
    Xt = padding_transformer.fit_transform(X_train)

    # when we tabulrize the data it has 12 dimensions
    # and we've truncated them all to (10-2) long.
    data = from_nested_to_2d_array(Xt)
    assert len(data.columns) == 40 * 12
示例#16
0
文件: slope.py 项目: wh28325/sktime
    def transform(self, X, y=None):

        """
        Parameters
        ----------
        X : a pandas dataframe of shape = [n_samples, num_dims]
            The training input samples.

        Returns
        -------
        df: a pandas data frame of shape = [num_intervals, num_dims]
        """

        # Check the data
        self.check_is_fitted()
        X = check_X(X, coerce_to_pandas=True)

        # Get information about the dataframe
        n_timepoints = len(X.iloc[0, 0])
        num_instances = X.shape[0]
        col_names = X.columns

        self._check_parameters(n_timepoints)

        df = pd.DataFrame()

        for x in col_names:
            # Convert one of the columns in the dataframe to numpy array
            arr = from_nested_to_2d_array(pd.DataFrame(X[x]), return_numpy=True)

            # Calculate gradients
            transformedData = []
            for y in range(num_instances):
                res = self._get_gradients_of_lines(arr[y])
                transformedData.append(res)

            # Convert to Numpy array
            transformedData = np.asarray(transformedData)

            # Add it to the dataframe
            colToAdd = []
            for i in range(len(transformedData)):
                inst = transformedData[i]
                colToAdd.append(pd.Series(inst))

            df[x] = colToAdd

        return df
示例#17
0
文件: _paa.py 项目: zerefwayne/sktime
    def _perform_paa_along_dim(self, X):
        X = from_nested_to_2d_array(X, return_numpy=True)

        num_atts = X.shape[1]
        num_insts = X.shape[0]
        dims = pd.DataFrame()
        data = []

        for i in range(num_insts):
            series = X[i, :]

            frames = []
            current_frame = 0
            current_frame_size = 0
            frame_length = num_atts / self.num_intervals
            frame_sum = 0

            for n in range(num_atts):
                remaining = frame_length - current_frame_size

                if remaining > 1:
                    frame_sum += series[n]
                    current_frame_size += 1
                else:
                    frame_sum += remaining * series[n]
                    current_frame_size += remaining

                if current_frame_size == frame_length:
                    frames.append(frame_sum / frame_length)
                    current_frame += 1

                    frame_sum = (1 - remaining) * series[n]
                    current_frame_size = 1 - remaining

            # if the last frame was lost due to double imprecision
            if current_frame == self.num_intervals - 1:
                frames.append(frame_sum / frame_length)

            data.append(pd.Series(frames))

        dims[0] = data

        return dims
示例#18
0
文件: dwt.py 项目: wh28325/sktime
    def transform(self, X, y=None):
        """
        Parameters
        ----------
        X : a pandas dataframe of shape = [n_samples, num_dims]
            The training input samples.

        Returns
        -------
        dims: a pandas data frame of shape
              = [n_samples, num_dims]
        """

        # Check the data
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=False, coerce_to_pandas=True)

        self._check_parameters()

        # Get information about the dataframe
        col_names = X.columns

        df = pd.DataFrame()
        for x in col_names:
            # Convert one of the columns in the dataframe to numpy array
            arr = from_nested_to_2d_array(pd.DataFrame(X[x]),
                                          return_numpy=True)

            transformedData = self._extract_wavelet_coefficients(arr)

            # Convert to a numpy array
            transformedData = np.asarray(transformedData)

            # Add it to the dataframe
            colToAdd = []
            for i in range(len(transformedData)):
                inst = transformedData[i]
                colToAdd.append(pd.Series(inst))

            df[x] = colToAdd

        return df
示例#19
0
    def transform(self, X, y=None):
        """Transform nested pandas dataframe into tabular dataframe.

        Parameters
        ----------
        X : pandas DataFrame
            Nested dataframe with pandas series or numpy arrays in cells.
        y : array-like, optional (default=None)

        Returns
        -------
        Xt : pandas DataFrame
            Transformed dataframe with only primitives in cells.
        """
        self.check_is_fitted()
        X = check_X(X)
        if isinstance(X, pd.DataFrame):
            return from_nested_to_2d_array(X)
        else:
            return from_3d_numpy_to_2d_array(X)
示例#20
0
def test_dft_mft():
    # load training data
    X, Y = load_gunpoint(split="train", return_X_y=True)
    X_tab = from_nested_to_2d_array(X, return_numpy=True)

    word_length = 6
    alphabet_size = 4

    # print("Single DFT transformation")
    window_size = np.shape(X_tab)[1]
    p = SFA(
        word_length=word_length,
        alphabet_size=alphabet_size,
        window_size=window_size,
        binning_method="equi-depth",
    ).fit(X, Y)
    dft = p._discrete_fourier_transform(X_tab[0])
    mft = p._mft(X_tab[0])

    assert (mft - dft < 0.0001).all()

    # print("Windowed DFT transformation")

    for norm in [True, False]:
        for window_size in [140]:
            p = SFA(
                word_length=word_length,
                norm=norm,
                alphabet_size=alphabet_size,
                window_size=window_size,
                binning_method="equi-depth",
            ).fit(X, Y)
            mft = p._mft(X_tab[0])
            for i in range(len(X_tab[0]) - window_size + 1):
                dft_transformed = p._discrete_fourier_transform(
                    X_tab[0, i : window_size + i]
                )
                assert (mft[i] - dft_transformed < 0.001).all()

            assert len(mft) == len(X_tab[0]) - window_size + 1
            assert len(mft[0]) == word_length
示例#21
0
    def _transform_single_feature(self, X, feature):
        """transforms data into the catch22 features

        Parameters
        ----------
        X : pandas DataFrame, input time series
        feature : int, catch22 feature id or String, catch22 feature
                  name.

        Returns
        -------
        Numpy array containing a catch22 feature for each input series
        """
        if isinstance(feature, int):
            if feature > 21 or feature < 0:
                raise ValueError("Invalid catch22 feature ID")
        elif isinstance(feature, str):
            if feature in feature_names:
                feature = feature_names.index(feature)
            else:
                raise ValueError("Invalid catch22 feature name")
        else:
            raise ValueError("Feature name or ID required")

        if isinstance(X, pd.DataFrame):
            X = from_nested_to_2d_array(X, return_numpy=True)

        n_instances = X.shape[0]
        X = np.reshape(X, (n_instances, -1))

        c22_list = []
        for i in range(n_instances):
            series = X[i, :].tolist()
            c22_val = features[feature](series)
            c22_list.append(c22_val)

        return np.array(c22_list)