Exemplo n.º 1
0
def test_tabularize():
    n_obs_X = 20
    n_cols_X = 3
    X = generate_df_from_array(np.random.normal(size=n_obs_X),
                               n_rows=10,
                               n_cols=n_cols_X)

    # Test single series input.
    Xt = tabularize(X.iloc[:, 0], return_array=True)
    assert Xt.shape[0] == X.shape[0]
    assert Xt.shape[1] == n_obs_X

    Xt = tabularize(X.iloc[:, 0])
    assert Xt.index.equals(X.index)

    # Test dataframe input with columns having series of different length.
    n_obs_Y = 13
    n_cols_Y = 2
    Y = generate_df_from_array(np.random.normal(size=n_obs_Y),
                               n_rows=10,
                               n_cols=n_cols_Y)
    X = pd.concat([X, Y], axis=1)

    Xt = tabularize(X, return_array=True)
    assert Xt.shape[0] == X.shape[0]
    assert Xt.shape[1] == (n_cols_X * n_obs_X) + (n_cols_Y * n_obs_Y)

    Xt = tabularize(X)
    assert Xt.index.equals(X.index)
Exemplo n.º 2
0
def test_row_transformer_transform_inverse_transform():
    X, y = load_gunpoint(return_X_y=True)
    t = RowTransformer(StandardScaler())
    Xt = t.fit_transform(X)
    Xit = t.inverse_transform(Xt)
    assert Xit.shape == X.shape
    assert isinstance(Xit.iloc[0, 0], (
        pd.Series, np.ndarray))  # check series-to-series transforms
    np.testing.assert_array_almost_equal(tabularize(X).values,
                                         tabularize(Xit).values, decimal=5)
def test_random_state():
    X = generate_df_from_array(np.random.normal(size=10))
    random_state = 1234

    for n_intervals in [0.5, 10, 'sqrt', 'random', 'log']:
        trans = RandomIntervalSegmenter(n_intervals=n_intervals, random_state=random_state)
        first_Xt = trans.fit_transform(X)
        for _ in range(N_ITER):
            trans = RandomIntervalSegmenter(n_intervals=n_intervals, random_state=random_state)
            Xt = trans.fit_transform(X)
            np.testing.assert_array_equal(tabularize(first_Xt).values, tabularize(Xt).values)
Exemplo n.º 4
0
 def distance(instance_a, instance_b, **params):
     # find distance
     instance_a = tabularize(instance_a,
                             return_array=True)  # todo use specific
     # dimension rather than whole
     # thing?
     instance_b = tabularize(instance_b,
                             return_array=True)  # todo use specific
     # dimension rather than whole thing?
     instance_a = np.transpose(instance_a)
     instance_b = np.transpose(instance_b)
     return distance_measure(instance_a, instance_b, **params)
Exemplo n.º 5
0
def test_output_format_dim(len_series, n_instances, n_components):
    np.random.seed(42)
    X = detabularize(pd.DataFrame(data=np.random.randn(n_instances, len_series)))

    trans = PCATransformer(n_components=n_components)
    Xt = trans.fit_transform(X)

    # Check number of rows and output type.
    assert isinstance(Xt, pd.DataFrame)
    assert Xt.shape[0] == X.shape[0]

    # Check number of principal components in the output.
    assert tabularize(Xt).shape[1] == min(n_components, tabularize(X).shape[1])
Exemplo n.º 6
0
    def transform(self, X, y=None):
        """
        Transform X, transforms univariate time-series using sklearn's PCA
        class

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_samples, 1]
            Nested dataframe with univariate time-series in cells.

        Returns
        -------
        Xt : pandas DataFrame
          Transformed pandas DataFrame with the same number of rows and the
          (potentially reduced) PCA transformed
          column. Time indices of the original column are replaced with 0:(
          n_components - 1).
        """
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)

        # Transform X using the fitted PCA
        Xtab = tabularize(X)
        Xpca = pd.DataFrame(data=self.pca.transform(Xtab),
                            index=Xtab.index,
                            columns=Xtab.columns[:self.pca.n_components_])

        # Back-transform into time series data format
        Xt = detabularise(Xpca, index=X.index)
        Xt.columns = X.columns
        return Xt
Exemplo n.º 7
0
    def _transform_words(self, X):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=False)

        if self.use_first_order_differences:
            X = self.add_first_order_differences(X)

        bag_all_words = [dict() for _ in range(len(X))]

        # On each dimension, perform SFA
        for ind, column in enumerate(self.col_names):
            X_dim = X[column]
            X_dim = tabularize(X_dim, return_array=True)

            for i, window_size in enumerate(self.window_sizes[ind]):

                # SFA transform
                sfa_words = self.SFA_transformers[ind][i].transform(X_dim)
                bag = sfa_words[0]  # .iloc[:, 0]

                # merging bag-of-patterns of different window_sizes
                # to single bag-of-patterns with prefix indicating
                # the used window-length
                highest = np.int32(self.highest_bits[ind])
                for j in range(len(bag)):
                    for (key, value) in bag[j].items():
                        # append the prefices to the words to distinguish
                        # between window-sizes
                        word = MUSE.shift_left(key, highest, ind,
                                               self.highest_dim_bit,
                                               window_size)

                        bag_all_words[j][word] = value

        return bag_all_words
Exemplo n.º 8
0
    def fit(self, X, y=None):
        """Calculate word breakpoints using _mcb

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The class labels.

        Returns
        -------
        self : object
         """

        if self.alphabet_size < 2 or self.alphabet_size > 4:
            raise ValueError(
                "Alphabet size must be an integer between 2 and 4")

        if self.word_length < 1 or self.word_length > 16:
            raise ValueError("Word length must be an integer between 1 and 16")

        if self.igb and y is None:
            raise ValueError(
                "Class values must be provided for information gain binning")

        X = check_X(X, enforce_univariate=True)
        X = tabularize(X, return_array=True)

        self.n_instances, self.series_length = X.shape
        self.breakpoints = self._igb(X, y) if self.igb else self._mcb(X)

        self._is_fitted = True
        return self
Exemplo n.º 9
0
    def _transform_words(self, X):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)
        X = tabularize(X, return_array=True)

        bag_all_words = [dict() for _ in range(len(X))]
        for i, window_size in enumerate(self.window_sizes):

            # SFA transform
            sfa_words = self.SFA_transformers[i].transform(X)
            bag = sfa_words[0]  # .iloc[:, 0]

            # merging bag-of-patterns of different window_sizes
            # to single bag-of-patterns with prefix indicating
            # the used window-length
            for j in range(len(bag)):
                for (key, value) in bag[j].items():
                    # append the prefices to the words to distinguish
                    # between window-sizes
                    if isinstance(key, tuple):
                        word = (
                            ((key[0] << self.highest_bit) | key[1]) << 3
                        ) | window_size
                    else:
                        # word = ((key << self.highest_bit) << 3) | window_size
                        word = WEASEL.shift_left(key, self.highest_bit, window_size)

                    bag_all_words[j][word] = value

        return bag_all_words
Exemplo n.º 10
0
    def predict(self, X):
        self.check_is_fitted()

        if isinstance(X, pd.Series) or isinstance(X, pd.DataFrame):
            X = check_X(X, enforce_univariate=True)
            X = tabularize(X, return_array=True)

        rng = check_random_state(self.random_state)

        classes = []
        test_bags = self.transformer.transform(X)
        test_bags = test_bags[0]  # .iloc[:, 0]

        for test_bag in test_bags:
            best_sim = -1
            nn = None

            for n, bag in enumerate(self.transformed_data):
                sim = histogram_intersection(test_bag, bag)

                if sim > best_sim or (sim == best_sim and rng.random() < 0.5):
                    best_sim = sim
                    nn = self.class_vals[n]

            classes.append(nn)

        return np.array(classes)
Exemplo n.º 11
0
def test_dft_mft():
    # load training data
    X, Y = load_gunpoint(split="train", return_X_y=True)
    X_tab = tabularize(X, return_array=True)

    word_length = 6
    alphabet_size = 4

    print("Single DFT transformation")
    window_size = np.shape(X_tab)[1]
    p = SFA(word_length=word_length, alphabet_size=alphabet_size,
            window_size=window_size, binning_method="equi-depth").fit(X, Y)
    dft = p._discrete_fourier_transform(X_tab[0])
    mft = p._mft(X_tab[0])

    assert ((mft-dft < 0.0001).all())

    print("Windowed DFT transformation")

    for norm in [True, False]:
        for window_size in [140]:
            p = SFA(word_length=word_length, norm=norm,
                    alphabet_size=alphabet_size, window_size=window_size,
                    binning_method="equi-depth").fit(X, Y)
            mft = p._mft(X_tab[0])
            for i in range(len(X_tab[0]) - window_size + 1):
                dft_transformed = p._discrete_fourier_transform(
                                        X_tab[0, i:window_size+i])
                assert(mft[i] - dft_transformed < 0.001).all()

            assert(len(mft) == len(X_tab[0]) - window_size + 1)
            assert(len(mft[0]) == word_length)
Exemplo n.º 12
0
    def _combine_data_frames(self, dataFrames, weighting_factor, col_names):
        """
        Helper function for the shape_dtw class to combine two dataframes
        together into a single dataframe.
        Used when the shape_descriptor_function is set to "compound".
        """
        first_desc = dataFrames[0]
        second_desc = dataFrames[1]

        first_desc_array = []
        second_desc_array = []

        # Convert the dataframes into arrays
        for x in first_desc.columns:
            first_desc_array.append(
                tabularize(first_desc[x], return_array=True))

        for x in second_desc.columns:
            second_desc_array.append(
                tabularize(second_desc[x], return_array=True))

        # Concatenate the arrays together
        res = []
        for x in range(len(first_desc_array)):
            dim1 = []
            for y in range(len(first_desc_array[x])):
                dim2 = []
                dim2.extend(first_desc_array[x][y])
                dim2.extend(second_desc_array[x][y] * weighting_factor)
                dim1.append(dim2)
            res.append(dim1)

        res = np.asarray(res)

        # Convert to pandas dataframe
        df = pd.DataFrame()

        for col in col_names:
            colToAdd = []
            for row in range(len(res[col])):
                inst = res[col][row]
                colToAdd.append(pd.Series(inst))
            df[col] = colToAdd

        return df
Exemplo n.º 13
0
    def transform(self, X, y=None):
        """

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.

        Returns
        -------
        dims: Pandas data frame with first dimension in column zero
        """
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)
        X = tabularize(X, return_array=True)

        num_atts = X.shape[1]
        num_insts = X.shape[0]
        dims = pd.DataFrame()
        data = []

        for i in range(num_insts):
            series = X[i, :]

            frames = []
            current_frame = 0
            current_frame_size = 0
            frame_length = num_atts / self.num_intervals
            frame_sum = 0

            for n in range(num_atts):
                remaining = frame_length - current_frame_size

                if remaining > 1:
                    frame_sum += series[n]
                    current_frame_size += 1
                else:
                    frame_sum += remaining * series[n]
                    current_frame_size += remaining

                if current_frame_size == frame_length:
                    frames.append(frame_sum / frame_length)
                    current_frame += 1

                    frame_sum = (1 - remaining) * series[n]
                    current_frame_size = (1 - remaining)

            # if the last frame was lost due to double imprecision
            if current_frame == self.num_intervals - 1:
                frames.append(frame_sum / frame_length)

            data.append(pd.Series(frames))

        dims[0] = data

        return dims
Exemplo n.º 14
0
    def predict_proba(self, X):
        """
        Find probability estimates for each class for all cases in X.
        Parameters
        ----------
        X : The training input samples. array-like or sparse matrix of shape
        = [n_test_instances, series_length]
            If a Pandas data frame is passed (sktime format) a check is
            performed that it only has one column.
            If not, an exception is thrown, since this classifier does not
            yet have
            multivariate capability.

        Local variables
        ----------
        n_test_instances     : int, number of cases to classify
        series_length    : int, number of attributes in X, must match
        _num_atts determined in fit

        Returns
        -------
        output : array of shape = [n_test_instances, num_classes] of
        probabilities
        """
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)
        X = tabularize(X, return_array=True)

        n_test_instances, series_length = X.shape
        if series_length != self.series_length:
            raise TypeError(
                " ERROR number of attributes in the train does not match "
                "that in the test data")
        sums = np.zeros((X.shape[0], self.n_classes), dtype=np.float64)
        for i in range(0, self.n_estimators):
            transformed_x = np.empty(shape=(3 * self.n_intervals,
                                            n_test_instances),
                                     dtype=np.float32)
            for j in range(0, self.n_intervals):
                means = np.mean(
                    X[:, self.intervals[i][j][0]:self.intervals[i][j][1]],
                    axis=1)
                std_dev = np.std(
                    X[:, self.intervals[i][j][0]:self.intervals[i][j][1]],
                    axis=1)
                slope = self._lsq_fit(
                    X[:, self.intervals[i][j][0]:self.intervals[i][j][1]])
                transformed_x[3 * j] = means
                transformed_x[3 * j + 1] = std_dev
                transformed_x[3 * j + 2] = slope
            transformed_x = transformed_x.T
            sums += self.classifiers[i].predict_proba(transformed_x)

        output = sums / (np.ones(self.n_classes) * self.n_estimators)
        return output
Exemplo n.º 15
0
    def transform(self, X, y=None):
        """
        Transform X, segments time-series in each column into random
        intervals using interval indices generated
        during `fit`.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_samples, n_features]
            Nested dataframe with time-series in cells.

        Returns
        -------
        Xt : pandas DataFrame
          Transformed pandas DataFrame with same number of rows and one
          column for each generated interval.
        """

        # Check inputs.
        self.check_is_fitted()
        X = check_X(X)

        # Check that the input is of the same shape as the one passed
        # during fit.
        if X.shape[1] != self.input_shape_[1]:
            raise ValueError(
                'Number of columns of input is different from what was seen'
                'in `fit`')
        # # Input validation
        # if not all([np.array_equal(fit_idx, trans_idx)
        #             for trans_idx, fit_idx in zip(check_equal_index(X),
        #             self._time_index)]):
        #     raise ValueError('Indexes of input time-series are different
        #     from what was seen in `fit`')

        # Segment into intervals.
        # TODO generalise to non-equal-index cases
        intervals = []
        colname = X.columns[0]
        colnames = []
        # Tabularise assuming series
        arr = tabularize(X, return_array=True)
        # have equal indexes in any given column
        print(self.intervals_)
        for start, end in self.intervals_:
            interval = arr[:, start:end]
            intervals.append(interval)
            colnames.append(f"{colname}_{start}_{end}")

        # Return nested pandas DataFrame.
        Xt = pd.DataFrame(concat_nested_arrays(intervals, return_arrays=True))
        Xt.columns = colnames
        return Xt
Exemplo n.º 16
0
def test_tsfresh_extractor(default_fc_parameters):
    X, y = make_classification_problem()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    transformer = TSFreshFeatureExtractor(
        default_fc_parameters=default_fc_parameters, disable_progressbar=True)

    Xt = transformer.fit_transform(X_train, y_train)
    actual = Xt.filter(like="__mean", axis=1).values.ravel()
    expected = tabularize(X_train).mean(axis=1).values

    assert expected[0] == X_train.iloc[0, 0].mean()
    np.testing.assert_allclose(actual, expected)
Exemplo n.º 17
0
    def _apply_rowwise(self, func, X, y=None):
        """Helper function to apply transform or inverse_transform function
        on each row of data container"""
        self.check_is_fitted()
        X = check_X(X)

        # 1st attempt: apply, relatively fast but not robust
        # try and except, but sometimes breaks in other cases than excepted
        # ValueError
        # Works on single column, but on multiple columns only if columns
        # have equal-length series.
        # try:
        #     Xt = X.apply(self.transformer.fit_transform)
        #
        # # Otherwise call apply on each column separately.
        # except ValueError as e:
        #     if str(e) == "arrays must all be same length":
        #         Xt = pd.concat([pd.Series(col.apply(
        #         self.transformer.fit_transform)) for _, col in X.items()],
        #         axis=1)
        #     else:
        #         raise

        # 2nd attempt: apply but iterate over columns, still relatively fast
        # but still not very robust
        # but column is not 2d and thus breaks if transformer expects 2d input
        try:
            Xt = pd.concat([pd.Series(col.apply(func))
                            for _, col in X.items()], axis=1)

        # 3rd attempt: explicit for-loops, most robust but very slow
        except Exception:
            cols_t = []
            for c in range(X.shape[1]):  # loop over columns
                col = X.iloc[:, c]
                rows_t = []
                for row in col:  # loop over rows in each column
                    row_2d = pd.DataFrame(row)  # convert into 2d dataframe
                    row_t = func(row_2d).ravel()  # apply transform
                    rows_t.append(row_t)  # append transformed rows
                cols_t.append(rows_t)  # append transformed columns

            # if series-to-series transform, flatten transformed series
            Xt = concat_nested_arrays(
                cols_t)  # concatenate transformed columns

            # tabularise/unnest series-to-primitive transforms
            xt = Xt.iloc[0, 0]
            if isinstance(xt, (pd.Series, np.ndarray)) and len(xt) == 1:
                Xt = tabularize(Xt)
        return Xt
Exemplo n.º 18
0
    def transform(self, X, y=None):
        """
        Function to transform a data frame of time series data.

        Parameters
        ----------
        X : a pandas dataframe of shape = [n_samples, num_dims]
            The training input samples.

        Returns
        -------
        dims: a pandas data frame of shape = [n_samples, num_dims]
        """

        # Check the data
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=False)

        # Get information about the dataframe
        num_insts = X.shape[0]
        col_names = X.columns
        num_atts = len(X.iloc[0, 0])

        # Check the parameters are appropriate
        self._check_parameters(num_atts)

        df = pd.DataFrame()

        for x in col_names:
            # Convert one of the columns in the dataframe to a numpy array
            arr = tabularize(pd.DataFrame(X[x]), return_array=True)

            # Get the HOG1Ds of each time series
            transformedData = []
            for y in range(num_insts):
                inst = self._calculate_hog1ds(arr[y])
                transformedData.append(inst)

            # Convert to numpy array
            transformedData = np.asarray(transformedData)

            # Add it to the dataframe
            colToAdd = []
            for i in range(len(transformedData)):
                inst = transformedData[i]
                colToAdd.append(pd.Series(inst))

            df[x] = colToAdd

        return df
Exemplo n.º 19
0
def test_pca_results(n_components):
    np.random.seed(42)

    # sklearn
    X = pd.DataFrame(data=np.random.randn(10, 5))
    pca = PCA(n_components=n_components)
    Xt1 = pca.fit_transform(X)

    # sktime
    Xs = detabularize(X)
    pca_transform = PCATransformer(n_components=n_components)
    Xt2 = pca_transform.fit_transform(Xs)

    assert np.allclose(np.asarray(Xt1), np.asarray(tabularize(Xt2)))
Exemplo n.º 20
0
    def transform(self, X, y=None):
        """
        Function to perform the transformation on the time series data.

        Parameters
        ----------
        X : a pandas dataframe of shape = [n_instances, 1]
            The training input samples.

        Returns
        -------
        dims: a pandas data frame of shape = [n_instances, n_timepoints]
        """
        # get the number of attributes and instances
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)
        X = tabularize(X, return_array=True)

        n_timepoints = X.shape[1]
        n_instances = X.shape[0]

        # Check the parameters are appropriate
        self._check_parameters(n_timepoints)

        pad_amnt = math.floor(self.window_length/2)
        padded_data = np.zeros((n_instances, n_timepoints + (2*pad_amnt)))

        # Pad both ends of X
        for i in range(n_instances):
            padded_data[i] = np.pad(X[i], pad_amnt, mode='edge')

        subsequences = np.zeros((n_instances, n_timepoints,
                                 self.window_length))

        # Extract subsequences
        for i in range(n_instances):
            subsequences[i] = self._extract_subsequences(padded_data[i],
                                                         n_timepoints)

        # Convert this into a panda's data frame
        df = pd.DataFrame()
        for i in range(len(subsequences)):
            inst = subsequences[i]
            data = []
            for j in range(len(inst)):
                data.append(pd.Series(inst[j]))
            df[i] = data

        return df.transpose()
Exemplo n.º 21
0
def test_padding_paramterised_transformer():
    # load data
    name = 'JapaneseVowels'
    X_train, y_train = _load_dataset(name, split='train', return_X_y=True)
    X_test, y_test = _load_dataset(name, split='test', return_X_y=True)

    # print(X_train)

    padding_transformer = PaddingTransformer(pad_length=40)
    Xt = padding_transformer.fit_transform(X_train)

    # when we tabulrize the data it has 12 dimensions
    # and we've truncated them all to (10-2) long.
    data = tabularize(Xt)
    assert len(data.columns) == 40 * 12
Exemplo n.º 22
0
def test_padding_transformer():
    # load data
    name = 'JapaneseVowels'
    X_train, y_train = _load_dataset(name, split='train', return_X_y=True)
    X_test, y_test = _load_dataset(name, split='test', return_X_y=True)

    # print(X_train)

    padding_transformer = PaddingTransformer()
    Xt = padding_transformer.fit_transform(X_train)

    # when we tabulrize the data it has 12 dimensions
    # and we've padded them to there normal length of 29
    data = tabularize(Xt)
    assert len(data.columns) == 29 * 12
Exemplo n.º 23
0
def test_truncation_transformer():
    # load data
    name = 'JapaneseVowels'
    X_train, y_train = _load_dataset(name, split='train', return_X_y=True)
    X_test, y_test = _load_dataset(name, split='test', return_X_y=True)

    # print(X_train)

    truncated_transformer = TruncationTransformer(5)
    Xt = truncated_transformer.fit_transform(X_train)

    # when we tabulrize the data it has 12 dimensions
    # and we've truncated them all to 5 long.
    data = tabularize(Xt)
    assert len(data.columns) == 5 * 12
Exemplo n.º 24
0
    def predict_proba(self, X):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)
        X = tabularize(X, return_array=True)

        sums = np.zeros((X.shape[0], self.n_classes))

        for n, clf in enumerate(self.classifiers):
            preds = clf.predict(X)
            for i in range(0, X.shape[0]):
                sums[i, self.class_dictionary[preds[i]]] += self.weights[n]

        dists = sums / (np.ones(self.n_classes) * self.weight_sum)

        return dists
Exemplo n.º 25
0
    def transform(self, X, y=None):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)
        X = tabularize(X, return_array=True)

        bags = pd.DataFrame()
        dim = []

        for i in range(X.shape[0]):
            dfts = self._mft(X[i, :])
            bag = {}
            last_word = -1
            repeat_words = 0
            words = []

            for window in range(dfts.shape[0]):
                word_raw = _create_word(dfts[window], self.word_length,
                                        self.alphabet_size, self.breakpoints)
                word = _BitWord(word=word_raw)

                words.append(word)
                repeat_word = (self._add_to_pyramid(
                    bag, word, last_word, window -
                    int(repeat_words / 2)) if self.levels > 1 else
                               self._add_to_bag(bag, word, last_word))
                if repeat_word:
                    repeat_words += 1
                else:
                    last_word = word.word
                    repeat_words = 0

                if self.bigrams:
                    if window - self.window_size >= 0 and window > 0:
                        bigram = words[window - self.window_size] \
                            .create_bigram(word, self.word_length)
                        if self.levels > 1:
                            bigram = (bigram, 0)
                        bag[bigram] = bag.get(bigram, 0) + 1

            if self.save_words:
                self.words.append(words)

            dim.append(pd.Series(bag))

        bags[0] = dim

        return bags
Exemplo n.º 26
0
    def transform(self, X, y=None):
        """
        Parameters
        ----------
        X : a pandas dataframe of shape = [n_samples, num_dims]
            The training input samples.

        Returns
        -------
        df: a pandas data frame of shape = [num_intervals, num_dims]
        """

        # Check the data
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=False)

        # Get information about the dataframe
        n_timepoints = len(X.iloc[0, 0])
        num_instances = X.shape[0]
        col_names = X.columns

        self._check_parameters(n_timepoints)

        df = pd.DataFrame()

        for x in col_names:
            # Convert one of the columns in the dataframe to numpy array
            arr = tabularize(pd.DataFrame(X[x]), return_array=True)

            # Calculate gradients
            transformedData = []
            for y in range(num_instances):
                res = self._get_gradients_of_lines(arr[y])
                transformedData.append(res)

            # Convert to Numpy array
            transformedData = np.asarray(transformedData)

            # Add it to the dataframe
            colToAdd = []
            for i in range(len(transformedData)):
                inst = transformedData[i]
                colToAdd.append(pd.Series(inst))

            df[x] = colToAdd

        return df
Exemplo n.º 27
0
    def transform(self, X, y=None):
        """Transform nested pandas dataframe into tabular dataframe.

        Parameters
        ----------
        X : pandas DataFrame
            Nested dataframe with pandas series or numpy arrays in cells.
        y : array-like, optional (default=None)

        Returns
        -------
        Xt : pandas DataFrame
            Transformed dataframe with only primitives in cells.
        """
        self.check_is_fitted()
        X = check_X(X)
        return tabularize(X)
Exemplo n.º 28
0
    def fit(self, X, y):

        if isinstance(X, pd.Series) or isinstance(X, pd.DataFrame):
            X, y = check_X_y(X, y, enforce_univariate=True)
            X = tabularize(X, return_array=True)

        sfa = self.transformer.fit_transform(X, y)
        self.transformed_data = sfa[0]  # .iloc[:, 0]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self
Exemplo n.º 29
0
    def _perform_paa_along_dim(self, X):
        X = tabularize(X, return_array=True)

        num_atts = X.shape[1]
        num_insts = X.shape[0]
        dims = pd.DataFrame()
        data = []

        for i in range(num_insts):
            series = X[i, :]

            frames = []
            current_frame = 0
            current_frame_size = 0
            frame_length = num_atts / self.num_intervals
            frame_sum = 0

            for n in range(num_atts):
                remaining = frame_length - current_frame_size

                if remaining > 1:
                    frame_sum += series[n]
                    current_frame_size += 1
                else:
                    frame_sum += remaining * series[n]
                    current_frame_size += remaining

                if current_frame_size == frame_length:
                    frames.append(frame_sum / frame_length)
                    current_frame += 1

                    frame_sum = (1 - remaining) * series[n]
                    current_frame_size = (1 - remaining)

            # if the last frame was lost due to double imprecision
            if current_frame == self.num_intervals - 1:
                frames.append(frame_sum / frame_length)

            data.append(pd.Series(frames))

        dims[0] = data

        return dims
Exemplo n.º 30
0
    def predict_proba(self, X):
        """
        Find probability estimates for each class for all cases in X.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances, n_columns]
            The training input samples.  If a Pandas data frame is passed,

        Local variables
        ----------
        n_samps     : int, number of cases to classify
        n_columns    : int, number of attributes in X, must match _num_atts
        determined in fit

        Returns
        -------
        output : array of shape = [n_instances, n_classes] of probabilities
        """
        self.check_is_fitted()
        check_X(X, enforce_univariate=True)
        X = tabularize(X, return_array=True)

        n_cases, n_columns = X.shape
        if n_columns != self.series_length:
            raise TypeError(
                " ERROR number of attributes in the train does not match "
                "that in the test data")
        sums = np.zeros((X.shape[0], self.n_classes), dtype=np.float64)

        for i in range(0, self.n_estimators):
            acf_x = np.empty(shape=(n_cases, self.lags[i]))
            ps_len = (self.intervals[i][1] - self.intervals[i][0]) / 2
            ps_x = np.empty(shape=(n_cases, int(ps_len)))
            for j in range(0, n_cases):
                acf_x[j] = acf(X[j, self.intervals[i][0]:self.intervals[i][1]],
                               self.lags[i])
                ps_x[j] = ps(X[j, self.intervals[i][0]:self.intervals[i][1]])
            transformed_x = np.concatenate((acf_x, ps_x), axis=1)
            sums += self.estimators_[i].predict_proba(transformed_x)

        output = sums / (np.ones(self.n_classes) * self.n_estimators)
        return output