Пример #1
0
    def predict_proba(self, X):
        """Predict class probabilities for X.
        The predicted class probabilities of an input sample are computed as
        the mean predicted class probabilities of the trees in the forest. The
        class probability of a single tree is the fraction of samples of the same
        class in a leaf.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.
        Returns
        -------
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
            such arrays if n_outputs > 1.
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute `classes_`.
        """
        check_is_fitted(self, 'estimators_')

        # Check data
        validate_X(X)
        check_X_is_univariate(X)
        X = self._validate_X_predict(X)

        # Assign chunk of trees to jobs
        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

        all_proba = Parallel(n_jobs=n_jobs,
                             verbose=self.verbose)(delayed(e.predict_proba)(X)
                                                   for e in self.estimators_)

        return np.sum(all_proba, axis=0) / len(self.estimators_)
Пример #2
0
    def transform(self, X):
        """
            Takes as input a time series dataset and returns the matrix profile
            for each single time series of the dataset.

            Parameters
            ----------
                X: pandas.DataFrame
                   Time series dataset.

            Output
            ------
                Xt: pandas.DataFrame
                    Dataframe with the same number of rows as the input.
                    The number of columns equals the number of subsequences
                    of the desired length in each time series.
        """

        # Input checks
        validate_X(X)
        check_X_is_univariate(X)

        n_instances = X.shape[0]

        # Convert into tabular format
        tabulariser = Tabulariser()
        X = tabulariser.transform(X)

        n_subs = X.shape[1]-self.m+1

        Xt = pd.DataFrame(stomp_self(np.array([X.iloc[i]]), self.m) for i in range(0, n_instances))

        return Xt
Пример #3
0
    def transform(self, X, y=None):
        """Transform X.
        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_samples, n_columns]
            Nested dataframe with time-series in cells.
        Returns
        -------
        Xt : pandas DataFrame
          Transformed pandas DataFrame
        """

        # input checks
        validate_X(X)
        check_X_is_univariate(X)

        # get column name
        column_name = X.columns[0]

        # find plateaus (segments of the same value)
        for x in X.iloc[:, 0]:
            x = np.asarray(x)

            # find indices of transition
            if np.isnan(self.value):
                i = np.where(np.isnan(x), 1, 0)

            elif np.isinf(self.value):
                i = np.where(np.isinf(x), 1, 0)

            else:
                i = np.where(x == self.value, 1, 0)

            # pad and find where segments transition
            transitions = np.diff(np.hstack([0, i, 0]))

            # compute starts, ends and lengths of the segments
            starts = np.where(transitions == 1)[0]
            ends = np.where(transitions == -1)[0]
            lengths = ends - starts

            # filter out single points
            starts = starts[lengths >= self.min_length]
            lengths = lengths[lengths >= self.min_length]

            self._starts.append(starts)
            self._lengths.append(lengths)

        # put into dataframe
        Xt = pd.DataFrame()
        column_prefix = "%s_%s" % (column_name, "nan" if np.isnan(self.value)
                                   else str(self.value))
        Xt["%s_starts" % column_prefix] = pd.Series(self._starts)
        Xt["%s_lengths" % column_prefix] = pd.Series(self._lengths)
        return Xt
Пример #4
0
    def _set_oob_score(self, X, y):
        """Compute out-of-bag score"""
        validate_X_y(X, y)
        check_X_is_univariate(X)

        n_classes_ = self.n_classes_
        n_samples = y.shape[0]

        oob_decision_function = []
        oob_score = 0.0
        predictions = [
            np.zeros((n_samples, n_classes_[k]))
            for k in range(self.n_outputs_)
        ]

        n_samples_bootstrap = _get_n_samples_bootstrap(n_samples,
                                                       self.max_samples)

        for estimator in self.estimators_:
            unsampled_indices = _generate_unsampled_indices(
                estimator.random_state, n_samples, n_samples_bootstrap)
            p_estimator = estimator.predict_proba(X.iloc[unsampled_indices, :])

            if self.n_outputs_ == 1:
                p_estimator = [p_estimator]

            for k in range(self.n_outputs_):
                predictions[k][unsampled_indices, :] += p_estimator[k]

        for k in range(self.n_outputs_):
            if (predictions[k].sum(axis=1) == 0).any():
                warn("Some inputs do not have OOB scores. "
                     "This probably means too few trees were used "
                     "to compute any reliable oob estimates.")

            decision = (predictions[k] /
                        predictions[k].sum(axis=1)[:, np.newaxis])
            oob_decision_function.append(decision)
            oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1),
                                 axis=0)

        if self.n_outputs_ == 1:
            self.oob_decision_function_ = oob_decision_function[0]
        else:
            self.oob_decision_function_ = oob_decision_function

        self.oob_score_ = oob_score / self.n_outputs_
Пример #5
0
    def fit(self, X, y=None):
        """
        Fit transformer, generating random interval indices.

        Parameters
        ----------
        X : pandas DataFrame of shape [n_samples, n_features]
            Input data
        y : pandas Series, shape (n_samples, ...), optional
            Targets for supervised learning.

        Returns
        -------
        self : an instance of self.
        """

        validate_X(X)
        check_X_is_univariate(X)

        self.input_shape_ = X.shape

        # Retrieve time-series indexes from each column.
        self._time_index = get_time_index(X)

        if isinstance(self.intervals, np.ndarray):
            self.intervals_ = self.intervals

        elif np.issubdtype(self.intervals, np.integer):
            self.intervals_ = np.array_split(self._time_index, self.intervals)

        else:
            raise ValueError(
                f"Intervals must be either an integer, a single array with "
                f"start and end points, but found: {self.intervals}")

        return self
Пример #6
0
    def transform(self, X, y=None):
        """
        Transform X, segments time-series in each column into random intervals using interval indices generated
        during `fit` and extracts features from each interval.

        Parameters
        ----------
        X : nested pandas.DataFrame of shape [n_samples, n_features]
            Nested dataframe with time-series in cells.

        Returns
        -------
        Xt : pandas.DataFrame
          Transformed pandas DataFrame with same number of rows and one column for each generated interval.
        """
        # Check is fit had been called
        check_is_fitted(self, 'intervals_')
        validate_X(X)
        check_X_is_univariate(X)

        # Check that the input is of the same shape as the one passed
        # during fit.
        if X.shape[1] != self.input_shape_[1]:
            raise ValueError(
                'Number of columns of input is different from what was seen'
                'in `fit`')
        # Input validation
        # if not all([np.array_equal(fit_idx, trans_idx) for trans_idx, fit_idx in zip(check_equal_index(X),
        #                                                                              self._time_index)]):
        #     raise ValueError('Indexes of input time-series are different from what was seen in `fit`')

        n_rows, n_columns = X.shape
        n_features = len(self.features)

        n_intervals = len(self.intervals_)

        # Compute features on intervals.
        Xt = np.zeros(
            (n_rows, n_features *
             n_intervals))  # Allocate output array for transformed data
        self.columns_ = []
        colname = X.columns[0]

        # Tabularize each column assuming series have equal indexes in any given column.
        # TODO generalise to non-equal-index cases
        arr = tabularize(X, return_array=True)
        i = 0
        for func in self.features:
            # TODO generalise to series-to-series functions and function kwargs
            for start, end in self.intervals_:
                interval = arr[:, start:end]

                # Try to use optimised computations over axis if possible, otherwise iterate over rows.
                try:
                    Xt[:, i] = func(interval, axis=1)
                except TypeError as e:
                    if str(
                            e
                    ) == f"{func.__name__}() got an unexpected keyword argument 'axis'":
                        Xt[:, i] = np.apply_along_axis(func, 1, interval)
                    else:
                        raise
                i += 1
                self.columns_.append(
                    f'{colname}_{start}_{end}_{func.__name__}')

        Xt = pd.DataFrame(Xt)
        Xt.columns = self.columns_
        return Xt
Пример #7
0
    def fit(self, X, y, sample_weight=None):
        """Build a forest of trees from the training set (X, y).

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples. Internally, its dtype will be converted
            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csc_matrix``.

        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (class labels in classification, real numbers in
            regression).

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        Returns
        -------
        self : object
        """

        # Validate or convert input data
        validate_X_y(X, y)
        check_X_is_univariate(X)

        if sample_weight is not None:
            sample_weight = check_array(sample_weight, ensure_2d=False)

        # Remap output
        self.n_columns = X.shape[1]

        y = np.atleast_1d(y)
        if y.ndim == 2 and y.shape[1] == 1:
            warn(
                "A column-vector y was passed when a 1d array was"
                " expected. Please change the shape of y to "
                "(n_samples,), for example using ravel().",
                DataConversionWarning,
                stacklevel=2)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        y, expanded_class_weight = self._validate_y_class_weight(y)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        if expanded_class_weight is not None:
            if sample_weight is not None:
                sample_weight = sample_weight * expanded_class_weight
            else:
                sample_weight = expanded_class_weight

        # Check parameters
        self._validate_estimator()

        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available"
                             " if bootstrap=True")

        random_state = check_random_state(self.random_state)

        if not self.warm_start or not hasattr(self, "estimators_"):
            # Free allocated memory, if any
            self.estimators_ = []

        n_more_estimators = self.n_estimators - len(self.estimators_)

        if n_more_estimators < 0:
            raise ValueError('n_estimators=%d must be larger or equal to '
                             'len(estimators_)=%d when warm_start==True' %
                             (self.n_estimators, len(self.estimators_)))

        elif n_more_estimators == 0:
            warn("Warm-start fitting without increasing n_estimators does not "
                 "fit new trees.")
        else:
            if self.warm_start and len(self.estimators_) > 0:
                # We draw from the random state to get the random state we
                # would have got if we hadn't used a warm_start.
                random_state.randint(MAX_INT, size=len(self.estimators_))

            trees = [
                self._make_estimator(append=False, random_state=random_state)
                for _ in range(n_more_estimators)
            ]

            # Parallel loop: for standard random forests, the threading
            # backend is preferred as the Cython code for fitting the trees
            # is internally releasing the Python GIL making threading more
            # efficient than multiprocessing in that case. However, in this case,
            # for fitting pipelines in parallel, multiprocessing is more efficient.
            trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                delayed(_parallel_build_trees)(t,
                                               self,
                                               X,
                                               y,
                                               sample_weight,
                                               i,
                                               len(trees),
                                               verbose=self.verbose,
                                               class_weight=self.class_weight)
                for i, t in enumerate(trees))

            # Collect newly grown trees
            self.estimators_.extend(trees)

        if self.oob_score:
            self._set_oob_score(X, y)

        # Decapsulate classes_ attributes
        if hasattr(self, "classes_") and self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        return self
Пример #8
0
    def inverse_transform(self, X, y=None):
        """Inverse transform X

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_samples, n_features]
            Nested dataframe with time-series in cells.

        Returns
        -------
        Xt : pandas DataFrame
          Transformed pandas DataFrame with same number of rows and one column for each generated interval.
        """

        if self.check_input:
            validate_X(X)
            check_X_is_univariate(X)

        # check that number of samples are the same, inverse transform depends on parameters fitted in transform and
        # hence only works on data with the same (number of) rows
        if not X.shape[0] == self._input_shape[0]:
            raise ValueError(f"Inverse transform only works on data with the same number samples "
                             f"as seen during transform, but found: {X.shape[0]} samples "
                             f"!= {self._input_shape[0]} samples (seen during transform)")

        # if the seasonal periodicity is 1, return unchanged X
        sp = self.sp
        if sp == 1:
            return X

        # check if seasonal decomposition model has been fitted in transform
        check_is_fitted_in_transform(self, 'seasonal_components_')

        # check if time index is aligned with time index seen during transform
        time_index = get_time_index(X)

        # align seasonal components with index of X
        if self._time_index.equals(time_index):
            # if time index is the same as used for fitting seasonal components, simply expand it to the size of X
            seasonal_components = self.seasonal_components_

        else:
            # if time index is not aligned, make sure to align fitted seasonal components to new index
            seasonal_components = self._align_seasonal_components_to_index(time_index)

        # expand or shorten aligned seasonal components to same size as X
        n_obs = len(time_index)
        if n_obs > sp:
            n_tiles = np.int(np.ceil(n_obs / sp))
            seasonal_components = np.tile(seasonal_components, n_tiles)
        seasonal_components = seasonal_components[:, :n_obs]

        # convert into tabular format
        tabulariser = Tabulariser()
        Xs = tabulariser.transform(X.iloc[:, :1])

        # inverse transform data
        if self.model == 'additive':
            Xit = Xs + seasonal_components
        else:
            Xit = Xs * seasonal_components

        # convert back into nested format
        Xit = tabulariser.inverse_transform(pd.DataFrame(Xit))
        Xit.columns = X.columns
        return Xit