Exemplo n.º 1
0
def test_class_distribution():
    y = np.array([[1, 0, 0, 1],
                  [2, 2, 0, 1],
                  [1, 3, 0, 1],
                  [4, 2, 0, 1],
                  [2, 0, 0, 1],
                  [1, 3, 0, 1]])
    # Define the sparse matrix with a mix of implicit and explicit zeros
    data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1])
    indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5])
    indptr = np.array([0, 6, 11, 11, 17])
    y_sp = sp.csc_matrix((data, indices, indptr), shape=(6, 4))

    classes, n_classes, class_prior = class_distribution(y)
    classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp)
    classes_expected = [[1, 2, 4],
                        [0, 2, 3],
                        [0],
                        [1]]
    n_classes_expected = [3, 3, 1, 1]
    class_prior_expected = [[3/6, 2/6, 1/6],
                            [1/3, 1/3, 1/3],
                            [1.0],
                            [1.0]]

    for k in range(y.shape[1]):
        assert_array_almost_equal(classes[k], classes_expected[k])
        assert_array_almost_equal(n_classes[k], n_classes_expected[k])
        assert_array_almost_equal(class_prior[k], class_prior_expected[k])

        assert_array_almost_equal(classes_sp[k], classes_expected[k])
        assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
        assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])

    # Test again with explicit sample weights
    (classes,
     n_classes,
     class_prior) = class_distribution(y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0])
    (classes_sp,
     n_classes_sp,
     class_prior_sp) = class_distribution(y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0])
    class_prior_expected = [[4/9, 3/9, 2/9],
                            [2/9, 4/9, 3/9],
                            [1.0],
                            [1.0]]

    for k in range(y.shape[1]):
        assert_array_almost_equal(classes[k], classes_expected[k])
        assert_array_almost_equal(n_classes[k], n_classes_expected[k])
        assert_array_almost_equal(class_prior[k], class_prior_expected[k])

        assert_array_almost_equal(classes_sp[k], classes_expected[k])
        assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
        assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
Exemplo n.º 2
0
def test_class_distribution():
    y = np.array([[1, 0, 0, 1],
                  [2, 2, 0, 1],
                  [1, 3, 0, 1],
                  [4, 2, 0, 1],
                  [2, 0, 0, 1],
                  [1, 3, 0, 1]])
    # Define the sparse matrix with a mix of implicit and explicit zeros
    data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1])
    indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5])
    indptr = np.array([0, 6, 11, 11, 17])
    y_sp = sp.csc_matrix((data, indices, indptr), shape=(6, 4))

    classes, n_classes, class_prior = class_distribution(y)
    classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp)
    classes_expected = [[1, 2, 4],
                        [0, 2, 3],
                        [0],
                        [1]]
    n_classes_expected = [3, 3, 1, 1]
    class_prior_expected = [[3/6, 2/6, 1/6],
                            [1/3, 1/3, 1/3],
                            [1.0],
                            [1.0]]

    for k in range(y.shape[1]):
        assert_array_almost_equal(classes[k], classes_expected[k])
        assert_array_almost_equal(n_classes[k], n_classes_expected[k])
        assert_array_almost_equal(class_prior[k], class_prior_expected[k])

        assert_array_almost_equal(classes_sp[k], classes_expected[k])
        assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
        assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])

    # Test again with explicit sample weights
    (classes,
     n_classes,
     class_prior) = class_distribution(y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0])
    (classes_sp,
     n_classes_sp,
     class_prior_sp) = class_distribution(y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0])
    class_prior_expected = [[4/9, 3/9, 2/9],
                            [2/9, 4/9, 3/9],
                            [1.0],
                            [1.0]]

    for k in range(y.shape[1]):
        assert_array_almost_equal(classes[k], classes_expected[k])
        assert_array_almost_equal(n_classes[k], n_classes_expected[k])
        assert_array_almost_equal(class_prior[k], class_prior_expected[k])

        assert_array_almost_equal(classes_sp[k], classes_expected[k])
        assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
        assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
Exemplo n.º 3
0
    def fit(self, X, y):
        """Fit a single boss classifier on n_instances cases (X,y).

        Parameters
        ----------
        X : pd.DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)

        sfa = self.transformer.fit_transform(X)
        self.transformed_data = sfa[0]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self
Exemplo n.º 4
0
    def fit(self, X, y, **kwargs):
        self.nb_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        for itr in range(self.nb_iterations):
            # each construction shall have a different random initialisation
            y_cur, fit_time_cur, predict_time_cur = self.load_network_probs(
                self.network_name,
                itr,
                self.res_path,
                self.dataset_name,
                self.random_seed,
            )

            if itr == 0:
                self.y_pred = y_cur
                self.fit_time = fit_time_cur
                self.predict_time = predict_time_cur
            else:
                self.y_pred = self.y_pred + y_cur
                self.fit_time = self.fit_time + fit_time_cur
                self.predict_time = self.predict_time + predict_time_cur

        self.y_pred = self.y_pred / self.nb_iterations

        # check if binary classification
        if self.y_pred.shape[1] == 1:
            # first column is probability of class 0 and second is of class 1
            self.y_pred = np.hstack([1 - self.y_pred, self.y_pred])

        self._is_fitted = True

        return self
Exemplo n.º 5
0
    def fit(self, X, y):
        """Perform a shapelet transform then builds a random forest.
        Contract default for ST is 5 hours
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification. RISE has no bespoke method for multivariate
            classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
         """
        X, y = check_X_y(X, y, enforce_univariate=True)
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        self.classifier.fit(X, y)

        #        self.shapelet_transform.fit(X,y)
        #        print("Shapelet Search complete")
        #        self.st_X =self.shapelet_transform.transform(X)
        #        print("Transform complete")
        #        X = np.asarray([a.values for a in X.iloc[:, 0]])
        #        self.classifier.fit(X,y)
        #       print("Build classifier complete")
        self._is_fitted = True
        return self
    def fit(self, X, y):
        """Fit a random catch22 feature forest classifier.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X = check_X(X, enforce_univariate=False, coerce_to_numpy=True)
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        c22 = Catch22(outlier_norm=self.outlier_norm)
        c22_list = c22.fit_transform(X)

        self.classifier = RandomForestClassifier(
            n_jobs=self.n_jobs,
            n_estimators=self.n_estimators,
            random_state=self.random_state,
        )

        X_c22 = np.nan_to_num(np.array(c22_list, dtype=np.float32), False, 0,
                              0, 0)
        self.classifier.fit(X_c22, y)

        self._is_fitted = True
        return self
Exemplo n.º 7
0
    def fit(self, X, y):

        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        self.constituent_build_times = np.zeros(len(self.classifiers))
        self.train_accs_by_classifier = np.zeros(len(self.classifiers))
        self.training_preds = np.empty((len(self.classifiers), len(y)),dtype=type(y[0]))
        self.training_probas = np.empty((len(self.classifiers), len(y), len(self.classes_)))

        # build each classifier
        for c_id in range(len(self.classifiers)):
            if self.verbose > 0:
                print("Building "+self.classifier_names[c_id])
            start_time = time.time()
            self.classifiers[c_id].random_state=self.random_state
            if self.classifier_param_grids is None or self.classifier_param_grids[c_id] is None:
                pass
            else:
                grid = GridSearchCV(estimator=self.classifiers[c_id], param_grid=self.classifier_param_grids[c_id],
                                    scoring='accuracy', cv=self.param_cv_folds, verbose=self.verbose)
                self.classifiers[c_id] = grid.fit(X, y).best_estimator_

            self.training_probas[c_id] = cross_val_predict(self.classifiers[c_id], X=X, y=y, cv=self.cv_folds, method='predict_proba')

            end_time = time.time()

            self.training_preds[c_id] = np.array([self.classes_[np.argmax(x)] for x in self.training_probas[c_id]])
            self.train_accs_by_classifier[c_id] = accuracy_score(y, self.training_preds[c_id])
            self.constituent_build_times[c_id] = end_time-start_time
            self.classifiers[c_id].fit(train_x, train_y)
Exemplo n.º 8
0
    def fit(self, X, y):
        """Build a pipeline containing the ROCKET transformer and RidgeClassifierCV.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y)

        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self.classifier = rocket_pipeline = make_pipeline(
            Rocket(
                num_kernels=self.num_kernels,
                random_state=self.random_state,
                n_jobs=self.n_jobs,
            ),
            RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True),
        )
        rocket_pipeline.fit(X, y)

        self._is_fitted = True
        return self
Exemplo n.º 9
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y) using supervised
        intervals and summary features
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification. STSF has no bespoke method for multivariate
            classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(
            X,
            y,
            enforce_univariate=True,
            coerce_to_numpy=True,
        )
        X = X.squeeze(1)
        n_instances, _ = X.shape

        rng = check_random_state(self.random_state)

        cls, class_counts = np.unique(y, return_counts=True)
        self.n_classes = class_counts.shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        self.intervals_ = [[[] for _ in range(3)]
                           for _ in range(self.n_estimators)]

        _, X_p = signal.periodogram(X)
        X_d = np.diff(X, 1)

        balance_cases = np.zeros(0, dtype=np.int32)
        average = math.floor(n_instances / self.n_classes)
        for i, c in enumerate(cls):
            if class_counts[i] < average:
                cls_idx = np.where(y == c)[0]
                balance_cases = np.concatenate(
                    (rng.choice(cls_idx, size=average - class_counts[i]),
                     balance_cases))

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(self._fit_estimator)(
                X,
                X_p,
                X_d,
                y,
                np.concatenate((rng.choice(n_instances, size=n_instances),
                                balance_cases)),
                i,
            ) for i in range(self.n_estimators))

        self._is_fitted = True
        return self
Exemplo n.º 10
0
    def _fit(self, X, y):
        """Fit an estimator using transformed data from the Catch22 transformer.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, n_dims]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        self.n_classes = np.unique(y).shape[0]

        self._transformer = Catch22(outlier_norm=self.outlier_norm)
        self._estimator = _clone_estimator(
            RandomForestClassifier(n_estimators=200)
            if self.estimator is None else self.estimator,
            self.random_state,
        )

        m = getattr(self._estimator, "n_jobs", None)
        if m is not None:
            self._estimator.n_jobs = self.n_jobs

        X_t = self._transformer.fit_transform(X, y)
        X_t = np.nan_to_num(X_t, False, 0, 0, 0)
        self._estimator.fit(X_t, y)

        return self
Exemplo n.º 11
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y).

         Uses random intervals and catch22/tsf summary features.

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,n_dimensions,
        series_length] or shape = [n_instances,series_length]
        The training input samples.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, coerce_to_numpy=True)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        if self.base_estimator is None or self.base_estimator == "DTC":
            self.tree = DecisionTreeClassifier(criterion="entropy")
        elif self.base_estimator == "CIT":
            self.tree = ContinuousIntervalTree()
        elif isinstance(self.base_estimator, BaseEstimator):
            self.tree = self.base_estimator
        else:
            raise ValueError("DrCIF invalid base estimator given.")

        if self.n_intervals is None:
            self.__n_intervals = int(
                math.sqrt(self.series_length) * math.sqrt(self.n_dims)
            )
        if self.__n_intervals <= 0:
            self.__n_intervals = 1

        if self.series_length < self.min_interval:
            self.min_interval = self.series_length

        if self.max_interval is None:
            self.__max_interval = self.series_length / 2
        if self.__max_interval < self.min_interval:
            self.__max_interval = self.min_interval

        fit = Parallel(n_jobs=self.n_jobs)(
            delayed(self._fit_estimator)(
                X,
                y,
                i,
            )
            for i in range(self.n_estimators)
        )

        self.classifiers, self.intervals, self.dims, self.atts = zip(*fit)

        self._is_fitted = True
        return self
Exemplo n.º 12
0
    def fit(self, X, y):
        """Fit a single TD classifier on n_instances cases (X,y).

        Parameters
        ----------
        X : pd.DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, coerce_to_numpy=True)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        # select dimensions using accuracy estimate if multivariate
        if self.n_dims > 1:
            self.dims, self.transformers = self._select_dims(X, y)

            words = [defaultdict(int) for _ in range(self.n_instances)]

            for i, dim in enumerate(self.dims):
                X_dim = X[:, dim, :].reshape(self.n_instances, 1,
                                             self.series_length)
                dim_words = self.transformers[i].transform(X_dim, y)
                dim_words = dim_words[0]

                for i in range(self.n_instances):
                    for word, count in dim_words[i].items():
                        words[i][word << self.highest_dim_bit | dim] = count

            self.transformed_data = words
        else:
            self.transformers.append(
                SFA(
                    word_length=self.word_length,
                    alphabet_size=self.alphabet_size,
                    window_size=self.window_size,
                    norm=self.norm,
                    levels=self.levels,
                    binning_method="information-gain"
                    if self.igb else "equi-depth",
                    bigrams=self.bigrams,
                    remove_repeat_words=True,
                    save_words=False,
                    n_jobs=self.n_jobs,
                ))
            sfa = self.transformers[0].fit_transform(X, y)
            self.transformed_data = sfa[0]

        self._is_fitted = True
        return self
Exemplo n.º 13
0
    def fit(self, X, y):
        sfa = self.transform.fit_transform(X)
        self.transformed_data = [series.to_dict() for series in sfa.iloc[:, 0]]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index
Exemplo n.º 14
0
    def fit(self, X, y):
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)

        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        cv_size = 10
        _, counts = np.unique(y, return_counts=True)
        min_class = np.min(counts)
        if min_class < cv_size:
            cv_size = min_class

        self.stc = ShapeletTransformClassifier(
            random_state=self.random_state,
            time_contract_in_mins=60,
        )
        self.stc.fit(X, y)
        train_preds = cross_val_predict(
            ShapeletTransformClassifier(
                random_state=self.random_state,
                time_contract_in_mins=60,
            ),
            X=X,
            y=y,
            cv=cv_size,
        )
        self.stc_weight = accuracy_score(y, train_preds)**4

        self.tsf = TimeSeriesForest(random_state=self.random_state)
        self.tsf.fit(X, y)
        train_preds = cross_val_predict(
            TimeSeriesForest(random_state=self.random_state),
            X=X,
            y=y,
            cv=cv_size,
        )
        self.tsf_weight = accuracy_score(y, train_preds)**4

        self.rise = RandomIntervalSpectralForest(
            random_state=self.random_state)
        self.fit(X, y)
        train_preds = cross_val_predict(
            RandomIntervalSpectralForest(random_state=self.random_state),
            X=X,
            y=y,
            cv=cv_size,
        )
        self.rise_weight = accuracy_score(y, train_preds)**4

        self.cboss = ContractableBOSS(random_state=self.random_state)
        self.cboss.fit(X, y)
        train_probs = self.cboss._get_train_probs(X)
        train_preds = self.cboss.classes_[np.argmax(train_probs, axis=1)]
        self.cboss_weight = accuracy_score(y, train_preds)**4

        return self
Exemplo n.º 15
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y) using random
        intervals and summary features
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification. TSF has no bespoke method for multivariate
            classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(
            X,
            y,
            enforce_univariate=not TimeSeriesForest.capabilities["multivariate"],
            coerce_to_numpy=True,
        )
        X = X.squeeze(1)
        n_instances, self.series_length = X.shape

        rng = check_random_state(self.random_state)

        self.n_classes = np.unique(y).shape[0]

        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        self.n_intervals = int(math.sqrt(self.series_length))
        if self.n_intervals == 0:
            self.n_intervals = 1
        if self.series_length < self.min_interval:
            self.min_interval = self.series_length

        self.intervals_ = [
            _get_intervals(self.n_intervals, self.min_interval, self.series_length, rng)
            for _ in range(self.n_estimators)
        ]

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_estimator)(
                X,
                y,
                self.base_estimator,
                self.intervals_[i],
                self.random_state,
            )
            for i in range(self.n_estimators)
        )

        self._is_fitted = True
        return self
Exemplo n.º 16
0
    def fit(self, X, y, sample_weight=None):
        """Build a forest of trees from the training set (X, y) using random intervals and summary measures.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samps, num_atts]
            The training input samples.  If a Pandas data frame is passed, the column _dim_to_use is extracted
        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The class labels.

        Returns
        -------
        self : object
         """

        if isinstance(X, pd.DataFrame):
            if isinstance(X.iloc[0,self.dim_to_use], pd.Series):
                X = np.asarray([a.values for a in X.iloc[:,0]])
            else:
                raise TypeError("Input should either be a 2d numpy array, or a pandas dataframe containing Series objects")
        n_samps, self.series_length = X.shape

        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        self.intervals=np.zeros((self.num_trees, 2), dtype=int)
        self.intervals[0][0] = 0
        self.intervals[0][1] = self.series_length
        for i in range(1, self.num_trees):
            self.intervals[i][0]=random.randint(self.series_length - self.min_interval)
            self.intervals[i][1]=random.randint(self.intervals[i][0] + self.min_interval, self.series_length)
        # Check lag against global properties
        if self.acf_lag > self.series_length-self.acf_min_values:
            self.acf_lag = self.series_length - self.acf_min_values
        if self.acf_lag < 0:
            self.acf_lag = 1
        self.lags=np.zeros(self.num_trees, dtype=int)
        for i in range(0, self.num_trees):
            temp_lag=self.acf_lag
            if temp_lag > self.intervals[i][1]-self.intervals[i][0]-self.acf_min_values:
                temp_lag = self.intervals[i][1] - self.intervals[i][0] - self.acf_min_values
            if temp_lag < 0:
                temp_lag = 1
            self.lags[i] = int(temp_lag)
            acf_x = np.empty(shape=(n_samps,self.lags[i]))
            ps_len = (self.intervals[i][1] - self.intervals[i][0]) / 2
            ps_x = np.empty(shape=(n_samps,int(ps_len)))
            for j in range(0, n_samps):
                acf_x[j] = acf(X[j,self.intervals[i][0]:self.intervals[i][1]], temp_lag)
                ps_x[j] = ps(X[j, self.intervals[i][0]:self.intervals[i][1]])
            transformed_x = np.concatenate((acf_x,ps_x),axis=1)
#            transformed_x=acf_x
            tree = deepcopy(self.base_estimator)
            tree.fit(transformed_x, y)
            self.classifiers.append(tree)
        return self
Exemplo n.º 17
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y) using random
        intervals and catch22/tsf summary features
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,series_length]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification).
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, coerce_to_numpy=True)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        if self.n_intervals is None:
            self.__n_intervals = 4 + int(
                (math.sqrt(self.series_length) * math.sqrt(self.n_dims)) / 3)
        if self.__n_intervals <= 0:
            self.__n_intervals = 1
        if self.series_length < self.min_interval:
            self.min_interval = self.series_length

        if self.max_interval is None:
            self.__max_interval = self.series_length / 2
        if self.__max_interval < self.min_interval:
            self.__max_interval = self.min_interval

        _, X_p = signal.periodogram(X)
        X_d = np.diff(X, 1)
        self.total_intervals = self.__n_intervals * 2 + int(
            self.__n_intervals / 2)

        fit = Parallel(n_jobs=self.n_jobs)(delayed(self._fit_estimator)(
            X,
            X_p,
            X_d,
            y,
            i,
        ) for i in range(self.n_estimators))

        self.classifiers, self.intervals, self.dims, self.atts = zip(*fit)

        self._is_fitted = True
        return self
Exemplo n.º 18
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y) using random intervals and summary features
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it must have a single column (i.e. univariate
            classification. RISE has no bespoke method for multivariate classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
         """
        if isinstance(X, pd.DataFrame):
            if X.shape[1] > 1:
                raise TypeError("TSF cannot handle multivariate problems yet")
            elif isinstance(X.iloc[0,0], pd.Series):
                X = np.asarray([a.values for a in X.iloc[:,0]])
            else:
                raise TypeError("Input should either be a 2d numpy array, or a pandas dataframe with a single column of Series objects (TSF cannot yet handle multivariate problems")
        n_samps, self.series_length = X.shape

        self.n_classes = np.unique(y).shape[0]

        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        self.n_intervals = int(math.sqrt(self.series_length))
        if self.n_intervals==0:
            self.n_intervals = 1
        if self.series_length <self.min_interval:
            self.min_interval=self.series_length
        self.intervals=np.zeros((self.n_trees, 3 * self.n_intervals, 2), dtype=int)
        for i in range(0, self.n_trees):
            transformed_x = np.empty(shape=(3 * self.n_intervals, n_samps))
            # Find the random intervals for classifier i and concatentate features
            for j in range(0, self.n_intervals):
                self.intervals[i][j][0]=random.randint(self.series_length - self.min_interval)
                length=random.randint(self.series_length - self.intervals[i][j][0] - 1)
                if length < self.min_interval:
                    length = self.min_interval
                self.intervals[i][j][1] = self.intervals[i][j][0] + length
                # Transforms here, just hard coding it, so not configurable
                means = np.mean(X[:, self.intervals[i][j][0]:self.intervals[i][j][1]], axis=1)
                std_dev = np.std(X[:, self.intervals[i][j][0]:self.intervals[i][j][1]], axis=1)
                slope = self.lsq_fit(X[:, self.intervals[i][j][0]:self.intervals[i][j][1]])
                transformed_x[3*j]=means
                transformed_x[3*j+1]=std_dev
                transformed_x[3*j+2]=slope
            tree = deepcopy(self.base_estimator)
            transformed_x=transformed_x.T
            tree.fit(transformed_x, y)
            self.classifiers.append(tree)
        return self
Exemplo n.º 19
0
    def fit(self, X, y):
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)

        sfa = self.transformer.fit_transform(X, y)
        self.transformed_data = sfa[0]  # .iloc[:, 0]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self
Exemplo n.º 20
0
    def _fit(self, X, y):
        self._n_jobs = check_n_jobs(self.n_jobs)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        if self.time_limit_in_minutes > 0:
            # contracting 2/3 transform (with 1/5 of that taken away for final
            # transform), 1/3 classifier
            third = self.time_limit_in_minutes / 3
            self._classifier_limit_in_minutes = third
            self._transform_limit_in_minutes = (third * 2) / 5 * 4
        elif self.transform_limit_in_minutes > 0:
            self._transform_limit_in_minutes = self.transform_limit_in_minutes

        self._transformer = RandomShapeletTransform(
            n_shapelet_samples=self.n_shapelet_samples,
            max_shapelets=self.max_shapelets,
            max_shapelet_length=self.max_shapelet_length,
            time_limit_in_minutes=self._transform_limit_in_minutes,
            contract_max_n_shapelet_samples=self.
            contract_max_n_shapelet_samples,
            n_jobs=self.n_jobs,
            batch_size=self.batch_size,
            random_state=self.random_state,
        )

        self._estimator = _clone_estimator(
            RotationForest() if self.estimator is None else self.estimator,
            self.random_state,
        )

        if isinstance(self._estimator, RotationForest):
            self._estimator.save_transformed_data = self.save_transformed_data

        m = getattr(self._estimator, "n_jobs", None)
        if m is not None:
            self._estimator.n_jobs = self._n_jobs

        m = getattr(self._estimator, "time_limit_in_minutes", None)
        if m is not None and self.time_limit_in_minutes > 0:
            self._estimator.time_limit_in_minutes = self._classifier_limit_in_minutes

        X_t = self._transformer.fit_transform(X, y).to_numpy()

        if self.save_transformed_data:
            self.transformed_data = X_t

        self._estimator.fit(X_t, y)
Exemplo n.º 21
0
    def fit(self, X, y):
        """Build a forest of trees from the training set (X, y).

        Parameters
        ----------
        Xt: np.ndarray or pd.DataFrame
            Panel training data.
        y : np.ndarray
            The class labels.

        Returns
        -------
        self : object
            An fitted instance of the classifier
        """
        X, y = check_X_y(
            X,
            y,
            enforce_univariate=not self.capabilities["multivariate"],
            coerce_to_numpy=True,
        )
        X = X.squeeze(1)
        n_instances, self.series_length = X.shape

        n_jobs = check_n_jobs(self.n_jobs)

        rng = check_random_state(self.random_state)

        self.n_classes = np.unique(y).shape[0]

        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        self.n_intervals = int(math.sqrt(self.series_length))
        if self.n_intervals == 0:
            self.n_intervals = 1
        if self.series_length < self.min_interval:
            self.min_interval = self.series_length

        self.intervals_ = [
            _get_intervals(self.n_intervals, self.min_interval,
                           self.series_length, rng)
            for _ in range(self.n_estimators)
        ]

        self.estimators_ = Parallel(n_jobs=n_jobs)(
            delayed(_fit_estimator)(_clone_estimator(self.base_estimator, rng),
                                    X, y, self.intervals_[i])
            for i in range(self.n_estimators))

        self._is_fitted = True
        return self
Exemplo n.º 22
0
    def fit(self, X, y):
        """Perform a shapelet transform then builds a random forest.

        Contract default for ST is 5 hours

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,
        series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it
            must have a single column (i.e. univariate
            classification. RISE has no bespoke method for multivariate
            classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True)

        # if y is a pd.series then convert to array.
        if isinstance(y, pd.Series):
            y = y.to_numpy()

        # generate pipeline in fit so that random state can be propagated properly.
        self.classifier_ = Pipeline([
            (
                "st",
                ContractedShapeletTransform(
                    time_contract_in_mins=self.transform_contract_in_mins,
                    verbose=False,
                    random_state=self.random_state,
                ),
            ),
            (
                "rf",
                RandomForestClassifier(n_estimators=self.n_estimators,
                                       random_state=self.random_state),
            ),
        ])

        self.n_classes_ = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        self.classifier_.fit(X, y)

        self._is_fitted = True
        return self
    def fit(self, X, y):
        """Fit a tree on cases (X,y), where y is the target variable.

        Build an information gain based tree for continuous attributes using the
        margin gain metric for ties.

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances,n_attributes]
        The training input samples.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
        """
        if not isinstance(X, np.ndarray) or len(X.shape) > 2:
            raise ValueError(
                "ContinuousIntervalTree is not a time series classifier. "
                "A 2d numpy array is required.")
        X, y = check_X_y(X, y)

        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self._class_dictionary[classVal] = index

        rng = check_random_state(self.random_state)
        self.root = _TreeNode(random_state=rng)

        thresholds = np.linspace(np.min(X, axis=0), np.max(X, axis=0), 20)
        distribution_cls, distribution = unique_count(y)
        e = _entropy(distribution, distribution.sum())

        self.root.build_tree(
            X,
            y,
            thresholds,
            e,
            distribution_cls,
            distribution,
            0,
            self.max_depth,
            False,
        )

        self._is_fitted = True
        return self
Exemplo n.º 24
0
    def _fit(self, X, y):
        self._n_jobs = check_n_jobs(self.n_jobs)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self._class_vals = y
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self._class_dictionary[classVal] = index

        # select dimensions using accuracy estimate if multivariate
        if self.n_dims > 1:
            self._dims, self._transformers = self._select_dims(X, y)

            words = [defaultdict(int) for _ in range(self.n_instances)]

            for i, dim in enumerate(self._dims):
                X_dim = X[:, dim, :].reshape(self.n_instances, 1,
                                             self.series_length)
                dim_words = self._transformers[i].transform(X_dim, y)
                dim_words = dim_words[0]

                for n in range(self.n_instances):
                    for word, count in dim_words[n].items():
                        words[n][word << self._highest_dim_bit | dim] = count

            self._transformed_data = words
        else:
            self._transformers.append(
                SFA(
                    word_length=self.word_length,
                    alphabet_size=self.alphabet_size,
                    window_size=self.window_size,
                    norm=self.norm,
                    levels=self.levels,
                    binning_method="information-gain"
                    if self.igb else "equi-depth",
                    bigrams=self.bigrams,
                    remove_repeat_words=True,
                    lower_bounding=False,
                    save_words=False,
                    use_fallback_dft=True,
                    n_jobs=self._n_jobs,
                ))
            sfa = self._transformers[0].fit_transform(X, y)
            self._transformed_data = sfa[0]
Exemplo n.º 25
0
    def fit(self, X, y):
        """
        Build a single or ensemble of pipelines containing the ROCKET transformer and
        RidgeClassifierCV classifier.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y)

        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        if self.ensemble:
            for i in range(self.ensemble_size):
                rocket_pipeline = make_pipeline(
                    Rocket(num_kernels=self.num_kernels,
                           random_state=self.random_state),
                    RidgeClassifierCV(alphas=np.logspace(-3, 3, 10),
                                      normalize=True),
                )
                rocket_pipeline.fit(X, y)
                self.classifiers.append(rocket_pipeline)
                self.weights.append(rocket_pipeline.steps[1][1].best_score_)
                self.weight_sum = self.weight_sum + self.weights[i]
        else:
            rocket_pipeline = make_pipeline(
                Rocket(num_kernels=self.num_kernels,
                       random_state=self.random_state),
                RidgeClassifierCV(alphas=np.logspace(-3, 3, 10),
                                  normalize=True),
            )
            rocket_pipeline.fit(X, y)
            self.classifiers.append(rocket_pipeline)

        self._is_fitted = True
        return self
Exemplo n.º 26
0
    def _fit(self, X, y):
        self._n_jobs = check_n_jobs(self.n_jobs)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        if self.base_estimator == "DTC":
            self._base_estimator = DecisionTreeClassifier(criterion="entropy")
        elif self.base_estimator == "CIT":
            self._base_estimator = ContinuousIntervalTree()
        elif isinstance(self.base_estimator, BaseEstimator):
            self._base_estimator = self.base_estimator
        else:
            raise ValueError("DrCIF invalid base estimator given.")

        if self.n_intervals is None:
            self._n_intervals = int(
                math.sqrt(self.series_length) * math.sqrt(self.n_dims)
            )
        if self._n_intervals <= 0:
            self._n_intervals = 1

        if self.att_subsample_size > 25:
            self._att_subsample_size = 25

        if self.series_length < self.min_interval:
            self._min_interval = self.series_length
        elif self.min_interval < 3:
            self._min_interval = 3

        if self.max_interval is None:
            self._max_interval = self.series_length / 2
        if self._max_interval < self._min_interval:
            self._max_interval = self._min_interval

        fit = Parallel(n_jobs=self._n_jobs)(
            delayed(self._fit_estimator)(
                X,
                y,
                i,
            )
            for i in range(self.n_estimators)
        )

        self.estimators_, self.intervals, self.dims, self.atts = zip(*fit)
Exemplo n.º 27
0
    def fit(self, X, y):

        if isinstance(X, pd.Series) or isinstance(X, pd.DataFrame):
            X, y = check_X_y(X, y, enforce_univariate=True)
            X = tabularize(X, return_array=True)

        sfa = self.transformer.fit_transform(X, y)
        self.transformed_data = sfa[0]  # .iloc[:, 0]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self
Exemplo n.º 28
0
    def fit(self, X, y):
        """Fit an estimator using transformed data from the Catch22 transformer.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, n_dims]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y)
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        self.n_classes = np.unique(y).shape[0]

        self._transformer = (TSFreshRelevantFeatureExtractor(
            default_fc_parameters=self.default_fc_parameters,
            n_jobs=self.n_jobs,
            chunksize=self.chunksize,
        ) if self.relevant_feature_extractor else TSFreshFeatureExtractor(
            default_fc_parameters=self.default_fc_parameters,
            n_jobs=self.n_jobs,
            chunksize=self.chunksize,
        ))
        self._estimator = _clone_estimator(
            RandomForestClassifier(n_estimators=200)
            if self.estimator is None else self.estimator,
            self.random_state,
        )

        if self.verbose < 2:
            self._transformer.show_warnings = False
            if self.verbose < 1:
                self._transformer.disable_progressbar = True

        m = getattr(self._estimator, "n_jobs", None)
        if callable(m):
            self._estimator.n_jobs = self.n_jobs

        X_t = self._transformer.fit_transform(X, y)
        self._estimator.fit(X_t, y)

        self._is_fitted = True
        return self
Exemplo n.º 29
0
    def fit(self, X, y):
        """
        Build an ensemble of pipelines containing the ROCKET transformer and
        RidgeClassifierCV classifier.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y)
        n_jobs = check_n_jobs(self.n_jobs)

        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        base_estimator = _make_estimator(self.num_kernels, self.random_state)
        self.estimators_ = Parallel(n_jobs=n_jobs)(
            delayed(_fit_estimator)(_clone_estimator(base_estimator,
                                                     self.random_state), X, y)
            for _ in range(self.n_estimators))

        self.weights = []
        self.weight_sum = 0
        for rocket_pipeline in self.estimators_:
            weight = rocket_pipeline.steps[1][1].best_score_
            self.weights.append(weight)
            self.weight_sum += weight

        self._is_fitted = True
        return self
Exemplo n.º 30
0
    def fit(self, X, y):
        """Fit a random catch22 feature forest classifier

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X = check_X(X, enforce_univariate=False, coerce_to_numpy=True)
        n_instances = X.shape[0]
        X = np.reshape(X, (n_instances, -1))
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        c22_list = []
        for i in range(n_instances):
            series = X[i, :]
            c22_dict = catch22_all(series)
            c22_list.append(c22_dict["values"])

        self.classifier = RandomForestClassifier(
            n_jobs=self.n_jobs,
            n_estimators=self.n_estimators,
            random_state=self.random_state,
        )

        X_c22 = np.array(c22_list)
        np.nan_to_num(X_c22, False, 0, 0, 0)

        self.classifier.fit(X_c22, y)

        self._is_fitted = True
        return self
Exemplo n.º 31
0
    def _fit(self, X, y):
        self._n_jobs = check_n_jobs(self.n_jobs)

        if self.n_parameter_samples <= self.randomly_selected_params:
            print(  # noqa
                "TDE Warning: n_parameter_samples <= randomly_selected_params, ",
                "ensemble member parameters will be fully randomly selected.",
            )

        time_limit = self.time_limit_in_minutes * 60
        self.n_instances, self.n_dims, self.series_length = X.shape
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self._class_dictionary[classVal] = index

        self.estimators_ = []
        self.weights = []
        self._prev_parameters_x = []
        self._prev_parameters_y = []

        # Window length parameter space dependent on series length
        max_window_searches = self.series_length / 4
        max_window = int(self.series_length * self.max_win_len_prop)
        win_inc = int((max_window - self.min_window) / max_window_searches)
        if win_inc < 1:
            win_inc = 1
        if self.min_window > max_window + 1:
            raise ValueError(
                f"Error in TemporalDictionaryEnsemble, min_window ="
                f"{self.min_window} is bigger"
                f" than max_window ={max_window},"
                f" series length is {self.series_length}"
                f" try set min_window to be smaller than series length in "
                f"the constructor, but the classifier may not work at "
                f"all with very short series")

        possible_parameters = self._unique_parameters(max_window, win_inc)
        num_classifiers = 0
        subsample_size = int(self.n_instances * 0.7)
        lowest_acc = 1
        lowest_acc_idx = 0

        time_limit = self.time_limit_in_minutes * 60
        start_time = time.time()
        train_time = 0
        if time_limit > 0:
            n_parameter_samples = 0
        else:
            n_parameter_samples = self.n_parameter_samples

        rng = check_random_state(self.random_state)

        if self.bigrams is None:
            if self.n_dims > 1:
                use_bigrams = False
            else:
                use_bigrams = True
        else:
            use_bigrams = self.bigrams

        # use time limit or n_parameter_samples if limit is 0
        while (train_time < time_limit or num_classifiers < n_parameter_samples
               ) and len(possible_parameters) > 0:
            if num_classifiers < self.randomly_selected_params:
                parameters = possible_parameters.pop(
                    rng.randint(0, len(possible_parameters)))
            else:
                scaler = preprocessing.StandardScaler()
                scaler.fit(self._prev_parameters_x)
                gp = KernelRidge(kernel="poly", degree=1)
                gp.fit(scaler.transform(self._prev_parameters_x),
                       self._prev_parameters_y)
                preds = gp.predict(scaler.transform(possible_parameters))
                parameters = possible_parameters.pop(
                    rng.choice(np.flatnonzero(preds == preds.max())))

            subsample = rng.choice(self.n_instances,
                                   size=subsample_size,
                                   replace=False)
            X_subsample = X[subsample]
            y_subsample = y[subsample]

            tde = IndividualTDE(
                *parameters,
                alphabet_size=self._alphabet_size,
                bigrams=use_bigrams,
                dim_threshold=self.dim_threshold,
                max_dims=self.max_dims,
                random_state=self.random_state,
            )
            tde.fit(X_subsample, y_subsample)
            tde._subsample = subsample

            if self.save_train_predictions:
                tde._train_predictions = np.zeros(subsample_size)

            tde._accuracy = self._individual_train_acc(
                tde,
                y_subsample,
                subsample_size,
                0 if num_classifiers < self.max_ensemble_size else lowest_acc,
            )
            if tde._accuracy > 0:
                weight = math.pow(tde._accuracy, 4)
            else:
                weight = 0.000000001

            if num_classifiers < self.max_ensemble_size:
                if tde._accuracy < lowest_acc:
                    lowest_acc = tde._accuracy
                    lowest_acc_idx = num_classifiers
                self.weights.append(weight)
                self.estimators_.append(tde)
            elif tde._accuracy > lowest_acc:
                self.weights[lowest_acc_idx] = weight
                self.estimators_[lowest_acc_idx] = tde
                lowest_acc, lowest_acc_idx = self._worst_ensemble_acc()

            self._prev_parameters_x.append(parameters)
            self._prev_parameters_y.append(tde._accuracy)

            num_classifiers += 1
            train_time = time.time() - start_time

        self.n_estimators = len(self.estimators_)
        self._weight_sum = np.sum(self.weights)
Exemplo n.º 32
0
    def fit(self, X, y, sample_weight=None):
        """Fit the random classifier.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            Target values.

        sample_weight : array-like of shape = [n_samples], optional
            Sample weights.

        Returns
        -------
        self : object
            Returns self.
        """
        if self.strategy not in ("most_frequent", "stratified", "uniform",
                                 "constant"):
            raise ValueError("Unknown strategy type.")

        if self.strategy == "uniform" and sp.issparse(y):
            y = y.toarray()
            warnings.warn('A local copy of the target data has been converted '
                          'to a numpy array. Predicting on sparse target data '
                          'with the uniform strategy would not save memory '
                          'and would be slower.',
                          UserWarning)

        self.sparse_output_ = sp.issparse(y)

        if not self.sparse_output_:
            y = np.atleast_1d(y)

        self.output_2d_ = y.ndim == 2
        if y.ndim == 1:
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if self.strategy == "constant":
            if self.constant is None:
                raise ValueError("Constant target value has to be specified "
                                 "when the constant strategy is used.")
            else:
                constant = np.reshape(np.atleast_1d(self.constant), (-1, 1))
                if constant.shape[0] != self.n_outputs_:
                    raise ValueError("Constant target value should have "
                                     "shape (%d, 1)." % self.n_outputs_)

        (self.classes_,
         self.n_classes_,
         self.class_prior_) = class_distribution(y, sample_weight)

        if self.strategy == "constant":
            for k in range(self.n_outputs_):
                # Checking in case of constant strategy if the constant
                # provided by the user is in y.
                if constant[k] not in self.classes_[k]:
                    raise ValueError("The constant target value must be "
                                     "present in training data")

        if self.n_outputs_ == 1 and not self.output_2d_:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]
            self.class_prior_ = self.class_prior_[0]

        return self