Пример #1
0
def test_summary_transformer_incorrect_quantile_raises_error(quantile_arg):
    """Test if correct errors are raised for invalid quantiles input."""
    msg = """`quantiles` must be int, float or a list or tuple made up of
          int and float values that are between 0 and 1.
          """
    with pytest.raises(ValueError, match=msg):
        transformer = SummaryTransformer(summary_function="mean",
                                         quantiles=quantile_arg)
        transformer.fit_transform(data_to_test[0])
Пример #2
0
def test_summary_transformer_incorrect_summary_function_raises_error(
        summary_arg):
    """Test if correct errors are raised for invalid summary_function input."""
    msg = rf"""`summary_function` must be str or a list or tuple made up of
          {ALLOWED_SUM_FUNCS}.
          """
    with pytest.raises(ValueError, match=re.escape(msg)):
        transformer = SummaryTransformer(summary_function=summary_arg,
                                         quantiles=None)
        transformer.fit_transform(data_to_test[0])
    def get_test_params(cls, parameter_set="default"):
        """Return testing parameter settings for the estimator.

        Parameters
        ----------
        parameter_set : str, default="default"
            Name of the set of test parameters to return, for use in tests. If no
            special parameters are defined for a value, will return `"default"` set.
            For classifiers, a "default" set of parameters should be provided for
            general testing, and a "results_comparison" set for comparing against
            previously recorded results if the general set does not produce suitable
            probabilities to compare against.

        Returns
        -------
        params : dict or list of dict, default={}
            Parameters to create testing instances of the class.
            Each dict are parameters to construct an "interesting" test instance, i.e.,
            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
            `create_test_instance` uses the first (or only) dictionary in `params`.
        """
        from sklearn.ensemble import RandomForestClassifier

        from sktime.transformations.series.summarize import SummaryTransformer

        if parameter_set == "results_comparison":
            return {
                "n_intervals":
                3,
                "estimator":
                RandomForestClassifier(n_estimators=10),
                "interval_transformers":
                SummaryTransformer(
                    summary_function=("mean", "std", "min", "max"),
                    quantiles=(0.25, 0.5, 0.75),
                ),
            }
        else:
            return {
                "n_intervals":
                2,
                "estimator":
                RandomForestClassifier(n_estimators=2),
                "interval_transformers":
                SummaryTransformer(summary_function=("mean", "min", "max"), ),
            }
Пример #4
0
    def _fit(self, X, y):
        """Fit a pipeline on cases (X,y), where y is the target variable.

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The training data.
        y : array-like, shape = [n_instances]
            The class labels.

        Returns
        -------
        self :
            Reference to self.

        Notes
        -----
        Changes state by creating a fitted model that updates attributes
        ending in "_" and sets is_fitted flag to True.
        """
        self._transformer = SummaryTransformer(
            summary_function=self.summary_functions,
            quantiles=self.summary_quantiles,
        )

        self._estimator = _clone_estimator(
            RandomForestClassifier(n_estimators=200)
            if self.estimator is None
            else self.estimator,
            self.random_state,
        )

        m = getattr(self._estimator, "n_jobs", None)
        if m is not None:
            self._estimator.n_jobs = self._threads_to_use

        X_t = self._transformer.fit_transform(X, y)

        if X_t.shape[0] > len(y):
            X_t = X_t.to_numpy().reshape((len(y), -1))
            self._transform_atts = X_t.shape[1]

        self._estimator.fit(X_t, y)

        return self
Пример #5
0
    def fit(self, X, y=None):
        """Fit the random interval transform.

        Parameters
        ----------
        X : pandas DataFrame or 3d numpy array, input time series
        y : array_like, target values (optional, ignored)
        """
        X = check_X(X, coerce_to_numpy=True)

        _, n_dims, series_length = X.shape

        if self.transformers is None:
            self._transformers = [
                SummaryTransformer(
                    summary_function=("mean", "std", "min", "max"),
                    quantiles=(0.25, 0.5, 0.75),
                )
            ]

        if not isinstance(self._transformers, list):
            self._transformers = [self._transformers]

        li = []
        for i in range(len(self._transformers)):
            li.append(
                _clone_estimator(
                    self._transformers[i],
                    self.random_state,
                ))

            m = getattr(li[i], "n_jobs", None)
            if m is not None:
                li[i].n_jobs = self.n_jobs
        self._transformers = li

        rng = check_random_state(self.random_state)
        self._dims = rng.choice(n_dims, self.n_intervals, replace=True)
        self._intervals = np.zeros((self.n_intervals, 2), dtype=int)

        for i in range(0, self.n_intervals):
            if rng.random() < 0.5:
                self._intervals[i][0] = rng.randint(0, series_length - 3)
                length = (
                    rng.randint(0, series_length - self._intervals[i][0] - 3) +
                    3 if series_length - self._intervals[i][0] - 3 > 0 else 3)
                self._intervals[i][1] = self._intervals[i][0] + length
            else:
                self._intervals[i][1] = rng.randint(0, series_length - 3) + 3
                length = (rng.randint(0, self._intervals[i][1] - 3) +
                          3 if self._intervals[i][1] - 3 > 0 else 3)
                self._intervals[i][0] = self._intervals[i][1] - length

        self._is_fitted = True
        return self
Пример #6
0
def test_summary_transformer_output_type(y, summary_arg, quantile_arg):
    """Test whether output is DataFrame of correct dimensions."""
    transformer = SummaryTransformer(summary_function=summary_arg,
                                     quantiles=quantile_arg)
    transformer.fit(y)
    yt = transformer.transform(y)

    output_is_dataframe = isinstance(yt, pd.DataFrame)
    expected_instances = 1 if isinstance(y, pd.Series) else y.shape[1]
    expected_sum_features = 1 if isinstance(summary_arg,
                                            str) else len(summary_arg)
    if quantile_arg is None:
        expected_q_features = 0
    elif isinstance(quantile_arg, (int, float)):
        expected_q_features = 1
    else:
        expected_q_features = len(quantile_arg)
    expected_features = expected_sum_features + expected_q_features

    assert output_is_dataframe and yt.shape == (expected_instances,
                                                expected_features)
Пример #7
0
def test_random_interval_classifier_on_basic_motions():
    """Test of RandomIntervalClassifier on basic motions."""
    # load basic motions data
    X_train, y_train = load_basic_motions(split="train")
    X_test, y_test = load_basic_motions(split="test")
    indices = np.random.RandomState(4).choice(len(y_train), 10, replace=False)

    # train random interval classifier
    ric = RandomIntervalClassifier(
        random_state=0,
        n_intervals=5,
        interval_transformers=SummaryTransformer(
            summary_function=("mean", "std", "min", "max"),
            quantiles=(0.25, 0.5, 0.75),
        ),
        estimator=RandomForestClassifier(n_estimators=10),
    )
    ric.fit(X_train.iloc[indices], y_train[indices])

    # assert probabilities are the same
    probas = ric.predict_proba(X_test.iloc[indices])
    testing.assert_array_almost_equal(
        probas, random_interval_classifier_basic_motions_probas, decimal=2)
Пример #8
0
             n_estimators=10,
         )
     ),
 )
 _print_array(
     "MatrixProfileClassifier - UnitTest",
     _reproduce_classification_unit_test(MatrixProfileClassifier(random_state=0)),
 )
 _print_array(
     "RandomIntervalClassifier - UnitTest",
     _reproduce_classification_unit_test(
         RandomIntervalClassifier(
             random_state=0,
             n_intervals=5,
             interval_transformers=SummaryTransformer(
                 summary_function=("mean", "std", "min", "max"),
                 quantiles=(0.25, 0.5, 0.75),
             ),
             estimator=RandomForestClassifier(n_estimators=10),
         )
     ),
 )
 _print_array(
     "RandomIntervalClassifier - BasicMotions",
     _reproduce_classification_basic_motions(
         RandomIntervalClassifier(
             random_state=0,
             n_intervals=5,
             interval_transformers=SummaryTransformer(
                 summary_function=("mean", "std", "min", "max"),
                 quantiles=(0.25, 0.5, 0.75),
             ),
Пример #9
0
     "default_fc_parameters": "minimal",
 },
 FreshPRINCE: {
     "n_estimators": 3,
     "default_fc_parameters": "minimal",
 },
 RandomIntervals: {
     "n_intervals": 3,
 },
 RandomIntervalClassifier: {
     "n_intervals":
     3,
     "estimator":
     RandomForestClassifier(n_estimators=3),
     "interval_transformers":
     SummaryTransformer(summary_function=("mean", "min", "max"), ),
 },
 SummaryClassifier: {
     "estimator": RandomForestClassifier(n_estimators=3),
     "summary_functions": ("mean", "min", "max"),
 },
 RocketClassifier: {
     "num_kernels": 100
 },
 Arsenal: {
     "num_kernels": 50,
     "n_estimators": 3
 },
 HIVECOTEV1: {
     "stc_params": {
         "estimator": RotationForest(n_estimators=2),
Пример #10
0
def set_classifier(cls, resample_id=None, train_file=False):
    """Construct a classifier, possibly seeded.

    Basic way of creating the classifier to build using the default settings. This
    set up is to help with batch jobs for multiple problems to facilitate easy
    reproducibility for use with load_and_run_classification_experiment. You can pass a
    classifier object instead to run_classification_experiment.

    Parameters
    ----------
    cls : str
        String indicating which classifier you want.
    resample_id : int or None, default=None
        Classifier random seed.
    train_file : bool, default=False
        Whether a train file is being produced.

    Return
    ------
    classifier : A BaseClassifier.
        The classifier matching the input classifier name.
    """
    name = cls.lower()
    # Dictionary based
    if name == "boss" or name == "bossensemble":
        return BOSSEnsemble(random_state=resample_id)
    elif name == "cboss" or name == "contractableboss":
        return ContractableBOSS(random_state=resample_id)
    elif name == "tde" or name == "temporaldictionaryensemble":
        return TemporalDictionaryEnsemble(random_state=resample_id,
                                          save_train_predictions=train_file)
    elif name == "weasel":
        return WEASEL(random_state=resample_id)
    elif name == "muse":
        return MUSE(random_state=resample_id)
    # Distance based
    elif name == "pf" or name == "proximityforest":
        return ProximityForest(random_state=resample_id)
    elif name == "pt" or name == "proximitytree":
        return ProximityTree(random_state=resample_id)
    elif name == "ps" or name == "proximityStump":
        return ProximityStump(random_state=resample_id)
    elif name == "dtwcv" or name == "kneighborstimeseriesclassifier":
        return KNeighborsTimeSeriesClassifier(distance="dtwcv")
    elif name == "dtw" or name == "1nn-dtw":
        return KNeighborsTimeSeriesClassifier(distance="dtw")
    elif name == "msm" or name == "1nn-msm":
        return KNeighborsTimeSeriesClassifier(distance="msm")
    elif name == "ee" or name == "elasticensemble":
        return ElasticEnsemble(random_state=resample_id)
    elif name == "shapedtw":
        return ShapeDTW()
    # Feature based
    elif name == "summary":
        return SummaryClassifier(
            random_state=resample_id,
            estimator=RandomForestClassifier(n_estimators=500))
    elif name == "summary-intervals":
        return RandomIntervalClassifier(
            random_state=resample_id,
            interval_transformers=SummaryTransformer(
                summary_function=("mean", "std", "min", "max"),
                quantiles=(0.25, 0.5, 0.75),
            ),
            estimator=RandomForestClassifier(n_estimators=500),
        )
    elif name == "summary-catch22":
        return RandomIntervalClassifier(
            random_state=resample_id,
            estimator=RandomForestClassifier(n_estimators=500))
    elif name == "catch22":
        return Catch22Classifier(
            random_state=resample_id,
            estimator=RandomForestClassifier(n_estimators=500))
    elif name == "matrixprofile":
        return MatrixProfileClassifier(random_state=resample_id)
    elif name == "signature":
        return SignatureClassifier(
            random_state=resample_id,
            estimator=RandomForestClassifier(n_estimators=500),
        )
    elif name == "tsfresh":
        return TSFreshClassifier(
            random_state=resample_id,
            estimator=RandomForestClassifier(n_estimators=500))
    elif name == "tsfresh-r":
        return TSFreshClassifier(
            random_state=resample_id,
            estimator=RandomForestClassifier(n_estimators=500),
            relevant_feature_extractor=True,
        )
    elif name == "freshprince":
        return FreshPRINCE(random_state=resample_id,
                           save_transformed_data=train_file)
    # Hybrid
    elif name == "hc1" or name == "hivecotev1":
        return HIVECOTEV1(random_state=resample_id)
    elif name == "hc2" or name == "hivecotev2":
        return HIVECOTEV2(random_state=resample_id)
    # Interval based
    elif name == "rise" or name == "randomintervalspectralforest":
        return RandomIntervalSpectralForest(random_state=resample_id,
                                            n_estimators=500)
    elif name == "tsf" or name == "timeseriesforestclassifier":
        return TimeSeriesForestClassifier(random_state=resample_id,
                                          n_estimators=500)
    elif name == "cif" or name == "canonicalintervalforest":
        return CanonicalIntervalForest(random_state=resample_id,
                                       n_estimators=500)
    elif name == "stsf" or name == "supervisedtimeseriesforest":
        return SupervisedTimeSeriesForest(random_state=resample_id,
                                          n_estimators=500)
    elif name == "drcif":
        return DrCIF(random_state=resample_id,
                     n_estimators=500,
                     save_transformed_data=train_file)
    # Kernel based
    elif name == "rocket":
        return RocketClassifier(random_state=resample_id)
    elif name == "mini-rocket":
        return RocketClassifier(random_state=resample_id,
                                rocket_transform="minirocket")
    elif name == "multi-rocket":
        return RocketClassifier(random_state=resample_id,
                                rocket_transform="multirocket")
    elif name == "arsenal":
        return Arsenal(random_state=resample_id,
                       save_transformed_data=train_file)
    elif name == "mini-arsenal":
        return Arsenal(
            random_state=resample_id,
            save_transformed_data=train_file,
            rocket_transform="minirocket",
        )
    elif name == "multi-arsenal":
        return Arsenal(
            random_state=resample_id,
            save_transformed_data=train_file,
            rocket_transform="multirocket",
        )
    # Shapelet based
    elif name == "stc" or name == "shapelettransformclassifier":
        return ShapeletTransformClassifier(
            transform_limit_in_minutes=120,
            random_state=resample_id,
            save_transformed_data=train_file,
        )
    else:
        raise Exception("UNKNOWN CLASSIFIER")
Пример #11
0
class SummaryClassifier(BaseClassifier):
    """Summary statistic classifier.

    This classifier simply transforms the input data using the SummaryTransformer
    transformer and builds a provided estimator using the transformed data.

    Parameters
    ----------
    summary_functions : str, list, tuple, default=("mean", "std", "min", "max")
        Either a string, or list or tuple of strings indicating the pandas
        summary functions that are used to summarize each column of the dataset.
        Must be one of ("mean", "min", "max", "median", "sum", "skew", "kurt",
        "var", "std", "mad", "sem", "nunique", "count").
    summary_quantiles : str, list, tuple or None, default=(0.25, 0.5, 0.75)
        Optional list of series quantiles to calculate. If None, no quantiles
        are calculated.
    estimator : sklearn classifier, default=None
        An sklearn estimator to be built using the transformed data. Defaults to a
        Random Forest with 200 trees.
    n_jobs : int, default=1
        The number of jobs to run in parallel for both `fit` and `predict`.
        ``-1`` means using all processors.
    random_state : int or None, default=None
        Seed for random, integer.

    Attributes
    ----------
    n_classes_ : int
        Number of classes. Extracted from the data.
    classes_ : ndarray of shape (n_classes)
        Holds the label for each class.

    See Also
    --------
    SummaryTransformer

    Examples
    --------
    >>> from sktime.classification.feature_based import SummaryClassifier
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> from sktime.datasets import load_unit_test
    >>> X_train, y_train = load_unit_test(split="train", return_X_y=True)
    >>> X_test, y_test = load_unit_test(split="test", return_X_y=True)
    >>> clf = SummaryClassifier(estimator=RandomForestClassifier(n_estimators=10))
    >>> clf.fit(X_train, y_train)
    SummaryClassifier(...)
    >>> y_pred = clf.predict(X_test)
    """

    _tags = {
        "capability:multivariate": True,
        "capability:multithreading": True,
    }

    def __init__(
        self,
        summary_functions=("mean", "std", "min", "max"),
        summary_quantiles=(0.25, 0.5, 0.75),
        estimator=None,
        n_jobs=1,
        random_state=None,
    ):
        self.summary_functions = summary_functions
        self.summary_quantiles = summary_quantiles
        self.estimator = estimator

        self.n_jobs = n_jobs
        self.random_state = random_state

        self._transformer = None
        self._estimator = None
        self._transform_atts = 0

        super(SummaryClassifier, self).__init__()

    def _fit(self, X, y):
        """Fit a pipeline on cases (X,y), where y is the target variable.

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The training data.
        y : array-like, shape = [n_instances]
            The class labels.

        Returns
        -------
        self :
            Reference to self.

        Notes
        -----
        Changes state by creating a fitted model that updates attributes
        ending in "_" and sets is_fitted flag to True.
        """
        self._transformer = SummaryTransformer(
            summary_function=self.summary_functions,
            quantiles=self.summary_quantiles,
        )

        self._estimator = _clone_estimator(
            RandomForestClassifier(n_estimators=200)
            if self.estimator is None
            else self.estimator,
            self.random_state,
        )

        m = getattr(self._estimator, "n_jobs", None)
        if m is not None:
            self._estimator.n_jobs = self._threads_to_use

        X_t = self._transformer.fit_transform(X, y)

        if X_t.shape[0] > len(y):
            X_t = X_t.to_numpy().reshape((len(y), -1))
            self._transform_atts = X_t.shape[1]

        self._estimator.fit(X_t, y)

        return self

    def _predict(self, X):
        """Predict class values of n instances in X.

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The data to make predictions for.

        Returns
        -------
        y : array-like, shape = [n_instances]
            Predicted class labels.
        """
        X_t = self._transformer.transform(X)

        if X_t.shape[1] < self._transform_atts:
            X_t = X_t.to_numpy().reshape((-1, self._transform_atts))

        return self._estimator.predict(X_t)

    def _predict_proba(self, X):
        """Predict class probabilities for n instances in X.

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The data to make predict probabilities for.

        Returns
        -------
        y : array-like, shape = [n_instances, n_classes_]
            Predicted probabilities using the ordering in classes_.
        """
        X_t = self._transformer.transform(X)

        if X_t.shape[1] < self._transform_atts:
            X_t = X_t.to_numpy().reshape((-1, self._transform_atts))

        m = getattr(self._estimator, "predict_proba", None)
        if callable(m):
            return self._estimator.predict_proba(X_t)
        else:
            dists = np.zeros((X.shape[0], self.n_classes_))
            preds = self._estimator.predict(X_t)
            for i in range(0, X.shape[0]):
                dists[i, self._class_dictionary[preds[i]]] = 1
            return dists
Пример #12
0
 },
 MatrixProfileClassifier: {
     "subsequence_length": 4,
 },
 TSFreshClassifier: {
     "estimator": RandomForestClassifier(n_estimators=3),
     "default_fc_parameters": "minimal",
 },
 RandomIntervals: {
     "n_intervals": 3,
 },
 RandomIntervalClassifier: {
     "n_intervals": 3,
     "estimator": RandomForestClassifier(n_estimators=3),
     "interval_transformers": SummaryTransformer(
         summary_function=("mean", "min", "max"),
     ),
 },
 SummaryClassifier: {
     "estimator": RandomForestClassifier(n_estimators=3),
     "summary_functions": ("mean", "min", "max"),
 },
 RocketClassifier: {"num_kernels": 100},
 Arsenal: {"num_kernels": 50, "n_estimators": 3},
 HIVECOTEV1: {
     "stc_params": {
         "estimator": RotationForest(n_estimators=2),
         "max_shapelets": 5,
         "n_shapelet_samples": 20,
         "batch_size": 10,
     },