示例#1
0
    def test_fit_transform_output(self, estimator_instance, scenario):
        """Test that transform output is of expected scitype."""
        X = scenario.args["transform"]["X"]
        Xt = scenario.run(estimator_instance,
                          method_sequence=["fit", "transform"])

        X_scitype = scenario.get_tag("X_scitype")
        trafo_input = estimator_instance.get_tag("scitype:transform-input")
        trafo_output = estimator_instance.get_tag("scitype:transform-output")

        # get metadata for X and ensure that X_scitype tag was correct
        valid_X_scitype, _, X_metadata = check_is_scitype(X,
                                                          scitype=X_scitype,
                                                          return_metadata=True)
        msg = (
            f"error with scenario {type(scenario).__name__}, X_scitype tag "
            f'was "{X_scitype}", but check_is_scitype does not confirm this')
        assert valid_X_scitype, msg

        Xt_expected_scitype = self._expected_trafo_output_scitype(
            X_scitype, trafo_input, trafo_output)

        # todo 0.11.0 or 0.12.0:
        #   remove this once #2219 is merged, which adds Hierarchical support
        #   until then, skip tests if expected scitype is Hierarchical
        if Xt_expected_scitype == "Hierarchical":
            return None

        valid_scitype, _, Xt_metadata = check_is_scitype(
            Xt, scitype=Xt_expected_scitype, return_metadata=True)

        msg = (
            f"{type(estimator_instance).__name__}.transform should return an object of "
            f"scitype {Xt_expected_scitype} when given an input of scitype {X_scitype},"
            f" but found the following return: {Xt}")
        assert valid_scitype, msg

        # we now know that Xt has its expected scitype
        # assign this variable for better readability
        Xt_scitype = Xt_expected_scitype

        # skip the "number of instances" test below for Aggregator, Reconciler
        #   reason: this adds "pseudo-instances" for the __total and increases the count
        #   todo: we probably want to mirror this into a "hierarchical" tag later on
        if type(estimator_instance).__name__ in ["Aggregator", "Reconciler"]:
            return None

        # if we vectorize, number of instances before/after transform should be same
        if trafo_input == "Series" and trafo_output == "Series":
            if X_scitype == "Series" and Xt_scitype == "Series":
                if estimator_instance.get_tag(
                        "transform-returns-same-time-index"):
                    assert X.shape[0] == Xt.shape[0]
            if X_scitype == "Panel" and Xt_scitype == "Panel":
                assert X_metadata["n_instances"] == Xt_metadata["n_instances"]
            if X_scitype == "Hierarchical" and Xt_scitype == "Hierarchical":
                assert X_metadata["n_instances"] == Xt_metadata["n_instances"]
        if trafo_input == "Panel" and trafo_output == "Panel":
            if X_scitype == "Hierarchical" and Xt_scitype == "Hierarchical":
                assert X_metadata["n_panels"] == Xt_metadata["n_panels"]
    def test_classifier_output(self, estimator_instance, scenario):
        """Test classifier outputs the correct data types and values.

        Test predict produces a np.array or pd.Series with only values seen in the train
        data, and that predict_proba probability estimates add up to one.
        """
        n_classes = scenario.get_tag("n_classes")
        X_new = scenario.args["predict"]["X"]
        y_train = scenario.args["fit"]["y"]
        # we use check_is_scitype to get the number instances in X_new
        #   this is more robust against different scitypes in X_new
        _, _, X_new_metadata = check_is_scitype(X_new,
                                                "Panel",
                                                return_metadata=True)
        X_new_instances = X_new_metadata["n_instances"]

        # run fit and predict
        y_pred = scenario.run(estimator_instance,
                              method_sequence=["fit", "predict"])

        # check predict
        assert isinstance(y_pred, np.ndarray)
        assert y_pred.shape == (X_new_instances, )
        assert np.all(np.isin(np.unique(y_pred), np.unique(y_train)))

        # check predict proba (all classifiers have predict_proba by default)
        y_proba = scenario.run(estimator_instance,
                               method_sequence=["predict_proba"])
        assert isinstance(y_proba, np.ndarray)
        assert y_proba.shape == (X_new_instances, n_classes)
        np.testing.assert_allclose(y_proba.sum(axis=1), 1)
示例#3
0
    def _pairwise_table_x_check(self, X, var_name="X"):
        """Check and coerce input data.

        Method used to check the input and convert Table input
            to internally used format, as defined in X_inner_mtype tag

        Parameters
        ----------
        X: pd.DataFrame, pd.Series, numpy 1D or 2D, list of dicts
            sktime data container compliant with the Table scitype
            The value to be checked and coerced
        var_name: str, variable name to print in error messages

        Returns
        -------
        X: Panel data container of a supported format in X_inner_mtype
            usually a 2D np.ndarray or a pd.DataFrame, unless overridden
        """
        X_valid = check_is_scitype(X,
                                   "Table",
                                   return_metadata=False,
                                   var_name=var_name)

        if not X_valid:
            msg = (
                "X and X2 must be in an sktime compatible format, of scitype Table, "
                "for instance a pandas.DataFrame or a 2D numpy.ndarray. "
                "See the data format tutorial examples/AA_datatypes_and_datasets.ipynb"
            )
            raise TypeError(msg)

        X_inner_mtype = self.get_tag("X_inner_mtype")
        X_coerced = convert_to(X, to_type=X_inner_mtype, as_scitype="Table")

        return X_coerced
示例#4
0
def test_series_in_series_out_supported_fit_in_transform():
    """Test that fit/transform runs and returns the correct output type.

    Setting: transformer has tags
        "scitype:transform-input" = "Series"
        "scitype:transform-output" = "Series"
        "fit_is_empty" = True
        "X_inner_mtype" supports "Series"

    X input to fit/transform has Series scitype
    X output from fit/transform should be Series
    """
    # one example for a transformer which supports Series internally
    cls = ExponentTransformer
    est = cls.create_test_instance()
    # ensure cls is a good example, if this fails, choose another example
    #   (if this changes, it may be due to implementing more scitypes)
    #   (then this is not a failure of cls, but we need to choose another example)
    assert "Series" in inner_X_scitypes(est)
    assert est.get_class_tag("fit_is_empty")
    assert est.get_class_tag("scitype:transform-input") == "Series"
    assert est.get_class_tag("scitype:transform-output") == "Series"

    # scenario in which series are passed to fit/transform
    scenario = TransformerFitTransformSeriesUnivariate()
    Xt = scenario.run(est, method_sequence=["fit", "transform"])

    valid, _, _ = check_is_scitype(Xt, scitype="Series", return_metadata=True)
    assert valid, "fit.transform does not return a Series when given a Series"
示例#5
0
def test_panel_in_primitives_out_supported_with_y_in_fit_but_not_transform():
    """Test that fit/transform runs and returns the correct output type.

    Setting: transformer has tags
        "scitype:transform-input" = "Series"
        "scitype:transform-output" = "Primitives"
        "fit_is_empty" = False
        "requires_y" = True
        "X_inner_mtype" supports "Panel"

    X input to fit/transform has Panel scitype
    X output from fit/transform should be Table
    """
    # one example for a transformer which supports Panel internally
    cls = TSFreshRelevantFeatureExtractor
    est = cls.create_test_instance()
    # ensure cls is a good example, if this fails, choose another example
    #   (if this changes, it may be due to implementing more scitypes)
    #   (then this is not a failure of cls, but we need to choose another example)
    assert "Panel" in inner_X_scitypes(est)
    assert not est.get_tag("fit_is_empty")
    assert est.get_tag("requires_y")
    assert est.get_tag("scitype:transform-input") == "Series"
    assert est.get_tag("scitype:transform-output") == "Primitives"

    # scenario in which series are passed to fit/transform
    scenario = TransformerFitTransformPanelUnivariateWithClassYOnlyFit()
    Xt = scenario.run(est, method_sequence=["fit", "transform"])

    valid, _, _ = check_is_scitype(Xt, scitype="Table", return_metadata=True)
    assert valid, "fit.transform does not return a Table when given a Table"
    # todo: possibly, add mtype check, use metadata return
    # length of Xt should be seven = number of samples in the scenario
    assert len(Xt) == 7
示例#6
0
文件: base.py 项目: ksachdeva/sktime
def _check_classifier_input(
    X,
    y=None,
    enforce_min_instances=1,
):
    """Check whether input X and y are valid formats with minimum data.

    Raises a ValueError if the input is not valid.

    Parameters
    ----------
    X : check whether conformant with any sktime Panel mtype specification
    y : check whether a pd.Series or np.array
    enforce_min_instances : int, optional (default=1)
        check there are a minimum number of instances.

    Returns
    -------
    metadata : dict with metadata for X returned by datatypes.check_is_scitype

    Raises
    ------
    ValueError
        If y or X is invalid input data type, or there is not enough data
    """
    # Check X is valid input type and recover the data characteristics
    X_valid, _, X_metadata = check_is_scitype(X,
                                              scitype="Panel",
                                              return_metadata=True)
    if not X_valid:
        raise TypeError(
            f"X is not of a supported input data type."
            f"X must be in a supported mtype format for Panel, found {type(X)}"
            f"Use datatypes.check_is_mtype to check conformance with specifications."
        )
    n_cases = X_metadata["n_instances"]
    if n_cases < enforce_min_instances:
        raise ValueError(
            f"Minimum number of cases required is {enforce_min_instances} but X "
            f"has : {n_cases}")

    # Check y if passed
    if y is not None:
        # Check y valid input
        if not isinstance(y, (pd.Series, np.ndarray)):
            raise ValueError(
                f"y must be a np.array or a pd.Series, but found type: {type(y)}"
            )
        # Check matching number of labels
        n_labels = y.shape[0]
        if n_cases != n_labels:
            raise ValueError(
                f"Mismatch in number of cases. Number in X = {n_cases} nos in y = "
                f"{n_labels}")
        if isinstance(y, np.ndarray):
            if y.ndim > 1:
                raise ValueError(f"y must be 1-dimensional but is in fact "
                                 f"{y.ndim} dimensional")
    return X_metadata
示例#7
0
    def _check_clusterer_input(
        self, X: TimeSeriesInstances, enforce_min_instances: int = 1
    ) -> TimeSeriesInstances:
        """Validate the input and prepare for _fit.

        Parameters
        ----------
        X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
            (n_instances,n_dimensions,series_length)) or nested pd.DataFrame (
            n_instances,n_dimensions).
            Training time series instances to cluster.

        Returns
        -------
        X : np.ndarray (3d of shape (n_instances,n_dimensions,series_length)) or
            pd.Dataframe (n_instances,n_dimensions).
            Converted X ready for _fit.

        Raises
        ------
        ValueError
            If y or X is invalid input data type, or there is not enough data.
        """
        X = self._initial_conversion(X)

        X_valid, _, X_metadata = check_is_scitype(
            X, scitype="Panel", return_metadata=True
        )
        if not X_valid:
            raise TypeError(
                f"X is not of a supported input data type."
                f"X must be of type np.ndarray or pd.DataFrame, found {type(X)}"
                f"Use datatypes.check_is_mtype to check conformance with "
                f"specifications."
            )
        n_cases = X_metadata["n_instances"]
        if n_cases < enforce_min_instances:
            raise ValueError(
                f"Minimum number of cases required is {enforce_min_instances} but X "
                f"has : {n_cases}"
            )
        missing = X_metadata["has_nans"]
        multivariate = not X_metadata["is_univariate"]
        unequal = not X_metadata["is_equal_length"]
        self._check_capabilities(missing, multivariate, unequal)
        return convert_to(
            X,
            to_type=self.get_tag("X_inner_mtype"),
            as_scitype="Panel",
        )
示例#8
0
    def _pairwise_panel_x_check(self, X, var_name="X"):
        """Check and coerce input data.

        Method used to check the input and convert Series/Panel input
            to internally used format, as defined in X_inner_mtype tag

        Parameters
        ----------
        X: List of dfs, Numpy of dfs, 3d numpy
            sktime data container compliant with the Series or Panel scitype
            The value to be checked
        var_name: str, variable name to print in error messages

        Returns
        -------
        X: Panel data container of a supported format in X_inner_mtype
            usually df-list, list of pd.DataFrame, unless overridden
        """
        check_res = check_is_scitype(X, ["Series", "Panel"],
                                     return_metadata=True,
                                     var_name=var_name)
        X_valid = check_res[0]
        metadata = check_res[2]

        X_scitype = metadata["scitype"]

        if not X_valid:
            msg = (
                "X and X2 must be in an sktime compatible format, "
                "of scitype Series or Panel, "
                "for instance a pandas.DataFrame with sktime compatible time indices, "
                "or with MultiIndex and lowest level a sktime compatible time index. "
                "See the data format tutorial examples/AA_datatypes_and_datasets.ipynb"
            )
            raise TypeError(msg)

        # if the input is a single series, convert it to a Panel
        if X_scitype == "Series":
            X = convert_Series_to_Panel(X)

        # can't be anything else if check_is_scitype is working properly
        elif X_scitype != "Panel":
            raise RuntimeError(
                "Unexpected error in check_is_scitype, check validity")

        X_inner_mtype = self.get_tag("X_inner_mtype")
        X_coerced = convert_to(X, to_type=X_inner_mtype, as_scitype="Panel")

        return X_coerced
示例#9
0
    def _pairwise_panel_x_check(self, X, var_name="X"):
        """Check and coerce input data.

        Method used to check the input and convert Series/Panel input
            to internally used format, as defined in X_inner_mtype tag

        Parameters
        ----------
        X: List of dfs, Numpy of dfs, 3d numpy
            The value to be checked
        var_name: str, variable name to print in error messages

        Returns
        -------
        X: Panel data container of a supported format in X_inner_mtype
            usually df-list, list of pd.DataFrame, unless overridden
        """
        check_res = check_is_scitype(X, ["Series", "Panel"],
                                     return_metadata=True,
                                     var_name=var_name)
        X_valid = check_res[0]
        metadata = check_res[2]

        X_scitype = metadata["scitype"]

        if not X_valid:
            raise TypeError("X/X2 must be of Series or Panel scitype")

        # if the input is a single series, convert it to a Panel
        if X_scitype == "Series":
            X = convert_Series_to_Panel(X)

        # can't be anything else if check_is_scitype is working properly
        elif X_scitype != "Panel":
            raise RuntimeError(
                "Unexpected error in check_is_scitype, check validity")

        X_inner_mtype = self.get_tag("X_inner_mtype")
        X_coerced = convert_to(X, to_type=X_inner_mtype, as_scitype="Panel")

        return X_coerced
示例#10
0
def test_hierarchical_in_hierarchical_out_not_supported_but_series_fit_in_transform(
):
    """Test that fit/transform runs and returns the correct output type.

    Setting: transformer has tags
        "scitype:transform-input" = "Series"
        "scitype:transform-output" = "Series"
        "fit_is_empty" = True
        "X_inner_mtype" supports "Series" but not "Panel" and not "Hierarchical

    X input to fit/transform has Hierarchical scitype
    X output from fit/transform should be Hierarchical
    """
    # one example for a transformer which supports Series internally
    cls = ExponentTransformer
    est = cls.create_test_instance()
    # ensure cls is a good example, if this fails, choose another example
    #   (if this changes, it may be due to implementing more scitypes)
    #   (then this is not a failure of cls, but we need to choose another example)
    assert "Series" in inner_X_scitypes(est)
    assert "Panel" not in inner_X_scitypes(est)
    assert "Hierarchical" not in inner_X_scitypes(est)
    assert est.get_tag("fit_is_empty")
    assert est.get_tag("scitype:transform-input") == "Series"
    assert est.get_tag("scitype:transform-output") == "Series"

    # scenario in which series are passed to fit/transform
    scenario = TransformerFitTransformHierarchicalUnivariate()
    Xt = scenario.run(est, method_sequence=["fit", "transform"])

    valid, _, _ = check_is_scitype(Xt,
                                   scitype="Hierarchical",
                                   return_metadata=True)
    assert valid, "fit.transform does not return a Table when given a Table"
    # todo: possibly, add mtype check, use metadata return
    # length of Xt should be number of hierarchy levels times number of time points
    assert len(Xt) == 2 * 4 * 12
示例#11
0
    def _check_ys(self, y_true, y_pred, multioutput):
        if multioutput is None:
            multioutput = self.multioutput
        valid, msg, metadata = check_is_scitype(y_pred,
                                                scitype="Proba",
                                                return_metadata=True,
                                                var_name="y_pred")

        if not valid:
            raise TypeError(msg)

        y_pred_mtype = metadata["mtype"]
        inner_y_pred_mtype = self.get_tag("scitype:y_pred")
        y_pred_inner = convert(
            y_pred,
            from_type=y_pred_mtype,
            to_type=inner_y_pred_mtype,
            as_scitype="Proba",
        )

        y_true, y_pred, multioutput = self._check_consistent_input(
            y_true, y_pred, multioutput)

        return y_true, y_pred_inner, multioutput
示例#12
0
def get_window(obj, window_length=None, lag=0):
    """Slice obj to the time index window with given length and lag.

    Returns time series or time series panel with time indices
        strictly greater than cutoff - lag - window_length, and
        equal or less than cutoff - lag.
    Cutoff if of obj, as determined by get_cutoff.

    Parameters
    ----------
    obj : sktime compatible time series data container or None
        if not None, must be of one of the following mtypes:
            pd.Series, pd.DataFrame, np.ndarray, of Series scitype
            pd.multiindex, numpy3D, nested_univ, df-list, of Panel scitype
            pd_multiindex_hier, of Hierarchical scitype
    window_length : int or timedelta, optional, default=-inf
        must be int if obj is int indexed, timedelta if datetime indexed
        length of the window to slice to. Default = window of infinite size
    lag : int or timedelta, optional, default = 0
        must be int if obj is int indexed, timedelta if datetime indexed
        lag of the latest time in the window, with respect to cutoff of obj

    Returns
    -------
    obj sub-set to time indices in the semi-open interval
        (cutoff - window_length - lag, cutoff - lag)
        None if obj was None
    """
    from sktime.datatypes import check_is_scitype, convert_to

    if window_length is None or obj is None:
        return obj

    valid, _, metadata = check_is_scitype(
        obj, scitype=["Series", "Panel", "Hierarchical"], return_metadata=True)
    if not valid:
        raise ValueError(
            "obj must be of Series, Panel, or Hierarchical scitype")
    obj_in_mtype = metadata["mtype"]

    obj = convert_to(obj, GET_LATEST_WINDOW_SUPPORTED_MTYPES)

    # numpy3D (Panel) or np.npdarray (Series)
    if isinstance(obj, np.ndarray):
        obj_len = len(obj)
        window_start = max(-window_length - lag, -obj_len)
        window_end = max(-lag, -obj_len)
        if window_end == 0:
            return obj[window_start:]
        else:
            return obj[window_start:window_end]

    # pd.DataFrame(Series), pd-multiindex (Panel) and pd_multiindex_hier (Hierarchical)
    if isinstance(obj, pd.DataFrame):
        cutoff = get_cutoff(obj)
        win_start_excl = cutoff - window_length - lag
        win_end_incl = cutoff - lag

        if not isinstance(obj.index, pd.MultiIndex):
            time_indices = obj.index
        else:
            time_indices = obj.index.get_level_values(-1)

        win_select = (time_indices > win_start_excl) & (time_indices <=
                                                        win_end_incl)
        obj_subset = obj.iloc[win_select]

        return convert_to(obj_subset, obj_in_mtype)

    raise ValueError(
        "bug in get_latest_window, unreachable condition, ifs should be exhaustive"
    )