Exemplo n.º 1
0
 def test_error_process_generic_option_1(self, value, group_name,
                                         allow_none, allow_empty):
     with pytest.raises(ValueError):
         _internal.process_generic_option(value=value,
                                          group_name=group_name,
                                          allow_none=allow_none,
                                          allow_empty=allow_empty)
Exemplo n.º 2
0
 def test_error_process_generic_option_3(self):
     with pytest.raises(TypeError):
         _internal.process_generic_option(values=[1, 2, 3],
                                          group_name="timeopt")
Exemplo n.º 3
0
    def fit(self,
            X: t.Sequence,
            y: t.Sequence,
            transform_num: bool = True,
            transform_cat: bool = True,
            rescale: t.Optional[str] = None,
            rescale_args: t.Optional[t.Dict[str, t.Any]] = None,
            cat_cols: t.Optional[t.Union[str, t.Iterable[int]]] = "auto",
            check_bool: bool = False,
            precomp_groups: t.Optional[str] = "all",
            wildcard: str = "all",
            suppress_warnings: bool = False,
            ) -> "MFE":
        """Fits dataset into an MFE model.

        Parameters
        ----------
        X : :obj:`Sequence`
            Predictive attributes of the dataset.

        y : :obj:`Sequence`
            Target attributes of the dataset, assuming that it is a supervised
            task.

        transform_num : :obj:`bool`, optional
            If True, numeric attributes are discretized using equal-frequency
            histogram technique to use alongside categorical data when
            extracting categoric-only metafeatures. Note that numeric-only
            features still uses the original numeric values, not the
            discretized ones. If False, then numeric attributes are ignored for
            categorical-only meta-features.

        transform_cat : :obj:`bool`, optional
            If True, categorical attributes are binarized using a model matrix
            to use when alongside numerical data while extracting numeric-only
            metafeatures. Note that categoric-only features still uses the
            original categoric values, not the binarized ones. If False, then
            categorical attributes are ignored for numeric-only metafeatures.

            The formula used for this transformation is just the union (+) of
            all categoric attributes using formula language from ``patsy``
            package API, removing the intercept terms:
            ``~ 0 + A_1 + ... + A_n``, where ``n`` is the number of attributes
            and A_i is the ith categoric attribute, 1 <= i <= n.

        rescale : :obj:`str`, optional
            If :obj:`NoneType`, the model keeps all numeric data with its
            original values. Otherwise, this argument can assume one of the
            string options below to rescale all numeric values:

                1. ``standard``: set numeric data to zero mean, unit variance.
                   Also known as ``z-score`` normalization. Check the
                   documentation of ``sklearn.preprocessing.StandardScaler``
                   for in-depth information.

                2. `'min-max``: set numeric data to interval [a, b], a < b. It
                   is possible to define values to ``a`` and ``b`` using
                   argument ``rescale_args``. The default values are a = 0.0
                   and b = 1.0. Check ``sklearn.preprocessing.MinMaxScaler``
                   documentation for more information.

                3. ``robust``: rescale data using statistics robust to the
                   presence of outliers. For in-depth information, check
                   documentation of ``sklearn.preprocessing.RobustScaler``.

        rescale_args : :obj:`dict`, optional
            Dictionary containing parameters for rescaling data. Used only if
            ``rescale`` argument is not :obj:`NoneType`. These dictionary keys
            are the parameter names as strings and the values, the
            corresponding parameter value.

        cat_cols :obj:`Sequence` of :obj:`int` or :obj:`str`, optional
            Categorical columns of dataset. If given :obj:`NoneType` or an
            empty sequence, assume all columns as numeric. If given value
            ``auto``, then an attempt of automatic detection is performed while
            fitting the dataset.

        check_bool : :obj:`bool`, optional
            If `cat_cols` is ``auto``, and this flag is True, assume that all
            columns with precisely two different values is also a categorical
            (boolean) column, independently of its data type. Otherwise, these
            columns may be considered numeric depending on their data type.

        missing_data : :obj:`str`, optional
            Defines the strategy to handle missing values in data. Still not
            implemented.

        precomp_groups : :obj:`str`, optional
            Defines which metafeature groups common values should be cached to
            share among various meta-feature extraction related methods (e.g.
            ``classes``, or ``covariance``). This argument may speed up
            meta-feature extraction but also consumes more memory, so it may
            not be suitable for huge datasets.

        wildcard : :obj:`str`, optional
            Value used as ``select all`` for ``precomp_groups``.

        suppress_warnings : :obj:`bool`, optional
            If True, ignore all warnings invoked while fitting dataset.

        Returns
        -------
        self

        Raises
        ------
        ValueError
            If the number of rows of X and y length does not match.
        TypeError
            If X or y (or both) is neither a :obj:`list` or a :obj:`np.ndarray`
            object.

        """
        self.X, self.y = _internal.check_data(X, y)

        rescale = _internal.process_generic_option(
            value=rescale, group_name="rescale", allow_none=True)

        self._fill_col_ind_by_type(cat_cols=cat_cols, check_bool=check_bool)

        data_cat = self._set_data_categoric(transform_num=transform_num)
        data_num = self._set_data_numeric(
            transform_cat=transform_cat,
            rescale=rescale,
            rescale_args=rescale_args)

        # Custom arguments for metafeature extraction methods
        self._custom_args_ft = {
            "X": self.X,
            "N": data_num,
            "C": data_cat,
            "y": self.y,
            "folds": self.folds,
            "sample_size": self.sample_size,
            "score": self.score,
            "random_state": self.random_state,
            "cat_cols": self._attr_indexes_cat,
        }

        # Custom arguments from preprocessing methods
        self._precomp_args_ft = _internal.process_precomp_groups(
            precomp_groups=precomp_groups,
            groups=self.groups,
            wildcard=wildcard,
            suppress_warnings=suppress_warnings,
            **self._custom_args_ft)

        # Custom arguments for postprocessing methods
        self._postprocess_args_ft = {
            "inserted_group_dep": self.inserted_group_dep,
        }

        # Custom arguments for summarization methods
        self._custom_args_sum = {
            "ddof": 1,
        }

        return self
Exemplo n.º 4
0
    def __init__(self,
                 groups: t.Union[str, t.Iterable[str]] = "all",
                 features: t.Union[str, t.Iterable[str]] = "all",
                 summary: t.Union[str, t.Iterable[str]] = ("mean", "sd"),
                 measure_time: t.Optional[str] = None,
                 wildcard: str = "all",
                 score="accuracy",
                 folds=10,
                 sample_size=1.0,
                 suppress_warnings: bool = False,
                 random_state: t.Optional[int] = None) -> None:
        """This class provides easy access for metafeature extraction from
        datasets.

        It expected that user first calls ``fit`` method after instantiation
        and then ``extract`` for effectively extract the selected metafeatures.
        Check reference [1]_ for more information.

        Parameters
        ----------
        groups : :obj:`Iterable` of :obj:`str` or :obj:`str`
            A collection or a single metafeature group name representing the
            desired group of metafeatures for extraction. The supported groups
            are:

                1. ``general``: general/simples metafeatures.
                2. ``statistical``: statistical metafeatures.
                3. ``info-theory``: information-theoretic type of metafeature.
                4. ``model-based``: metafeatures based on machine learning
                   model characteristics.
                5. ``landmarking``: metafeatures representing performance
                   metrics from simple machine learning models or machine
                   learning models induced with sampled data.

            The value provided by the argument ``wildcard`` can be used to
            select all metafeature groups rapidly.

        features : :obj:`Iterable` of :obj:`str` or :obj:`str`, optional
            A collection or a single metafeature name desired for extraction.
            Keep in mind that the extraction only gathers features also in the
            selected ``groups``. Check this class ``feature`` attribute to get
            a list of available metafeatures from selected groups.

            The value provided by the argument ``wildcard`` can be used to
            select all features from all selected groups rapidly.

        summary : :obj:`Iterable` of :obj:`str` or :obj:`str`, optional
            A collection or a single summary function to summarize a group of
            metafeature measures into a fixed-length group of value, typically
            a single value. The values must be one of the following:

                1. ``mean``: Average of the values.
                2. ``sd``: Standard deviation of the values.
                3. ``count``: Computes the cardinality of the measure. Suitable
                   for variable cardinality.
                4. ``histogram``: Describes the distribution of the measured
                   values. Suitable for high cardinality.
                5. ``iq_range``: Computes the interquartile range of the
                   measured values.
                6. ``kurtosis``: Describes the shape of the measures values
                   distribution.
                7. ``max``: Results in the maximum value of the measure.
                8. ``median``: Results in the central value of the measure.
                9. ``min``: Results in the minimum value of the measure.
                10. ``quantiles``: Results in the minimum, first quartile,
                    median, third quartile and maximum of the measured values.
                11. ``range``: Computes the range of the measured values.
                12. ``skewness``: Describes the shape of the measure values
                    distribution in terms of symmetry.

            If more than one summary function is selected, then all multivalued
            extracted metafeatures are summarized with each summary function.

            The particular value provided by the argument ``wildcard`` can be
            used to select all summary functions rapidly.

        measure_time : :obj:`str`, optional
            Options for measuring the time elapsed during metafeature
            extraction. If this argument value is :obj:`NoneType`, no time
            elapsed is measured. Otherwise, this argument must be a :obj:`str`
            valued as one of the options below:

                1. ``avg``: average time for each metafeature (total time
                   divided by the feature cardinality, i.e., number of features
                   extracted by a single feature-extraction related method),
                   without summarization time.
                2. ``avg_summ``: average time for each metafeature (total time
                   of extraction divided by feature cardinality) including
                   required time for summarization.
                3. ``total``: total time for each metafeature, without
                   summarization time.
                4. ``total_summ``: total time for each metafeature including
                   the required time for summarization.

            The ``cardinality`` of the feature is the number of values
            extracted by a single calculation method.

            For example, ``mean`` feature has cardinality equal to the number
            of numeric features in the dataset, where ``cor`` (from
            ``correlation``) has cardinality equals to (N - 1)/2, where N is
            the number of numeric features in the dataset.

            The cardinality is used to divide the total execution time of that
            method if an option starting with ``avg`` is selected.

            If a summary method has cardinality higher than one (more than one
            value returned after summarization and, thus, creating more than
            one entry in the result lists) like, for example, ``histogram``
            summary method, then the corresponding time of this summary will be
            inserted only in the first correspondent element of the time list.
            The remaining entries are all filled with 0 value, to keep
            consistency between the size of all lists returned and index
            correspondence between they.

        wildcard : :obj:`str`, optional
            Value used as ``select all`` for ``groups``, ``features`` and
            ``summary`` arguments.

         score : :obj:`str`, optional
            Score metric used to extract ``landmarking`` metafeatures.

         folds : :obj:`int`, optional
            Number of folds to create a Stratified K-Fold cross validation
            to produce the ``landmarking`` metafeatures.

         sample_size : :obj:`float`, optional
            Sample proportion used to produce the ``landmarking`` metafeatures.
            This argument must be in 0.5 and 1.0 (both inclusive) interval.

        suppress_warnings : :obj:`bool`, optional
            If True, then ignore all warnings invoked at the instantiation
            time.

        Notes
        -----
            .. [1] Rivolli et al. "Towards Reproducible Empirical
               Research in Meta-Learning,".
               Rivolli et al. URL: https://arxiv.org/abs/1808.10406

        Examples
        --------

        Load a dataset

        >>> from sklearn.datasets import load_iris
        >>> from pymfe.mfe import MFE

        >>> data = load_iris()
        >>> y = data.target
        >>> X = data.data

        Extract all measures

        >>> mfe = MFE()
        >>> mfe.fit(X, y)
        >>> ft = mfe.extract()
        >>> print(ft)

        Extract general, statistical and information-theoretic measures

        >>> mfe = MFE(groups=["general", "statistical", "info-theory"])
        >>> mfe.fit(X, y)
        >>> ft = mfe.extract()
        >>> print(ft)

        """
        self.groups = _internal.process_generic_set(
            values=groups, group_name="groups")  # type: t.Tuple[str, ...]

        self.groups, self.inserted_group_dep = (
            _internal.solve_group_dependencies(
                groups=self.groups))

        proc_feat = _internal.process_features(
            features=features,
            groups=self.groups,
            suppress_warnings=suppress_warnings,
            wildcard=wildcard,
        )  # type: t.Tuple[t.Tuple[str, ...], _TypeSeqExt, t.Tuple[str, ...]]

        self.features, self._metadata_mtd_ft, self.groups = proc_feat
        del proc_feat

        self.summary, self._metadata_mtd_sm = _internal.process_summary(
            summary)  # type: t.Tuple[t.Tuple[str, ...], _TypeSeqExt]

        self.timeopt = _internal.process_generic_option(
            value=measure_time, group_name="timeopt",
            allow_none=True)  # type: t.Optional[str]

        self.X = None  # type: t.Optional[np.ndarray]
        self.y = None  # type: t.Optional[np.ndarray]

        self._custom_args_ft = None  # type: t.Optional[t.Dict[str, t.Any]]
        """User-independent arguments for ft. methods (e.g. ``X`` and ``y``)"""

        self._custom_args_sum = None  # type: t.Optional[t.Dict[str, t.Any]]
        """User-independent arguments for summary functions methods."""

        self._attr_indexes_num = None  # type: t.Optional[t.Tuple[int, ...]]
        """Numeric column indexes from ``X`` (independent attributes)."""

        self._attr_indexes_cat = None  # type: t.Optional[t.Tuple[int, ...]]
        """Categoric column indexes from ``X`` (independent attributes)."""

        self._precomp_args_ft = None  # type: t.Optional[t.Dict[str, t.Any]]
        """Precomputed common feature-extraction method arguments."""

        self._postprocess_args_ft = {}  # type: t.Dict[str, t.Any]
        """User-independent arguments for post-processing methods."""

        if random_state is None or isinstance(random_state, int):
            self.random_state = random_state
            np.random.seed(random_state)

        else:
            raise ValueError(
                'Invalid "random_state" argument ({0}). '
                'Expecting None or an integer.'.format(random_state))

        if isinstance(folds, int):
            self.folds = folds
        else:
            raise ValueError('Invalid "folds" argument ({0}). '
                             'Expecting an integer.'.format(random_state))

        if isinstance(sample_size, int):
            sample_size = float(sample_size)

        if isinstance(sample_size, float)\
           and 0.5 <= sample_size <= 1.0:
            self.sample_size = sample_size

        else:
            raise ValueError('Invalid "sample_size" argument ({0}). '
                             'Expecting an float [0.5, 1].'
                             .format(random_state))

        self.score = _internal.check_score(score, self.groups)