def test_has_class_filter_table(self):
        filter_ = HasClass()
        with_class = filter_(self.table)
        self.assertEqual(len(with_class), len(self.table) - self.n_missing)
        self.assertFalse(with_class.has_missing_class())

        filter_ = HasClass(negate=True)
        without_class = filter_(self.table)
        self.assertEqual(len(without_class), self.n_missing)
        self.assertTrue(without_class.has_missing_class())
    def test_has_class_filter_instance(self):
        class_missing = self.table[9]
        class_present = self.table[0]

        filter_ = HasClass()
        self.assertFalse(filter_(class_missing))
        self.assertTrue(filter_(class_present))

        filter_ = HasClass(negate=True)
        self.assertTrue(filter_(class_missing))
        self.assertFalse(filter_(class_present))
Пример #3
0
class NNTransferLearner(Learner):

    preprocessors = default_preprocessors = [
        HasClass(),
    ]

    __returns__ = NNTransferModel

    def __init__(self, original_model, preprocessors=None, fit_params={}):
        super().__init__(preprocessors=preprocessors)
        self.preprocessors.append(CorrectWavenumberRange(original_model.wavenumbers))
        self.preprocessors.append(Normalize(Normalize.Vector))
        self.original_model = original_model
        self.fit_params = fit_params

    def fit(self, X, Y, W=None):
        # TODO retraining would modify the original model, should we do a copy there?
        pretrained = self.original_model.load()
        for layer in pretrained.layers:
            layer.trainable = False
        y_onehot = keras.utils.np_utils.to_categorical(Y)

        last = Dense(y_onehot.shape[1], name='classify', activation='softmax')\
            (pretrained.layers[-2].output)

        model = KerasModel(inputs=pretrained.input, outputs=last)
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        model.fit(X, y_onehot, epochs=50, **self.fit_params)

        return NNTransferModel(model)
Пример #4
0
    def set_train_data(self, data):
        """
        Set the input training dataset.

        Parameters
        ----------
        data : Optional[Orange.data.Table]
        """
        self.Information.data_sampled.clear()
        self.Error.train_data_empty.clear()
        self.Error.class_required.clear()
        self.Error.too_many_classes.clear()
        self.Error.only_one_class_var_value.clear()
        if data is not None and not len(data):
            self.Error.train_data_empty()
            data = None
        if data:
            conds = [
                not data.domain.class_vars,
                len(data.domain.class_vars) > 1, data.domain.has_discrete_class
                and len(data.domain.class_var.values) == 1
            ]
            errors = [
                self.Error.class_required, self.Error.too_many_classes,
                self.Error.only_one_class_var_value
            ]
            for cond, error in zip(conds, errors):
                if cond:
                    error()
                    data = None
                    break

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.Information.data_sampled()
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.train_data_missing_vals = \
            data is not None and np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.Warning.missing_data(self._which_missing_data())
            if data:
                data = HasClass()(data)
        else:
            self.Warning.missing_data.clear()

        self.data = data
        self.closeContext()
        self._update_scorers()
        self._update_controls()
        if data is not None:
            self._update_class_selection()
            self.openContext(data.domain)
            if self.fold_feature_selected and bool(self.feature_model):
                self.resampling = OWTestLearners.FeatureFold
        self._invalidate()
Пример #5
0
    def set_train_data(self, data):
        """
        Set the input training dataset.

        Parameters
        ----------
        data : Optional[Orange.data.Table]
        """
        self.cancel()
        self.Information.data_sampled.clear()
        self.Error.train_data_error.clear()

        if data is not None:
            data_errors = [
                ("Train dataset is empty.", len(data) == 0),
                ("Train data input requires a target variable.",
                 not data.domain.class_vars),
                ("Too many target variables.",
                 len(data.domain.class_vars) > 1),
                ("Target variable has no values.", np.isnan(data.Y).all()),
                ("Target variable has only one value.",
                 data.domain.has_discrete_class and len(unique(data.Y)) < 2),
                ("Data has no features to learn from.", data.X.shape[1] == 0),
            ]

            for error_msg, cond in data_errors:
                if cond:
                    self.Error.train_data_error(error_msg)
                    data = None
                    break

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.Information.data_sampled()
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.train_data_missing_vals = \
            data is not None and np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.Warning.missing_data(self._which_missing_data())
            if data:
                data = HasClass()(data)
        else:
            self.Warning.missing_data.clear()

        self.data = data
        self.closeContext()
        self._update_scorers()
        self._update_controls()
        if data is not None:
            self._update_class_selection()
            self.openContext(data.domain)
            if self.fold_feature_selected and bool(self.feature_model):
                self.resampling = OWTestAndScore.FeatureFold
        self._invalidate()
Пример #6
0
class Scorer(_RefuseDataInConstructor, Reprable):
    feature_type = None
    class_type = None
    supports_sparse_data = None
    preprocessors = [HasClass()]

    @property
    def friendly_name(self):
        """Return type name with camel-case separated into words.
        Derived classes can provide a better property or a class attribute.
        """
        return re.sub("([a-z])([A-Z])",
                      lambda mo: mo.group(1) + " " + mo.group(2).lower(),
                      type(self).__name__)

    @staticmethod
    def _friendly_vartype_name(vartype):
        if vartype == DiscreteVariable:
            return "categorical"
        if vartype == ContinuousVariable:
            return "numeric"
        # Fallbacks
        name = vartype.__name__
        if name.endswith("Variable"):
            return name.lower()[:-8]
        return name

    def __call__(self, data, feature=None):
        if not data.domain.class_var:
            raise ValueError(
                "{} requires data with a target variable."
                .format(self.friendly_name))
        if not isinstance(data.domain.class_var, self.class_type):
            raise ValueError(
                "{} requires a {} target variable."
                .format(self.friendly_name,
                        self._friendly_vartype_name(self.class_type)))

        if feature is not None:
            f = data.domain[feature]
            data = data.transform(Domain([f], data.domain.class_vars))

        for pp in self.preprocessors:
            data = pp(data)

        for var in data.domain.attributes:
            if not isinstance(var, self.feature_type):
                raise ValueError(
                    "{} cannot score {} variables."
                    .format(self.friendly_name,
                            self._friendly_vartype_name(type(var))))

        return self.score_data(data, feature)

    def score_data(self, data, feature):
        raise NotImplementedError
 def test_has_class_multiclass(self):
     domain = Domain([DiscreteVariable("x", values="01")], [
         DiscreteVariable("y1", values="01"),
         DiscreteVariable("y2", values="01")
     ])
     table = Table(
         domain,
         [[0, 1, np.nan], [1, np.nan, 0], [1, 0, 1], [1, np.nan, np.nan]])
     table = HasClass()(table)
     self.assertTrue(not np.isnan(table).any())
     self.assertEqual(table.domain, domain)
     self.assertEqual(len(table), 1)
Пример #8
0
    def __call__(self, data):
        """
        Remove rows that contain NaN in any class variable from the dataset
        and return the resulting data table.

        Parameters
        ----------
        data : an input dataset

        Returns
        -------
        data : dataset without rows with missing classes
        """
        return HasClass()(data)
Пример #9
0
class XGBBase(SklLearner):
    """Base class for xgboost (classification and regression) learners """
    preprocessors = default_preprocessors = [
        HasClass(),
        Continuize(),
        RemoveNaNColumns(),
    ]

    def __init__(self, preprocessors=None, **kwargs):
        super().__init__(preprocessors=preprocessors)
        self.params = kwargs

    @SklLearner.params.setter
    def params(self, values: Dict):
        self._params = values
Пример #10
0
    def set_test_data(self, data):
        # type: (Orange.data.Table) -> None
        """
        Set the input separate testing dataset.

        Parameters
        ----------
        data : Optional[Orange.data.Table]
        """
        self.Information.test_data_sampled.clear()
        self.Error.test_data_empty.clear()
        if data is not None and not data:
            self.Error.test_data_empty()
            data = None

        if data and not data.domain.class_vars:
            self.Error.class_required_test()
            data = None
        else:
            self.Error.class_required_test.clear()

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.Information.test_data_sampled()
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.test_data_missing_vals = \
            data is not None and np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.Warning.missing_data(self._which_missing_data())
            if data:
                data = HasClass()(data)
        else:
            self.Warning.missing_data.clear()

        self.test_data = data
        if self.resampling == OWTestAndScore.TestOnTest:
            self._invalidate()
Пример #11
0
    def test_reprs(self):
        flid = IsDefined(negate=True)
        flhc = HasClass()
        flr = Random()
        fld = FilterDiscrete(self.attr_disc, None)
        flsv = SameValue(self.attr_disc, self.value_disc, negate=True)
        flc = FilterContinuous(self.vs[0], FilterContinuous.Less, 5)
        flc2 = FilterContinuous(self.vs[1], FilterContinuous.Greater, 3)
        flv = Values([flc, flc2], conjunction=False, negate=True)
        flvf = ValueFilter(self.attr_disc)
        fls = FilterString("name", FilterString.Equal, "Aardvark", case_sensitive=False)
        flsl = FilterStringList("name", ["Aardvark"], case_sensitive=False)
        flrx = FilterRegex("name", "^c...$")

        filters = [flid, flhc, flr, fld, flsv, flc, flv, flvf, fls, flsl, flrx]

        for f in filters:
            repr_str = repr(f)
            new_f = eval(repr_str)
            self.assertEqual(repr(new_f), repr_str)
Пример #12
0
class CatBoostLearnerRegression(CatBoostLearner, LearnerRegression):
    __wraps__ = None
    __returns__ = CatBoostModel
    supports_multiclass = True
    _params = {}

    learner_adequacy_err_msg = "Continuous class variable expected."

    preprocessors = default_preprocessors = [
        HasClass(), Continuize(),
        RemoveNaNColumns(),
        SklImpute()
    ]

    def check_learner_adequacy(self, domain):
        return domain.has_continuous_class

    @property
    def params(self):
        return self._params

    @params.setter
    def params(self, value):
        self._params = self._get_sklparams(value)

    def _get_sklparams(self, values):
        skllearner = self.__wraps__
        if skllearner is not None:
            spec = inspect.getargs(skllearner.__init__.__code__)
            # first argument is 'self'
            assert spec.args[0] == "self"
            params = {
                name: values[name]
                for name in spec.args[1:] if name in values
            }
        else:
            raise TypeError("Wrapper does not define '__wraps__'")
        return params

    def preprocess(self, data):
        data = super().preprocess(data)

        if any(v.is_discrete and len(v.values) > 2
               for v in data.domain.attributes):
            raise ValueError("Wrapped scikit-learn methods do not support " +
                             "multinomial variables.")

        return data

    def __call__(self, data):
        m = super().__call__(data)
        m.params = self.params
        return m

    def fit(self, X, Y, W=None):
        clf = self.__wraps__(**self.params)
        Y = Y.reshape(-1)
        if W is None or not self.supports_weights:
            return self.__returns__(clf.fit(X, Y))
        return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1)))

    @property
    def supports_weights(self):
        """Indicates whether this learner supports weighted instances.
        """
        return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames

    def __getattr__(self, item):
        try:
            return self.params[item]
        except (KeyError, AttributeError):
            raise AttributeError(item) from None

    # TODO: Disallow (or mirror) __setattr__ for keys in params?

    def __dir__(self):
        dd = super().__dir__()
        return list(sorted(set(dd) | set(self.params.keys())))
Пример #13
0
class CurveFitLearner(Learner):
    """
    Fit a function to data.
    It uses the scipy.curve_fit to find the optimal values of parameters.

    Parameters
    ----------
    expression : callable or str
        A modeling function.
        If callable, it must take the independent variable as the first
        argument and the parameters to fit as separate remaining arguments.
        If string, a lambda function is created,
        using `expression`, `available_feature_names`, `function` and `env`
        attributes.
        Should be string for pickling the model.
    parameters_names : list of str
        List of parameters names. Only needed when the expression
        is callable.
    features_names : list of str
        List of features names. Only needed when the expression
        is callable.
    available_feature_names : list of str
        List of all available features names. Only needed when the expression
        is string. Needed to distinguish between parameters and features when
        translating the expression into the lambda.
    functions : list of str
        List of all available functions. Only needed when the expression
        is string. Needed to distinguish between parameters and functions when
        translating the expression into the lambda.
    sanitizer : callable
        Function for sanitizing names.
    env : dict
        An environment to capture in the lambda's closure.
    p0 : list of floats, optional
        Initial guess for the parameters.
    bounds : 2-tuple of array_like, optional
        Lower and upper bounds on parameters.
    preprocessors : tuple of Orange preprocessors, optional
        The processors that will be used when data is passed to the learner.

    Examples
    --------
    >>> import numpy as np
    >>> from Orange.data import Table
    >>> from Orange.regression import CurveFitLearner
    >>> data = Table("housing")
    >>> # example with callable expression
    >>> cfun = lambda x, a, b, c: a * np.exp(-b * x[:, 0] * x[:, 1]) + c
    >>> learner = CurveFitLearner(cfun, ["a", "b", "c"], ["CRIM", "LSTAT"])
    >>> model = learner(data)
    >>> pred = model(data)
    >>> coef = model.coefficients
    >>> # example with str expression
    >>> sfun = "a * exp(-b * CRIM * LSTAT) + c"
    >>> names = [a.name for a in data.domain.attributes]
    >>> learner = CurveFitLearner(sfun, available_feature_names=names,
    ...                           functions=["exp"])
    >>> model = learner(data)
    >>> pred = model(data)
    >>> coef = model.coefficients

    """
    preprocessors = [HasClass(), RemoveNaNColumns(), Impute()]
    __returns__ = CurveFitModel
    name = "Curve Fit"

    def __init__(
            self,
            expression: Union[Callable, ast.Expression, str],
            parameters_names: Optional[List[str]] = None,
            features_names: Optional[List[str]] = None,
            available_feature_names: Optional[List[str]] = None,
            functions: Optional[List[str]] = None,
            sanitizer: Optional[Callable] = None,
            env: Optional[Dict[str, Any]] = None,
            p0: Union[List, Dict, None] = None,
            bounds: Union[Tuple, Dict] = (-np.inf, np.inf),
            preprocessors=None
    ):
        super().__init__(preprocessors)

        if callable(expression):
            if parameters_names is None:
                raise TypeError("Provide 'parameters_names' parameter.")
            if features_names is None:
                raise TypeError("Provide 'features_names' parameter.")

            args = None
            function = expression
        else:
            if available_feature_names is None:
                raise TypeError("Provide 'available_feature_names' parameter.")
            if functions is None:
                raise TypeError("Provide 'functions' parameter.")

            args = dict(expression=expression,
                        available_feature_names=available_feature_names,
                        functions=functions, sanitizer=sanitizer, env=env)
            function, parameters_names, features_names = _create_lambda(**args)

        if isinstance(p0, dict):
            p0 = [p0.get(p, 1) for p in parameters_names]
        if isinstance(bounds, dict):
            d = [-np.inf, np.inf]
            lower_bounds = [bounds.get(p, d)[0] for p in parameters_names]
            upper_bounds = [bounds.get(p, d)[1] for p in parameters_names]
            bounds = lower_bounds, upper_bounds

        self.__function = function
        self.__parameters_names = parameters_names
        self.__features_names = features_names
        self.__p0 = p0
        self.__bounds = bounds

        # needed for pickling - if the expression is a lambda function, the
        # learner is not picklable
        self.__create_lambda_args = args

    @property
    def parameters_names(self) -> List[str]:
        return self.__parameters_names

    def fit_storage(self, data: Table) -> CurveFitModel:
        domain: Domain = data.domain
        attributes = []
        for attr in domain.attributes:
            if attr.name in self.__features_names:
                if not attr.is_continuous:
                    raise ValueError("Numeric feature expected.")
                attributes.append(attr)

        new_domain = Domain(attributes, domain.class_vars, domain.metas)
        transformed = data.transform(new_domain)
        params = curve_fit(self.__function, transformed.X, transformed.Y,
                           p0=self.__p0, bounds=self.__bounds)[0]
        return CurveFitModel(new_domain, domain,
                             self.__parameters_names, params, self.__function,
                             self.__create_lambda_args)

    def __getstate__(self) -> Dict:
        if not self.__create_lambda_args:
            raise AttributeError(
                "Can't pickle/copy callable. Use str expression instead."
            )
        state = self.__create_lambda_args.copy()
        state["parameters_names"] = None
        state["features_names"] = None
        state["p0"] = self.__p0
        state["bounds"] = self.__bounds
        state["preprocessors"] = self.preprocessors
        return state

    def __setstate__(self, state: Dict):
        expression = state.pop("expression")
        self.__init__(expression, **state)
Пример #14
0
class CatGBBaseLearner(Learner, metaclass=WrapperMeta):
    """
    ${skldoc}
    Additional Orange parameters

    preprocessors : list, optional
        An ordered list of preprocessors applied to data before
        training or testing.
        Defaults to
        `[RemoveNaNClasses(), RemoveNaNColumns()]`
    """
    supports_weights = True
    __wraps__ = None
    __returns__ = CatGBModel
    _params = {}
    preprocessors = default_preprocessors = [
        HasClass(),
        RemoveNaNColumns(),
    ]

    # pylint: disable=unused-argument,too-many-arguments,too-many-locals
    def __init__(self,
                 iterations=None,
                 learning_rate=None,
                 depth=None,
                 l2_leaf_reg=None,
                 model_size_reg=None,
                 rsm=None,
                 loss_function=None,
                 border_count=None,
                 feature_border_type=None,
                 per_float_feature_quantization=None,
                 input_borders=None,
                 output_borders=None,
                 fold_permutation_block=None,
                 od_pval=None,
                 od_wait=None,
                 od_type=None,
                 nan_mode=None,
                 counter_calc_method=None,
                 leaf_estimation_iterations=None,
                 leaf_estimation_method=None,
                 thread_count=None,
                 random_seed=None,
                 use_best_model=None,
                 verbose=False,
                 logging_level=None,
                 metric_period=None,
                 ctr_leaf_count_limit=None,
                 store_all_simple_ctr=None,
                 max_ctr_complexity=None,
                 has_time=None,
                 allow_const_label=None,
                 classes_count=None,
                 class_weights=None,
                 one_hot_max_size=None,
                 random_strength=None,
                 name=None,
                 ignored_features=None,
                 train_dir=cache_dir(),
                 custom_loss=None,
                 custom_metric=None,
                 eval_metric=None,
                 bagging_temperature=None,
                 save_snapshot=None,
                 snapshot_file=None,
                 snapshot_interval=None,
                 fold_len_multiplier=None,
                 used_ram_limit=None,
                 gpu_ram_part=None,
                 allow_writing_files=False,
                 final_ctr_computation_mode=None,
                 approx_on_full_history=None,
                 boosting_type=None,
                 simple_ctr=None,
                 combinations_ctr=None,
                 per_feature_ctr=None,
                 task_type=None,
                 device_config=None,
                 devices=None,
                 bootstrap_type=None,
                 subsample=None,
                 sampling_unit=None,
                 dev_score_calc_obj_block_size=None,
                 max_depth=None,
                 n_estimators=None,
                 num_boost_round=None,
                 num_trees=None,
                 colsample_bylevel=None,
                 random_state=None,
                 reg_lambda=None,
                 objective=None,
                 eta=None,
                 max_bin=None,
                 scale_pos_weight=None,
                 gpu_cat_features_storage=None,
                 data_partition=None,
                 metadata=None,
                 early_stopping_rounds=None,
                 cat_features=None,
                 grow_policy=None,
                 min_data_in_leaf=None,
                 min_child_samples=None,
                 max_leaves=None,
                 num_leaves=None,
                 score_function=None,
                 leaf_estimation_backtracking=None,
                 ctr_history_unit=None,
                 monotone_constraints=None,
                 feature_weights=None,
                 penalties_coefficient=None,
                 first_feature_use_penalties=None,
                 model_shrink_rate=None,
                 model_shrink_mode=None,
                 langevin=None,
                 diffusion_temperature=None,
                 posterior_sampling=None,
                 boost_from_average=None,
                 text_features=None,
                 tokenizers=None,
                 dictionaries=None,
                 feature_calcers=None,
                 text_processing=None,
                 preprocessors=None):
        super().__init__(preprocessors=preprocessors)
        self.params = vars()

    @property
    def params(self):
        return self._params

    @params.setter
    def params(self, value):
        self._params = self._get_wrapper_params(value)

    def _get_wrapper_params(self, values):
        spec = list(
            inspect.signature(self.__wraps__.__init__).parameters.keys())
        return {name: values[name] for name in spec[1:] if name in values}

    def __call__(self, data, progress_callback=None):
        m = super().__call__(data, progress_callback)
        m.params = self.params
        return m

    def fit_storage(self, data: Table):
        domain, X, Y, W = data.domain, data.X, data.Y.reshape(-1), None
        if self.supports_weights and data.has_weights():
            W = data.W.reshape(-1)
        # pylint: disable=not-callable
        clf = self.__wraps__(**self.params)
        cat_features = [
            i for i, attr in enumerate(domain.attributes) if attr.is_discrete
        ]
        if cat_features:
            X = X.astype(str)
        cat_model = clf.fit(X, Y, cat_features=cat_features, sample_weight=W)
        return self.__returns__(cat_model, cat_features, domain)

    def __getattr__(self, item):
        try:
            return self.params[item]
        except (KeyError, AttributeError):
            raise AttributeError(item) from None

    def __dir__(self):
        dd = super().__dir__()
        return list(sorted(set(dd) | set(self.params.keys())))
Пример #15
0
class SoftmaxRegressionLearner(Learner):
    """L2 regularized softmax regression classifier.
    Uses the L-BFGS algorithm to minimize the categorical
    cross entropy cost with L2 regularization. This model is suitable
    when dealing with a multi-class classification problem.

    When using this learner you should:

    - choose a suitable regularization parameter lambda\_,
    - consider using many logistic regression models (one for each
      value of the class variable) instead of softmax regression.

    Parameters
    ----------

    lambda\_ : float, optional (default=1.0)
        Regularization parameter. It controls trade-off between fitting the
        data and keeping parameters small. Higher values of lambda\_ force
        parameters to be smaller.

    preprocessors : list, optional
        Preprocessors are applied to data before training or testing. Default preprocessors:
        Defaults to
        `[RemoveNaNClasses(), RemoveNaNColumns(), Impute(), Continuize(), Normalize()]`

        - remove columns with all values as NaN
        - replace NaN values with suitable values
        - continuize all discrete attributes,
        - transform the dataset so that the columns are on a similar scale,

    fmin_args : dict, optional
        Parameters for L-BFGS algorithm.
    """

    name = "softmax"
    preprocessors = [
        HasClass(),
        RemoveNaNColumns(),
        Impute(),
        Continuize(),
        Normalize(),
    ]

    def __init__(self, lambda_=1.0, preprocessors=None, **fmin_args):
        super().__init__(preprocessors=preprocessors)
        self.lambda_ = lambda_
        self.fmin_args = fmin_args

    def cost_grad(self, Theta_flat, X, Y):
        Theta = Theta_flat.reshape((self.num_classes, X.shape[1]))

        M = X.dot(Theta.T)
        P = np.exp(M - np.max(M, axis=1)[:, None])
        P /= np.sum(P, axis=1)[:, None]

        cost = -np.sum(np.log(P) * Y)
        cost += self.lambda_ * Theta_flat.dot(Theta_flat) / 2.0
        cost /= X.shape[0]

        grad = X.T.dot(P - Y).T
        grad += self.lambda_ * Theta
        grad /= X.shape[0]

        return cost, grad.ravel()

    def fit(self, X, y, W):
        if len(y.shape) > 1:
            raise ValueError("Softmax regression does not support "
                             "multi-label classification")

        if np.isnan(np.sum(X)) or np.isnan(np.sum(y)):
            raise ValueError("Softmax regression does not support "
                             "unknown values")

        X = np.hstack((X, np.ones((X.shape[0], 1))))

        self.num_classes = np.unique(y).size
        Y = np.eye(self.num_classes)[y.ravel().astype(int)]

        theta = np.zeros(self.num_classes * X.shape[1])
        theta, j, ret = fmin_l_bfgs_b(self.cost_grad,
                                      theta,
                                      args=(X, Y),
                                      **self.fmin_args)
        Theta = theta.reshape((self.num_classes, X.shape[1]))

        return SoftmaxRegressionModel(Theta)
Пример #16
0
class SklLearner(Learner, metaclass=WrapperMeta):
    """
    ${skldoc}
    Additional Orange parameters

    preprocessors : list, optional
        An ordered list of preprocessors applied to data before
        training or testing.
        Defaults to
        `[RemoveNaNClasses(), Continuize(), SklImpute(), RemoveNaNColumns()]`
    """
    __wraps__ = None
    __returns__ = SklModel
    _params = {}

    preprocessors = default_preprocessors = [
        HasClass(), Continuize(),
        RemoveNaNColumns(),
        SklImpute()
    ]

    @property
    def params(self):
        return self._params

    @params.setter
    def params(self, value):
        self._params = self._get_sklparams(value)

    def _get_sklparams(self, values):
        skllearner = self.__wraps__
        if skllearner is not None:
            spec = inspect.getargs(skllearner.__init__.__code__)
            # first argument is 'self'
            assert spec.args[0] == "self"
            params = {
                name: values[name]
                for name in spec.args[1:] if name in values
            }
        else:
            raise TypeError("Wrapper does not define '__wraps__'")
        return params

    def preprocess(self, data):
        data = super().preprocess(data)

        if any(v.is_discrete and len(v.values) > 2
               for v in data.domain.attributes):
            raise ValueError("Wrapped scikit-learn methods do not support " +
                             "multinomial variables.")

        return data

    def __call__(self, data):
        m = super().__call__(data)
        m.params = self.params
        return m

    def fit(self, X, Y, W=None):
        clf = self.__wraps__(**self.params)
        Y = Y.reshape(-1)
        if W is None or not self.supports_weights:
            return self.__returns__(clf.fit(X, Y))
        return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1)))

    @property
    def supports_weights(self):
        """Indicates whether this learner supports weighted instances.
        """
        return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames

    def __getattr__(self, item):
        try:
            return self.params[item]
        except (KeyError, AttributeError):
            raise AttributeError(item) from None

    # TODO: Disallow (or mirror) __setattr__ for keys in params?

    def __dir__(self):
        dd = super().__dir__()
        return list(sorted(set(dd) | set(self.params.keys())))
class LinearRegressionLearner(Learner):
    '''L2 regularized linear regression (a.k.a Ridge regression)

    This model uses the L-BFGS algorithm to minimize the linear least
    squares penalty with L2 regularization. When using this model you
    should:

    - Choose a suitable regularization parameter lambda_
    - Consider appending a column of ones to the dataset (intercept term)

    Parameters
    ----------

    lambda\_ : float, optional (default=1.0)
        Regularization parameter. It controls trade-off between fitting the
        data and keeping parameters small. Higher values of lambda\_ force
        parameters to be smaller.

    preprocessors : list, optional
        Preprocessors are applied to data before training or testing. Default preprocessors
        `[Normalize(), Continuize(), Impute(), RemoveNaNColumns()]`:
        - transform the dataset so that the columns are on a similar scale,
        - continuize all discrete attributes,
        - remove columns with all values as NaN
        - replace NaN values with suitable values

    fmin_args : dict, optional
        Parameters for L-BFGS algorithm.
    """

    Examples
    --------

        import numpy as np
        from Orange.data import Table
        from Orange.regression.linear_bfgs import LinearRegressionLearner

        data = Table('housing')
        data.X = np.hstack((data.X, np.ones((data.X.shape[0], 1)))) # append ones
        m = LinearRegressionLearner(lambda_=1.0)
        c = m(data) # fit
        print(c(data)) # predict
    '''

    name = "linear_bfgs"
    preprocessors = [
        HasClass(),
        Normalize(),
        Continuize(),
        Impute(),
        RemoveNaNColumns(),
    ]

    def __init__(self, lambda_=1.0, preprocessors=None, **fmin_args):

        super().__init__(preprocessors=preprocessors)
        self.lambda_ = lambda_
        self.fmin_args = fmin_args

    def cost_grad(self, theta, X, y):
        t = X.dot(theta) - y

        cost = t.dot(t)
        cost += self.lambda_ * theta.dot(theta)
        cost /= 2.0 * X.shape[0]

        grad = X.T.dot(t)
        grad += self.lambda_ * theta
        grad /= X.shape[0]

        return cost, grad

    def fit(self, X, Y, W):
        if len(Y.shape) > 1 and Y.shape[1] > 1:
            raise ValueError(
                "Linear regression does not support " "multi-target classification"
            )

        if np.isnan(np.sum(X)) or np.isnan(np.sum(Y)):
            raise ValueError("Linear regression does not support " "unknown values")

        theta = np.zeros(X.shape[1])
        theta, cost, ret = fmin_l_bfgs_b(
            self.cost_grad, theta, args=(X, Y.ravel()), **self.fmin_args
        )

        return LinearRegressionModel(theta)
Пример #18
0
class LRRulesLearner(Learner):
    """
    Learner learns a set of rules by using the provided
    rule learner. Then, learned rules get encoded as binary attributes
    (0 - not covered, 1 - covered) and together with the original set of
    attributes comprise the set of attributes used in logistic regression
    learning.

    If rule_learner is not provided, this acts as an ordinary
    logistic regression.

    The fitter for logistic regression uses the L2-penalized loss function.
    To prevent overfitting due to attributes built from rules,
    the weights of new rule-based attributes are penalized more (see
    Možina et al. Extreme value correction in rule learning)

    TODO: weights are not supported yet.
    """
    name = 'logreg rules'
    preprocessors = [HasClass(), RemoveNaNColumns(), Impute()]

    def __init__(self,
                 preprocessors=None,
                 penalty=1,
                 opt_penalty=False,
                 rule_learner=None,
                 basic_attributes=True,
                 fit_intercept=True,
                 intercept_scaling=2,
                 penalize_rules=True):
        """
        Parameters
        ----------
        preprocessors :
            A sequence of data preprocessors to apply on data prior to
            fitting the model.
        penalty : L2-penalty in loss function.
        rule_learner: Rule learner used to construct new attributes.
        fit_intercept: Should we add a constant column to data?
        intercept_scaling: Value of constant in the intercept column. Note that
            intercept column is appended after normalization, therefore higher
            values will be less affected by penalization.
        """
        super().__init__(preprocessors)
        self.penalty = penalty
        self.opt_penalty = opt_penalty
        self.rule_learner = rule_learner
        self.fit_intercept = fit_intercept
        self.intercept_scaling = intercept_scaling
        self.basic_attributes = basic_attributes
        self.penalize_rules = penalize_rules
        # Post rule learning preprocessing should not decrease the
        # number of examples.
        self.post_rule_preprocess = [Normalize(), Continuize()]

    def fit_storage(self, data):
        if self.opt_penalty:
            self.penalty = self.tune_penalty(data)
        # learn rules
        rules = self.rule_learner(data).rule_list if self.rule_learner else []
        # preprocess data
        if not self.basic_attributes:
            domain = Domain([], data.domain.class_vars, data.domain.metas)
            data = data.from_table(domain, data)
        for pp in self.post_rule_preprocess:
            data = pp(data)
        # create data
        X, Y, W = data.X, data.Y, data.W if data.W else None
        # 1. add rules to X
        Xr = np.concatenate([X] +
                            [r.covered_examples[:, np.newaxis] for r in rules],
                            axis=1)
        # 2. add constant to X
        if self.fit_intercept:
            Xr = self.add_intercept(self.intercept_scaling, Xr)
        # set additional penalties that penalized rule-based attributes
        gamma = self.get_gamma(X, rules)
        # build model
        w = []
        se = []
        if len(self.domain.class_var.values) > 2:
            for cli, _ in enumerate(self.domain.class_var.values):
                # create class with domain {-1, 1}
                yc = np.ones_like(Y)
                yc[Y != cli] = -1
                # set bounds
                bounds = self.set_bounds(X, rules, cli)
                x, s = self.fit_params(Xr, yc, bounds, gamma)
                w.append(x)
                se.append(s)
        else:
            yc = np.ones_like(Y)
            yc[Y != 0] = -1
            bounds = self.set_bounds(X, rules, 0)
            x, s = self.fit_params(Xr, yc, bounds, gamma)
            w = [x, -x]
            se = [s, s]
        # remove zero weights and corresponding rules
        to_keep, final_rules = list(range(X.shape[1])), []
        for ri, r in enumerate(rules):
            if any(wi[X.shape[1] + ri] != 0 for wi in w):
                to_keep.append(X.shape[1] + ri)
                final_rules.append(r)
        if self.fit_intercept:
            to_keep.append(-1)
        w = [wi[to_keep] for wi in w]
        se = [s[to_keep] for s in se]
        return LRRulesClassifier(w, se, final_rules, self.fit_intercept,
                                 self.intercept_scaling, self.domain,
                                 data.domain)

    def tune_penalty(self, data):
        learner = LRRulesLearner(fit_intercept=self.fit_intercept,
                                 intercept_scaling=self.intercept_scaling)
        penalties = [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10., 100.]
        scores = []
        for pen in penalties:
            learner.penalty = pen
            res = CrossValidation(data, [learner], k=5, random_state=1111)
            ll = LogLoss(res)
            scores.append(ll)
        return penalties[scores.index(min(scores))]

    def get_gamma(self, X, rules):
        gamma = [0] * X.shape[1]
        for r in rules:
            if self.penalize_rules:
                gamma.append(r.curr_class_dist[r.target_class] -
                             r.quality * r.curr_class_dist.sum())
            else:
                gamma.append(0)
        if self.fit_intercept:
            gamma.append(0)
        return np.array(gamma)

    @staticmethod
    def add_intercept(intercept, X):
        return np.hstack((X, intercept * np.ones((X.shape[0], 1))))

    def set_bounds(self, X, rules, cli):
        bounds = [(None, None) for _ in range(X.shape[1])]
        for r in rules:
            if r.target_class == cli:
                bounds.append((0, None))
            else:
                bounds.append((None, 0))
        if self.fit_intercept:
            bounds.append((None, None))
        return bounds

    def fit_params(self, X, y, bounds, gamma):
        w0 = np.zeros(X.shape[1])
        out = opt.minimize(self.ll,
                           w0,
                           args=(X, y, gamma),
                           method='TNC',
                           bounds=bounds,
                           jac=self.gradient)
        w = out.x
        # compute standard errors (s)
        z = self.phi(X.dot(w))
        weights = z * (1 - z)
        xwx = (X.T * weights).dot(X)
        diag = np.diag_indices(X.shape[1])
        xwx[diag] += self.penalty
        inv = np.linalg.inv(xwx)
        s = np.sqrt(inv[diag])
        return w, s

    @staticmethod
    def phi(t):
        # logistic function, returns 1 / (1 + exp(-t))
        idx = t > 0
        out = np.empty(t.size, dtype=np.float)
        out[idx] = 1. / (1 + np.exp(-t[idx]))
        exp_t = np.exp(t[~idx])
        out[~idx] = exp_t / (1. + exp_t)
        return out

    def ll(self, w, X, y, gamma):
        # loss function to be optimized, it's the logistic loss
        z = X.dot(w)
        yz = y * z
        idx = yz > 0
        out = np.zeros_like(yz)
        out[idx] = np.log(1 + np.exp(-yz[idx]))
        out[~idx] = (-yz[~idx] + np.log(1 + np.exp(yz[~idx])))
        out = out.sum()
        # add penalty
        out += (self.penalty * .5 * w).dot(w)
        # add second penalty (which is lasso-like and is a numpy array)
        out += gamma.dot(np.abs(w))
        return out

    def gradient(self, w, X, y, gamma):
        # gradient of the logistic loss (ll)
        z = X.dot(w)
        z = self.phi(y * z)
        z0 = (z - 1) * y
        gradll = X.T.dot(z0)
        # add penalties
        gradll += self.penalty * w
        # second penalty
        pos = w > 0
        neg = w < 0
        gradll[pos] += gamma[pos]
        gradll[neg] -= gamma[neg]
        return gradll