def test_has_class_filter_table(self): filter_ = HasClass() with_class = filter_(self.table) self.assertEqual(len(with_class), len(self.table) - self.n_missing) self.assertFalse(with_class.has_missing_class()) filter_ = HasClass(negate=True) without_class = filter_(self.table) self.assertEqual(len(without_class), self.n_missing) self.assertTrue(without_class.has_missing_class())
def test_has_class_filter_instance(self): class_missing = self.table[9] class_present = self.table[0] filter_ = HasClass() self.assertFalse(filter_(class_missing)) self.assertTrue(filter_(class_present)) filter_ = HasClass(negate=True) self.assertTrue(filter_(class_missing)) self.assertFalse(filter_(class_present))
class NNTransferLearner(Learner): preprocessors = default_preprocessors = [ HasClass(), ] __returns__ = NNTransferModel def __init__(self, original_model, preprocessors=None, fit_params={}): super().__init__(preprocessors=preprocessors) self.preprocessors.append(CorrectWavenumberRange(original_model.wavenumbers)) self.preprocessors.append(Normalize(Normalize.Vector)) self.original_model = original_model self.fit_params = fit_params def fit(self, X, Y, W=None): # TODO retraining would modify the original model, should we do a copy there? pretrained = self.original_model.load() for layer in pretrained.layers: layer.trainable = False y_onehot = keras.utils.np_utils.to_categorical(Y) last = Dense(y_onehot.shape[1], name='classify', activation='softmax')\ (pretrained.layers[-2].output) model = KerasModel(inputs=pretrained.input, outputs=last) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) model.fit(X, y_onehot, epochs=50, **self.fit_params) return NNTransferModel(model)
def set_train_data(self, data): """ Set the input training dataset. Parameters ---------- data : Optional[Orange.data.Table] """ self.Information.data_sampled.clear() self.Error.train_data_empty.clear() self.Error.class_required.clear() self.Error.too_many_classes.clear() self.Error.only_one_class_var_value.clear() if data is not None and not len(data): self.Error.train_data_empty() data = None if data: conds = [ not data.domain.class_vars, len(data.domain.class_vars) > 1, data.domain.has_discrete_class and len(data.domain.class_var.values) == 1 ] errors = [ self.Error.class_required, self.Error.too_many_classes, self.Error.only_one_class_var_value ] for cond, error in zip(conds, errors): if cond: error() data = None break if isinstance(data, SqlTable): if data.approx_len() < AUTO_DL_LIMIT: data = Table(data) else: self.Information.data_sampled() data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(AUTO_DL_LIMIT, partial=True) data = Table(data_sample) self.train_data_missing_vals = \ data is not None and np.isnan(data.Y).any() if self.train_data_missing_vals or self.test_data_missing_vals: self.Warning.missing_data(self._which_missing_data()) if data: data = HasClass()(data) else: self.Warning.missing_data.clear() self.data = data self.closeContext() self._update_scorers() self._update_controls() if data is not None: self._update_class_selection() self.openContext(data.domain) if self.fold_feature_selected and bool(self.feature_model): self.resampling = OWTestLearners.FeatureFold self._invalidate()
def set_train_data(self, data): """ Set the input training dataset. Parameters ---------- data : Optional[Orange.data.Table] """ self.cancel() self.Information.data_sampled.clear() self.Error.train_data_error.clear() if data is not None: data_errors = [ ("Train dataset is empty.", len(data) == 0), ("Train data input requires a target variable.", not data.domain.class_vars), ("Too many target variables.", len(data.domain.class_vars) > 1), ("Target variable has no values.", np.isnan(data.Y).all()), ("Target variable has only one value.", data.domain.has_discrete_class and len(unique(data.Y)) < 2), ("Data has no features to learn from.", data.X.shape[1] == 0), ] for error_msg, cond in data_errors: if cond: self.Error.train_data_error(error_msg) data = None break if isinstance(data, SqlTable): if data.approx_len() < AUTO_DL_LIMIT: data = Table(data) else: self.Information.data_sampled() data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(AUTO_DL_LIMIT, partial=True) data = Table(data_sample) self.train_data_missing_vals = \ data is not None and np.isnan(data.Y).any() if self.train_data_missing_vals or self.test_data_missing_vals: self.Warning.missing_data(self._which_missing_data()) if data: data = HasClass()(data) else: self.Warning.missing_data.clear() self.data = data self.closeContext() self._update_scorers() self._update_controls() if data is not None: self._update_class_selection() self.openContext(data.domain) if self.fold_feature_selected and bool(self.feature_model): self.resampling = OWTestAndScore.FeatureFold self._invalidate()
class Scorer(_RefuseDataInConstructor, Reprable): feature_type = None class_type = None supports_sparse_data = None preprocessors = [HasClass()] @property def friendly_name(self): """Return type name with camel-case separated into words. Derived classes can provide a better property or a class attribute. """ return re.sub("([a-z])([A-Z])", lambda mo: mo.group(1) + " " + mo.group(2).lower(), type(self).__name__) @staticmethod def _friendly_vartype_name(vartype): if vartype == DiscreteVariable: return "categorical" if vartype == ContinuousVariable: return "numeric" # Fallbacks name = vartype.__name__ if name.endswith("Variable"): return name.lower()[:-8] return name def __call__(self, data, feature=None): if not data.domain.class_var: raise ValueError( "{} requires data with a target variable." .format(self.friendly_name)) if not isinstance(data.domain.class_var, self.class_type): raise ValueError( "{} requires a {} target variable." .format(self.friendly_name, self._friendly_vartype_name(self.class_type))) if feature is not None: f = data.domain[feature] data = data.transform(Domain([f], data.domain.class_vars)) for pp in self.preprocessors: data = pp(data) for var in data.domain.attributes: if not isinstance(var, self.feature_type): raise ValueError( "{} cannot score {} variables." .format(self.friendly_name, self._friendly_vartype_name(type(var)))) return self.score_data(data, feature) def score_data(self, data, feature): raise NotImplementedError
def test_has_class_multiclass(self): domain = Domain([DiscreteVariable("x", values="01")], [ DiscreteVariable("y1", values="01"), DiscreteVariable("y2", values="01") ]) table = Table( domain, [[0, 1, np.nan], [1, np.nan, 0], [1, 0, 1], [1, np.nan, np.nan]]) table = HasClass()(table) self.assertTrue(not np.isnan(table).any()) self.assertEqual(table.domain, domain) self.assertEqual(len(table), 1)
def __call__(self, data): """ Remove rows that contain NaN in any class variable from the dataset and return the resulting data table. Parameters ---------- data : an input dataset Returns ------- data : dataset without rows with missing classes """ return HasClass()(data)
class XGBBase(SklLearner): """Base class for xgboost (classification and regression) learners """ preprocessors = default_preprocessors = [ HasClass(), Continuize(), RemoveNaNColumns(), ] def __init__(self, preprocessors=None, **kwargs): super().__init__(preprocessors=preprocessors) self.params = kwargs @SklLearner.params.setter def params(self, values: Dict): self._params = values
def set_test_data(self, data): # type: (Orange.data.Table) -> None """ Set the input separate testing dataset. Parameters ---------- data : Optional[Orange.data.Table] """ self.Information.test_data_sampled.clear() self.Error.test_data_empty.clear() if data is not None and not data: self.Error.test_data_empty() data = None if data and not data.domain.class_vars: self.Error.class_required_test() data = None else: self.Error.class_required_test.clear() if isinstance(data, SqlTable): if data.approx_len() < AUTO_DL_LIMIT: data = Table(data) else: self.Information.test_data_sampled() data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(AUTO_DL_LIMIT, partial=True) data = Table(data_sample) self.test_data_missing_vals = \ data is not None and np.isnan(data.Y).any() if self.train_data_missing_vals or self.test_data_missing_vals: self.Warning.missing_data(self._which_missing_data()) if data: data = HasClass()(data) else: self.Warning.missing_data.clear() self.test_data = data if self.resampling == OWTestAndScore.TestOnTest: self._invalidate()
def test_reprs(self): flid = IsDefined(negate=True) flhc = HasClass() flr = Random() fld = FilterDiscrete(self.attr_disc, None) flsv = SameValue(self.attr_disc, self.value_disc, negate=True) flc = FilterContinuous(self.vs[0], FilterContinuous.Less, 5) flc2 = FilterContinuous(self.vs[1], FilterContinuous.Greater, 3) flv = Values([flc, flc2], conjunction=False, negate=True) flvf = ValueFilter(self.attr_disc) fls = FilterString("name", FilterString.Equal, "Aardvark", case_sensitive=False) flsl = FilterStringList("name", ["Aardvark"], case_sensitive=False) flrx = FilterRegex("name", "^c...$") filters = [flid, flhc, flr, fld, flsv, flc, flv, flvf, fls, flsl, flrx] for f in filters: repr_str = repr(f) new_f = eval(repr_str) self.assertEqual(repr(new_f), repr_str)
class CatBoostLearnerRegression(CatBoostLearner, LearnerRegression): __wraps__ = None __returns__ = CatBoostModel supports_multiclass = True _params = {} learner_adequacy_err_msg = "Continuous class variable expected." preprocessors = default_preprocessors = [ HasClass(), Continuize(), RemoveNaNColumns(), SklImpute() ] def check_learner_adequacy(self, domain): return domain.has_continuous_class @property def params(self): return self._params @params.setter def params(self, value): self._params = self._get_sklparams(value) def _get_sklparams(self, values): skllearner = self.__wraps__ if skllearner is not None: spec = inspect.getargs(skllearner.__init__.__code__) # first argument is 'self' assert spec.args[0] == "self" params = { name: values[name] for name in spec.args[1:] if name in values } else: raise TypeError("Wrapper does not define '__wraps__'") return params def preprocess(self, data): data = super().preprocess(data) if any(v.is_discrete and len(v.values) > 2 for v in data.domain.attributes): raise ValueError("Wrapped scikit-learn methods do not support " + "multinomial variables.") return data def __call__(self, data): m = super().__call__(data) m.params = self.params return m def fit(self, X, Y, W=None): clf = self.__wraps__(**self.params) Y = Y.reshape(-1) if W is None or not self.supports_weights: return self.__returns__(clf.fit(X, Y)) return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1))) @property def supports_weights(self): """Indicates whether this learner supports weighted instances. """ return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames def __getattr__(self, item): try: return self.params[item] except (KeyError, AttributeError): raise AttributeError(item) from None # TODO: Disallow (or mirror) __setattr__ for keys in params? def __dir__(self): dd = super().__dir__() return list(sorted(set(dd) | set(self.params.keys())))
class CurveFitLearner(Learner): """ Fit a function to data. It uses the scipy.curve_fit to find the optimal values of parameters. Parameters ---------- expression : callable or str A modeling function. If callable, it must take the independent variable as the first argument and the parameters to fit as separate remaining arguments. If string, a lambda function is created, using `expression`, `available_feature_names`, `function` and `env` attributes. Should be string for pickling the model. parameters_names : list of str List of parameters names. Only needed when the expression is callable. features_names : list of str List of features names. Only needed when the expression is callable. available_feature_names : list of str List of all available features names. Only needed when the expression is string. Needed to distinguish between parameters and features when translating the expression into the lambda. functions : list of str List of all available functions. Only needed when the expression is string. Needed to distinguish between parameters and functions when translating the expression into the lambda. sanitizer : callable Function for sanitizing names. env : dict An environment to capture in the lambda's closure. p0 : list of floats, optional Initial guess for the parameters. bounds : 2-tuple of array_like, optional Lower and upper bounds on parameters. preprocessors : tuple of Orange preprocessors, optional The processors that will be used when data is passed to the learner. Examples -------- >>> import numpy as np >>> from Orange.data import Table >>> from Orange.regression import CurveFitLearner >>> data = Table("housing") >>> # example with callable expression >>> cfun = lambda x, a, b, c: a * np.exp(-b * x[:, 0] * x[:, 1]) + c >>> learner = CurveFitLearner(cfun, ["a", "b", "c"], ["CRIM", "LSTAT"]) >>> model = learner(data) >>> pred = model(data) >>> coef = model.coefficients >>> # example with str expression >>> sfun = "a * exp(-b * CRIM * LSTAT) + c" >>> names = [a.name for a in data.domain.attributes] >>> learner = CurveFitLearner(sfun, available_feature_names=names, ... functions=["exp"]) >>> model = learner(data) >>> pred = model(data) >>> coef = model.coefficients """ preprocessors = [HasClass(), RemoveNaNColumns(), Impute()] __returns__ = CurveFitModel name = "Curve Fit" def __init__( self, expression: Union[Callable, ast.Expression, str], parameters_names: Optional[List[str]] = None, features_names: Optional[List[str]] = None, available_feature_names: Optional[List[str]] = None, functions: Optional[List[str]] = None, sanitizer: Optional[Callable] = None, env: Optional[Dict[str, Any]] = None, p0: Union[List, Dict, None] = None, bounds: Union[Tuple, Dict] = (-np.inf, np.inf), preprocessors=None ): super().__init__(preprocessors) if callable(expression): if parameters_names is None: raise TypeError("Provide 'parameters_names' parameter.") if features_names is None: raise TypeError("Provide 'features_names' parameter.") args = None function = expression else: if available_feature_names is None: raise TypeError("Provide 'available_feature_names' parameter.") if functions is None: raise TypeError("Provide 'functions' parameter.") args = dict(expression=expression, available_feature_names=available_feature_names, functions=functions, sanitizer=sanitizer, env=env) function, parameters_names, features_names = _create_lambda(**args) if isinstance(p0, dict): p0 = [p0.get(p, 1) for p in parameters_names] if isinstance(bounds, dict): d = [-np.inf, np.inf] lower_bounds = [bounds.get(p, d)[0] for p in parameters_names] upper_bounds = [bounds.get(p, d)[1] for p in parameters_names] bounds = lower_bounds, upper_bounds self.__function = function self.__parameters_names = parameters_names self.__features_names = features_names self.__p0 = p0 self.__bounds = bounds # needed for pickling - if the expression is a lambda function, the # learner is not picklable self.__create_lambda_args = args @property def parameters_names(self) -> List[str]: return self.__parameters_names def fit_storage(self, data: Table) -> CurveFitModel: domain: Domain = data.domain attributes = [] for attr in domain.attributes: if attr.name in self.__features_names: if not attr.is_continuous: raise ValueError("Numeric feature expected.") attributes.append(attr) new_domain = Domain(attributes, domain.class_vars, domain.metas) transformed = data.transform(new_domain) params = curve_fit(self.__function, transformed.X, transformed.Y, p0=self.__p0, bounds=self.__bounds)[0] return CurveFitModel(new_domain, domain, self.__parameters_names, params, self.__function, self.__create_lambda_args) def __getstate__(self) -> Dict: if not self.__create_lambda_args: raise AttributeError( "Can't pickle/copy callable. Use str expression instead." ) state = self.__create_lambda_args.copy() state["parameters_names"] = None state["features_names"] = None state["p0"] = self.__p0 state["bounds"] = self.__bounds state["preprocessors"] = self.preprocessors return state def __setstate__(self, state: Dict): expression = state.pop("expression") self.__init__(expression, **state)
class CatGBBaseLearner(Learner, metaclass=WrapperMeta): """ ${skldoc} Additional Orange parameters preprocessors : list, optional An ordered list of preprocessors applied to data before training or testing. Defaults to `[RemoveNaNClasses(), RemoveNaNColumns()]` """ supports_weights = True __wraps__ = None __returns__ = CatGBModel _params = {} preprocessors = default_preprocessors = [ HasClass(), RemoveNaNColumns(), ] # pylint: disable=unused-argument,too-many-arguments,too-many-locals def __init__(self, iterations=None, learning_rate=None, depth=None, l2_leaf_reg=None, model_size_reg=None, rsm=None, loss_function=None, border_count=None, feature_border_type=None, per_float_feature_quantization=None, input_borders=None, output_borders=None, fold_permutation_block=None, od_pval=None, od_wait=None, od_type=None, nan_mode=None, counter_calc_method=None, leaf_estimation_iterations=None, leaf_estimation_method=None, thread_count=None, random_seed=None, use_best_model=None, verbose=False, logging_level=None, metric_period=None, ctr_leaf_count_limit=None, store_all_simple_ctr=None, max_ctr_complexity=None, has_time=None, allow_const_label=None, classes_count=None, class_weights=None, one_hot_max_size=None, random_strength=None, name=None, ignored_features=None, train_dir=cache_dir(), custom_loss=None, custom_metric=None, eval_metric=None, bagging_temperature=None, save_snapshot=None, snapshot_file=None, snapshot_interval=None, fold_len_multiplier=None, used_ram_limit=None, gpu_ram_part=None, allow_writing_files=False, final_ctr_computation_mode=None, approx_on_full_history=None, boosting_type=None, simple_ctr=None, combinations_ctr=None, per_feature_ctr=None, task_type=None, device_config=None, devices=None, bootstrap_type=None, subsample=None, sampling_unit=None, dev_score_calc_obj_block_size=None, max_depth=None, n_estimators=None, num_boost_round=None, num_trees=None, colsample_bylevel=None, random_state=None, reg_lambda=None, objective=None, eta=None, max_bin=None, scale_pos_weight=None, gpu_cat_features_storage=None, data_partition=None, metadata=None, early_stopping_rounds=None, cat_features=None, grow_policy=None, min_data_in_leaf=None, min_child_samples=None, max_leaves=None, num_leaves=None, score_function=None, leaf_estimation_backtracking=None, ctr_history_unit=None, monotone_constraints=None, feature_weights=None, penalties_coefficient=None, first_feature_use_penalties=None, model_shrink_rate=None, model_shrink_mode=None, langevin=None, diffusion_temperature=None, posterior_sampling=None, boost_from_average=None, text_features=None, tokenizers=None, dictionaries=None, feature_calcers=None, text_processing=None, preprocessors=None): super().__init__(preprocessors=preprocessors) self.params = vars() @property def params(self): return self._params @params.setter def params(self, value): self._params = self._get_wrapper_params(value) def _get_wrapper_params(self, values): spec = list( inspect.signature(self.__wraps__.__init__).parameters.keys()) return {name: values[name] for name in spec[1:] if name in values} def __call__(self, data, progress_callback=None): m = super().__call__(data, progress_callback) m.params = self.params return m def fit_storage(self, data: Table): domain, X, Y, W = data.domain, data.X, data.Y.reshape(-1), None if self.supports_weights and data.has_weights(): W = data.W.reshape(-1) # pylint: disable=not-callable clf = self.__wraps__(**self.params) cat_features = [ i for i, attr in enumerate(domain.attributes) if attr.is_discrete ] if cat_features: X = X.astype(str) cat_model = clf.fit(X, Y, cat_features=cat_features, sample_weight=W) return self.__returns__(cat_model, cat_features, domain) def __getattr__(self, item): try: return self.params[item] except (KeyError, AttributeError): raise AttributeError(item) from None def __dir__(self): dd = super().__dir__() return list(sorted(set(dd) | set(self.params.keys())))
class SoftmaxRegressionLearner(Learner): """L2 regularized softmax regression classifier. Uses the L-BFGS algorithm to minimize the categorical cross entropy cost with L2 regularization. This model is suitable when dealing with a multi-class classification problem. When using this learner you should: - choose a suitable regularization parameter lambda\_, - consider using many logistic regression models (one for each value of the class variable) instead of softmax regression. Parameters ---------- lambda\_ : float, optional (default=1.0) Regularization parameter. It controls trade-off between fitting the data and keeping parameters small. Higher values of lambda\_ force parameters to be smaller. preprocessors : list, optional Preprocessors are applied to data before training or testing. Default preprocessors: Defaults to `[RemoveNaNClasses(), RemoveNaNColumns(), Impute(), Continuize(), Normalize()]` - remove columns with all values as NaN - replace NaN values with suitable values - continuize all discrete attributes, - transform the dataset so that the columns are on a similar scale, fmin_args : dict, optional Parameters for L-BFGS algorithm. """ name = "softmax" preprocessors = [ HasClass(), RemoveNaNColumns(), Impute(), Continuize(), Normalize(), ] def __init__(self, lambda_=1.0, preprocessors=None, **fmin_args): super().__init__(preprocessors=preprocessors) self.lambda_ = lambda_ self.fmin_args = fmin_args def cost_grad(self, Theta_flat, X, Y): Theta = Theta_flat.reshape((self.num_classes, X.shape[1])) M = X.dot(Theta.T) P = np.exp(M - np.max(M, axis=1)[:, None]) P /= np.sum(P, axis=1)[:, None] cost = -np.sum(np.log(P) * Y) cost += self.lambda_ * Theta_flat.dot(Theta_flat) / 2.0 cost /= X.shape[0] grad = X.T.dot(P - Y).T grad += self.lambda_ * Theta grad /= X.shape[0] return cost, grad.ravel() def fit(self, X, y, W): if len(y.shape) > 1: raise ValueError("Softmax regression does not support " "multi-label classification") if np.isnan(np.sum(X)) or np.isnan(np.sum(y)): raise ValueError("Softmax regression does not support " "unknown values") X = np.hstack((X, np.ones((X.shape[0], 1)))) self.num_classes = np.unique(y).size Y = np.eye(self.num_classes)[y.ravel().astype(int)] theta = np.zeros(self.num_classes * X.shape[1]) theta, j, ret = fmin_l_bfgs_b(self.cost_grad, theta, args=(X, Y), **self.fmin_args) Theta = theta.reshape((self.num_classes, X.shape[1])) return SoftmaxRegressionModel(Theta)
class SklLearner(Learner, metaclass=WrapperMeta): """ ${skldoc} Additional Orange parameters preprocessors : list, optional An ordered list of preprocessors applied to data before training or testing. Defaults to `[RemoveNaNClasses(), Continuize(), SklImpute(), RemoveNaNColumns()]` """ __wraps__ = None __returns__ = SklModel _params = {} preprocessors = default_preprocessors = [ HasClass(), Continuize(), RemoveNaNColumns(), SklImpute() ] @property def params(self): return self._params @params.setter def params(self, value): self._params = self._get_sklparams(value) def _get_sklparams(self, values): skllearner = self.__wraps__ if skllearner is not None: spec = inspect.getargs(skllearner.__init__.__code__) # first argument is 'self' assert spec.args[0] == "self" params = { name: values[name] for name in spec.args[1:] if name in values } else: raise TypeError("Wrapper does not define '__wraps__'") return params def preprocess(self, data): data = super().preprocess(data) if any(v.is_discrete and len(v.values) > 2 for v in data.domain.attributes): raise ValueError("Wrapped scikit-learn methods do not support " + "multinomial variables.") return data def __call__(self, data): m = super().__call__(data) m.params = self.params return m def fit(self, X, Y, W=None): clf = self.__wraps__(**self.params) Y = Y.reshape(-1) if W is None or not self.supports_weights: return self.__returns__(clf.fit(X, Y)) return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1))) @property def supports_weights(self): """Indicates whether this learner supports weighted instances. """ return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames def __getattr__(self, item): try: return self.params[item] except (KeyError, AttributeError): raise AttributeError(item) from None # TODO: Disallow (or mirror) __setattr__ for keys in params? def __dir__(self): dd = super().__dir__() return list(sorted(set(dd) | set(self.params.keys())))
class LinearRegressionLearner(Learner): '''L2 regularized linear regression (a.k.a Ridge regression) This model uses the L-BFGS algorithm to minimize the linear least squares penalty with L2 regularization. When using this model you should: - Choose a suitable regularization parameter lambda_ - Consider appending a column of ones to the dataset (intercept term) Parameters ---------- lambda\_ : float, optional (default=1.0) Regularization parameter. It controls trade-off between fitting the data and keeping parameters small. Higher values of lambda\_ force parameters to be smaller. preprocessors : list, optional Preprocessors are applied to data before training or testing. Default preprocessors `[Normalize(), Continuize(), Impute(), RemoveNaNColumns()]`: - transform the dataset so that the columns are on a similar scale, - continuize all discrete attributes, - remove columns with all values as NaN - replace NaN values with suitable values fmin_args : dict, optional Parameters for L-BFGS algorithm. """ Examples -------- import numpy as np from Orange.data import Table from Orange.regression.linear_bfgs import LinearRegressionLearner data = Table('housing') data.X = np.hstack((data.X, np.ones((data.X.shape[0], 1)))) # append ones m = LinearRegressionLearner(lambda_=1.0) c = m(data) # fit print(c(data)) # predict ''' name = "linear_bfgs" preprocessors = [ HasClass(), Normalize(), Continuize(), Impute(), RemoveNaNColumns(), ] def __init__(self, lambda_=1.0, preprocessors=None, **fmin_args): super().__init__(preprocessors=preprocessors) self.lambda_ = lambda_ self.fmin_args = fmin_args def cost_grad(self, theta, X, y): t = X.dot(theta) - y cost = t.dot(t) cost += self.lambda_ * theta.dot(theta) cost /= 2.0 * X.shape[0] grad = X.T.dot(t) grad += self.lambda_ * theta grad /= X.shape[0] return cost, grad def fit(self, X, Y, W): if len(Y.shape) > 1 and Y.shape[1] > 1: raise ValueError( "Linear regression does not support " "multi-target classification" ) if np.isnan(np.sum(X)) or np.isnan(np.sum(Y)): raise ValueError("Linear regression does not support " "unknown values") theta = np.zeros(X.shape[1]) theta, cost, ret = fmin_l_bfgs_b( self.cost_grad, theta, args=(X, Y.ravel()), **self.fmin_args ) return LinearRegressionModel(theta)
class LRRulesLearner(Learner): """ Learner learns a set of rules by using the provided rule learner. Then, learned rules get encoded as binary attributes (0 - not covered, 1 - covered) and together with the original set of attributes comprise the set of attributes used in logistic regression learning. If rule_learner is not provided, this acts as an ordinary logistic regression. The fitter for logistic regression uses the L2-penalized loss function. To prevent overfitting due to attributes built from rules, the weights of new rule-based attributes are penalized more (see Možina et al. Extreme value correction in rule learning) TODO: weights are not supported yet. """ name = 'logreg rules' preprocessors = [HasClass(), RemoveNaNColumns(), Impute()] def __init__(self, preprocessors=None, penalty=1, opt_penalty=False, rule_learner=None, basic_attributes=True, fit_intercept=True, intercept_scaling=2, penalize_rules=True): """ Parameters ---------- preprocessors : A sequence of data preprocessors to apply on data prior to fitting the model. penalty : L2-penalty in loss function. rule_learner: Rule learner used to construct new attributes. fit_intercept: Should we add a constant column to data? intercept_scaling: Value of constant in the intercept column. Note that intercept column is appended after normalization, therefore higher values will be less affected by penalization. """ super().__init__(preprocessors) self.penalty = penalty self.opt_penalty = opt_penalty self.rule_learner = rule_learner self.fit_intercept = fit_intercept self.intercept_scaling = intercept_scaling self.basic_attributes = basic_attributes self.penalize_rules = penalize_rules # Post rule learning preprocessing should not decrease the # number of examples. self.post_rule_preprocess = [Normalize(), Continuize()] def fit_storage(self, data): if self.opt_penalty: self.penalty = self.tune_penalty(data) # learn rules rules = self.rule_learner(data).rule_list if self.rule_learner else [] # preprocess data if not self.basic_attributes: domain = Domain([], data.domain.class_vars, data.domain.metas) data = data.from_table(domain, data) for pp in self.post_rule_preprocess: data = pp(data) # create data X, Y, W = data.X, data.Y, data.W if data.W else None # 1. add rules to X Xr = np.concatenate([X] + [r.covered_examples[:, np.newaxis] for r in rules], axis=1) # 2. add constant to X if self.fit_intercept: Xr = self.add_intercept(self.intercept_scaling, Xr) # set additional penalties that penalized rule-based attributes gamma = self.get_gamma(X, rules) # build model w = [] se = [] if len(self.domain.class_var.values) > 2: for cli, _ in enumerate(self.domain.class_var.values): # create class with domain {-1, 1} yc = np.ones_like(Y) yc[Y != cli] = -1 # set bounds bounds = self.set_bounds(X, rules, cli) x, s = self.fit_params(Xr, yc, bounds, gamma) w.append(x) se.append(s) else: yc = np.ones_like(Y) yc[Y != 0] = -1 bounds = self.set_bounds(X, rules, 0) x, s = self.fit_params(Xr, yc, bounds, gamma) w = [x, -x] se = [s, s] # remove zero weights and corresponding rules to_keep, final_rules = list(range(X.shape[1])), [] for ri, r in enumerate(rules): if any(wi[X.shape[1] + ri] != 0 for wi in w): to_keep.append(X.shape[1] + ri) final_rules.append(r) if self.fit_intercept: to_keep.append(-1) w = [wi[to_keep] for wi in w] se = [s[to_keep] for s in se] return LRRulesClassifier(w, se, final_rules, self.fit_intercept, self.intercept_scaling, self.domain, data.domain) def tune_penalty(self, data): learner = LRRulesLearner(fit_intercept=self.fit_intercept, intercept_scaling=self.intercept_scaling) penalties = [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10., 100.] scores = [] for pen in penalties: learner.penalty = pen res = CrossValidation(data, [learner], k=5, random_state=1111) ll = LogLoss(res) scores.append(ll) return penalties[scores.index(min(scores))] def get_gamma(self, X, rules): gamma = [0] * X.shape[1] for r in rules: if self.penalize_rules: gamma.append(r.curr_class_dist[r.target_class] - r.quality * r.curr_class_dist.sum()) else: gamma.append(0) if self.fit_intercept: gamma.append(0) return np.array(gamma) @staticmethod def add_intercept(intercept, X): return np.hstack((X, intercept * np.ones((X.shape[0], 1)))) def set_bounds(self, X, rules, cli): bounds = [(None, None) for _ in range(X.shape[1])] for r in rules: if r.target_class == cli: bounds.append((0, None)) else: bounds.append((None, 0)) if self.fit_intercept: bounds.append((None, None)) return bounds def fit_params(self, X, y, bounds, gamma): w0 = np.zeros(X.shape[1]) out = opt.minimize(self.ll, w0, args=(X, y, gamma), method='TNC', bounds=bounds, jac=self.gradient) w = out.x # compute standard errors (s) z = self.phi(X.dot(w)) weights = z * (1 - z) xwx = (X.T * weights).dot(X) diag = np.diag_indices(X.shape[1]) xwx[diag] += self.penalty inv = np.linalg.inv(xwx) s = np.sqrt(inv[diag]) return w, s @staticmethod def phi(t): # logistic function, returns 1 / (1 + exp(-t)) idx = t > 0 out = np.empty(t.size, dtype=np.float) out[idx] = 1. / (1 + np.exp(-t[idx])) exp_t = np.exp(t[~idx]) out[~idx] = exp_t / (1. + exp_t) return out def ll(self, w, X, y, gamma): # loss function to be optimized, it's the logistic loss z = X.dot(w) yz = y * z idx = yz > 0 out = np.zeros_like(yz) out[idx] = np.log(1 + np.exp(-yz[idx])) out[~idx] = (-yz[~idx] + np.log(1 + np.exp(yz[~idx]))) out = out.sum() # add penalty out += (self.penalty * .5 * w).dot(w) # add second penalty (which is lasso-like and is a numpy array) out += gamma.dot(np.abs(w)) return out def gradient(self, w, X, y, gamma): # gradient of the logistic loss (ll) z = X.dot(w) z = self.phi(y * z) z0 = (z - 1) * y gradll = X.T.dot(z0) # add penalties gradll += self.penalty * w # second penalty pos = w > 0 neg = w < 0 gradll[pos] += gamma[pos] gradll[neg] -= gamma[neg] return gradll