def test_column_filtering(self): data = Table("iris") data.X[:, (1, 3)] = np.NaN new_data = RemoveNaNColumns()(data) self.assertEqual(len(new_data.domain.attributes), len(data.domain.attributes) - 2) data = Table("iris") data.X[0, 0] = np.NaN new_data = RemoveNaNColumns()(data) self.assertEqual(len(new_data.domain.attributes), len(data.domain.attributes))
def test_column_filtering_sparse(self): data = Table("iris") with data.unlocked(): data.X = csr_matrix(data.X) new_data = RemoveNaNColumns()(data) self.assertEqual(data, new_data)
def test_input_preprocessors(self): """Check multiple preprocessors on input""" pp_list = PreprocessorList([Randomize(), RemoveNaNColumns()]) self.send_signal("Preprocessor", pp_list) self.widget.apply_button.button.click() self.assertEqual((pp_list, ), self.widget.learner.preprocessors, '`PreprocessorList` was not added to preprocessors')
class NaiveBayesLearner(Learner): """ Naive Bayes classifier. Works only with discrete attributes. By default, continuous attributes are discretized. Parameters ---------- preprocessors : list, optional (default="[Orange.preprocess.Discretize]") An ordered list of preprocessors applied to data before training or testing. """ preprocessors = [RemoveNaNColumns(), Discretize()] name = "naive bayes" def fit_storage(self, table): if not isinstance(table, Storage): raise TypeError("Data is not a subclass of Orange.data.Storage.") if not all(var.is_discrete for var in table.domain.variables): raise NotImplementedError("Only discrete variables are supported.") cont = contingency.get_contingencies(table) class_freq = np.array( np.diag(contingency.get_contingency(table, table.domain.class_var)) ) class_prob = (class_freq + 1) / (np.sum(class_freq) + len(class_freq)) log_cont_prob = [ np.log( (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + c.shape[0]) / class_prob[:, None] ) for c in cont ] return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
def compute_distances(self): if self.data is None or len(self.data) == 0 \ or self.reference is None or len(self.reference) == 0: self.distances = None return distance = METRICS[self.distance_index][1] n_ref = len(self.reference) all_data = Table.concatenate([self.reference, self.data], 0) pp_all_data = Impute()(RemoveNaNColumns()(all_data)) pp_reference, pp_data = pp_all_data[:n_ref], pp_all_data[n_ref:] self.distances = distance(pp_data, pp_reference).min(axis=1)
class EllipticEnvelopeLearner(SklLearner): __wraps__ = skl_covariance.EllipticEnvelope __returns__ = EllipticEnvelopeClassifier preprocessors = [Continuize(), RemoveNaNColumns(), SklImpute()] def __init__(self, store_precision=True, assume_centered=False, support_fraction=None, contamination=0.1, random_state=None, preprocessors=None): super().__init__(preprocessors=preprocessors) self.params = vars()
class XGBBase(SklLearner): """Base class for xgboost (classification and regression) learners """ preprocessors = default_preprocessors = [ HasClass(), Continuize(), RemoveNaNColumns(), ] def __init__(self, preprocessors=None, **kwargs): super().__init__(preprocessors=preprocessors) self.params = kwargs @SklLearner.params.setter def params(self, values: Dict): self._params = values
class PolynomialLearner(Learner): name = 'poly learner' preprocessors = [Continuize(), RemoveNaNColumns(), SklImpute()] def __init__(self, learner, degree=1, preprocessors=None): super().__init__(preprocessors=preprocessors) self.degree = degree self.learner = learner def fit(self, X, Y, W): polyfeatures = skl_preprocessing.PolynomialFeatures(self.degree) X = polyfeatures.fit_transform(X) clf = self.learner if W is None or not self.supports_weights: model = clf.fit(X, Y, None) else: model = clf.fit(X, Y, sample_weight=W.reshape(-1)) return PolynomialModel(model, polyfeatures)
class TreeRegressionLearner(SklLearner): __wraps__ = skl_tree.DecisionTreeRegressor __returns__ = TreeRegressor name = 'regression tree' preprocessors = [RemoveNaNColumns(), SklImpute(), Continuize()] def __init__(self, criterion="mse", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None, random_state=None, max_leaf_nodes=None, preprocessors=None): super().__init__(preprocessors=preprocessors) self.params = vars()
class NaiveBayesLearner(Learner): """ Naive Bayes classifier. Works only with discrete attributes. By default, continuous attributes are discretized. Parameters ---------- preprocessors : list, optional (default="[Orange.preprocess.Discretize]") An ordered list of preprocessors applied to data before training or testing. """ preprocessors = [RemoveNaNColumns(), Discretize()] name = 'naive bayes' def fit_storage(self, table): if not isinstance(table, Storage): raise TypeError("Data is not a subclass of Orange.data.Storage.") if not all(var.is_discrete for var in table.domain.variables): raise NotImplementedError("Only categorical variables are " "supported.") cont = contingency.get_contingencies(table) class_freq = np.array( np.diag(contingency.get_contingency(table, table.domain.class_var))) nclss = (class_freq != 0).sum() if not nclss: raise ValueError("Data has no defined target values.") # Laplacian smoothing considers only classes that appear in the data, # in part to avoid cases where the probabilities are affected by empty # (or completely spurious) classes that appear because of Orange's reuse # of variables. See GH-2943. # The corresponding elements of class_probs are set to zero only after # mock non-zero values are used in computation of log_cont_prob to # prevent division by zero. class_prob = (class_freq + 1) / (np.sum(class_freq) + nclss) log_cont_prob = [ np.log((np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + nclss) / class_prob[:, None]) for c in cont ] class_prob[class_freq == 0] = 0 return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
def apply(self): if self.data is None or self.reference is None: self.send("Neighbors", None) return distance = self.DISTANCES[self.distance_index] n_data, n_ref = len(self.data), len(self.reference) all_data = Table.concatenate([self.reference, self.data], 0) pp_all_data = Impute()(RemoveNaNColumns()(all_data)) pp_data, pp_reference = pp_all_data[n_ref:], pp_all_data[:n_ref] dist = distance(np.vstack((pp_data, pp_reference)))[:n_data, n_data:] data = self._add_similarity(self.data, dist) sorted_indices = list(np.argsort(dist.flatten()))[::-1] indices = [] while len(sorted_indices) > 0 and len(indices) < self.n_neighbors: index = int(sorted_indices.pop() / len(self.reference)) if (self.data[index] not in self.reference or not self.exclude_reference) and index not in indices: indices.append(index) neighbours = data[indices] neighbours.attributes = self.data.attributes self.send("Neighbors", neighbours)
def compute_distances(self): self.Error.diff_domains.clear() if not self.data or not self.reference: self.distances = None return if set(self.reference.domain.attributes) != \ set(self.data.domain.attributes): self.Error.diff_domains() self.distances = None return metric = METRICS[self.distance_index][1] n_ref = len(self.reference) # comparing only attributes, no metas and class-vars new_domain = Domain(self.data.domain.attributes) reference = self.reference.transform(new_domain) data = self.data.transform(new_domain) all_data = Table.concatenate([reference, data], 0) pp_all_data = Impute()(RemoveNaNColumns()(all_data)) pp_reference, pp_data = pp_all_data[:n_ref], pp_all_data[n_ref:] self.distances = metric(pp_data, pp_reference).min(axis=1)
class SoftmaxLearner(Learner): """ Implementation of softmax regression with k*(n+1) parameters trained using L-BFGS optimization. """ name = 'softmax' preprocessors = [ RemoveNaNClasses(), Normalize(), Continuize(), Impute(), RemoveNaNColumns() ] def __init__(self, preprocessors=None): super().__init__(preprocessors=preprocessors) def mysigma(self, x): """ My softmax function. Always check that you provide correctly oriented data (ignore - solved with slicing). I subtracted max value to prevent overflow at calculation of exponent - it may cause undeflow, but that is not a problem. """ tmpx = np.exp(x - np.max(x, axis=1)[:, None]) return tmpx / np.sum(tmpx, axis=1)[:, None] def cost(self, theta, X, y): """ Args: theta (np.ndarray): model parameters of shape [n_classes * n_features] X (np.ndarray): data of shape [n_examples, n_features] y (np.ndarray): target variable of shape [n_examples] Returns: float: The value of cost function evaluated with given parameters. """ ################################################################################################# # Theta pretvorim iz dolgega vektorja v matricno obliko, nato pripravim indikatorsko funkcijo ################################################################################################# theta = theta.reshape((-1, X.shape[1])) indicator = np.identity(theta.shape[0])[y.astype(int)] return -(np.sum(indicator * np.log(self.mysigma(X.dot(theta.T))))) def grad(self, theta, X, y): """ Args: theta (np.ndarray): model parameters of shape [n_classes * n_features] X (np.ndarray): data of shape [n_examples, n_features] y (np.ndarray): target variable of shape [n_examples] Returns: np.ndarray: Gradients wrt. all model's parameters of shape [n_classes * n_features] """ theta = theta.reshape((-1, X.shape[1])) indicator = np.identity(theta.shape[0])[y.astype(int)] return -(X.T.dot( (indicator - self.mysigma(X.dot(theta.T))))).T.flatten() def approx_grad(self, theta, X, y, eps=1e-5): """ Args: theta (np.ndarray): model parameters of shape [n_classes * n_features] X (np.ndarray): data of shape [n_examples, n_features] y (np.ndarray): target variable of shape [n_examples] eps (float): value offset for gradient estimation Returns: np.ndarray: Gradients wrt. all model's parameters of shape [n_classes * n_features] """ result = [] for i in range(len(theta)): crr = np.zeros(len(theta)) crr[i] = 1 result.append((self.cost(theta + (crr * eps), X, y) - self.cost(theta - (crr * eps), X, y)) / (2 * eps)) return np.array(result) def fit(self, X, y, W=None): """ Args: X (np.ndarray): data of shape [n_examples, n_features] y (np.ndarray): target variable of shape [n_examples] W (np.ndarray): Orange weights - ignore for this exercise Returns: SoftmaxModel: Orange's classification model """ num_classes = len( np.unique(y)) # predpostavljamo da so vsi razredi prisotni X = np.column_stack((np.ones(X.shape[0]), X)) theta = np.ones(num_classes * X.shape[1]) * 1e-9 result = fmin_l_bfgs_b(self.cost, theta, self.grad, args=(X, y))[0] return SoftmaxModel(result.reshape((-1, X.shape[1])))
class SklLearner(Learner, metaclass=WrapperMeta): """ ${skldoc} Additional Orange parameters preprocessors : list, optional An ordered list of preprocessors applied to data before training or testing. Defaults to `[RemoveNaNClasses(), Continuize(), SklImpute(), RemoveNaNColumns()]` """ __wraps__ = None __returns__ = SklModel _params = {} preprocessors = default_preprocessors = [ HasClass(), Continuize(), RemoveNaNColumns(), SklImpute() ] @property def params(self): return self._params @params.setter def params(self, value): self._params = self._get_sklparams(value) def _get_sklparams(self, values): skllearner = self.__wraps__ if skllearner is not None: spec = inspect.getargs(skllearner.__init__.__code__) # first argument is 'self' assert spec.args[0] == "self" params = { name: values[name] for name in spec.args[1:] if name in values } else: raise TypeError("Wrapper does not define '__wraps__'") return params def preprocess(self, data): data = super().preprocess(data) if any(v.is_discrete and len(v.values) > 2 for v in data.domain.attributes): raise ValueError("Wrapped scikit-learn methods do not support " + "multinomial variables.") return data def __call__(self, data): m = super().__call__(data) m.params = self.params return m def fit(self, X, Y, W=None): clf = self.__wraps__(**self.params) Y = Y.reshape(-1) if W is None or not self.supports_weights: return self.__returns__(clf.fit(X, Y)) return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1))) @property def supports_weights(self): """Indicates whether this learner supports weighted instances. """ return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames def __getattr__(self, item): try: return self.params[item] except (KeyError, AttributeError): raise AttributeError(item) from None # TODO: Disallow (or mirror) __setattr__ for keys in params? def __dir__(self): dd = super().__dir__() return list(sorted(set(dd) | set(self.params.keys())))
class CatBoostLearnerRegression(CatBoostLearner, LearnerRegression): __wraps__ = None __returns__ = CatBoostModel supports_multiclass = True _params = {} learner_adequacy_err_msg = "Continuous class variable expected." preprocessors = default_preprocessors = [ HasClass(), Continuize(), RemoveNaNColumns(), SklImpute() ] def check_learner_adequacy(self, domain): return domain.has_continuous_class @property def params(self): return self._params @params.setter def params(self, value): self._params = self._get_sklparams(value) def _get_sklparams(self, values): skllearner = self.__wraps__ if skllearner is not None: spec = inspect.getargs(skllearner.__init__.__code__) # first argument is 'self' assert spec.args[0] == "self" params = { name: values[name] for name in spec.args[1:] if name in values } else: raise TypeError("Wrapper does not define '__wraps__'") return params def preprocess(self, data): data = super().preprocess(data) if any(v.is_discrete and len(v.values) > 2 for v in data.domain.attributes): raise ValueError("Wrapped scikit-learn methods do not support " + "multinomial variables.") return data def __call__(self, data): m = super().__call__(data) m.params = self.params return m def fit(self, X, Y, W=None): clf = self.__wraps__(**self.params) Y = Y.reshape(-1) if W is None or not self.supports_weights: return self.__returns__(clf.fit(X, Y)) return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1))) @property def supports_weights(self): """Indicates whether this learner supports weighted instances. """ return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames def __getattr__(self, item): try: return self.params[item] except (KeyError, AttributeError): raise AttributeError(item) from None # TODO: Disallow (or mirror) __setattr__ for keys in params? def __dir__(self): dd = super().__dir__() return list(sorted(set(dd) | set(self.params.keys())))
class CurveFitLearner(Learner): """ Fit a function to data. It uses the scipy.curve_fit to find the optimal values of parameters. Parameters ---------- expression : callable or str A modeling function. If callable, it must take the independent variable as the first argument and the parameters to fit as separate remaining arguments. If string, a lambda function is created, using `expression`, `available_feature_names`, `function` and `env` attributes. Should be string for pickling the model. parameters_names : list of str List of parameters names. Only needed when the expression is callable. features_names : list of str List of features names. Only needed when the expression is callable. available_feature_names : list of str List of all available features names. Only needed when the expression is string. Needed to distinguish between parameters and features when translating the expression into the lambda. functions : list of str List of all available functions. Only needed when the expression is string. Needed to distinguish between parameters and functions when translating the expression into the lambda. sanitizer : callable Function for sanitizing names. env : dict An environment to capture in the lambda's closure. p0 : list of floats, optional Initial guess for the parameters. bounds : 2-tuple of array_like, optional Lower and upper bounds on parameters. preprocessors : tuple of Orange preprocessors, optional The processors that will be used when data is passed to the learner. Examples -------- >>> import numpy as np >>> from Orange.data import Table >>> from Orange.regression import CurveFitLearner >>> data = Table("housing") >>> # example with callable expression >>> cfun = lambda x, a, b, c: a * np.exp(-b * x[:, 0] * x[:, 1]) + c >>> learner = CurveFitLearner(cfun, ["a", "b", "c"], ["CRIM", "LSTAT"]) >>> model = learner(data) >>> pred = model(data) >>> coef = model.coefficients >>> # example with str expression >>> sfun = "a * exp(-b * CRIM * LSTAT) + c" >>> names = [a.name for a in data.domain.attributes] >>> learner = CurveFitLearner(sfun, available_feature_names=names, ... functions=["exp"]) >>> model = learner(data) >>> pred = model(data) >>> coef = model.coefficients """ preprocessors = [HasClass(), RemoveNaNColumns(), Impute()] __returns__ = CurveFitModel name = "Curve Fit" def __init__( self, expression: Union[Callable, ast.Expression, str], parameters_names: Optional[List[str]] = None, features_names: Optional[List[str]] = None, available_feature_names: Optional[List[str]] = None, functions: Optional[List[str]] = None, sanitizer: Optional[Callable] = None, env: Optional[Dict[str, Any]] = None, p0: Union[List, Dict, None] = None, bounds: Union[Tuple, Dict] = (-np.inf, np.inf), preprocessors=None ): super().__init__(preprocessors) if callable(expression): if parameters_names is None: raise TypeError("Provide 'parameters_names' parameter.") if features_names is None: raise TypeError("Provide 'features_names' parameter.") args = None function = expression else: if available_feature_names is None: raise TypeError("Provide 'available_feature_names' parameter.") if functions is None: raise TypeError("Provide 'functions' parameter.") args = dict(expression=expression, available_feature_names=available_feature_names, functions=functions, sanitizer=sanitizer, env=env) function, parameters_names, features_names = _create_lambda(**args) if isinstance(p0, dict): p0 = [p0.get(p, 1) for p in parameters_names] if isinstance(bounds, dict): d = [-np.inf, np.inf] lower_bounds = [bounds.get(p, d)[0] for p in parameters_names] upper_bounds = [bounds.get(p, d)[1] for p in parameters_names] bounds = lower_bounds, upper_bounds self.__function = function self.__parameters_names = parameters_names self.__features_names = features_names self.__p0 = p0 self.__bounds = bounds # needed for pickling - if the expression is a lambda function, the # learner is not picklable self.__create_lambda_args = args @property def parameters_names(self) -> List[str]: return self.__parameters_names def fit_storage(self, data: Table) -> CurveFitModel: domain: Domain = data.domain attributes = [] for attr in domain.attributes: if attr.name in self.__features_names: if not attr.is_continuous: raise ValueError("Numeric feature expected.") attributes.append(attr) new_domain = Domain(attributes, domain.class_vars, domain.metas) transformed = data.transform(new_domain) params = curve_fit(self.__function, transformed.X, transformed.Y, p0=self.__p0, bounds=self.__bounds)[0] return CurveFitModel(new_domain, domain, self.__parameters_names, params, self.__function, self.__create_lambda_args) def __getstate__(self) -> Dict: if not self.__create_lambda_args: raise AttributeError( "Can't pickle/copy callable. Use str expression instead." ) state = self.__create_lambda_args.copy() state["parameters_names"] = None state["features_names"] = None state["p0"] = self.__p0 state["bounds"] = self.__bounds state["preprocessors"] = self.preprocessors return state def __setstate__(self, state: Dict): expression = state.pop("expression") self.__init__(expression, **state)
class LinearRegressionLearner(Learner): '''L2 regularized linear regression (a.k.a Ridge regression) This model uses the L-BFGS algorithm to minimize the linear least squares penalty with L2 regularization. When using this model you should: - Choose a suitable regularization parameter lambda_ - Consider appending a column of ones to the dataset (intercept term) Parameters ---------- lambda\_ : float, optional (default=1.0) Regularization parameter. It controls trade-off between fitting the data and keeping parameters small. Higher values of lambda\_ force parameters to be smaller. preprocessors : list, optional Preprocessors are applied to data before training or testing. Default preprocessors `[Normalize(), Continuize(), Impute(), RemoveNaNColumns()]`: - transform the dataset so that the columns are on a similar scale, - continuize all discrete attributes, - remove columns with all values as NaN - replace NaN values with suitable values fmin_args : dict, optional Parameters for L-BFGS algorithm. """ Examples -------- import numpy as np from Orange.data import Table from Orange.regression.linear_bfgs import LinearRegressionLearner data = Table('housing') data.X = np.hstack((data.X, np.ones((data.X.shape[0], 1)))) # append ones m = LinearRegressionLearner(lambda_=1.0) c = m(data) # fit print(c(data)) # predict ''' name = "linear_bfgs" preprocessors = [ HasClass(), Normalize(), Continuize(), Impute(), RemoveNaNColumns(), ] def __init__(self, lambda_=1.0, preprocessors=None, **fmin_args): super().__init__(preprocessors=preprocessors) self.lambda_ = lambda_ self.fmin_args = fmin_args def cost_grad(self, theta, X, y): t = X.dot(theta) - y cost = t.dot(t) cost += self.lambda_ * theta.dot(theta) cost /= 2.0 * X.shape[0] grad = X.T.dot(t) grad += self.lambda_ * theta grad /= X.shape[0] return cost, grad def fit(self, X, Y, W): if len(Y.shape) > 1 and Y.shape[1] > 1: raise ValueError( "Linear regression does not support " "multi-target classification" ) if np.isnan(np.sum(X)) or np.isnan(np.sum(Y)): raise ValueError("Linear regression does not support " "unknown values") theta = np.zeros(X.shape[1]) theta, cost, ret = fmin_l_bfgs_b( self.cost_grad, theta, args=(X, Y.ravel()), **self.fmin_args ) return LinearRegressionModel(theta)
class CatGBBaseLearner(Learner, metaclass=WrapperMeta): """ ${skldoc} Additional Orange parameters preprocessors : list, optional An ordered list of preprocessors applied to data before training or testing. Defaults to `[RemoveNaNClasses(), RemoveNaNColumns()]` """ supports_weights = True __wraps__ = None __returns__ = CatGBModel _params = {} preprocessors = default_preprocessors = [ HasClass(), RemoveNaNColumns(), ] # pylint: disable=unused-argument,too-many-arguments,too-many-locals def __init__(self, iterations=None, learning_rate=None, depth=None, l2_leaf_reg=None, model_size_reg=None, rsm=None, loss_function=None, border_count=None, feature_border_type=None, per_float_feature_quantization=None, input_borders=None, output_borders=None, fold_permutation_block=None, od_pval=None, od_wait=None, od_type=None, nan_mode=None, counter_calc_method=None, leaf_estimation_iterations=None, leaf_estimation_method=None, thread_count=None, random_seed=None, use_best_model=None, verbose=False, logging_level=None, metric_period=None, ctr_leaf_count_limit=None, store_all_simple_ctr=None, max_ctr_complexity=None, has_time=None, allow_const_label=None, classes_count=None, class_weights=None, one_hot_max_size=None, random_strength=None, name=None, ignored_features=None, train_dir=cache_dir(), custom_loss=None, custom_metric=None, eval_metric=None, bagging_temperature=None, save_snapshot=None, snapshot_file=None, snapshot_interval=None, fold_len_multiplier=None, used_ram_limit=None, gpu_ram_part=None, allow_writing_files=False, final_ctr_computation_mode=None, approx_on_full_history=None, boosting_type=None, simple_ctr=None, combinations_ctr=None, per_feature_ctr=None, task_type=None, device_config=None, devices=None, bootstrap_type=None, subsample=None, sampling_unit=None, dev_score_calc_obj_block_size=None, max_depth=None, n_estimators=None, num_boost_round=None, num_trees=None, colsample_bylevel=None, random_state=None, reg_lambda=None, objective=None, eta=None, max_bin=None, scale_pos_weight=None, gpu_cat_features_storage=None, data_partition=None, metadata=None, early_stopping_rounds=None, cat_features=None, grow_policy=None, min_data_in_leaf=None, min_child_samples=None, max_leaves=None, num_leaves=None, score_function=None, leaf_estimation_backtracking=None, ctr_history_unit=None, monotone_constraints=None, feature_weights=None, penalties_coefficient=None, first_feature_use_penalties=None, model_shrink_rate=None, model_shrink_mode=None, langevin=None, diffusion_temperature=None, posterior_sampling=None, boost_from_average=None, text_features=None, tokenizers=None, dictionaries=None, feature_calcers=None, text_processing=None, preprocessors=None): super().__init__(preprocessors=preprocessors) self.params = vars() @property def params(self): return self._params @params.setter def params(self, value): self._params = self._get_wrapper_params(value) def _get_wrapper_params(self, values): spec = list( inspect.signature(self.__wraps__.__init__).parameters.keys()) return {name: values[name] for name in spec[1:] if name in values} def __call__(self, data, progress_callback=None): m = super().__call__(data, progress_callback) m.params = self.params return m def fit_storage(self, data: Table): domain, X, Y, W = data.domain, data.X, data.Y.reshape(-1), None if self.supports_weights and data.has_weights(): W = data.W.reshape(-1) # pylint: disable=not-callable clf = self.__wraps__(**self.params) cat_features = [ i for i, attr in enumerate(domain.attributes) if attr.is_discrete ] if cat_features: X = X.astype(str) cat_model = clf.fit(X, Y, cat_features=cat_features, sample_weight=W) return self.__returns__(cat_model, cat_features, domain) def __getattr__(self, item): try: return self.params[item] except (KeyError, AttributeError): raise AttributeError(item) from None def __dir__(self): dd = super().__dir__() return list(sorted(set(dd) | set(self.params.keys())))
class SklLearner(Learner, metaclass=WrapperMeta): """ ${skldoc} Additional Orange parameters preprocessors : list, optional (default=[Continuize(), SklImpute(), RemoveNaNColumns()]) An ordered list of preprocessors applied to data before training or testing. """ __wraps__ = None __returns__ = SklModel _params = None name = 'skl learner' preprocessors = [Continuize(), RemoveNaNColumns(), SklImpute(force=False)] @property def params(self): return self._params @params.setter def params(self, value): self._params = self._get_sklparams(value) def _get_sklparams(self, values): skllearner = self.__wraps__ if skllearner is not None: spec = inspect.getargs(skllearner.__init__.__code__) # first argument is 'self' assert spec.args[0] == "self" params = {name: values[name] for name in spec.args[1:] if name in values} else: raise TypeError("Wrapper does not define '__wraps__'") return params def preprocess(self, data): data = super().preprocess(data) if any(v.is_discrete and len(v.values) > 2 for v in data.domain.attributes): raise ValueError("Wrapped scikit-learn methods do not support " + "multinomial variables.") return data def __call__(self, data): m = super().__call__(data) m.used_vals = [np.unique(y) for y in data.Y[:, None].T] m.params = self.params return m def fit(self, X, Y, W): clf = self.__wraps__(**self.params) Y = Y.reshape(-1) if W is None or not self.supports_weights: return self.__returns__(clf.fit(X, Y)) return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1))) def __repr__(self): return '{} {}'.format(self.name, self.params)
class LRRulesLearner(Learner): """ Learner learns a set of rules by using the provided rule learner. Then, learned rules get encoded as binary attributes (0 - not covered, 1 - covered) and together with the original set of attributes comprise the set of attributes used in logistic regression learning. If rule_learner is not provided, this acts as an ordinary logistic regression. The fitter for logistic regression uses the L2-penalized loss function. To prevent overfitting due to attributes built from rules, the weights of new rule-based attributes are penalized more (see Možina et al. Extreme value correction in rule learning) TODO: weights are not supported yet. """ name = 'logreg rules' preprocessors = [HasClass(), RemoveNaNColumns(), Impute()] def __init__(self, preprocessors=None, penalty=1, opt_penalty=False, rule_learner=None, basic_attributes=True, fit_intercept=True, intercept_scaling=2, penalize_rules=True): """ Parameters ---------- preprocessors : A sequence of data preprocessors to apply on data prior to fitting the model. penalty : L2-penalty in loss function. rule_learner: Rule learner used to construct new attributes. fit_intercept: Should we add a constant column to data? intercept_scaling: Value of constant in the intercept column. Note that intercept column is appended after normalization, therefore higher values will be less affected by penalization. """ super().__init__(preprocessors) self.penalty = penalty self.opt_penalty = opt_penalty self.rule_learner = rule_learner self.fit_intercept = fit_intercept self.intercept_scaling = intercept_scaling self.basic_attributes = basic_attributes self.penalize_rules = penalize_rules # Post rule learning preprocessing should not decrease the # number of examples. self.post_rule_preprocess = [Normalize(), Continuize()] def fit_storage(self, data): if self.opt_penalty: self.penalty = self.tune_penalty(data) # learn rules rules = self.rule_learner(data).rule_list if self.rule_learner else [] # preprocess data if not self.basic_attributes: domain = Domain([], data.domain.class_vars, data.domain.metas) data = data.from_table(domain, data) for pp in self.post_rule_preprocess: data = pp(data) # create data X, Y, W = data.X, data.Y, data.W if data.W else None # 1. add rules to X Xr = np.concatenate([X] + [r.covered_examples[:, np.newaxis] for r in rules], axis=1) # 2. add constant to X if self.fit_intercept: Xr = self.add_intercept(self.intercept_scaling, Xr) # set additional penalties that penalized rule-based attributes gamma = self.get_gamma(X, rules) # build model w = [] se = [] if len(self.domain.class_var.values) > 2: for cli, _ in enumerate(self.domain.class_var.values): # create class with domain {-1, 1} yc = np.ones_like(Y) yc[Y != cli] = -1 # set bounds bounds = self.set_bounds(X, rules, cli) x, s = self.fit_params(Xr, yc, bounds, gamma) w.append(x) se.append(s) else: yc = np.ones_like(Y) yc[Y != 0] = -1 bounds = self.set_bounds(X, rules, 0) x, s = self.fit_params(Xr, yc, bounds, gamma) w = [x, -x] se = [s, s] # remove zero weights and corresponding rules to_keep, final_rules = list(range(X.shape[1])), [] for ri, r in enumerate(rules): if any(wi[X.shape[1] + ri] != 0 for wi in w): to_keep.append(X.shape[1] + ri) final_rules.append(r) if self.fit_intercept: to_keep.append(-1) w = [wi[to_keep] for wi in w] se = [s[to_keep] for s in se] return LRRulesClassifier(w, se, final_rules, self.fit_intercept, self.intercept_scaling, self.domain, data.domain) def tune_penalty(self, data): learner = LRRulesLearner(fit_intercept=self.fit_intercept, intercept_scaling=self.intercept_scaling) penalties = [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10., 100.] scores = [] for pen in penalties: learner.penalty = pen res = CrossValidation(data, [learner], k=5, random_state=1111) ll = LogLoss(res) scores.append(ll) return penalties[scores.index(min(scores))] def get_gamma(self, X, rules): gamma = [0] * X.shape[1] for r in rules: if self.penalize_rules: gamma.append(r.curr_class_dist[r.target_class] - r.quality * r.curr_class_dist.sum()) else: gamma.append(0) if self.fit_intercept: gamma.append(0) return np.array(gamma) @staticmethod def add_intercept(intercept, X): return np.hstack((X, intercept * np.ones((X.shape[0], 1)))) def set_bounds(self, X, rules, cli): bounds = [(None, None) for _ in range(X.shape[1])] for r in rules: if r.target_class == cli: bounds.append((0, None)) else: bounds.append((None, 0)) if self.fit_intercept: bounds.append((None, None)) return bounds def fit_params(self, X, y, bounds, gamma): w0 = np.zeros(X.shape[1]) out = opt.minimize(self.ll, w0, args=(X, y, gamma), method='TNC', bounds=bounds, jac=self.gradient) w = out.x # compute standard errors (s) z = self.phi(X.dot(w)) weights = z * (1 - z) xwx = (X.T * weights).dot(X) diag = np.diag_indices(X.shape[1]) xwx[diag] += self.penalty inv = np.linalg.inv(xwx) s = np.sqrt(inv[diag]) return w, s @staticmethod def phi(t): # logistic function, returns 1 / (1 + exp(-t)) idx = t > 0 out = np.empty(t.size, dtype=np.float) out[idx] = 1. / (1 + np.exp(-t[idx])) exp_t = np.exp(t[~idx]) out[~idx] = exp_t / (1. + exp_t) return out def ll(self, w, X, y, gamma): # loss function to be optimized, it's the logistic loss z = X.dot(w) yz = y * z idx = yz > 0 out = np.zeros_like(yz) out[idx] = np.log(1 + np.exp(-yz[idx])) out[~idx] = (-yz[~idx] + np.log(1 + np.exp(yz[~idx]))) out = out.sum() # add penalty out += (self.penalty * .5 * w).dot(w) # add second penalty (which is lasso-like and is a numpy array) out += gamma.dot(np.abs(w)) return out def gradient(self, w, X, y, gamma): # gradient of the logistic loss (ll) z = X.dot(w) z = self.phi(y * z) z0 = (z - 1) * y gradll = X.T.dot(z0) # add penalties gradll += self.penalty * w # second penalty pos = w > 0 neg = w < 0 gradll[pos] += gamma[pos] gradll[neg] -= gamma[neg] return gradll
class SoftmaxRegressionLearner(Learner): """L2 regularized softmax regression classifier. Uses the L-BFGS algorithm to minimize the categorical cross entropy cost with L2 regularization. This model is suitable when dealing with a multi-class classification problem. When using this learner you should: - choose a suitable regularization parameter lambda\_, - consider using many logistic regression models (one for each value of the class variable) instead of softmax regression. Parameters ---------- lambda\_ : float, optional (default=1.0) Regularization parameter. It controls trade-off between fitting the data and keeping parameters small. Higher values of lambda\_ force parameters to be smaller. preprocessors : list, optional Preprocessors are applied to data before training or testing. Default preprocessors: Defaults to `[RemoveNaNClasses(), RemoveNaNColumns(), Impute(), Continuize(), Normalize()]` - remove columns with all values as NaN - replace NaN values with suitable values - continuize all discrete attributes, - transform the dataset so that the columns are on a similar scale, fmin_args : dict, optional Parameters for L-BFGS algorithm. """ name = "softmax" preprocessors = [ HasClass(), RemoveNaNColumns(), Impute(), Continuize(), Normalize(), ] def __init__(self, lambda_=1.0, preprocessors=None, **fmin_args): super().__init__(preprocessors=preprocessors) self.lambda_ = lambda_ self.fmin_args = fmin_args def cost_grad(self, Theta_flat, X, Y): Theta = Theta_flat.reshape((self.num_classes, X.shape[1])) M = X.dot(Theta.T) P = np.exp(M - np.max(M, axis=1)[:, None]) P /= np.sum(P, axis=1)[:, None] cost = -np.sum(np.log(P) * Y) cost += self.lambda_ * Theta_flat.dot(Theta_flat) / 2.0 cost /= X.shape[0] grad = X.T.dot(P - Y).T grad += self.lambda_ * Theta grad /= X.shape[0] return cost, grad.ravel() def fit(self, X, y, W): if len(y.shape) > 1: raise ValueError("Softmax regression does not support " "multi-label classification") if np.isnan(np.sum(X)) or np.isnan(np.sum(y)): raise ValueError("Softmax regression does not support " "unknown values") X = np.hstack((X, np.ones((X.shape[0], 1)))) self.num_classes = np.unique(y).size Y = np.eye(self.num_classes)[y.ravel().astype(int)] theta = np.zeros(self.num_classes * X.shape[1]) theta, j, ret = fmin_l_bfgs_b(self.cost_grad, theta, args=(X, Y), **self.fmin_args) Theta = theta.reshape((self.num_classes, X.shape[1])) return SoftmaxRegressionModel(Theta)