Python RemoveNaNColumns примеры, Orange.preprocess.RemoveNaNColumns Python примеры использования

Пример #1

0

Показать файл

    def test_column_filtering(self):
        data = Table("iris")
        data.X[:, (1, 3)] = np.NaN

        new_data = RemoveNaNColumns()(data)
        self.assertEqual(len(new_data.domain.attributes),
                         len(data.domain.attributes) - 2)

        data = Table("iris")
        data.X[0, 0] = np.NaN
        new_data = RemoveNaNColumns()(data)
        self.assertEqual(len(new_data.domain.attributes),
                         len(data.domain.attributes))

Пример #2

0

Показать файл

Файл: test_preprocess.py Проект: szzyiit/orange3

    def test_column_filtering_sparse(self):
        data = Table("iris")
        with data.unlocked():
            data.X = csr_matrix(data.X)

        new_data = RemoveNaNColumns()(data)
        self.assertEqual(data, new_data)

Пример #3

0

Показать файл

 def test_input_preprocessors(self):
     """Check multiple preprocessors on input"""
     pp_list = PreprocessorList([Randomize(), RemoveNaNColumns()])
     self.send_signal("Preprocessor", pp_list)
     self.widget.apply_button.button.click()
     self.assertEqual((pp_list, ), self.widget.learner.preprocessors,
                      '`PreprocessorList` was not added to preprocessors')

Пример #4

0

Показать файл

Файл: naive_bayes.py Проект: varikmp/BioDepot-workflow-builder

class NaiveBayesLearner(Learner):
    """
    Naive Bayes classifier. Works only with discrete attributes. By default,
    continuous attributes are discretized.

    Parameters
    ----------
    preprocessors : list, optional (default="[Orange.preprocess.Discretize]")
        An ordered list of preprocessors applied to data before training
        or testing.
    """

    preprocessors = [RemoveNaNColumns(), Discretize()]
    name = "naive bayes"

    def fit_storage(self, table):
        if not isinstance(table, Storage):
            raise TypeError("Data is not a subclass of Orange.data.Storage.")
        if not all(var.is_discrete for var in table.domain.variables):
            raise NotImplementedError("Only discrete variables are supported.")

        cont = contingency.get_contingencies(table)
        class_freq = np.array(
            np.diag(contingency.get_contingency(table, table.domain.class_var))
        )
        class_prob = (class_freq + 1) / (np.sum(class_freq) + len(class_freq))
        log_cont_prob = [
            np.log(
                (np.array(c) + 1)
                / (np.sum(np.array(c), axis=0)[None, :] + c.shape[0])
                / class_prob[:, None]
            )
            for c in cont
        ]
        return NaiveBayesModel(log_cont_prob, class_prob, table.domain)

Пример #5

0

Показать файл

 def compute_distances(self):
     if self.data is None or len(self.data) == 0 \
             or self.reference is None or len(self.reference) == 0:
         self.distances = None
         return
     distance = METRICS[self.distance_index][1]
     n_ref = len(self.reference)
     all_data = Table.concatenate([self.reference, self.data], 0)
     pp_all_data = Impute()(RemoveNaNColumns()(all_data))
     pp_reference, pp_data = pp_all_data[:n_ref], pp_all_data[n_ref:]
     self.distances = distance(pp_data, pp_reference).min(axis=1)

Пример #6

0

Показать файл

Файл: elliptic_envelope.py Проект: davan690/Orange3_bayes_biolab_ARD

class EllipticEnvelopeLearner(SklLearner):
    __wraps__ = skl_covariance.EllipticEnvelope
    __returns__ = EllipticEnvelopeClassifier
    preprocessors = [Continuize(), RemoveNaNColumns(), SklImpute()]

    def __init__(self,
                 store_precision=True,
                 assume_centered=False,
                 support_fraction=None,
                 contamination=0.1,
                 random_state=None,
                 preprocessors=None):
        super().__init__(preprocessors=preprocessors)
        self.params = vars()

Пример #7

0

Показать файл

Файл: base.py Проект: ycpengpeng/orange3

class XGBBase(SklLearner):
    """Base class for xgboost (classification and regression) learners """
    preprocessors = default_preprocessors = [
        HasClass(),
        Continuize(),
        RemoveNaNColumns(),
    ]

    def __init__(self, preprocessors=None, **kwargs):
        super().__init__(preprocessors=preprocessors)
        self.params = kwargs

    @SklLearner.params.setter
    def params(self, values: Dict):
        self._params = values

Пример #8

0

Показать файл

Файл: linear.py Проект: wibrt/orange3

class PolynomialLearner(Learner):
    name = 'poly learner'
    preprocessors = [Continuize(), RemoveNaNColumns(), SklImpute()]

    def __init__(self, learner, degree=1, preprocessors=None):
        super().__init__(preprocessors=preprocessors)
        self.degree = degree
        self.learner = learner

    def fit(self, X, Y, W):
        polyfeatures = skl_preprocessing.PolynomialFeatures(self.degree)
        X = polyfeatures.fit_transform(X)
        clf = self.learner
        if W is None or not self.supports_weights:
            model = clf.fit(X, Y, None)
        else:
            model = clf.fit(X, Y, sample_weight=W.reshape(-1))
        return PolynomialModel(model, polyfeatures)

Пример #9

0

Показать файл

class TreeRegressionLearner(SklLearner):
    __wraps__ = skl_tree.DecisionTreeRegressor
    __returns__ = TreeRegressor
    name = 'regression tree'
    preprocessors = [RemoveNaNColumns(), SklImpute(), Continuize()]

    def __init__(self,
                 criterion="mse",
                 splitter="best",
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 max_features=None,
                 random_state=None,
                 max_leaf_nodes=None,
                 preprocessors=None):
        super().__init__(preprocessors=preprocessors)
        self.params = vars()

Пример #10

0

Показать файл

class NaiveBayesLearner(Learner):
    """
    Naive Bayes classifier. Works only with discrete attributes. By default,
    continuous attributes are discretized.

    Parameters
    ----------
    preprocessors : list, optional (default="[Orange.preprocess.Discretize]")
        An ordered list of preprocessors applied to data before training
        or testing.
    """
    preprocessors = [RemoveNaNColumns(), Discretize()]
    name = 'naive bayes'

    def fit_storage(self, table):
        if not isinstance(table, Storage):
            raise TypeError("Data is not a subclass of Orange.data.Storage.")
        if not all(var.is_discrete for var in table.domain.variables):
            raise NotImplementedError("Only categorical variables are "
                                      "supported.")

        cont = contingency.get_contingencies(table)
        class_freq = np.array(
            np.diag(contingency.get_contingency(table,
                                                table.domain.class_var)))
        nclss = (class_freq != 0).sum()
        if not nclss:
            raise ValueError("Data has no defined target values.")

        # Laplacian smoothing considers only classes that appear in the data,
        # in part to avoid cases where the probabilities are affected by empty
        # (or completely spurious) classes that appear because of Orange's reuse
        # of variables. See GH-2943.
        # The corresponding elements of class_probs are set to zero only after
        # mock non-zero values are used in computation of log_cont_prob to
        # prevent division by zero.
        class_prob = (class_freq + 1) / (np.sum(class_freq) + nclss)
        log_cont_prob = [
            np.log((np.array(c) + 1) /
                   (np.sum(np.array(c), axis=0)[None, :] + nclss) /
                   class_prob[:, None]) for c in cont
        ]
        class_prob[class_freq == 0] = 0
        return NaiveBayesModel(log_cont_prob, class_prob, table.domain)

Пример #11

0

Показать файл

 def apply(self):
     if self.data is None or self.reference is None:
         self.send("Neighbors", None)
         return
     distance = self.DISTANCES[self.distance_index]
     n_data, n_ref = len(self.data), len(self.reference)
     all_data = Table.concatenate([self.reference, self.data], 0)
     pp_all_data = Impute()(RemoveNaNColumns()(all_data))
     pp_data, pp_reference = pp_all_data[n_ref:], pp_all_data[:n_ref]
     dist = distance(np.vstack((pp_data, pp_reference)))[:n_data, n_data:]
     data = self._add_similarity(self.data, dist)
     sorted_indices = list(np.argsort(dist.flatten()))[::-1]
     indices = []
     while len(sorted_indices) > 0 and len(indices) < self.n_neighbors:
         index = int(sorted_indices.pop() / len(self.reference))
         if (self.data[index] not in self.reference
                 or not self.exclude_reference) and index not in indices:
             indices.append(index)
     neighbours = data[indices]
     neighbours.attributes = self.data.attributes
     self.send("Neighbors", neighbours)

Пример #12

0

Показать файл

    def compute_distances(self):
        self.Error.diff_domains.clear()
        if not self.data or not self.reference:
            self.distances = None
            return
        if set(self.reference.domain.attributes) != \
                set(self.data.domain.attributes):
            self.Error.diff_domains()
            self.distances = None
            return

        metric = METRICS[self.distance_index][1]
        n_ref = len(self.reference)

        # comparing only attributes, no metas and class-vars
        new_domain = Domain(self.data.domain.attributes)
        reference = self.reference.transform(new_domain)
        data = self.data.transform(new_domain)

        all_data = Table.concatenate([reference, data], 0)
        pp_all_data = Impute()(RemoveNaNColumns()(all_data))
        pp_reference, pp_data = pp_all_data[:n_ref], pp_all_data[n_ref:]
        self.distances = metric(pp_data, pp_reference).min(axis=1)

Пример #13

0

Показать файл

Файл: softmax.py Проект: am93/fri-ozip-naloge

class SoftmaxLearner(Learner):
    """
    Implementation of softmax regression with k*(n+1) parameters
    trained using L-BFGS optimization.
    """
    name = 'softmax'
    preprocessors = [
        RemoveNaNClasses(),
        Normalize(),
        Continuize(),
        Impute(),
        RemoveNaNColumns()
    ]

    def __init__(self, preprocessors=None):
        super().__init__(preprocessors=preprocessors)

    def mysigma(self, x):
        """
        My softmax function. Always check that you provide correctly oriented data (ignore - solved with slicing).
        I subtracted max value to prevent overflow at calculation of exponent - it may cause undeflow, but that is
        not a problem.
        """
        tmpx = np.exp(x - np.max(x, axis=1)[:, None])
        return tmpx / np.sum(tmpx, axis=1)[:, None]

    def cost(self, theta, X, y):
        """
        Args:
            theta (np.ndarray): model parameters of shape [n_classes * n_features]
            X (np.ndarray): data of shape [n_examples, n_features]
            y (np.ndarray): target variable of shape [n_examples]

        Returns:
            float: The value of cost function evaluated with given parameters.
        """
        #################################################################################################
        # Theta pretvorim iz dolgega vektorja v matricno obliko, nato pripravim indikatorsko funkcijo
        #################################################################################################
        theta = theta.reshape((-1, X.shape[1]))
        indicator = np.identity(theta.shape[0])[y.astype(int)]
        return -(np.sum(indicator * np.log(self.mysigma(X.dot(theta.T)))))

    def grad(self, theta, X, y):
        """
        Args:
            theta (np.ndarray): model parameters of shape [n_classes * n_features]
            X (np.ndarray): data of shape [n_examples, n_features]
            y (np.ndarray): target variable of shape [n_examples]

        Returns:
            np.ndarray: Gradients wrt. all model's parameters of shape
                [n_classes * n_features]
        """
        theta = theta.reshape((-1, X.shape[1]))
        indicator = np.identity(theta.shape[0])[y.astype(int)]
        return -(X.T.dot(
            (indicator - self.mysigma(X.dot(theta.T))))).T.flatten()

    def approx_grad(self, theta, X, y, eps=1e-5):
        """
        Args:
            theta (np.ndarray): model parameters of shape [n_classes * n_features]
            X (np.ndarray): data of shape [n_examples, n_features]
            y (np.ndarray): target variable of shape [n_examples]
            eps (float): value offset for gradient estimation

        Returns:
            np.ndarray: Gradients wrt. all model's parameters of shape
                [n_classes * n_features]
        """
        result = []
        for i in range(len(theta)):
            crr = np.zeros(len(theta))
            crr[i] = 1
            result.append((self.cost(theta + (crr * eps), X, y) -
                           self.cost(theta - (crr * eps), X, y)) / (2 * eps))

        return np.array(result)

    def fit(self, X, y, W=None):
        """
        Args:
            X (np.ndarray): data of shape [n_examples, n_features]
            y (np.ndarray): target variable of shape [n_examples]
            W (np.ndarray): Orange weights - ignore for this exercise

        Returns:
            SoftmaxModel: Orange's classification model
        """
        num_classes = len(
            np.unique(y))  # predpostavljamo da so vsi razredi prisotni
        X = np.column_stack((np.ones(X.shape[0]), X))
        theta = np.ones(num_classes * X.shape[1]) * 1e-9
        result = fmin_l_bfgs_b(self.cost, theta, self.grad, args=(X, y))[0]
        return SoftmaxModel(result.reshape((-1, X.shape[1])))

Пример #14

0

Показать файл

class SklLearner(Learner, metaclass=WrapperMeta):
    """
    ${skldoc}
    Additional Orange parameters

    preprocessors : list, optional
        An ordered list of preprocessors applied to data before
        training or testing.
        Defaults to
        `[RemoveNaNClasses(), Continuize(), SklImpute(), RemoveNaNColumns()]`
    """
    __wraps__ = None
    __returns__ = SklModel
    _params = {}

    preprocessors = default_preprocessors = [
        HasClass(), Continuize(),
        RemoveNaNColumns(),
        SklImpute()
    ]

    @property
    def params(self):
        return self._params

    @params.setter
    def params(self, value):
        self._params = self._get_sklparams(value)

    def _get_sklparams(self, values):
        skllearner = self.__wraps__
        if skllearner is not None:
            spec = inspect.getargs(skllearner.__init__.__code__)
            # first argument is 'self'
            assert spec.args[0] == "self"
            params = {
                name: values[name]
                for name in spec.args[1:] if name in values
            }
        else:
            raise TypeError("Wrapper does not define '__wraps__'")
        return params

    def preprocess(self, data):
        data = super().preprocess(data)

        if any(v.is_discrete and len(v.values) > 2
               for v in data.domain.attributes):
            raise ValueError("Wrapped scikit-learn methods do not support " +
                             "multinomial variables.")

        return data

    def __call__(self, data):
        m = super().__call__(data)
        m.params = self.params
        return m

    def fit(self, X, Y, W=None):
        clf = self.__wraps__(**self.params)
        Y = Y.reshape(-1)
        if W is None or not self.supports_weights:
            return self.__returns__(clf.fit(X, Y))
        return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1)))

    @property
    def supports_weights(self):
        """Indicates whether this learner supports weighted instances.
        """
        return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames

    def __getattr__(self, item):
        try:
            return self.params[item]
        except (KeyError, AttributeError):
            raise AttributeError(item) from None

    # TODO: Disallow (or mirror) __setattr__ for keys in params?

    def __dir__(self):
        dd = super().__dir__()
        return list(sorted(set(dd) | set(self.params.keys())))

Пример #15

0

Показать файл

class CatBoostLearnerRegression(CatBoostLearner, LearnerRegression):
    __wraps__ = None
    __returns__ = CatBoostModel
    supports_multiclass = True
    _params = {}

    learner_adequacy_err_msg = "Continuous class variable expected."

    preprocessors = default_preprocessors = [
        HasClass(), Continuize(),
        RemoveNaNColumns(),
        SklImpute()
    ]

    def check_learner_adequacy(self, domain):
        return domain.has_continuous_class

    @property
    def params(self):
        return self._params

    @params.setter
    def params(self, value):
        self._params = self._get_sklparams(value)

    def _get_sklparams(self, values):
        skllearner = self.__wraps__
        if skllearner is not None:
            spec = inspect.getargs(skllearner.__init__.__code__)
            # first argument is 'self'
            assert spec.args[0] == "self"
            params = {
                name: values[name]
                for name in spec.args[1:] if name in values
            }
        else:
            raise TypeError("Wrapper does not define '__wraps__'")
        return params

    def preprocess(self, data):
        data = super().preprocess(data)

        if any(v.is_discrete and len(v.values) > 2
               for v in data.domain.attributes):
            raise ValueError("Wrapped scikit-learn methods do not support " +
                             "multinomial variables.")

        return data

    def __call__(self, data):
        m = super().__call__(data)
        m.params = self.params
        return m

    def fit(self, X, Y, W=None):
        clf = self.__wraps__(**self.params)
        Y = Y.reshape(-1)
        if W is None or not self.supports_weights:
            return self.__returns__(clf.fit(X, Y))
        return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1)))

    @property
    def supports_weights(self):
        """Indicates whether this learner supports weighted instances.
        """
        return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames

    def __getattr__(self, item):
        try:
            return self.params[item]
        except (KeyError, AttributeError):
            raise AttributeError(item) from None

    # TODO: Disallow (or mirror) __setattr__ for keys in params?

    def __dir__(self):
        dd = super().__dir__()
        return list(sorted(set(dd) | set(self.params.keys())))

Пример #16

0

Показать файл

Файл: curvefit.py Проект: szzyiit/orange3

class CurveFitLearner(Learner):
    """
    Fit a function to data.
    It uses the scipy.curve_fit to find the optimal values of parameters.

    Parameters
    ----------
    expression : callable or str
        A modeling function.
        If callable, it must take the independent variable as the first
        argument and the parameters to fit as separate remaining arguments.
        If string, a lambda function is created,
        using `expression`, `available_feature_names`, `function` and `env`
        attributes.
        Should be string for pickling the model.
    parameters_names : list of str
        List of parameters names. Only needed when the expression
        is callable.
    features_names : list of str
        List of features names. Only needed when the expression
        is callable.
    available_feature_names : list of str
        List of all available features names. Only needed when the expression
        is string. Needed to distinguish between parameters and features when
        translating the expression into the lambda.
    functions : list of str
        List of all available functions. Only needed when the expression
        is string. Needed to distinguish between parameters and functions when
        translating the expression into the lambda.
    sanitizer : callable
        Function for sanitizing names.
    env : dict
        An environment to capture in the lambda's closure.
    p0 : list of floats, optional
        Initial guess for the parameters.
    bounds : 2-tuple of array_like, optional
        Lower and upper bounds on parameters.
    preprocessors : tuple of Orange preprocessors, optional
        The processors that will be used when data is passed to the learner.

    Examples
    --------
    >>> import numpy as np
    >>> from Orange.data import Table
    >>> from Orange.regression import CurveFitLearner
    >>> data = Table("housing")
    >>> # example with callable expression
    >>> cfun = lambda x, a, b, c: a * np.exp(-b * x[:, 0] * x[:, 1]) + c
    >>> learner = CurveFitLearner(cfun, ["a", "b", "c"], ["CRIM", "LSTAT"])
    >>> model = learner(data)
    >>> pred = model(data)
    >>> coef = model.coefficients
    >>> # example with str expression
    >>> sfun = "a * exp(-b * CRIM * LSTAT) + c"
    >>> names = [a.name for a in data.domain.attributes]
    >>> learner = CurveFitLearner(sfun, available_feature_names=names,
    ...                           functions=["exp"])
    >>> model = learner(data)
    >>> pred = model(data)
    >>> coef = model.coefficients

    """
    preprocessors = [HasClass(), RemoveNaNColumns(), Impute()]
    __returns__ = CurveFitModel
    name = "Curve Fit"

    def __init__(
            self,
            expression: Union[Callable, ast.Expression, str],
            parameters_names: Optional[List[str]] = None,
            features_names: Optional[List[str]] = None,
            available_feature_names: Optional[List[str]] = None,
            functions: Optional[List[str]] = None,
            sanitizer: Optional[Callable] = None,
            env: Optional[Dict[str, Any]] = None,
            p0: Union[List, Dict, None] = None,
            bounds: Union[Tuple, Dict] = (-np.inf, np.inf),
            preprocessors=None
    ):
        super().__init__(preprocessors)

        if callable(expression):
            if parameters_names is None:
                raise TypeError("Provide 'parameters_names' parameter.")
            if features_names is None:
                raise TypeError("Provide 'features_names' parameter.")

            args = None
            function = expression
        else:
            if available_feature_names is None:
                raise TypeError("Provide 'available_feature_names' parameter.")
            if functions is None:
                raise TypeError("Provide 'functions' parameter.")

            args = dict(expression=expression,
                        available_feature_names=available_feature_names,
                        functions=functions, sanitizer=sanitizer, env=env)
            function, parameters_names, features_names = _create_lambda(**args)

        if isinstance(p0, dict):
            p0 = [p0.get(p, 1) for p in parameters_names]
        if isinstance(bounds, dict):
            d = [-np.inf, np.inf]
            lower_bounds = [bounds.get(p, d)[0] for p in parameters_names]
            upper_bounds = [bounds.get(p, d)[1] for p in parameters_names]
            bounds = lower_bounds, upper_bounds

        self.__function = function
        self.__parameters_names = parameters_names
        self.__features_names = features_names
        self.__p0 = p0
        self.__bounds = bounds

        # needed for pickling - if the expression is a lambda function, the
        # learner is not picklable
        self.__create_lambda_args = args

    @property
    def parameters_names(self) -> List[str]:
        return self.__parameters_names

    def fit_storage(self, data: Table) -> CurveFitModel:
        domain: Domain = data.domain
        attributes = []
        for attr in domain.attributes:
            if attr.name in self.__features_names:
                if not attr.is_continuous:
                    raise ValueError("Numeric feature expected.")
                attributes.append(attr)

        new_domain = Domain(attributes, domain.class_vars, domain.metas)
        transformed = data.transform(new_domain)
        params = curve_fit(self.__function, transformed.X, transformed.Y,
                           p0=self.__p0, bounds=self.__bounds)[0]
        return CurveFitModel(new_domain, domain,
                             self.__parameters_names, params, self.__function,
                             self.__create_lambda_args)

    def __getstate__(self) -> Dict:
        if not self.__create_lambda_args:
            raise AttributeError(
                "Can't pickle/copy callable. Use str expression instead."
            )
        state = self.__create_lambda_args.copy()
        state["parameters_names"] = None
        state["features_names"] = None
        state["p0"] = self.__p0
        state["bounds"] = self.__bounds
        state["preprocessors"] = self.preprocessors
        return state

    def __setstate__(self, state: Dict):
        expression = state.pop("expression")
        self.__init__(expression, **state)

Пример #17

0

Показать файл

Файл: linear_bfgs.py Проект: varikmp/BioDepot-workflow-builder

class LinearRegressionLearner(Learner):
    '''L2 regularized linear regression (a.k.a Ridge regression)

    This model uses the L-BFGS algorithm to minimize the linear least
    squares penalty with L2 regularization. When using this model you
    should:

    - Choose a suitable regularization parameter lambda_
    - Consider appending a column of ones to the dataset (intercept term)

    Parameters
    ----------

    lambda\_ : float, optional (default=1.0)
        Regularization parameter. It controls trade-off between fitting the
        data and keeping parameters small. Higher values of lambda\_ force
        parameters to be smaller.

    preprocessors : list, optional
        Preprocessors are applied to data before training or testing. Default preprocessors
        `[Normalize(), Continuize(), Impute(), RemoveNaNColumns()]`:
        - transform the dataset so that the columns are on a similar scale,
        - continuize all discrete attributes,
        - remove columns with all values as NaN
        - replace NaN values with suitable values

    fmin_args : dict, optional
        Parameters for L-BFGS algorithm.
    """

    Examples
    --------

        import numpy as np
        from Orange.data import Table
        from Orange.regression.linear_bfgs import LinearRegressionLearner

        data = Table('housing')
        data.X = np.hstack((data.X, np.ones((data.X.shape[0], 1)))) # append ones
        m = LinearRegressionLearner(lambda_=1.0)
        c = m(data) # fit
        print(c(data)) # predict
    '''

    name = "linear_bfgs"
    preprocessors = [
        HasClass(),
        Normalize(),
        Continuize(),
        Impute(),
        RemoveNaNColumns(),
    ]

    def __init__(self, lambda_=1.0, preprocessors=None, **fmin_args):

        super().__init__(preprocessors=preprocessors)
        self.lambda_ = lambda_
        self.fmin_args = fmin_args

    def cost_grad(self, theta, X, y):
        t = X.dot(theta) - y

        cost = t.dot(t)
        cost += self.lambda_ * theta.dot(theta)
        cost /= 2.0 * X.shape[0]

        grad = X.T.dot(t)
        grad += self.lambda_ * theta
        grad /= X.shape[0]

        return cost, grad

    def fit(self, X, Y, W):
        if len(Y.shape) > 1 and Y.shape[1] > 1:
            raise ValueError(
                "Linear regression does not support " "multi-target classification"
            )

        if np.isnan(np.sum(X)) or np.isnan(np.sum(Y)):
            raise ValueError("Linear regression does not support " "unknown values")

        theta = np.zeros(X.shape[1])
        theta, cost, ret = fmin_l_bfgs_b(
            self.cost_grad, theta, args=(X, Y.ravel()), **self.fmin_args
        )

        return LinearRegressionModel(theta)

Пример #18

0

Показать файл

Файл: base.py Проект: ycpengpeng/orange3

class CatGBBaseLearner(Learner, metaclass=WrapperMeta):
    """
    ${skldoc}
    Additional Orange parameters

    preprocessors : list, optional
        An ordered list of preprocessors applied to data before
        training or testing.
        Defaults to
        `[RemoveNaNClasses(), RemoveNaNColumns()]`
    """
    supports_weights = True
    __wraps__ = None
    __returns__ = CatGBModel
    _params = {}
    preprocessors = default_preprocessors = [
        HasClass(),
        RemoveNaNColumns(),
    ]

    # pylint: disable=unused-argument,too-many-arguments,too-many-locals
    def __init__(self,
                 iterations=None,
                 learning_rate=None,
                 depth=None,
                 l2_leaf_reg=None,
                 model_size_reg=None,
                 rsm=None,
                 loss_function=None,
                 border_count=None,
                 feature_border_type=None,
                 per_float_feature_quantization=None,
                 input_borders=None,
                 output_borders=None,
                 fold_permutation_block=None,
                 od_pval=None,
                 od_wait=None,
                 od_type=None,
                 nan_mode=None,
                 counter_calc_method=None,
                 leaf_estimation_iterations=None,
                 leaf_estimation_method=None,
                 thread_count=None,
                 random_seed=None,
                 use_best_model=None,
                 verbose=False,
                 logging_level=None,
                 metric_period=None,
                 ctr_leaf_count_limit=None,
                 store_all_simple_ctr=None,
                 max_ctr_complexity=None,
                 has_time=None,
                 allow_const_label=None,
                 classes_count=None,
                 class_weights=None,
                 one_hot_max_size=None,
                 random_strength=None,
                 name=None,
                 ignored_features=None,
                 train_dir=cache_dir(),
                 custom_loss=None,
                 custom_metric=None,
                 eval_metric=None,
                 bagging_temperature=None,
                 save_snapshot=None,
                 snapshot_file=None,
                 snapshot_interval=None,
                 fold_len_multiplier=None,
                 used_ram_limit=None,
                 gpu_ram_part=None,
                 allow_writing_files=False,
                 final_ctr_computation_mode=None,
                 approx_on_full_history=None,
                 boosting_type=None,
                 simple_ctr=None,
                 combinations_ctr=None,
                 per_feature_ctr=None,
                 task_type=None,
                 device_config=None,
                 devices=None,
                 bootstrap_type=None,
                 subsample=None,
                 sampling_unit=None,
                 dev_score_calc_obj_block_size=None,
                 max_depth=None,
                 n_estimators=None,
                 num_boost_round=None,
                 num_trees=None,
                 colsample_bylevel=None,
                 random_state=None,
                 reg_lambda=None,
                 objective=None,
                 eta=None,
                 max_bin=None,
                 scale_pos_weight=None,
                 gpu_cat_features_storage=None,
                 data_partition=None,
                 metadata=None,
                 early_stopping_rounds=None,
                 cat_features=None,
                 grow_policy=None,
                 min_data_in_leaf=None,
                 min_child_samples=None,
                 max_leaves=None,
                 num_leaves=None,
                 score_function=None,
                 leaf_estimation_backtracking=None,
                 ctr_history_unit=None,
                 monotone_constraints=None,
                 feature_weights=None,
                 penalties_coefficient=None,
                 first_feature_use_penalties=None,
                 model_shrink_rate=None,
                 model_shrink_mode=None,
                 langevin=None,
                 diffusion_temperature=None,
                 posterior_sampling=None,
                 boost_from_average=None,
                 text_features=None,
                 tokenizers=None,
                 dictionaries=None,
                 feature_calcers=None,
                 text_processing=None,
                 preprocessors=None):
        super().__init__(preprocessors=preprocessors)
        self.params = vars()

    @property
    def params(self):
        return self._params

    @params.setter
    def params(self, value):
        self._params = self._get_wrapper_params(value)

    def _get_wrapper_params(self, values):
        spec = list(
            inspect.signature(self.__wraps__.__init__).parameters.keys())
        return {name: values[name] for name in spec[1:] if name in values}

    def __call__(self, data, progress_callback=None):
        m = super().__call__(data, progress_callback)
        m.params = self.params
        return m

    def fit_storage(self, data: Table):
        domain, X, Y, W = data.domain, data.X, data.Y.reshape(-1), None
        if self.supports_weights and data.has_weights():
            W = data.W.reshape(-1)
        # pylint: disable=not-callable
        clf = self.__wraps__(**self.params)
        cat_features = [
            i for i, attr in enumerate(domain.attributes) if attr.is_discrete
        ]
        if cat_features:
            X = X.astype(str)
        cat_model = clf.fit(X, Y, cat_features=cat_features, sample_weight=W)
        return self.__returns__(cat_model, cat_features, domain)

    def __getattr__(self, item):
        try:
            return self.params[item]
        except (KeyError, AttributeError):
            raise AttributeError(item) from None

    def __dir__(self):
        dd = super().__dir__()
        return list(sorted(set(dd) | set(self.params.keys())))

Пример #19

0

Показать файл

Файл: base.py Проект: rmcatee/orange3

class SklLearner(Learner, metaclass=WrapperMeta):
    """
    ${skldoc}
    Additional Orange parameters

    preprocessors : list, optional (default=[Continuize(), SklImpute(), RemoveNaNColumns()])
        An ordered list of preprocessors applied to data before
        training or testing.
    """
    __wraps__ = None
    __returns__ = SklModel
    _params = None

    name = 'skl learner'
    preprocessors = [Continuize(),
                     RemoveNaNColumns(),
                     SklImpute(force=False)]

    @property
    def params(self):
        return self._params

    @params.setter
    def params(self, value):
        self._params = self._get_sklparams(value)

    def _get_sklparams(self, values):
        skllearner = self.__wraps__
        if skllearner is not None:
            spec = inspect.getargs(skllearner.__init__.__code__)
            # first argument is 'self'
            assert spec.args[0] == "self"
            params = {name: values[name] for name in spec.args[1:]
                      if name in values}
        else:
            raise TypeError("Wrapper does not define '__wraps__'")
        return params

    def preprocess(self, data):
        data = super().preprocess(data)

        if any(v.is_discrete and len(v.values) > 2
               for v in data.domain.attributes):
            raise ValueError("Wrapped scikit-learn methods do not support " +
                             "multinomial variables.")

        return data

    def __call__(self, data):
        m = super().__call__(data)
        m.used_vals = [np.unique(y) for y in data.Y[:, None].T]
        m.params = self.params
        return m

    def fit(self, X, Y, W):
        clf = self.__wraps__(**self.params)
        Y = Y.reshape(-1)
        if W is None or not self.supports_weights:
            return self.__returns__(clf.fit(X, Y))
        return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1)))

    def __repr__(self):
        return '{} {}'.format(self.name, self.params)

Пример #20

0

Показать файл

class LRRulesLearner(Learner):
    """
    Learner learns a set of rules by using the provided
    rule learner. Then, learned rules get encoded as binary attributes
    (0 - not covered, 1 - covered) and together with the original set of
    attributes comprise the set of attributes used in logistic regression
    learning.

    If rule_learner is not provided, this acts as an ordinary
    logistic regression.

    The fitter for logistic regression uses the L2-penalized loss function.
    To prevent overfitting due to attributes built from rules,
    the weights of new rule-based attributes are penalized more (see
    Možina et al. Extreme value correction in rule learning)

    TODO: weights are not supported yet.
    """
    name = 'logreg rules'
    preprocessors = [HasClass(), RemoveNaNColumns(), Impute()]

    def __init__(self,
                 preprocessors=None,
                 penalty=1,
                 opt_penalty=False,
                 rule_learner=None,
                 basic_attributes=True,
                 fit_intercept=True,
                 intercept_scaling=2,
                 penalize_rules=True):
        """
        Parameters
        ----------
        preprocessors :
            A sequence of data preprocessors to apply on data prior to
            fitting the model.
        penalty : L2-penalty in loss function.
        rule_learner: Rule learner used to construct new attributes.
        fit_intercept: Should we add a constant column to data?
        intercept_scaling: Value of constant in the intercept column. Note that
            intercept column is appended after normalization, therefore higher
            values will be less affected by penalization.
        """
        super().__init__(preprocessors)
        self.penalty = penalty
        self.opt_penalty = opt_penalty
        self.rule_learner = rule_learner
        self.fit_intercept = fit_intercept
        self.intercept_scaling = intercept_scaling
        self.basic_attributes = basic_attributes
        self.penalize_rules = penalize_rules
        # Post rule learning preprocessing should not decrease the
        # number of examples.
        self.post_rule_preprocess = [Normalize(), Continuize()]

    def fit_storage(self, data):
        if self.opt_penalty:
            self.penalty = self.tune_penalty(data)
        # learn rules
        rules = self.rule_learner(data).rule_list if self.rule_learner else []
        # preprocess data
        if not self.basic_attributes:
            domain = Domain([], data.domain.class_vars, data.domain.metas)
            data = data.from_table(domain, data)
        for pp in self.post_rule_preprocess:
            data = pp(data)
        # create data
        X, Y, W = data.X, data.Y, data.W if data.W else None
        # 1. add rules to X
        Xr = np.concatenate([X] +
                            [r.covered_examples[:, np.newaxis] for r in rules],
                            axis=1)
        # 2. add constant to X
        if self.fit_intercept:
            Xr = self.add_intercept(self.intercept_scaling, Xr)
        # set additional penalties that penalized rule-based attributes
        gamma = self.get_gamma(X, rules)
        # build model
        w = []
        se = []
        if len(self.domain.class_var.values) > 2:
            for cli, _ in enumerate(self.domain.class_var.values):
                # create class with domain {-1, 1}
                yc = np.ones_like(Y)
                yc[Y != cli] = -1
                # set bounds
                bounds = self.set_bounds(X, rules, cli)
                x, s = self.fit_params(Xr, yc, bounds, gamma)
                w.append(x)
                se.append(s)
        else:
            yc = np.ones_like(Y)
            yc[Y != 0] = -1
            bounds = self.set_bounds(X, rules, 0)
            x, s = self.fit_params(Xr, yc, bounds, gamma)
            w = [x, -x]
            se = [s, s]
        # remove zero weights and corresponding rules
        to_keep, final_rules = list(range(X.shape[1])), []
        for ri, r in enumerate(rules):
            if any(wi[X.shape[1] + ri] != 0 for wi in w):
                to_keep.append(X.shape[1] + ri)
                final_rules.append(r)
        if self.fit_intercept:
            to_keep.append(-1)
        w = [wi[to_keep] for wi in w]
        se = [s[to_keep] for s in se]
        return LRRulesClassifier(w, se, final_rules, self.fit_intercept,
                                 self.intercept_scaling, self.domain,
                                 data.domain)

    def tune_penalty(self, data):
        learner = LRRulesLearner(fit_intercept=self.fit_intercept,
                                 intercept_scaling=self.intercept_scaling)
        penalties = [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10., 100.]
        scores = []
        for pen in penalties:
            learner.penalty = pen
            res = CrossValidation(data, [learner], k=5, random_state=1111)
            ll = LogLoss(res)
            scores.append(ll)
        return penalties[scores.index(min(scores))]

    def get_gamma(self, X, rules):
        gamma = [0] * X.shape[1]
        for r in rules:
            if self.penalize_rules:
                gamma.append(r.curr_class_dist[r.target_class] -
                             r.quality * r.curr_class_dist.sum())
            else:
                gamma.append(0)
        if self.fit_intercept:
            gamma.append(0)
        return np.array(gamma)

    @staticmethod
    def add_intercept(intercept, X):
        return np.hstack((X, intercept * np.ones((X.shape[0], 1))))

    def set_bounds(self, X, rules, cli):
        bounds = [(None, None) for _ in range(X.shape[1])]
        for r in rules:
            if r.target_class == cli:
                bounds.append((0, None))
            else:
                bounds.append((None, 0))
        if self.fit_intercept:
            bounds.append((None, None))
        return bounds

    def fit_params(self, X, y, bounds, gamma):
        w0 = np.zeros(X.shape[1])
        out = opt.minimize(self.ll,
                           w0,
                           args=(X, y, gamma),
                           method='TNC',
                           bounds=bounds,
                           jac=self.gradient)
        w = out.x
        # compute standard errors (s)
        z = self.phi(X.dot(w))
        weights = z * (1 - z)
        xwx = (X.T * weights).dot(X)
        diag = np.diag_indices(X.shape[1])
        xwx[diag] += self.penalty
        inv = np.linalg.inv(xwx)
        s = np.sqrt(inv[diag])
        return w, s

    @staticmethod
    def phi(t):
        # logistic function, returns 1 / (1 + exp(-t))
        idx = t > 0
        out = np.empty(t.size, dtype=np.float)
        out[idx] = 1. / (1 + np.exp(-t[idx]))
        exp_t = np.exp(t[~idx])
        out[~idx] = exp_t / (1. + exp_t)
        return out

    def ll(self, w, X, y, gamma):
        # loss function to be optimized, it's the logistic loss
        z = X.dot(w)
        yz = y * z
        idx = yz > 0
        out = np.zeros_like(yz)
        out[idx] = np.log(1 + np.exp(-yz[idx]))
        out[~idx] = (-yz[~idx] + np.log(1 + np.exp(yz[~idx])))
        out = out.sum()
        # add penalty
        out += (self.penalty * .5 * w).dot(w)
        # add second penalty (which is lasso-like and is a numpy array)
        out += gamma.dot(np.abs(w))
        return out

    def gradient(self, w, X, y, gamma):
        # gradient of the logistic loss (ll)
        z = X.dot(w)
        z = self.phi(y * z)
        z0 = (z - 1) * y
        gradll = X.T.dot(z0)
        # add penalties
        gradll += self.penalty * w
        # second penalty
        pos = w > 0
        neg = w < 0
        gradll[pos] += gamma[pos]
        gradll[neg] -= gamma[neg]
        return gradll

Пример #21

0

Показать файл

class SoftmaxRegressionLearner(Learner):
    """L2 regularized softmax regression classifier.
    Uses the L-BFGS algorithm to minimize the categorical
    cross entropy cost with L2 regularization. This model is suitable
    when dealing with a multi-class classification problem.

    When using this learner you should:

    - choose a suitable regularization parameter lambda\_,
    - consider using many logistic regression models (one for each
      value of the class variable) instead of softmax regression.

    Parameters
    ----------

    lambda\_ : float, optional (default=1.0)
        Regularization parameter. It controls trade-off between fitting the
        data and keeping parameters small. Higher values of lambda\_ force
        parameters to be smaller.

    preprocessors : list, optional
        Preprocessors are applied to data before training or testing. Default preprocessors:
        Defaults to
        `[RemoveNaNClasses(), RemoveNaNColumns(), Impute(), Continuize(), Normalize()]`

        - remove columns with all values as NaN
        - replace NaN values with suitable values
        - continuize all discrete attributes,
        - transform the dataset so that the columns are on a similar scale,

    fmin_args : dict, optional
        Parameters for L-BFGS algorithm.
    """

    name = "softmax"
    preprocessors = [
        HasClass(),
        RemoveNaNColumns(),
        Impute(),
        Continuize(),
        Normalize(),
    ]

    def __init__(self, lambda_=1.0, preprocessors=None, **fmin_args):
        super().__init__(preprocessors=preprocessors)
        self.lambda_ = lambda_
        self.fmin_args = fmin_args

    def cost_grad(self, Theta_flat, X, Y):
        Theta = Theta_flat.reshape((self.num_classes, X.shape[1]))

        M = X.dot(Theta.T)
        P = np.exp(M - np.max(M, axis=1)[:, None])
        P /= np.sum(P, axis=1)[:, None]

        cost = -np.sum(np.log(P) * Y)
        cost += self.lambda_ * Theta_flat.dot(Theta_flat) / 2.0
        cost /= X.shape[0]

        grad = X.T.dot(P - Y).T
        grad += self.lambda_ * Theta
        grad /= X.shape[0]

        return cost, grad.ravel()

    def fit(self, X, y, W):
        if len(y.shape) > 1:
            raise ValueError("Softmax regression does not support "
                             "multi-label classification")

        if np.isnan(np.sum(X)) or np.isnan(np.sum(y)):
            raise ValueError("Softmax regression does not support "
                             "unknown values")

        X = np.hstack((X, np.ones((X.shape[0], 1))))

        self.num_classes = np.unique(y).size
        Y = np.eye(self.num_classes)[y.ravel().astype(int)]

        theta = np.zeros(self.num_classes * X.shape[1])
        theta, j, ret = fmin_l_bfgs_b(self.cost_grad,
                                      theta,
                                      args=(X, Y),
                                      **self.fmin_args)
        Theta = theta.reshape((self.num_classes, X.shape[1]))

        return SoftmaxRegressionModel(Theta)

Python RemoveNaNColumns примеры использования