Exemplo n.º 1
0
 def decision_function(self,X):
     ''' 
     Computes distance to separating hyperplane between classes. The larger 
     is the absolute value of the decision function further data point is 
     from the decision boundary.
     
     Parameters
     ----------
     X: array-like of size (n_samples_test,n_features)
        Matrix of explanatory variables
       
     Returns
     -------
     decision: numpy array of size (n_samples_test,)
        Distance to decision boundary
     '''
     check_is_fitted(self, 'coef_') 
     X = check_array(X, accept_sparse=None, dtype = np.float64)
     n_features = self.coef_.shape[1]
     if X.shape[1] != n_features:
         raise ValueError("X has %d features per sample; expecting %d"
                          % (X.shape[1], n_features))
     decision = [self._decision_function_active(X[:,active],coef,active,bias) for 
                 coef,active,bias in zip(self.coef_,self.active_,self.intercept_)]
     decision = np.asarray(decision).squeeze().T
     return decision
Exemplo n.º 2
0
    def staged_predict(self, X):
        """Return staged predictions for X.

        The predicted regression value of an input sample is computed
        as the weighted median prediction of the classifiers in the ensemble.

        This generator method yields the ensemble prediction after each
        iteration of boosting and therefore allows monitoring, such as to
        determine the prediction on a test set after each boost.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrix can be CSC, CSR, COO,
            DOK, or LIL. DOK and LIL are converted to CSR.

        Returns
        -------
        y : generator of array, shape = [n_samples]
            The predicted regression values.
        """
        check_is_fitted(self, "estimator_weights_")
        X = self._validate_X_predict(X)

        for i, _ in enumerate(self.estimators_, 1):
            yield self._get_median_predict(X, limit=i)
Exemplo n.º 3
0
    def predict(self, X):
        """Predict multi-output variable using a model
         trained for each target variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            Data.

        Returns
        -------
        y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets predicted across multiple predictors.
            Note: Separate models are generated for each predictor.
        """
        check_is_fitted(self, 'estimators_')
        if not hasattr(self.estimator, "predict"):
            raise ValueError("The base estimator should implement a predict method")

        X = check_array(X, accept_sparse=True)

        y = Parallel(n_jobs=self.n_jobs)(delayed(parallel_helper)(e, 'predict', X)
                                         for e in self.estimators_)

        return np.asarray(y).T
Exemplo n.º 4
0
 def _kernel_decision_function(self,X):
     ''' Computes kernel and decision function based on kernel'''
     check_is_fitted(self,'coef_')
     X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
     K = get_kernel( X, self.relevant_vectors_, self.gamma, self.degree, 
                     self.coef0, self.kernel, self.kernel_params)
     return K , np.dot(K,self.coef_[self.active_]) + self.intercept_
Exemplo n.º 5
0
    def transform(self, X):
        """Transform a dataframe given the fit imputer.

        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to transform.

        Returns
        -------

        X : pd.DataFrame or np.ndarray
            The imputed matrix
        """

        check_is_fitted(self, 'fills_')
        # check on state of X and cols
        X, _ = validate_is_pd(X, self.cols)
        cols = self.cols if self.cols is not None else X.columns.values

        # get the fills
        modes = self.fills_

        # if it's a single int, easy:
        if isinstance(modes, int):
            X[cols] = X[cols].fillna(modes)
        else:
            # it's a dict
            for nm in cols:
                X[nm] = X[nm].fillna(modes[nm])

        return X if self.as_df else X.as_matrix()
Exemplo n.º 6
0
    def score(self, X, lengths=None):
        """Compute the log probability under the model.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Feature matrix of individual samples.

        lengths : array-like of integers, shape (n_sequences, ), optional
            Lengths of the individual sequences in ``X``. The sum of
            these should be ``n_samples``.

        Returns
        -------
        logprob : float
            Log likelihood of ``X``.

        See Also
        --------
        score_samples : Compute the log probability under the model and
            posteriors.
        decode : Find most likely state sequence corresponding to ``X``.
        """
        check_is_fitted(self, "startprob_")
        self._check()

        X = check_array(X)
        # XXX we can unroll forward pass for speed and memory efficiency.
        logprob = 0
        for i, j in iter_from_X_lengths(X, lengths):
            framelogprob = self._compute_log_likelihood(X[i:j])
            logprobij, _fwdlattice = self._do_forward_pass(framelogprob)
            logprob += logprobij
        return logprob
    def _check_vocabulary(self):
        """Check if vocabulary is empty or missing (not fit-ed)"""
        msg = "%(name)s - Vocabulary wasn't fitted."
        check_is_fitted(self, 'vocabulary_', msg=msg),

        if len(self.vocabulary_) == 0:
            raise ValueError("Vocabulary is empty")
Exemplo n.º 8
0
    def predict(self, X):
        """Predict with Vowpal Wabbit model

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features or 1)
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
            if not using convert_to_vw, X is expected to be a list of vw formatted feature vector strings with labels

        Returns
        -------
        y : array-like, shape (n_samples,)
            Output vector relative to X.
        """

        check_is_fitted(self, 'fit_')

        try:
            num_samples = X.shape[0] if X.ndim > 1 else len(X)
        except AttributeError:
            num_samples = len(X)

        if self.convert_to_vw_:
            X = tovw(X)

        model = self.get_vw()
        label_type = model.get_label_type()

        y = np.empty([num_samples])
        # add test examples to model
        for idx, x in enumerate(X):
            y[idx] = model.predict(ec=x, labelType=label_type)

        return y
Exemplo n.º 9
0
    def score(self, X, y=None):
        """Return the average log-likelihood of all samples.
        This calls sklearn.decomposition.PCA's score method
        on the specified columns [1].

        Parameters
        ----------

        X: Pandas ``DataFrame``, shape=(n_samples, n_features)
            The data to score.

        y: None
            Passthrough for pipeline/gridsearch


        Returns
        -------

        ll: float
            Average log-likelihood of the samples under the fit
            PCA model (`self.pca_`)


        References
        ----------

        .. [1] Bishop, C.  "Pattern Recognition and Machine Learning"
               12.2.1 p. 574 http://www.miketipping.com/papers/met-mppca.pdf
        """
        check_is_fitted(self, 'pca_')
        X, _ = validate_is_pd(X, self.cols)
        cols = X.columns if not self.cols else self.cols

        ll = self.pca_.score(X[cols].as_matrix(), _as_numpy(y))
        return ll
Exemplo n.º 10
0
    def transform(self, X):
        """Transform a test matrix given the already-fit transformer.

        Parameters
        ----------

        X : Pandas ``DataFrame``
            The Pandas frame to transform. The operation will
            be applied to a copy of the input data, and the result
            will be returned.


        Returns
        -------

        X : Pandas ``DataFrame``
            The operation is applied to a copy of ``X``,
            and the result set is returned.
        """
        check_is_fitted(self, 'sq_nms_')

        # check on state of X and cols
        X, _ = validate_is_pd(X, self.cols)
        sq_nms_ = self.sq_nms_

        # scale by norms
        for nm, the_norm in six.iteritems(sq_nms_):
            X[nm] /= the_norm

        return X if self.as_df else X.as_matrix()
Exemplo n.º 11
0
    def predict(self, X):
        """Predict if a particular sample is an outlier or not.
        Calling xgboost `predict` function.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        outlier_labels : numpy array of shape (n_samples,)
            For each observation, tells whether or not
            it should be considered as an outlier according to the
            fitted model. 0 stands for inliers and 1 for outliers.
        """

        check_is_fitted(self, ['clf_', 'decision_scores_',
                               'labels_', '_scalar'])

        X = check_array(X)

        # construct the new feature space
        X_add = self._generate_new_features(X)
        X_new = np.concatenate((X, X_add), axis=1)

        pred_scores = self.clf_.predict(X_new)
        return pred_scores.ravel()
Exemplo n.º 12
0
    def transform(self, X):
        """Transform a test matrix given the already-fit transformer.

        Parameters
        ----------

        X : Pandas ``DataFrame``
            The Pandas frame to transform. The operation will
            be applied to a copy of the input data, and the result
            will be returned.


        Returns
        -------

        X : Pandas ``DataFrame``
            The operation is applied to a copy of ``X``,
            and the result set is returned.
        """
        check_is_fitted(self, 'is_fit_')
        X, _ = validate_is_pd(X, self.cols)
        cols = _cols_if_none(X, self.cols)

        # apply the function
        # TODO: do we want to change the behavior to where the function
        # should accept an entire frame and not a series?
        X[cols] = X[cols].apply(lambda x: self.fun(x, **self.kwargs))
        return X
Exemplo n.º 13
0
 def decision_function(self,X):
     ''' 
     Computes distance to separating hyperplane between classes. The larger 
     is the absolute value of the decision function further data point is 
     from the decision boundary.
     
     Parameters
     ----------
     X: array-like of size (n_samples_test,n_features)
        Matrix of explanatory variables
       
     Returns
     -------
     decision: numpy array of size (n_samples_test,)
        Distance to decision boundary
     '''
     check_is_fitted(self, 'coef_') 
     X = check_array(X, accept_sparse=None, dtype = np.float64)
     n_features = self.relevant_vectors_[0].shape[1]
     if X.shape[1] != n_features:
         raise ValueError("X has %d features per sample; expecting %d"
                          % (X.shape[1], n_features))
     kernel = lambda rvs : get_kernel(X,rvs,self.gamma, self.degree, 
                                      self.coef0, self.kernel, self.kernel_params)
     decision = []
     for rv,cf,act,b in zip(self.relevant_vectors_,self.coef_,self.active_,
                            self.intercept_):
         # if there are no relevant vectors => use intercept only
         if rv.shape[0] == 0:
             decision.append( np.ones(X.shape[0])*b )
         else:
             decision.append(self._decision_function_active(kernel(rv),cf,act,b))
     decision = np.asarray(decision).squeeze().T
     return decision
Exemplo n.º 14
0
    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """

        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])

        # Invert outlier scores. Outliers comes with higher outlier scores
        # noinspection PyProtectedMember
        if _sklearn_version_20():
            return invert_order(self.detector_._score_samples(X))
        else:
            return invert_order(self.detector_._decision_function(X))
Exemplo n.º 15
0
    def predict(self, X):
        """ Predict class labels for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        ----------
        maj : array-like, shape = [n_samples]
            Predicted class labels.
        """

        check_is_fitted(self, 'estimators_')
        if self.voting == 'soft':
            maj = np.argmax(self.predict_proba(X), axis=1)

        else:  # 'hard' voting
            predictions = self._predict(X)
            maj = np.apply_along_axis(lambda x:
                                      np.argmax(np.bincount(x,
                                                weights=self.weights)),
                                      axis=1,
                                      arr=predictions.astype('int'))

        maj = self.le_.inverse_transform(maj)

        return maj
Exemplo n.º 16
0
    def decision_function(self, X):
        """Compute the decision function of ``X``.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The input samples.

        Returns
        -------
        score : array, shape = [n_samples, k]
            The decision function of the input samples. The order of
            outputs is the same of that of the `classes_` attribute.
            Binary classification is a special cases with ``k == 1``,
            otherwise ``k==n_classes``. For binary classification,
            values closer to -1 or 1 mean more like the first or second
            class in ``classes_``, respectively.
        """
        check_is_fitted(self, "n_classes_")
        X = np.asarray(X)

        pred = None

        for estimator in self.estimators_:
            # The weights are all 1. for LogitBoost
            current_pred = estimator.predict(X)

            if pred is None:
                pred = current_pred
            else:
                pred += current_pred

        return pred
Exemplo n.º 17
0
    def sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """

        # Check the consistency of X and y
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])

        check_is_fitted(self, 'ratio_')
        self._check_X_y(X, y)

        return self._sample(X, y)
Exemplo n.º 18
0
    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['model_', 'history_'])
        X = check_array(X)

        if self.preprocessing:
            X_norm = self.scaler_.transform(X)
        else:
            X_norm = np.copy(X)

        # Predict on X and return the reconstruction errors
        pred_scores = self.model_.predict(X_norm)
        return pairwise_distances_no_broadcast(X_norm, pred_scores)
Exemplo n.º 19
0
    def predict(self, ys, paradigm, use_beta=True):
        """
        """
        check_is_fitted(self, "hrf_")
        names, onsets, durations, modulation = check_paradigm(paradigm)
        frame_times = np.arange(0, onsets.max() + self.time_offset, self.t_r)
        f_hrf = interp1d(self.hx_, self.hrf_)

        dm = make_design_matrix_hrf(frame_times, paradigm,
                                    hrf_length=self.hrf_length,
                                    t_r=self.t_r, time_offset=self.time_offset,
                                    drift_model=self.drift_model,
                                    period_cut=self.period_cut,
                                    drift_order=self.drift_order,
                                    f_hrf=f_hrf)
        # Least squares estimation
        if use_beta:
            beta_values = self.beta
        else:
            beta_values = np.linalg.pinv(dm.values).dot(ys)
        #print dm.shape
        #print beta_values.shape
        ys_fit = dm.values[:, :len(beta_values)].dot(beta_values)
        #ys -= drifts.dot(np.linalg.pinv(drifts).dot(ys))

        ress = ys - ys_fit

        return ys_fit, dm, beta_values, ress
Exemplo n.º 20
0
    def transform(self, column):
        check_is_fitted(self, 'encoder_')
        column = h2o_col_to_numpy(column)

        # transform--
        # I don't like that we have to re-upload... but we do... for now...
        return H2OFrame.from_python(self.encoder_.transform(column).reshape(column.shape[0], 1))
Exemplo n.º 21
0
    def _predict_rank(self, X, normalized=False):
        """Predict the outlyingness rank of a sample by a fitted model. The
        method is for outlier detector score combination.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        normalized : bool, optional (default=False)
            If set to True, all ranks are normalized to [0,1].

        Returns
        -------
        ranks : array, shape (n_samples,)
            Outlying rank of a sample according to the training data.

        """

        check_is_fitted(self, ['decision_scores_'])

        test_scores = self.decision_function(X)
        train_scores = self.decision_scores_

        sorted_train_scores = np.sort(train_scores)
        ranks = np.searchsorted(sorted_train_scores, test_scores)

        if normalized:
            # return normalized ranks
            ranks = ranks / ranks.max()
        return ranks
Exemplo n.º 22
0
    def transform(self, X):
        """Transform a test matrix given the already-fit transformer.

        Parameters
        ----------

        X : Pandas ``DataFrame``
            The Pandas frame to transform. The operation will
            be applied to a copy of the input data, and the result
            will be returned.


        Returns
        -------

        X : Pandas ``DataFrame``
            The operation is applied to a copy of ``X``,
            and the result set is returned.
        """
        check_is_fitted(self, 'lambda_')
        # check on state of X and cols
        X, cols = validate_is_pd(X, self.cols, assert_all_finite=True)  # creates a copy -- we need all to be finite
        cols = _cols_if_none(X, self.cols)

        lambdas_ = self.lambda_

        # do transformations
        for nm in cols:
            X[nm] = _yj_transform_y(X[nm], lambdas_[nm])

        return X if self.as_df else X.as_matrix()
Exemplo n.º 23
0
    def score(self, frame):
        """Predict and score on a new frame. Note that this method
        will not store performance metrics in the report that ``report_score``
        generates.

        Parameters
        ----------

        frame : H2OFrame, shape=(n_samples, n_features)
            The test frame on which to predict and score performance.

        Returns
        -------

        scor : float
            The score on the testing frame
        """
        check_is_fitted(self, 'best_estimator_')
        e, l, p = self.extra_names_['expo'], self.extra_names_['loss'], self.extra_names_['prem']

        kwargs = {
            'expo': frame[e],
            'loss': frame[l],
            'prem': frame[p] if p is not None else None
        }

        y_truth = frame[self.target_feature]
        pred = self.best_estimator_.predict(frame)['predict']
        scor = self.scoring_class_.score_no_store(y_truth, pred, **kwargs)

        return scor
    def predict_proba(self, X):
        """Predict probability for each possible outcome.

        Compute the probability estimates for each single sample in X
        and each possible outcome seen during training (categorical
        distribution).

        Parameters
        ----------
        X : array_like, shape = [n_samples, n_features]

        Returns
        -------
        probabilities : array, shape = [n_samples, n_classes]
            Normalized probability distributions across
            class labels
        """
        check_is_fitted(self, 'X_')

        X_2d = check_array(X, accept_sparse = ['csc', 'csr', 'coo', 'dok',
                        'bsr', 'lil', 'dia'])
        weight_matrices = self._get_kernel(self.X_, X_2d)
        if self.kernel == 'knn':
            probabilities = []
            for weight_matrix in weight_matrices:
                ine = np.sum(self.label_distributions_[weight_matrix], axis=0)
                probabilities.append(ine)
            probabilities = np.array(probabilities)
        else:
            weight_matrices = weight_matrices.T
            probabilities = np.dot(weight_matrices, self.label_distributions_)
        normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T
        probabilities /= normalizer
        return probabilities
Exemplo n.º 25
0
 def predict_dist(self,X):
     '''
     Computes predictive distribution for test set.
     Predictive distribution for each data point is one dimensional
     Gaussian and therefore is characterised by mean and standard
     deviation.
     
     Parameters
     ----------
     X: {array-like,sparse matrix} of size [n_samples_test, n_features]
        Matrix of explanatory variables (test set)
        
     Returns
     -------
     y_hat: array of size [n_samples_test]
        Estimated values of targets on test set (Mean of predictive distribution)
        
     std_hat: array of size [n_samples_test]
        Error bounds (Standard deviation of predictive distribution)
     '''
     check_is_fitted(self, "coef_")
     # mean of predictive distribution
     K = get_kernel( X, self.relevant_vectors_, self.gamma, self.degree, 
                    self.coef0, self.kernel, self.kernel_params)
     y_hat     = decision_function(self,self.coef_[self.active_], X, 
                                   self.intercept_,self.relevant_vectors_, 
                                   self.gamma, self.degree, self.coef0,
                                   self.kernel,self.kernel_params)
     K         = (K - self._x_mean_[self.active_])
     var_hat   = self.alpha_
     var_hat  += np.sum( np.dot(K,self.sigma_) * K, axis = 1)
     std_hat   = np.sqrt(var_hat)
     return y_hat,std_hat
Exemplo n.º 26
0
    def transform(self, X, y=None):
        """Transform data to polynomial features
        Parameters
        ----------
        X : array with shape [n_samples, n_features]
            The data to transform, row by row.
        Returns
        -------
        XP : np.ndarray shape [n_samples, NP]
            The matrix of features, where NP is the number of polynomial
            features generated from the combination of inputs.
        """
        check_is_fitted(self, ['n_input_features_', 'n_output_features_'])

        X = check_array(X)
        n_samples, n_features = X.shape

        if n_features != self.n_input_features_:
            raise ValueError("X shape does not match training shape")

        # allocate output data
        XP = np.empty((n_samples, self.n_output_features_), dtype=X.dtype)

        combinations = self._combinations(n_features, self.degree,
                                          self.interaction_only,
                                          self.include_bias)
        for i, c in enumerate(combinations):
            # Change prod to sum
            XP[:, i] = X[:, c].sum(1)

        return XP
    def pairwise_kernel(self, X, Y):
        """Function to use with :func:`sklearn.metrics.pairwise.pairwise_kernels`

        Parameters
        ----------
        X : array, shape = [n_features]

        Y : array, shape = [n_features]

        Returns
        -------
        similarity : float
            Similarities are normalized to be within [0, 1]
        """
        check_is_fitted(self, 'X_fit_')
        if X.shape[0] != Y.shape[0]:
            raise ValueError('X and Y have different number of features')

        val = pairwise_continuous_ordinal_kernel(X[self._numeric_columns], Y[self._numeric_columns],
                                                 self._numeric_ranges)
        if len(self._nominal_columns) > 0:
            val += pairwise_nominal_kernel(X[self._nominal_columns].astype(numpy.int8),
                                           Y[self._nominal_columns].astype(numpy.int8))

        val /= X.shape[0]

        return val
Exemplo n.º 28
0
    def transform(self, X):
        """ A reference implementation of a transform function.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The input samples.

        Returns
        -------
        X_transformed : array of int of shape = [n_samples, n_features]
            The array containing the element-wise square roots of the values
            in `X`
        """
        # Check is fit had been called
        check_is_fitted(self, ['input_shape_'])

        # Input validation
        X = check_array(X)

        # Check that the input is of the same shape as the one passed
        # during fit.
        if X.shape != self.input_shape_:
            raise ValueError('Shape of input is different from what was seen'
                             'in `fit`')
        return np.sqrt(X)
Exemplo n.º 29
0
    def inverse_transform(self, X, copy=None):
        """Scale back the data to the original representation

        Parameters
        ----------
        X : array-like with shape [n_samples, n_features]
            The data used to scale along the features axis.
        """
        check_is_fitted(self, 'std_')

        copy = copy if copy is not None else self.copy
        if sparse.issparse(X):
            if self.with_mean:
                raise ValueError(
                    "Cannot uncenter sparse matrices: pass `with_mean=False` "
                    "instead See docstring for motivation and alternatives.")
            if not sparse.isspmatrix_csr(X):
                X = X.tocsr()
                copy = False
            if copy:
                X = X.copy()
            if self.std_ is not None:
                inplace_column_scale(X, self.std_)
        else:
            X = np.asarray(X)
            if copy:
                X = X.copy()
            if self.with_std:
                X *= self.std_
            if self.with_mean:
                X += self.mean_
        return X
Exemplo n.º 30
0
    def transform(self, X, y=None, copy=None):
        """Perform standardization by centering and scaling

        Parameters
        ----------
        X : array-like with shape [n_samples, n_features]
            The data used to scale along the features axis.
        """
        check_is_fitted(self, 'std_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, accept_sparse="csc", ensure_2d=False)
        if warn_if_not_float(X, estimator=self):
            X = X.astype(np.float)
        if sparse.issparse(X):
            if self.center_sparse:
                for i in range(X.shape[1]):
                    X.data[X.indptr[i]:X.indptr[i + 1]] -= self.mean_[i]

            elif self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")

            else:
                pass

            if self.std_ is not None:
                inplace_column_scale(X, 1 / self.std_)
        else:
            if self.with_mean:
                X -= self.mean_
            if self.with_std:
                X /= self.std_
        return X
Exemplo n.º 31
0
 def predict_proba(self, X):
     check_is_fitted(self, "has_fitted_")
     prob = EBMUtils.classifier_predict_proba(X, self)
     return prob
Exemplo n.º 32
0
 def predict(self, X):
     check_is_fitted(self, "has_fitted_")
     return EBMUtils.regressor_predict(X, self)
Exemplo n.º 33
0
    def staged_fit_interactions(self, X, y, inter_indices=[]):
        check_is_fitted(self, "has_fitted_")

        self.inter_episode_idx_ = 0
        if len(inter_indices) == 0:
            log.info("No interactions to train")
            return self

        log.info("Training interactions")

        # Split data into train/val
        X_train, X_val, y_train, y_val = train_test_split(
            X,
            y,
            test_size=self.holdout_split,
            random_state=self.random_state,
            stratify=y if is_classifier(self) else None,
        )
        if is_classifier(self):
            model_type = "classification"
        else:
            model_type = "regression"

        # Discard initial interactions
        new_attribute_set_models = []
        new_attribute_sets = []
        for i, attribute_set in enumerate(self.attribute_sets_):
            if attribute_set["n_attributes"] != 1:
                continue
            new_attribute_set_models.append(self.attribute_set_models_[i])
            new_attribute_sets.append(self.attribute_sets_[i])
        self.attribute_set_models_ = new_attribute_set_models
        self.attribute_sets_ = new_attribute_sets

        # Fix main, train interactions
        training_scores = self.decision_function(X_train)
        validation_scores = self.decision_function(X_val)
        inter_attr_sets = EBMUtils.gen_attribute_sets(inter_indices)
        with closing(
            NativeEBM(
                self.attributes_,
                inter_attr_sets,
                X_train,
                y_train,
                X_val,
                y_val,
                num_inner_bags=self.feature_step_n_inner_bags,
                num_classification_states=self.n_classes_,
                model_type=model_type,
                training_scores=training_scores,
                validation_scores=validation_scores,
                random_state=self.random_state,
            )
        ) as native_ebm:
            log.info("Train interactions")
            self.current_metric_, self.inter_episode_idx_ = self._cyclic_gradient_boost(
                native_ebm, inter_attr_sets, "Pair"
            )
            log.debug("Interaction Metric: {0}".format(self.current_metric_))

            for index, attr_set in enumerate(inter_attr_sets):
                self.attribute_set_models_.append(native_ebm.get_best_model(index))
                self.attribute_sets_.append(attr_set)

        return self
Exemplo n.º 34
0
 def predict(self, X):
     check_is_fitted(self, "has_fitted_")
     return EBMUtils.classifier_predict(X, self)
Exemplo n.º 35
0
 def predict_proba(self, X):
     check_is_fitted(self, "has_fitted_")
     X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types)
     X = self.preprocessor_.transform(X)
     prob = EBMUtils.classifier_predict_proba(X, self)
     return prob
Exemplo n.º 36
0
    def decision_function(self, X):
        check_is_fitted(self, "has_fitted_")

        return EBMUtils.decision_function(
            X, self.attribute_sets_, self.attribute_set_models_, 0
        )
Exemplo n.º 37
0
    def log_marginal_likelihood(self, theta=None, eval_gradient=False):
        """Returns log-marginal likelihood of theta for training data.

        In the case of multi-class classification, the mean log-marginal
        likelihood of the one-versus-rest classifiers are returned.

        Parameters
        ----------
        theta : array-like, shape = (n_kernel_params,) or none
            Kernel hyperparameters for which the log-marginal likelihood is
            evaluated. In the case of multi-class classification, theta may
            be the  hyperparameters of the compound kernel or of an individual
            kernel. In the latter case, all individual kernel get assigned the
            same theta values. If None, the precomputed log_marginal_likelihood
            of self.kernel_.theta is returned.

        eval_gradient : bool, default: False
            If True, the gradient of the log-marginal likelihood with respect
            to the kernel hyperparameters at position theta is returned
            additionally. Note that gradient computation is not supported
            for non-binary classification. If True, theta must not be None.

        Returns
        -------
        log_likelihood : float
            Log-marginal likelihood of theta for training data.

        log_likelihood_gradient : array, shape = (n_kernel_params,), optional
            Gradient of the log-marginal likelihood with respect to the kernel
            hyperparameters at position theta.
            Only returned when eval_gradient is True.
        """
        check_is_fitted(self, ["classes_", "n_classes_"])

        if theta is None:
            if eval_gradient:
                raise ValueError(
                    "Gradient can only be evaluated for theta!=None")
            return self.log_marginal_likelihood_value_

        theta = np.asarray(theta)
        if self.n_classes_ == 2:
            return self.base_estimator_.log_marginal_likelihood(
                theta, eval_gradient)
        else:
            if eval_gradient:
                raise NotImplementedError(
                    "Gradient of log-marginal-likelhood not implemented for "
                    "multi-class GPC.")
            estimators = self.base_estimator_.estimators_
            n_dims = estimators[0].kernel_.n_dims
            if theta.shape[0] == n_dims:  # use same theta for all sub-kernels
                return np.mean([
                    estimator.log_marginal_likelihood(theta)
                    for i, estimator in enumerate(estimators)
                ])
            elif theta.shape[0] == n_dims * self.classes_.shape[0]:
                # theta for compound kernel
                return np.mean([
                    estimator.log_marginal_likelihood(theta[n_dims * i:n_dims *
                                                            (i + 1)])
                    for i, estimator in enumerate(estimators)
                ])
            else:
                raise ValueError(
                    "Shape of theta must be either %d or %d. "
                    "Obtained theta with shape %d." %
                    (n_dims, n_dims * self.classes_.shape[0], theta.shape[0]))
Exemplo n.º 38
0
 def predict(self, X):
     check_is_fitted(self, "has_fitted_")
     X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types)
     X = self.preprocessor_.transform(X)
     return EBMUtils.regressor_predict(X, self)
Exemplo n.º 39
0
    def _run_with_single_dim_single_value_preset(self,i, preset_i, n_tries=10):
            """
            Method to run method once for one restricted feature
            Parameters
            ----------
            i:
                restricted feature
            preset_i:
                restricted range of feature i (set before optimization = preset)
            n_tries:
                number of allowed relaxation steps for the L1 constraint in case of LP infeasible

            """

            X = self.X_
            y = self.y_
            # Do we have intervals?
            check_is_fitted(self, "interval_")
            interval = self.unmod_interval_
            d = len(interval)

            constrained_ranges_diff = np.zeros((d, 2))

            # Init empty preset
            preset = np.empty(shape=(d, 2))
            preset.fill(np.nan)

            # Add correct sign of this coef
            signed_preset_i = np.sign(self._svm_coef[0][i]) * preset_i
            preset[i] = signed_preset_i
            
            # Calculate all bounds with feature i set to min_i
            l1 = self.optim_L1_
            loss = self.optim_loss_

            for j in range(n_tries):
                # try several times if problem to stringent
                try:
                    kwargs = {"verbose": False, "solver": "ECOS"}
                    rangevector, _, _ = self._main_opt(X, y, loss,
                                                          l1,
                                                          self.random_state,
                                                          presetModel=preset,
                                                          solverargs=kwargs)
                except NotFeasibleForParameters:
                    preset[i] *= -1
                    # print("Community detection: Constrained run failed, swap sign".format)
                    continue
                else:
                    #print("solved constrained opt for ", i)
                    # problem was solvable
                    break
            else:
                raise NotFeasibleForParameters("Community detection failed.", "dim {}".format(i))

            # rangevector, _ = self._postprocessing(self.optim_L1_, rangevector, False,
            #                                      None)
            # Get differences for constrained intervals to normal intervals
            constrained_ranges_diff = self.unmod_interval_ - rangevector

            # Current dimension is not constrained, so these values are set accordingly
            rangevector[i] = preset_i
            constrained_ranges_diff[i] = 0

            return rangevector, constrained_ranges_diff
Exemplo n.º 40
0
    def fit(self, X, y, *, sensitive_features, **kwargs):
        """Fit the model.

        The fit is based on training features and labels, sensitive features,
        as well as the fairness-unaware predictor or estimator. If an estimator was passed
        in the constructor this fit method will call `fit(X, y, **kwargs)` on said estimator.

        Parameters
        ----------
        X : numpy.ndarray or pandas.DataFrame
            The feature matrix
        y : numpy.ndarray, pandas.DataFrame, pandas.Series, or list
            The label vector
        sensitive_features : numpy.ndarray, list, pandas.DataFrame, or pandas.Series
            sensitive features to identify groups by
        """
        if self.estimator is None:
            raise ValueError(BASE_ESTIMATOR_NONE_ERROR_MESSAGE)

        if self.constraints in SIMPLE_CONSTRAINTS:
            if self.objective not in OBJECTIVES_FOR_SIMPLE_CONSTRAINTS:
                raise ValueError(
                    NOT_SUPPORTED_OBJECTIVES_FOR_SIMPLE_CONSTRAINTS_ERROR_MESSAGE
                    .format(self.constraints))
        elif self.constraints == "equalized_odds":
            if self.objective not in OBJECTIVES_FOR_EQUALIZED_ODDS:
                raise ValueError(
                    NOT_SUPPORTED_OBJECTIVES_FOR_EQUALIZED_ODDS_ERROR_MESSAGE)
        else:
            raise ValueError(NOT_SUPPORTED_CONSTRAINTS_ERROR_MESSAGE)

        if self.predict_method == "deprecated":
            warn(
                "'predict_method' default value is changed from 'predict' to "
                "'auto'. Explicitly pass `predict_method='predict' to "
                "replicate the old behavior, or pass `predict_method='auto' "
                "or other valid values to silence this warning.",
                FutureWarning,
            )
            self._predict_method = "predict"
        else:
            self._predict_method = self.predict_method

        if kwargs.get(_KW_CONTROL_FEATURES) is not None:
            raise ValueError(NO_CONTROL_FEATURES)

        _, _, sensitive_feature_vector, _ = _validate_and_reformat_input(
            X,
            y,
            sensitive_features=sensitive_features,
            enforce_binary_labels=True,
        )

        # postprocessing can't handle 0/1 as floating point numbers, so this
        # converts it to int
        if type(y) in [np.ndarray, pd.DataFrame, pd.Series]:
            y = y.astype(int)
        else:
            y = [int(y_val) for y_val in y]

        if not self.prefit:
            # Following is on two lines due to issue when estimator comes from
            # TensorFlow
            self.estimator_ = clone(self.estimator)
            self.estimator_.fit(X, y, **kwargs)
        else:
            try:
                check_is_fitted(self.estimator)
            except NotFittedError:
                warn(
                    BASE_ESTIMATOR_NOT_FITTED_WARNING.format(
                        type(self).__name__))
            self.estimator_ = self.estimator

        scores = _get_soft_predictions(self.estimator_, X,
                                       self._predict_method)
        if self.constraints == "equalized_odds":
            self.x_metric_ = "false_positive_rate"
            self.y_metric_ = "true_positive_rate"
            threshold_optimization_method = (
                self._threshold_optimization_for_equalized_odds)
        else:
            self.x_metric_ = SIMPLE_CONSTRAINTS[self.constraints]
            self.y_metric_ = self.objective
            threshold_optimization_method = (
                self._threshold_optimization_for_simple_constraints)

        self.interpolated_thresholder_ = threshold_optimization_method(
            sensitive_feature_vector, y, scores)
        return self
Exemplo n.º 41
0
 def check_is_fitted(self, label_name: str):
     if self.label.name == label_name or label_name is None:
         return check_is_fitted(
             self.model,
             ["estimators_", "coef_", "estimator", "_fit_X", "dual_coef_"],
             all_or_any=any)
Exemplo n.º 42
0
 def _transform_strategy_validator(self):
     """Private method to prep for prediction."""
     check_is_fitted(self, "statistics_")
Exemplo n.º 43
0
    def impute(self, X):
        """Generate imputations using predictions from the fit bayesian model.

        The transform method returns the values for imputation. Missing values
        in a given dataset are replaced with the random selection from the PMM
        process. Again, PMM imputes actually observed values, and the observed
        values are selected by finding the closest least squares predictions
        to a given prediction from the bayesian model.

        Args:
            X (pd.DataFrame): predictors to determine imputed values.

        Returns:
            np.array: imputed dataset.
        """
        # check if fitted then predict with least squares
        check_is_fitted(self, "statistics_")
        model = self.statistics_["param"]["model"]
        df = self.statistics_["param"]["y_obs"]
        df = df.reset_index(drop=True)

        # generate posterior distribution for alpha, beta coefficients
        with model:
            tr = pm.sample(
                sample=self.sample,
                tune=self.tune,
                init=self.init,
            )
        self.trace_ = tr

        # sample random alpha from alpha posterior distribution
        # get the mean and covariance of the multivariate betas
        # betas assumed multivariate normal by linear reg rules
        # sample beta w/ cov structure to create realistic variability
        alpha_bayes = np.random.choice(tr["alpha"])
        beta_means = tr["beta"].mean(0)
        beta_cov = np.cov(tr["beta"].T)
        beta_bayes = np.array(multivariate_normal(beta_means, beta_cov).rvs())

        # predictions for missing y, using bayes alpha + coeff samples
        # use these preds for nearest neighbor search from reg results
        # neighbors are nearest from prediction model fit on observed
        # imputed values are actual y vals corresponding to nearest neighbors
        # therefore, this is a form of "hot-deck" imputation
        y_pred_bayes = alpha_bayes + beta_bayes.dot(X.T)
        n_ = self.neighbors
        if X.columns.size == 1:
            y_pred_bayes = y_pred_bayes[0]
        if self.fill_value == "mean":
            imp = [_neighbors(x, n_, df, np.mean) for x in y_pred_bayes]
        elif self.fill_value == "random":
            choice = np.random.choice
            imp = [_neighbors(x, n_, df, choice) for x in y_pred_bayes]
        else:
            err = f"{self.fill_value} must be `mean` or `random`."
            raise ValueError(err)

        # finally, set last class values and return imputations
        self.y_pred = y_pred_bayes
        self.alphas = alpha_bayes
        self.betas = beta_bayes
        return imp
 def predict(self, X):
     check_is_fitted(self)
     X = check_array(X)
     return np.ones(X.shape[0])