Пример #1
0
def _daal_dbscan(X, eps=0.5, min_samples=5, sample_weight=None):
    if eps <= 0.0:
        raise ValueError("eps must be positive.")

    X = check_array(X, dtype=[np.float64, np.float32])
    if sample_weight is not None:
        sample_weight = np.asarray(sample_weight)
        check_consistent_length(X, sample_weight)
        ww = make2d(sample_weight)
    else:
        ww = None

    XX = make2d(X)

    fpt = getFPType(XX)
    alg = daal4py.dbscan(method='defaultDense',
                         fptype=fpt,
                         epsilon=float(eps),
                         minObservations=int(min_samples),
                         memorySavingMode=False,
                         resultsToCompute="computeCoreIndices")

    daal_res = alg.compute(XX, ww)
    n_clusters = daal_res.nClusters[0, 0]
    assignments = daal_res.assignments.ravel()
    if daal_res.coreIndices is not None:
        core_ind = daal_res.coreIndices.ravel()
    else:
        core_ind = np.array([], dtype=np.intc)

    return (core_ind, assignments)
Пример #2
0
def evaluate_print(clf_name, y, y_pred):
    """Utility function for evaluating and printing the results for examples.
    Default metrics include ROC and Precision @ n

    Parameters
    ----------
    clf_name : str
        The name of the detector.

    y : list or numpy array of shape (n_samples,)
        The ground truth. Binary (0: inliers, 1: outliers).

    y_pred : list or numpy array of shape (n_samples,)
        The raw outlier scores as returned by a fitted model.

    """

    y = column_or_1d(y)
    y_pred = column_or_1d(y_pred)
    check_consistent_length(y, y_pred)

    print('{clf_name} ROC:{roc}, precision @ rank n:{prn}'.format(
        clf_name=clf_name,
        roc=np.round(roc_auc_score(y, y_pred), decimals=4),
        prn=np.round(precision_n_scores(y, y_pred), decimals=4)))
Пример #3
0
def _check_x_y(x, y):
    x = check_array(x, ensure_2d=False, force_all_finite=True)
    y = check_array(y, ensure_2d=False, force_all_finite=True)

    check_consistent_length(x, y)

    return x, y
Пример #4
0
def evaluate_print(clf_name, y, y_pred):
    """Utility function for evaluating and printing the results for examples.
    Default metrics include accuracy, roc, and F1 score

    Parameters
    ----------
    clf_name : str
        The name of the estimator.

    y : list or numpy array of shape (n_samples,)
        The ground truth.

    y_pred : list or numpy array of shape (n_samples,)
        The raw scores as returned by a fitted model.

    """

    y = column_or_1d(y)
    y_pred = column_or_1d(y_pred)
    check_consistent_length(y, y_pred)

    print('{clf_name} Accuracy:{acc}, ROC:{roc}, F1:{f1}'.format(
        clf_name=clf_name,
        acc=np.round(accuracy_score(y, y_pred), decimals=4),
        roc=np.round(roc_auc_score(y, y_pred), decimals=4),
        f1=np.round(f1_score(y, y_pred), decimals=4)))
Пример #5
0
    def _init_fit(self, X, y, lipschitz):
        """Initialise model and check inputs.
        """
        self.random_state_ = check_random_state(self.random_state)

        check_consistent_length(X, y)
        X, X_means, y, lipschitz = self._prepare_dataset(X, y, lipschitz)

        self.subsampler_ = Subsampler(X.shape[0], self.subsampling_scheme,
                                      self.random_state_)

        groups = self.groups
        if groups is None:
            groups = np.arange(X.shape[1] - 1)

        self.group_ids_ = np.array(_parse_group_iterable(groups))

        self.groups_ = [
            self.group_ids_ == u for u in np.unique(self.group_ids_) if u >= 0
        ]
        self.group_reg_vector_ = self._get_reg_vector(self.group_reg)

        self.losses_ = []

        if not self.warm_start or not hasattr(self, "coef_"):
            self.coef_ = np.zeros((X.shape[1] - 1, y.shape[1]))
            self.intercept_ = np.zeros((1, self.coef_.shape[1]))

        self._check_valid_parameters()
        self.X_aug_, self.y_, self.lipschitz_ = X, y, lipschitz
        self._X_means_ = X_means
        if not self.old_regularisation and not self.supress_warning:
            warnings.warn(_OLD_REG_WARNING)
Пример #6
0
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test):
    if method == 'KNN':
        clf = KNN()
    elif method == 'CBLOF':
        clf = CBLOF()
    elif method == 'PCA':
        clf = PCA()
    else:
        clf = IForest()
    clf.fit(x_train)  # 使用x_train训练检测器clf

    # 返回训练数据x_train上的异常标签和异常分值
    y_train_pred = clf.labels_  # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
    y_train_scores = clf.decision_scores_  # 返回训练数据上的异常值 (分值越大越异常)
    print("On train Data:")
    evaluate_print(method, y_train, y_train_scores)

    # 用训练好的clf来预测未知数据中的异常值
    y_test_pred = clf.predict(x_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
    y_test_scores = clf.decision_function(x_test)  # 返回未知数据上的异常值 (分值越大越异常)
    print("On Test Data:")
    evaluate_print(method, y_test, y_test_scores)

    y_true = column_or_1d(y_test)
    y_pred = column_or_1d(y_test_scores)
    check_consistent_length(y_true, y_pred)

    roc = np.round(roc_auc_score(y_true, y_pred), decimals=4),
    prn = np.round(precision_n_scores(y_true, y_pred), decimals=4)

    total_roc.append(roc)
    total_prn.append(prn)
Пример #7
0
def score_predictor_report(y_true, y_pred, disp=True):
    """
    Report brief summary of prediction performance
    
    * mean absolute error
    * root mean squared error
    * number of data
    * mean and standard dev. of true scores
    * mean and standard dev. of predicted scores

    Parameters
    ----------
    y_true : array, shape(n_samples,)
        Ground truth scores
    y_pred : array, shape(n_samples,)
        Predicted scores
    disp : bool, optional, default=True
        if True, print report

    Returns
    -------
    stats : dict
        belief summary of prediction performance
    """

    # check inputs
    assert_all_finite(y_true)
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calc statistics
    stats = {
        'mean absolute error':
        skm.mean_absolute_error(y_true, y_pred),
        'root mean squared error':
        np.sqrt(np.maximum(skm.mean_squared_error(y_true, y_pred), 0.)),
        'n_samples':
        y_true.size,
        'true': {
            'mean': np.mean(y_true),
            'stdev': np.std(y_true)
        },
        'predicted': {
            'mean': np.mean(y_pred),
            'stdev': np.std(y_pred)
        }
    }

    # display statistics
    if disp:
        print(json.dumps(stats,
                         sort_keys=True,
                         indent=4,
                         separators=(',', ': '),
                         ensure_ascii=False),
              file=sys.stderr)

    return stats
Пример #8
0
    def _preprocess_data_for_fit(self, X, Y, copy):
        """Check if data conforms to expectations and possibly center-scale it.

        Parameters
        ----------
        X : np.ndarray (n_samples, n_X_features)
            data matrix X
        Y : np.ndarray (n_samples, n_Y_features)
            data matrix Y
        copy : bool
            whether a copy of the data is returned

        Returns
        -------
        prepared_X : np.ndarray (n_samples, n_X_features)
            data matrix X
        prepared_Y : np.ndarray (n_samples, n_Y_features)
            data matrix Y
        """

        check_consistent_length([X, Y])
        if Y.ndim == 1:
            Y = Y.reshape(-1, 1)
        X = check_array(X, dtype=np.float64, copy=copy, ensure_min_samples=2)
        Y = check_array(Y, dtype=np.float64, copy=copy, ensure_min_samples=2)

        # Scale (in place)
        X, Y, self.x_mean_, self.y_mean_, self.x_std_, self.y_std_ = \
            _center_scale_xy(X, Y, scale=self.scale, ddof=self.std_ddof)

        return X, Y
Пример #9
0
def mean_absolute_error(y_true, y_pred):
    """
    Mean absolute error and its standard deviation.
    
    If you need only mean absolute error, use 
    :func:`sklearn.metrics.mean_absolute_error`
    
    Parameters
    ----------
    y_true : array, shape(n_samples,)
        Ground truth scores
    y_pred : array, shape(n_samples,)
        Predicted scores

    Returns
    -------
    mean : float
        mean of squared errors
    stdev : float
        standard deviation of squared errors
    """

    # check inputs
    assert_all_finite(y_true)
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calculate errors
    errs = np.abs(y_true - y_pred)
    mean = np.nanmean(errs)
    stdev = np.nanstd(errs)

    return mean, stdev
Пример #10
0
def obs_fuzziness(y_true, p_values):
    """**Classification** - Calculate the Observed Fuzziness (OF)
    
    Significance independent metric, smaller is better
    
    Parameters
    ----------
    y_true : 1D numpy array, list or pandas Series
        True labels

    p_values : 2D numpy array or DataFrame
        The predicted p-values, first column for the class 0, second for class 1, ..
    
    Returns
    -------
    obs_fuzz : float 
        Observed fuzziness
    """
    p_values = to_numpy2D(p_values,'p_values')
    y_true = to_numpy1D_int(y_true, 'y_true')
    check_consistent_length(y_true, p_values)

    of_sum = 0
    for i in range(0,p_values.shape[0]):
        # Mask the p-value of the true label
        p_vals_masked = np.ma.array(p_values[i,:], mask=False)
        p_vals_masked.mask[y_true[i]] = True
        # Sum the remaining p-values
        of_sum += p_vals_masked.sum()
    
    return of_sum / len(y_true)
Пример #11
0
    def _check_consistent_input(self, y_true, y_pred, multioutput):
        check_consistent_length(y_true, y_pred)

        y_true = check_array(y_true, ensure_2d=False)

        if not isinstance(y_pred, pd.DataFrame):
            ValueError("y_pred should be a dataframe.")

        if not all(y_pred.dtypes == float):
            ValueError("Data should be numeric.")

        if y_true.ndim == 1:
            y_true = y_true.reshape((-1, 1))

        n_outputs = y_true.shape[1]

        allowed_multioutput_str = ("raw_values", "uniform_average",
                                   "variance_weighted")
        if isinstance(multioutput, str):
            if multioutput not in allowed_multioutput_str:
                raise ValueError("Allowed 'multioutput' string values are {}. "
                                 "You provided multioutput={!r}".format(
                                     allowed_multioutput_str, multioutput))
        elif multioutput is not None:
            multioutput = check_array(multioutput, ensure_2d=False)
            if n_outputs == 1:
                raise ValueError(
                    "Custom weights are useful only in multi-output case.")
            elif n_outputs != len(multioutput):
                raise ValueError(
                    "There must be equally many custom weights (%d) as outputs (%d)."
                    % (len(multioutput), n_outputs))

        return y_true, y_pred, multioutput
def check_arrays_survival(X, y, force_all_finite=True):
    """Check that all arrays have consistent first dimensions.

    Parameters
    ----------
    X : array-like
        Data matrix containing feature vectors.

    y : structured array with two fields
        A structured array containing the binary event indicator
        as first field, and time of event or time of censoring as
        second field.

    force_all_finite : boolean (default=True)
        Whether to raise an error on np.inf and np.nan in X.

    Returns
    -------
    X : array, shape=[n_samples, n_features]
        Feature vectors.

    event : array, shape=[n_samples,], dtype=bool
        Binary event indicator.

    time : array, shape=[n_samples,], dtype=float
        Time of event or censoring.
    """
    event, time = check_y_survival(y)
    X = check_array(X, dtype=float, ensure_min_samples=2, force_all_finite=force_all_finite)
    check_consistent_length(X, event, time)
    return X, event, time
Пример #13
0
Файл: cm.py Проект: chnlyi/i2b2
def _check_targets(y_true, y_pred):

    check_consistent_length(y_true, y_pred)
    type_true = type_of_target(y_true)
    type_pred = type_of_target(y_pred)

    y_type = {type_true, type_pred}
    if y_type == {"binary", "multiclass"}:
        y_type = {"multiclass"}

    if len(y_type) > 1:
        raise ValueError("Classification metrics can't handle a mix of {0} "
                         "and {1} targets".format(type_true, type_pred))

    # We can't have more than one value on y_type => The set is no more needed
    y_type = y_type.pop()

    # No metrics support "multiclass-multioutput" format
    if (y_type not in ["binary", "multiclass", "multilabel-indicator"]):
        raise ValueError("{0} is not supported".format(y_type))

    if y_type in ["binary", "multiclass"]:
        y_true = column_or_1d(y_true)
        y_pred = column_or_1d(y_pred)
        if y_type == "binary":
            unique_values = np.union1d(y_true, y_pred)
            if len(unique_values) > 2:
                y_type = "multiclass"

    if y_type.startswith('multilabel'):
        y_true = csr_matrix(y_true)
        y_pred = csr_matrix(y_pred)
        y_type = 'multilabel-indicator'

    return y_type, y_true, y_pred
Пример #14
0
def check_consistent_shape(X_train, y_train, X_test, y_test, y_train_pred,
                           y_test_pred):
    """Internal shape to check input data shapes are consistent.

    Parameters
    ----------
    X_train : numpy array of shape (n_samples, n_features)
        The training samples.

    y_train : list or array of shape (n_samples,)
        The ground truth of training samples.

    X_test : numpy array of shape (n_samples, n_features)
        The test samples.

    y_test : list or array of shape (n_samples,)
        The ground truth of test samples.

    y_train_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the training samples.

    y_test_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the test samples.

    Returns
    -------
    X_train : numpy array of shape (n_samples, n_features)
        The training samples.

    y_train : list or array of shape (n_samples,)
        The ground truth of training samples.

    X_test : numpy array of shape (n_samples, n_features)
        The test samples.

    y_test : list or array of shape (n_samples,)
        The ground truth of test samples.

    y_train_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the training samples.

    y_test_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the test samples.
    """

    # check input data shapes are consistent
    X_train, y_train = check_X_y(X_train, y_train)
    X_test, y_test = check_X_y(X_test, y_test)

    y_test_pred = column_or_1d(y_test_pred)
    y_train_pred = column_or_1d(y_train_pred)

    check_consistent_length(y_train, y_train_pred)
    check_consistent_length(y_test, y_test_pred)

    if X_train.shape[1] != X_test.shape[1]:
        raise ValueError("X_train {0} and X_test {1} have different number "
                         "of features.".format(X_train.shape, X_test.shape))

    return X_train, y_train, X_test, y_test, y_train_pred, y_test_pred
Пример #15
0
    def _validate_X_y_sample_weight(self, X, y, sample_weight):
        """Validate if X, y and sample_weight are numeric and of equal length.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input samples used to fit the classifier.
        y : array-like of shape (n_samples)
            Labels of the input samples 'X'. There may be missing labels.

        sample_weight : array-like of shape (n_samples,) (default=None)
            Sample weights for X, used to fit the clf.

        Returns
        -------
        X : array-like of shape (n_samples, n_features)
            Checked Input samples.
        y : array-like of shape (n_samples)
            Checked Labels of the input samples 'X'. Converts y to a numpy
            array
        """
        if sample_weight is not None:
            sample_weight = np.array(sample_weight)
            check_consistent_length(sample_weight, y)
        if X is not None and y is not None:
            X = check_array(X)
            y = np.array(y)
            check_consistent_length(X, y)
        return X, y, sample_weight
Пример #16
0
    def fit(self, X, y):
        """ Fit model with specified loss.

        Parameters
        ----------
        X : scipy.sparse.csc_matrix, (n_samples, n_features)

        y : float | ndarray, shape = (n_samples, )

                the targets have to be encodes as {-1, 1}.
        """
        y = _validate_class_labels(y)
        self.classes_ = np.unique(y)
        if len(self.classes_) != 2:
            raise ValueError("This solver only supports binary classification"
                             " but the data contains"
                             " class: %r" % self.classes_)

        # fastFM-core expects labels to be in {-1,1}
        y_train = y.copy()
        i_class1 = (y_train == self.classes_[0])
        y_train[i_class1] = -1
        y_train[-i_class1] = 1

        check_consistent_length(X, y)
        y = y.astype(np.float64)
        X = X.T
        X = check_array(X, accept_sparse="csc", dtype=np.float64)

        self.w0_, self.w_, self.V_ = ffm.ffm_sgd_fit(self, X, y)
        return self
Пример #17
0
    def fit(self, X, y):
        """ Fit model with specified loss.

        Parameters
        ----------
        X : scipy.sparse.csc_matrix, (n_samples, n_features)

        y : float | ndarray, shape = (n_samples, )

                the targets have to be encodes as {-1, 1}.
        """
        y = _validate_class_labels(y)
        self.classes_ = np.unique(y)
        if len(self.classes_) != 2:
            raise ValueError("This solver only supports binary classification"
                             " but the data contains"
                             " class: %r" % self.classes_)

        # fastFM-core expects labels to be in {-1,1}
        y_train = y.copy()
        i_class1 = (y_train == self.classes_[0])
        y_train[i_class1] = -1
        y_train[-i_class1] = 1

        check_consistent_length(X, y)
        y = y.astype(np.float64)
        X = X.T
        X = check_array(X, accept_sparse="csc", dtype=np.float64)

        self.w0_, self.w_, self.V_ = ffm.ffm_sgd_fit(self, X, y)
        return self
Пример #18
0
    def fit(self, X, y, strata):
        """
        Args:
            X: numpy matrix of predictors.
            y: numpy array.
            strata: numpy array of strata. It's sensible to envision the need of a
                multi-column matrix strata, but there is no such need for now.
        """
        X, y = check_X_y(X,
                         y,
                         ensure_2d=True,
                         copy=False,
                         y_numeric=True,
                         multi_output=False)
        strata = check_array(strata, ensure_2d=False)
        check_consistent_length(y, strata)

        uniq = np.unique(strata)
        models = {}
        residues = 0.0
        for key in uniq:
            model = self.model_class(**self.model_kwargs)
            idx = np.where(strata == key)[0]
            model.fit(X[idx, :], y[idx])
            models[key] = model
            residues += model._residues
        self.models_ = models
        self._residues = residues
        self._n_obs = X.shape[0]

        return self
Пример #19
0
def check_inputs(X, y, sample_weight=None, ensure_2d=True):
    """Input validation for debiasing algorithms.

    Checks all inputs for consistent length, validates shapes (optional for X),
    and returns an array of all ones if sample_weight is ``None``.

    Args:
        X (array-like): Input data.
        y (array-like, shape = (n_samples,)): Target values.
        sample_weight (array-like, optional): Sample weights.
        ensure_2d (bool, optional): Whether to raise a ValueError if X is not
            2D.

    Returns:
        tuple:

            * **X** (`array-like`) -- Validated X. Unchanged.

            * **y** (`array-like`) -- Validated y. Possibly converted to 1D if
              not a :class:`pandas.Series`.
            * **sample_weight** (`array-like`) -- Validated sample_weight. If no
              sample_weight is provided, returns a consistent-length array of
              ones.
    """
    if ensure_2d and X.ndim != 2:
        raise ValueError("Expected X to be 2D, got ndim == {} instead.".format(
                X.ndim))
    if not isinstance(y, pd.Series):  # don't cast Series -> ndarray
        y = column_or_1d(y)
    if sample_weight is not None:
        sample_weight = column_or_1d(sample_weight)
    else:
        sample_weight = np.ones(X.shape[0])
    check_consistent_length(X, y, sample_weight)
    return X, y, sample_weight
Пример #20
0
def _binary_clf_curve(y_true, y_score):
    check_consistent_length(y_true, y_score, None)
    y_true = column_or_1d(y_true)
    y_score = column_or_1d(y_score)
    assert_all_finite(y_true)
    assert_all_finite(y_score)

    # make y_true a boolean vector
    y_true = (y_true == 1)

    # sort scores and corresponding truth values
    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
    y_score = y_score[desc_score_indices]
    y_true = y_true[desc_score_indices]

    # y_score typically has many tied values. Here we extract
    # the indices associated with the distinct values. We also
    # concatenate a value for the end of the curve.
    distinct_value_indices = np.where(np.diff(y_score))[0]
    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]

    # accumulate the true positives with decreasing threshold
    tps = stable_cumsum(y_true)[threshold_idxs]
    fps = 1 + threshold_idxs - tps

    return fps, tps, y_score[threshold_idxs]
Пример #21
0
 def _prepare_dataset(self, X, y):
     check_consistent_length(X, y)
     check_array(X)
     check_array(y)
     if len(y.shape) == 1:
         y = y.reshape(-1, 1)
     return X, y
Пример #22
0
def _check_data(X, y, solver=None):
    if solver == 'sag':
        X = check_array(X, accept_sparse=['csr'], dtype=np.float64, order='C')
        y = check_array(y, dtype=np.float64, ensure_2d=False, order='F')
    else:
        X = check_array(X,
                        accept_sparse=['csr', 'csc', 'coo'],
                        dtype=np.float64)
        y = check_array(y, dtype='numeric', ensure_2d=False)

    check_consistent_length(X, y)

    n_samples, n_features = X.shape

    if y.ndim > 2:
        raise ValueError("Target y has the wrong shape %s" % str(y.shape))

    ravel = False
    if y.ndim == 1:
        y = y.reshape(-1, 1)
        ravel = True

    n_samples_, n_targets = y.shape

    if n_samples != n_samples_:
        raise ValueError("Number of samples in X and y does not correspond:"
                         " %d != %d" % (n_samples, n_samples_))
    return X, y, n_samples, n_features, n_targets, ravel
Пример #23
0
 def fit(self, X, y, trt, n_trt=None):
     X, y = check_X_y(X, y, accept_sparse="csr")
     self.trt_, self.n_trt_ = check_trt(trt, n_trt)
     check_consistent_length(X, y, self.trt_)
     self.n_models_ = self.n_trt_ + 1
     self.models_ = self._check_base_estimator(self.n_models_)
     self.n_ = np.empty(self.n_models_, dtype=int)
     self.p = X.shape[1]
     self.sigma = np.empty(self.n_models_)
     #import pdb; pdb.set_trace()
     for i in range(self.n_models_):
         mi = self.models_[i][1]
         ind = (trt == i)
         self.n_[i] = ind.sum()
         Xi = X[ind]
         yi = y[ind]
         Sigma = np.eye(self.p)
         M = GaussianMatrix(Xi, yi, 3 * Sigma)
         #mi.fit(Xi-M[0], yi-M[1])
         mi.fit(Xi, yi)
         mi.coef_ = np.linalg.inv(
             (Xi - M[0]).T @ (Xi - M[0])) @ (Xi - M[0]).T @ (yi - M[1])
         #mi.intercept_=0
         #g = GaussianMatrix2(Xi,yi,3*Sigma,mi.coef_)
         #mi.fit(Xi, yi-g)
         #mi.coef_
     return self
Пример #24
0
    def _prepare_dataset(self, X, y, lipschitz):
        """Ensure that the inputs are valid and prepare them for fit.
        """
        self.label_binarizer_ = LabelBinarizer()
        self.label_binarizer_.fit(y)
        y = self._encode(y)
        check_consistent_length(X, y)
        X = check_array(X, accept_sparse="csr")
        check_array(y, ensure_2d=False)
        if set(np.unique(y)) != {0, 1}:
            raise ValueError(
                "The target array must either be a 2D dummy encoded (binary)"
                "array or a 1D array with class labels as array elements.")

        # Add the intercept column and compute Lipschitz bound the correct way
        if self.fit_intercept:
            X = _add_intercept_col(X)
            X = check_array(X, accept_sparse="csr")

        if lipschitz is None:
            lipschitz = self._compute_lipschitz(X, y)

        if not self.fit_intercept:
            X = _add_intercept_col(X)
            X = check_array(X, accept_sparse="csr")

        return X, y, lipschitz
Пример #25
0
    def __call__(self, y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None):
        if self.lb_ is None:
            self.lb_ = LabelBinarizer()
            T = self.lb_.fit_transform(y_true)
        else:
            T = self.lb_.transform(y_true)

        if T.shape[1] == 1:
            T = np.append(1 - T, T, axis=1)

        Y = np.clip(y_pred, eps, 1 - eps)

        if not isinstance(Y, np.ndarray):
            raise ValueError("y_pred should be an array of floats.")

        if Y.ndim == 1:
            Y = Y[:, np.newaxis]
        if Y.shape[1] == 1:
            Y = np.append(1 - Y, Y, axis=1)

        check_consistent_length(T, Y)
        T = check_array(T)
        Y = check_array(Y)
        if T.shape[1] != Y.shape[1]:
            raise ValueError("y_true and y_pred have different number of classes " "%d, %d" % (T.shape[1], Y.shape[1]))

        Y /= Y.sum(axis=1)[:, np.newaxis]
        loss = -(T * np.log(Y)).sum(axis=1)

        return _weighted_sum(loss, sample_weight, normalize)
Пример #26
0
def evaluate_print(clf_name, y, y_pred):
    """Utility function for evaluating and printing the results for examples.
    Default metrics include ROC and Precision @ n

    Parameters
    ----------
    clf_name : str
        The name of the detector.

    y : list or numpy array of shape (n_samples,)
        The ground truth. Binary (0: inliers, 1: outliers).

    y_pred : list or numpy array of shape (n_samples,)
        The raw outlier scores as returned by a fitted model.

    """

    y = column_or_1d(y)
    y_pred = column_or_1d(y_pred)
    check_consistent_length(y, y_pred)

    print('{clf_name} ROC:{roc}, precision @ rank n:{prn}'.format(
        clf_name=clf_name,
        roc=np.round(roc_auc_score(y, y_pred), decimals=4),
        prn=np.round(precision_n_scores(y, y_pred), decimals=4)))
Пример #27
0
def check_consistent_shape(X_train, y_train, X_test, y_test, y_train_pred,
                           y_test_pred):
    """Internal shape to check input data shapes are consistent.

    Parameters
    ----------
    X_train : numpy array of shape (n_samples, n_features)
        The training samples.

    y_train : list or array of shape (n_samples,)
        The ground truth of training samples.

    X_test : numpy array of shape (n_samples, n_features)
        The test samples.

    y_test : list or array of shape (n_samples,)
        The ground truth of test samples.

    y_train_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the training samples.

    y_test_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the test samples.

    Returns
    -------
    X_train : numpy array of shape (n_samples, n_features)
        The training samples.

    y_train : list or array of shape (n_samples,)
        The ground truth of training samples.

    X_test : numpy array of shape (n_samples, n_features)
        The test samples.

    y_test : list or array of shape (n_samples,)
        The ground truth of test samples.

    y_train_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the training samples.

    y_test_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the test samples.
    """

    # check input data shapes are consistent
    X_train, y_train = check_X_y(X_train, y_train)
    X_test, y_test = check_X_y(X_test, y_test)

    y_test_pred = column_or_1d(y_test_pred)
    y_train_pred = column_or_1d(y_train_pred)

    check_consistent_length(y_train, y_train_pred)
    check_consistent_length(y_test, y_test_pred)

    if X_train.shape[1] != X_test.shape[1]:
        raise ValueError("X_train {0} and X_test {1} have different number "
                         "of features.".format(X_train.shape, X_test.shape))

    return X_train, y_train, X_test, y_test, y_train_pred, y_test_pred
Пример #28
0
    def fit(self, X_train, y_train, n_more_iter=0):
        """ Fit model with specified loss.

        Parameters
        ----------
        X : scipy.sparse.csc_matrix, (n_samples, n_features)

        y : float | ndarray, shape = (n_samples, )

        n_more_iter : int
                Number of iterations to continue from the current Coefficients.

        """

        check_consistent_length(X_train, y_train)
        y_train = check_array(y_train, ensure_2d=False, dtype=np.float64)

        X_train = check_array(X_train, accept_sparse="csc", dtype=np.float64,
                              order="F")
        self.n_iter = self.n_iter + n_more_iter

        if n_more_iter > 0:
            _check_warm_start(self, X_train)
            self.warm_start = True

        self.w0_, self.w_, self.V_ = ffm.ffm_als_fit(self, X_train, y_train)

        if self.iter_count != 0:
            self.iter_count = self.iter_count + n_more_iter
        else:
            self.iter_count = self.n_iter

        # reset to default setting
        self.warm_start = False
        return self
Пример #29
0
def check_arrays_survival(X, y, **kwargs):
    """Check that all arrays have consistent first dimensions.

    Parameters
    ----------
    X : array-like
        Data matrix containing feature vectors.

    y : structured array with two fields
        A structured array containing the binary event indicator
        as first field, and time of event or time of censoring as
        second field.

    kwargs : dict
        Additional arguments passed to :func:`sklearn.utils.check_array`.

    Returns
    -------
    X : array, shape=[n_samples, n_features]
        Feature vectors.

    event : array, shape=[n_samples,], dtype=bool
        Binary event indicator.

    time : array, shape=[n_samples,], dtype=float
        Time of event or censoring.
    """
    event, time = check_y_survival(y)
    kwargs.setdefault("dtype", numpy.float64)
    X = check_array(X, ensure_min_samples=2, **kwargs)
    check_consistent_length(X, event, time)
    return X, event, time
Пример #30
0
def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None):
    lb = LabelBinarizer()
    T = lb.fit_transform(y_true)
    if T.shape[1] == 1:
        T = np.append(1 - T, T, axis=1)

    # Clipping
    Y = np.clip(y_pred, eps, 1 - eps)

    # This happens in cases when elements in y_pred have type "str".
    if not isinstance(Y, np.ndarray):
        raise ValueError("y_pred should be an array of floats.")

    # If y_pred is of single dimension, assume y_true to be binary
    # and then check.
    if Y.ndim == 1:
        Y = Y[:, np.newaxis]
    if Y.shape[1] == 1:
        Y = np.append(1 - Y, Y, axis=1)
    # Check if dimensions are consistent.
    check_consistent_length(T, Y)
    T = check_array(T)
    Y = check_array(Y)
    if T.shape[1] != Y.shape[1]:
        raise ValueError("y_true and y_pred have different number of classes "
                         "%d, %d" % (T.shape[1], Y.shape[1]))

    # Renormalize
    Y /= Y.sum(axis=1)[:, np.newaxis]
    loss = -(T * np.log(Y)).sum(axis=1)

    return loss 
Пример #31
0
 def _check_X_y(X, y):
     y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
     X = check_array(X, accept_sparse=['csr', 'csc'], dtype=None)
     y = check_array(y, accept_sparse=['csr', 'csc'], dtype=None,
                     ensure_2d=False)
     check_consistent_length(X, y)
     return X, y, binarize_y
Пример #32
0
def _process_graphs(graphs, inner_hier_labels, outer_hier_labels, transform,
                    sort_nodes):
    """ Handles transformation and sorting of graphs for plotting
    
    """
    for g in graphs:
        check_consistent_length(g, inner_hier_labels, outer_hier_labels)

    graphs = [_transform(arr, transform) for arr in graphs]

    if inner_hier_labels is not None:
        inner_hier_labels = np.array(inner_hier_labels)
        if outer_hier_labels is None:
            outer_hier_labels = np.ones_like(inner_hier_labels)
        else:
            outer_hier_labels = np.array(outer_hier_labels)
    else:
        inner_hier_labels = np.ones(graphs[0].shape[0], dtype=int)
        outer_hier_labels = np.ones_like(inner_hier_labels)

    graphs = [
        _sort_graph(arr, inner_hier_labels, outer_hier_labels, sort_nodes)
        for arr in graphs
    ]
    return graphs
Пример #33
0
    def _check_X_y(self, X, y):
        if hasattr(X, "loc"):
            # store information to build dataframe
            self._X_columns = X.columns
            self._X_dtypes = X.dtypes
        else:
            self._X_columns = None
            self._X_dtypes = None

        if hasattr(y, "loc"):
            # store information to build a series
            self._y_name = y.name
            self._y_dtype = y.dtype
        else:
            self._y_name = None
            self._y_dtype = None

        y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
        X = check_array(X,
                        accept_sparse=["csr", "csc"],
                        dtype=None,
                        force_all_finite=False)
        y = check_array(y,
                        accept_sparse=["csr", "csc"],
                        dtype=None,
                        ensure_2d=False)
        check_consistent_length(X, y)
        return X, y, binarize_y
Пример #34
0
def nonnegative_regression(X, y, sample_weight=None):
    r"""Solve the nonnegative least squares estimate regression problem.

    Solves :math:`\underset{x}{\text{argmin}} \| Ax - b \|_2^2` subject to :math:`x \geq 0`
    using `scipy.optimize.nnls <https://docs.scipy.org/doc/scipy/reference/
    generated/scipy.optimize.nnls.html>`_

    Parameters
    ----------
    X : array, shape = (n_samples, n_features)
        Training data.

    y : array, shape = (n_samples,) or (n_samples, n_targets)
        Target values.

    sample_weight : float or array-like, shape (n_samples,), optional (default = None)
        Individual weights for each sample.

    Returns
    -------
    coef : array, shape = (n_features,) or (n_samples, n_features)
        Weight vector(s).

    res : float
        The residual, :math:`\| Ax - y \|_2`.
    """
    # TODO accept_sparse=['csr', 'csc', 'coo']? check sopt.nnls
    # TODO order='F'?
    X = check_array(X)
    y = check_array(y, ensure_2d=False)
    check_consistent_length(X, y)

    n_samples, n_features = X.shape

    ravel = False
    if y.ndim == 1:
        y = y.reshape(-1, 1)
        ravel = True

    n_samples_, n_targets = y.shape

    if n_samples != n_samples_:
        raise ValueError("Number of samples in X and y does not correspond:"
                         " %d != %d" % (n_samples, n_samples_))

    has_sw = sample_weight is not None

    if has_sw:
        if np.atleast_1d(sample_weight).ndim > 1:
            raise ValueError("Sample weights must be 1D array or scalar")

        X, y = _rescale_data(X, y, sample_weight)

    coef, res = _solve_nnls(X, y)

    if ravel:
        # When y was passed as 1d-array, we flatten the coefficients
        coef = coef.ravel()

    return coef, res
Пример #35
0
def _check_targets(y_true, y_pred):
    """Check that y_true and y_pred belong to the same classification task

    This converts multiclass or binary types to a common shape, and raises a
    ValueError for a mix of multilabel and multiclass targets, a mix of
    multilabel formats, for the presence of continuous-valued or multioutput
    targets, or for targets of different lengths.

    Column vectors are squeezed to 1d, while multilabel formats are returned
    as CSR sparse label indicators.

    Parameters
    ----------
    y_true : array-like

    y_pred : array-like

    Returns
    -------
    type_true : one of {'multilabel-indicator', 'multiclass', 'binary'}
        The type of the true target data, as output by
        ``utils.multiclass.type_of_target``

    y_true : array or indicator matrix

    y_pred : array or indicator matrix
    """
    check_consistent_length(y_true, y_pred)
    type_true = type_of_target(y_true)
    type_pred = type_of_target(y_pred)

    y_type = {type_true, type_pred}
    if y_type == {"binary", "multiclass"}:
        y_type = {"multiclass"}

    if len(y_type) > 1:
        raise ValueError("Classification metrics can't handle a mix of {0} "
                         "and {1} targets".format(type_true, type_pred))

    # We can't have more than one value on y_type => The set is no more needed
    y_type = y_type.pop()

    # No metrics support "multiclass-multioutput" format
    if (y_type not in ["binary", "multiclass", "multilabel-indicator"]):
        raise ValueError("{0} is not supported".format(y_type))

    if y_type in ["binary", "multiclass"]:
        y_true = column_or_1d(y_true)
        y_pred = column_or_1d(y_pred)
        if y_type == "binary":
            unique_values = np.union1d(y_true, y_pred)
            if len(unique_values) > 2:
                y_type = "multiclass"

    if y_type.startswith('multilabel'):
        y_true = csr_matrix(y_true)
        y_pred = csr_matrix(y_pred)
        y_type = 'multilabel-indicator'

    return y_type, y_true, y_pred
Пример #36
0
 def _evaluate_pointwise_score(y_true, y_pred, score_func):
     pointwise_scores = score_func(y_true, y_pred)
     check_consistent_length(pointwise_scores, y_true)
     mean_score = np.mean(pointwise_scores)
     n = pointwise_scores.shape[0]
     stderr = np.std(pointwise_scores) / np.sqrt(n - 1)
     return mean_score, stderr
Пример #37
0
def _dump_df_excel(obj, file, **kwargs):
    '''dump df to excel
    
    obj: 
        2d array like data
    file:
        str or file obj:        
    '''
    writer = pd.ExcelWriter(file)
    obj = get_flat_list(obj)

    sheet_name = kwargs.get('sheet_name')

    if sheet_name is None:
        sheet_name = ['sheet' + str(i + 1) for i in range(len(obj))]
    else:
        sheet_name = get_flat_list(sheet_name)
        check_consistent_length(obj, sheet_name)

    for data, name in zip(obj, sheet_name):
        try:
            data = pd.DataFrame(data)
            kw = get_kwargs(data.to_excel, **kwargs)
            kw.update({
                'sheet_name': name,
                'index': kwargs.get('index', False)
            })
            data.to_excel(writer, **kw)
        except Exception as e:
            print(repr(e))
            continue
    writer.save()
Пример #38
0
def _check_targets_hmc(y_true, y_pred):
    check_consistent_length(y_true, y_pred)
    y_type = set([type_of_target(y_true), type_of_target(y_pred)])
    if y_type == set(["binary", "multiclass"]):
        y_type = set(["multiclass"])
    if y_type != set(["multiclass"]):
        raise ValueError("{0} is not supported".format(y_type))
    y_true = column_or_1d(y_true)
    y_pred = column_or_1d(y_pred)
    return y_true, y_pred
Пример #39
0
def _validate_mcmc_fit_input(X_train, y_train, X_test):

        check_consistent_length(X_train, y_train)
        assert_all_finite(y_train)
        y_train = check_array(y_train, ensure_2d=False, dtype=np.float64)

        assert X_train.shape[1] == X_test.shape[1]
        X_train = check_array(X_train, accept_sparse="csc", dtype=np.float64,
                              order="F")
        X_test = check_array(X_test, accept_sparse="csc", dtype=np.float64,
                             order="F")
        return X_train, y_train, X_test
Пример #40
0
def item_finder_statistics(y_true, y_pred):
    """
    Full Statistics of prediction performance

    * n_samples
    * mean_absolute_error: mean, stdev
    * mean_squared_error: mean, rmse, stdev
    * predicted: mean, stdev
    * true: mean, stdev

    Parameters
    ----------
    y_true : array, shape=(n_samples,)
        Ground truth scores
    y_pred : array, shape=(n_samples,)
        Predicted scores

    Returns
    -------
    stats : dict
        Full statistics of prediction performance
    """

    # check inputs
    assert_all_finite(y_true)
    if not is_binary_score(y_true):
        raise ValueError('True scores must be binary')
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calc statistics
    stats = {}

    # dataset size
    stats['n_samples'] = y_true.size

    # descriptive statistics of ground truth scores
    stats['true'] = {'mean': np.mean(y_true), 'stdev': np.std(y_true)}

    # descriptive statistics of ground predicted scores
    stats['predicted'] = {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}

    # statistics at least 0 and 1 must be contained in a score array
    if is_binary_score(y_true, allow_uniform=False):

        # AUC (area under the curve)
        stats['area under the curve'] = skm.roc_auc_score(y_true, y_pred)

    return stats
Пример #41
0
def item_finder_report(y_true, y_pred, disp=True):
    """
    Report brief summary of prediction performance

    * AUC
    * number of data
    * mean and standard dev. of true scores
    * mean and standard dev. of predicted scores

    Parameters
    ----------
    y_true : array, shape(n_samples,)
        Ground truth scores
    y_pred : array, shape(n_samples,)
        Predicted scores
    disp : bool, optional, default=True
        if True, print report

    Returns
    -------
    stats : dict
        belief summary of prediction performance
    """

    # check inputs
    assert_all_finite(y_true)
    if not is_binary_score(y_true):
        raise ValueError('True scores must be binary')
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calc statistics
    stats = {
        'n_samples': y_true.size,
        'true': {'mean': np.mean(y_true), 'stdev': np.std(y_true)},
        'predicted': {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}}

    # statistics at least 0 and 1 must be contained in a score array
    if is_binary_score(y_true, allow_uniform=False):
        stats['area under the curve'] = skm.roc_auc_score(y_true, y_pred)

    # display statistics
    if disp:
        print(
            json.dumps(
                stats, sort_keys=True, indent=4, separators=(',', ': '),
                ensure_ascii=False), file=sys.stderr)

    return stats
Пример #42
0
def score_predictor_report(y_true, y_pred, disp=True):
    """
    Report brief summary of prediction performance
    
    * mean absolute error
    * root mean squared error
    * number of data
    * mean and standard dev. of true scores
    * mean and standard dev. of predicted scores

    Parameters
    ----------
    y_true : array, shape(n_samples,)
        Ground truth scores
    y_pred : array, shape(n_samples,)
        Predicted scores
    disp : bool, optional, default=True
        if True, print report

    Returns
    -------
    stats : dict
        belief summary of prediction performance
    """

    # check inputs
    assert_all_finite(y_true)
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calc statistics
    stats = {
        'mean absolute error': skm.mean_absolute_error(y_true, y_pred),
        'root mean squared error':
            np.sqrt(np.maximum(skm.mean_squared_error(y_true, y_pred), 0.)),
        'n_samples': y_true.size,
        'true': {'mean': np.mean(y_true), 'stdev': np.std(y_true)},
        'predicted': {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}}

    # display statistics
    if disp:
        print(json.dumps(
            stats, sort_keys=True, indent=4, separators=(',', ': '),
            ensure_ascii=False),
            file=sys.stderr)

    return stats
    def __init__(self, x, y, status, time=None):
        self.x, self.y = check_X_y(x, y)

        assert numpy.issubdtype(y.dtype, numpy.integer), \
            "y vector must have integer type, but was {0}".format(y.dtype)
        assert y.min() == 0, "minimum element of y vector must be 0"

        if time is None:
            self.status = check_array(status, dtype=bool, ensure_2d=False)
            check_consistent_length(self.x, self.status)
        else:
            self.status = check_array(status, dtype=bool, ensure_2d=False)
            self.time = check_array(time, ensure_2d=False)
            check_consistent_length(self.x, self.status, self.time)

        self.eps = numpy.finfo(self.x.dtype).eps
Пример #44
0
def get_label_n(y, y_pred, n=None):
    """Function to turn raw outlier scores into binary labels by assign 1
    to top n outlier scores.

    Parameters
    ----------
    y : list or numpy array of shape (n_samples,)
        The ground truth. Binary (0: inliers, 1: outliers).

    y_pred : list or numpy array of shape (n_samples,)
        The raw outlier scores as returned by a fitted model.

    n : int, optional (default=None)
        The number of outliers. if not defined, infer using ground truth.

    Returns
    -------
    labels : numpy array of shape (n_samples,)
        binary labels 0: normal points and 1: outliers

    Examples
    --------
    >>> from pyod.utils.utility import get_label_n
    >>> y = [0, 1, 1, 0, 0, 0]
    >>> y_pred = [0.1, 0.5, 0.3, 0.2, 0.7]
    >>> get_label_n(y, y_pred)
    >>> [0, 1, 0, 0, 1]

    """

    # enforce formats of inputs
    y = column_or_1d(y)
    y_pred = column_or_1d(y_pred)

    check_consistent_length(y, y_pred)
    y_len = len(y)  # the length of targets

    # calculate the percentage of outliers
    if n is not None:
        outliers_fraction = n / y_len
    else:
        outliers_fraction = np.count_nonzero(y) / y_len

    threshold = scoreatpercentile(y_pred, 100 * (1 - outliers_fraction))
    y_pred = (y_pred > threshold).astype('int')

    return y_pred
Пример #45
0
    def fit(self, X, y):
        """ Fit model with specified loss.

        Parameters
        ----------
        X : scipy.sparse.csc_matrix, (n_samples, n_features)

        y : float | ndarray, shape = (n_samples, )

        """

        check_consistent_length(X, y)
        y = check_array(y, ensure_2d=False, dtype=np.float64)
        X = X.T
        X = check_array(X, accept_sparse="csc", dtype=np.float64)

        self.w0_, self.w_, self.V_ = ffm.ffm_sgd_fit(self, X, y)
        return self
Пример #46
0
def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None):
    check_consistent_length(y_true, pred_decision, sample_weight)
    pred_decision = check_array(pred_decision, ensure_2d=False)
    y_true = column_or_1d(y_true)
    y_true_unique = np.unique(y_true)
    if y_true_unique.size > 2:
        if (labels is None and pred_decision.ndim > 1 and
                (np.size(y_true_unique) != pred_decision.shape[1])):
            raise ValueError("Please include all labels in y_true "
                             "or pass labels as third argument")
        if labels is None:
            labels = y_true_unique
        le = LabelEncoder()
        le.fit(labels)
        y_true = le.transform(y_true)
        mask = np.ones_like(pred_decision, dtype=bool)
        mask[np.arange(y_true.shape[0]), y_true] = False
        margin = pred_decision[~mask]
        margin -= np.max(pred_decision[mask].reshape(y_true.shape[0], -1),
                         axis=1)
    else:
        # Handles binary class case
        # this code assumes that positive and negative labels
        # are encoded as +1 and -1 respectively
        pred_decision = column_or_1d(pred_decision)
        pred_decision = np.ravel(pred_decision)

        lbin = LabelBinarizer(neg_label=-1)
        y_true = lbin.fit_transform(y_true)[:, 0]

        try:
            margin = y_true * pred_decision
        except TypeError:
            raise TypeError("pred_decision should be an array of floats.")

    losses = 1 - margin
    # The hinge_loss doesn't penalize good enough predictions.
    losses[losses <= 0] = 0
    return losses
Пример #47
0
def mean_squared_error(y_true, y_pred):
    """
    Root mean square error, mean square error, and its standard deviation.

    If you need only RMSE, use 
    :func:`sklearn.metrics.mean_absolute_error`

    Parameters
    ----------
    y_true : array, shape(n_samples,)
        Ground truth scores
    y_pred : array, shape(n_samples,)
        Predicted scores

    Returns
    -------
    rmse : float
        root mean squared error
    mean : float
        mean of absolute errors
    stdev : float
        standard deviation of absolute errors
    """

    # check inputs
    assert_all_finite(y_true)
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calculate errors
    errs = (y_true - y_pred) ** 2
    mean = np.nanmean(errs)
    stdev = np.nanstd(errs)
    rmse = np.sqrt(np.maximum(mean, 0.))

    return rmse, mean, stdev
Пример #48
0
    def fit(self, X, y, qids, sample_weight=None, monitor=None, ):
        """Fit lambdamart onto a dataset.

        Parameters
        ----------

        X : array_like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array_like, shape = [n_samples]
            Target values (integers in classification, real numbers in
            regression)
            For classification, labels must correspond to classes.
        qids : array_like, shape = [n_samples]
            Query ids for each sample. Samples must be grouped by query such
            that all queries with the same qid appear in one contiguous block.
        monitor : callable, optional
            The monitor is called after each iteration with the current
            iteration, a reference to the estimator and the local variables of
            ``_fit_stages`` as keyword arguments ``callable(i, self,
            locals())``. If the callable returns ``True`` the fitting procedure
            is stopped. The monitor can be used for various things such as
            computing held-out estimates, early stopping, model introspecting,
            and snapshoting.

        """

        if not self.warm_start:
            self._clear_state()

        X, y = check_X_y(X, y, dtype=DTYPE)
        n_samples, self.n_features = X.shape

        check_consistent_length(X, y, qids)
        if y.dtype.kind == 'O':
            y = y.astype(np.float64)

        random_state = check_random_state(self.random_state)
        self._check_params()

        if not self._is_initialized():
            self._init_state()
            begin_at_stage = 0
            y_pred = np.zeros(y.shape[0])
        else:
            if self.n_estimators < self.estimators_.shape[0]:
                raise ValueError('n_estimators=%d must be larger or equal to '
                                 'estimators_.shape[0]=%d when '
                                 'warm_start==True'
                                 % (self.n_estimators,
                                    self.estimators_.shape[0]))
            begin_at_stage = self.estimators_.shape[0]
            self.estimators_fitted_ = begin_at_stage
            self.estimators_.resize((self.n_estimators, 1))
            self.train_score_.resize(self.n_estimators)
            if self.query_subsample < 1.0:
                self.oob_improvement_.resize(self.n_estimators)
            y_pred = self.predict(X)

        n_stages = self._fit_stages(X, y, qids, y_pred,
                                    random_state, begin_at_stage, monitor)

        if n_stages < self.estimators_.shape[0]:
            self.trim(n_stages)

        return self
Пример #49
0
def score_predictor_statistics(y_true, y_pred, score_domain=(1, 5, 1)):
    """
    Full Statistics of prediction performance
    
    * n_samples
    * mean_absolute_error: mean, stdev
    * mean_squared_error: mean, rmse, stdev 
    * predicted: mean, stdev
    * true: mean, stdev

    Parameters
    ----------
    y_true : array, shape=(n_samples,)
        Ground truth scores
    y_pred : array, shape=(n_samples,)
        Predicted scores
    score_domain : array, shape=(3,)
        Domain of scores, represented by a triple: start, end, and stride
        default=(1, 5, 1).

    Returns
    -------
    stats : dict
        Full statistics of prediction performance
    """

    # check inputs
    assert_all_finite(y_true)
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calc statistics
    stats = {}

    # dataset size
    stats['n_samples'] = y_true.size

    # a list of possible score levels
    stats['score levels'] = np.hstack([
        np.arange(score_domain[0], score_domain[1], score_domain[2],
                  dtype=float), score_domain[1]])

    # mean absolute error
    mean, stdev = mean_absolute_error(y_true, y_pred)
    stats['mean absolute error'] = {'mean': mean, 'stdev': stdev}

    # root mean squared error
    rmse, mean, stdev = mean_squared_error(y_true, y_pred)
    stats['mean squared error'] = {'rmse': rmse, 'mean': mean, 'stdev': stdev}

    # descriptive statistics of ground truth scores
    stats['true'] = {'mean': np.mean(y_true), 'stdev': np.std(y_true)}

    hist, _ = score_histogram(y_true, score_domain=score_domain)
    stats['true']['histogram'] = hist
    stats['true']['histogram density'] = (hist / hist.sum())

    # descriptive statistics of ground predicted scores
    stats['predicted'] = {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}

    hist, _ = score_histogram(y_pred, score_domain=score_domain)
    stats['predicted']['histogram'] = hist
    stats['predicted']['histogram density'] = (hist / hist.sum())

    return stats
Пример #50
0
    def fit(self, X, y):
        """
        The Gaussian Process model fitting method.

        Parameters
        ----------
        X : double array_like
            An array with shape (n_samples, n_features) with the input at which
            observations were made.

        y : double array_like
            An array with shape (n_samples, ) or shape (n_samples, n_targets)
            with the observations of the output to be predicted.

        Returns
        -------
        gp : self
            A fitted Gaussian Process model object awaiting data to perform
            predictions.
        """
        # Run input checks
        self._check_params()

        self.random_state = check_random_state(self.random_state)

        # Force data to 2D numpy.array
        X = check_array(X)
        y = np.asarray(y)
        self.y_ndim_ = y.ndim
        if y.ndim == 1:
            y = y[:, np.newaxis]
        check_consistent_length(X, y)

        # Check shapes of DOE & observations
        n_samples, n_features = X.shape
        _, n_targets = y.shape

        # Run input checks
        self._check_params(n_samples)

        # Normalize data or don't
        if self.normalize:
            X_mean = np.mean(X, axis=0)
            X_std = np.std(X, axis=0)
            y_mean = np.mean(y, axis=0)
            y_std = np.std(y, axis=0)
            X_std[X_std == 0.] = 1.
            y_std[y_std == 0.] = 1.
            # center and scale X if necessary
            X = (X - X_mean) / X_std
            y = (y - y_mean) / y_std
        else:
            X_mean = np.zeros(1)
            X_std = np.ones(1)
            y_mean = np.zeros(1)
            y_std = np.ones(1)

        # Fit correlation model
        self.corr.fit(X, self.nugget)

        # Regression matrix and parameters
        F = self.regr(X)
        n_samples_F = F.shape[0]
        if F.ndim > 1:
            p = F.shape[1]
        else:
            p = 1
        if n_samples_F != n_samples:
            raise Exception("Number of rows in F and X do not match. Most "
                            "likely something is going wrong with the "
                            "regression model.")
        if p > n_samples_F:
            raise Exception(("Ordinary least squares problem is undetermined "
                             "n_samples=%d must be greater than the "
                             "regression model size p=%d.") % (n_samples, p))
        if self.beta0 is not None:
            if self.beta0.shape[0] != p:
                raise Exception("Shapes of beta0 and F do not match.")

        # Set attributes
        self.X = X
        self.y = y
        self.F = F
        self.X_mean, self.X_std = X_mean, X_std
        self.y_mean, self.y_std = y_mean, y_std

        # Determine Gaussian Process model parameters
        if self.thetaL is not None and self.thetaU is not None:
            # Maximum a Posterior estimation of the parameters
            if self.verbose:
                print("Performing Maximum a Posterior estimation of the "
                      "autocorrelation parameters...")
            self.theta_, self.posterior_function_value_, par = \
                self._arg_max_posterior()
            # compute reduced_likelihood_function_value_ for backward
            # compatibility
            self.reduced_likelihood_function_value_, _ = \
                self.reduced_likelihood_function()
            if np.isinf(self.posterior_function_value_):
                raise Exception("Bad parameter region. "
                                "Try increasing upper bound")

        else:
            # Given parameters
            if self.verbose:
                print("Given autocorrelation parameters. "
                      "Computing Gaussian Process model parameters...")
            self.theta_ = self.theta0
            self.reduced_likelihood_function_value_, par = \
                self.reduced_likelihood_function()
            self.posterior_function_value_ = \
                self.reduced_likelihood_function_value_ \
                + self.corr.log_prior(self.theta_)
            if np.isinf(self.posterior_function_value_):
                raise Exception("Bad point. Try increasing theta0.")

        self.beta = par['beta']
        self.gamma = par['gamma']
        self.sigma2 = par['sigma2']
        self.C = par['C']
        self.Ft = par['Ft']
        self.G = par['G']

        if self.storage_mode == 'light':
            # Delete heavy data (it will be computed again if required)
            # (it is required only when MSE is wanted in self.predict)
            if self.verbose:
                print("Light storage mode specified. "
                      "Flushing autocorrelation matrix...")
            self.F = None
            self.C = None
            self.Ft = None
            self.G = None

        return self
def concordance_index_censored(event_indicator, event_time, estimate):
    """Concordance index for right-censored data

    The concordance index is defined as the proportion of all comparable pairs
    in which the predictions and outcomes are concordant.

    Samples are comparable if for at least one of them an event occurred.
    If the estimated risk is larger for the sample with a higher time of
    event/censoring, the predictions of that pair are said to be concordant.
    If an event occurred for one sample and the other is known to be
    event-free at least until the time of event of the first, the second
    sample is assumed to *outlive* the first.
    When predicted risks are identical for a pair, 0.5 rather than 1 is added
    to the count of concordant pairs.
    A pair is not comparable if an event occurred for both of them at the same
    time or an event occurred for one of them but the time of censoring is
    smaller than the time of event of the first one.

    Parameters
    ----------
    event_indicator : array-like, shape = [n_samples,]
        Boolean array denotes whether an event occurred

    event_time : array-like, shape = [n_samples,]
        Array containing the time of an event or time of censoring

    estimate : array-like, shape = [n_samples,]
        Estimated risk of experiencing an event

    Returns
    -------
    cindex : float
        Concordance index

    concordant : int
        Number of concordant pairs

    discordant : int
        Number of discordant pairs

    tied_risk : int
        Number of pairs having tied estimated risks

    tied_time : int
        Number of pairs having an event at the same time

    References
    ----------
    .. [1] Harrell, F.E., Califf, R.M., Pryor, D.B., Lee, K.L., Rosati, R.A,
           "Multivariable prognostic models: issues in developing models,
           evaluating assumptions and adequacy, and measuring and reducing errors",
           Statistics in Medicine, 15(4), 361-87, 1996.
    """
    check_consistent_length(event_indicator, event_time, estimate)
    event_indicator = check_array(event_indicator, ensure_2d=False)
    event_time = check_array(event_time, ensure_2d=False)
    estimate = check_array(estimate, ensure_2d=False)

    if not numpy.issubdtype(event_indicator.dtype, numpy.bool_):
        raise ValueError(
            'only boolean arrays are supported as class labels for survival analysis, got {0}'.format(
                event_indicator.dtype))

    n_samples = len(event_time)
    if n_samples < 2:
        raise ValueError("Need a minimum of two samples")

    if not event_indicator.any():
        raise ValueError("All samples are censored")

    order = numpy.argsort(event_time)

    tied_time = 0
    comparable = {}
    for i in range(n_samples - 1):
        inext = i + 1
        j = inext
        time_i = event_time[order[i]]
        while j < n_samples and event_time[order[j]] == time_i:
            j += 1

        if event_indicator[order[i]]:
            mask = numpy.zeros(n_samples, dtype=bool)
            mask[inext:] = True
            if j - i > 1:
                # event times are tied, need to check for coinciding events
                event_at_same_time = event_indicator[order[inext:j]]
                mask[inext:j] = numpy.logical_not(event_at_same_time)
                tied_time += event_at_same_time.sum()
            comparable[i] = mask
        elif j - i > 1:
            # events at same time are comparable if at least one of them is positive
            mask = numpy.zeros(n_samples, dtype=bool)
            mask[inext:j] = event_indicator[order[inext:j]]
            comparable[i] = mask

    concordant = 0
    discordant = 0
    tied_risk = 0
    for ind, mask in comparable.items():
        est_i = estimate[order[ind]]
        event_i = event_indicator[order[ind]]

        est = estimate[order[mask]]

        if event_i:
            # an event should have a higher score
            con = (est < est_i).sum()
        else:
            # a non-event should have a lower score
            con = (est > est_i).sum()
        concordant += con

        tie = (est == est_i).sum()
        tied_risk += tie

        discordant += est.size - con - tie

    cindex = (concordant + 0.5 * tied_risk) / (concordant + discordant + tied_risk)
    return cindex, concordant, discordant, tied_risk, tied_time
    def fit(self, X1, y1, X2, y2, left_right_bounds=None):
        """Fit estimator using RANSAC algorithm.

        Namely, the fit is done into two main steps:
        - pre-fitting: quickly select n_prefits configurations which seems
        suitable given topological constraints.
        - finding best fit: select the pre-fit with the maximum number of inliers
        as the best fit.

        Inputs:
          X1, y1: Left lane points (supposedly)
          X2, y2: Right lane points (supposedly)
        """
        check_consistent_length(X1, y1)
        check_consistent_length(X2, y2)

        # Assume linear model by default
        min_samples = X1.shape[1] + 1
        if min_samples > X1.shape[0] or min_samples > X2.shape[0]:
            raise ValueError("`min_samples` may not be larger than number "
                             "of samples ``X1-2.shape[0]``.")

        # Check additional parameters...
        if self.stop_probability < 0 or self.stop_probability > 1:
            raise ValueError("`stop_probability` must be in range [0, 1].")
        if self.residual_threshold is None:
            residual_threshold = np.median(np.abs(y - np.median(y)))
        else:
            residual_threshold = self.residual_threshold
        delta_left_right = (left_right_bounds[0, 0, 1] + left_right_bounds[0, 0, 0]) / 2.
        # random_state = check_random_state(self.random_state)

        # Set up lambdas for computing score.
        score_lambdas = np.copy(self.score_lambdas)
        score_lambdas[0] = score_lambdas[0] / (y1.size + y2.size)

        # Collections...
        self.w_fits = []
        self.w_fits_l2 = []
        self.inliers_masks = []
        self.n_inliers = []
        self.score_fits = []

        # === Left lane, and then, right lane === #
        w_left_prefits = lanes_ransac_prefit(X1, y1,
                                             self.n_prefits,
                                             self.max_trials,
                                             self.w_refs_left,
                                             self.is_valid_bounds_left)
        (w_left1, in_mask_left1, score_left1) = \
            lanes_ransac_select_best(X1, y1,
                                     w_left_prefits, residual_threshold,
                                     self.w_refs_left, score_lambdas)
        n_inliers_left1 = np.sum(in_mask_left1)

        w_refs = np.vstack((self.w_refs_right, np.reshape(w_left1, (1, 3))))
        is_valid_bounds = np.vstack((self.is_valid_bounds_right, left_right_bounds))
        w_right_prefits = lanes_ransac_prefit(X2, y2,
                                              self.n_prefits,
                                              self.max_trials,
                                              w_refs,
                                              is_valid_bounds)
        w0 = lane_translate(w_left1, delta_left_right)
        w_right_prefits = np.vstack((w0, w_right_prefits))

        (w_right1, in_mask_right1, score_right1) = \
            lanes_ransac_select_best(X2, y2,
                                     w_right_prefits, residual_threshold,
                                     self.w_refs_right, score_lambdas)
        n_inliers_right1 = np.sum(in_mask_right1)
        n_inliers1 = n_inliers_right1 + n_inliers_left1

        self.w_fits.append((w_left1, w_right1))
        self.n_inliers.append(n_inliers1)
        self.inliers_masks.append((in_mask_left1, in_mask_right1))
        self.score_fits.append((score_left1, score_right1))

        # === Right lane and then left lane === #
        w_right_prefits = lanes_ransac_prefit(X2, y2,
                                              self.n_prefits,
                                              self.max_trials,
                                              self.w_refs_right,
                                              self.is_valid_bounds_right)
        (w_right2, in_mask_right2, score_right2) = \
            lanes_ransac_select_best(X2, y2,
                                     w_right_prefits, residual_threshold,
                                     self.w_refs_right, score_lambdas)
        n_inliers_right2 = np.sum(in_mask_right2)
        w_refs = np.vstack((self.w_refs_left, np.reshape(w_right2, (1, 3))))
        is_valid_bounds = np.vstack((self.is_valid_bounds_left, left_right_bounds))
        w_left_prefits = lanes_ransac_prefit(X1, y1,
                                             self.n_prefits,
                                             self.max_trials,
                                             w_refs,
                                             is_valid_bounds)
        w0 = lane_translate(w_right2, -delta_left_right)
        w_left_prefits = np.vstack((w0, w_left_prefits))

        (w_left2, in_mask_left2, score_left2) = \
            lanes_ransac_select_best(X1, y1,
                                     w_left_prefits, residual_threshold,
                                     self.w_refs_left, score_lambdas)
        n_inliers_left2 = np.sum(in_mask_left2)
        n_inliers2 = n_inliers_right2 + n_inliers_left2

        self.w_fits.append((w_left2, w_right2))
        self.n_inliers.append(n_inliers2)
        self.inliers_masks.append((in_mask_left2, in_mask_right2))
        self.score_fits.append((score_left2, score_right2))

        # === Previous frame??? === #
        if self.w_refs_left.size > 0 and self.w_refs_right.size > 0:
            in_mask_left3 = lanes_inliers(X1, y1, self.w_refs_left[0], residual_threshold)
            in_mask_right3 = lanes_inliers(X2, y2, self.w_refs_right[0], residual_threshold)
            n_inliers3 = np.sum(in_mask_left3) + np.sum(in_mask_right3)
            score_left3 = lane_score(np.sum(in_mask_left3),
                                     self.w_refs_left[0],
                                     self.w_refs_left,
                                     score_lambdas)
            score_right3 = lane_score(np.sum(in_mask_right3),
                                      self.w_refs_right[0],
                                      self.w_refs_right,
                                      score_lambdas)

            self.w_fits.append((self.w_refs_left[0], self.w_refs_right[0]))
            self.n_inliers.append(n_inliers3)
            self.inliers_masks.append((in_mask_left3, in_mask_right3))
            self.score_fits.append((score_left3, score_right3))

        # L2 regression regularisation of fits.
        self.w_fits_l2 = copy.deepcopy(self.w_fits)
        if self.l2_scales is not None:
            for i in range(len(self.w_fits)):
                w1, w2 = self.w_fits[i]
                # Some regression: ignored when inversed matrix error.
                try:
                    w_left = m_regression_exp(X1, y1, w1, self.l2_scales)
                except Exception:
                    w_left = w1
                try:
                    w_right = m_regression_exp(X2, y2, w2, self.l2_scales)
                except Exception:
                    w_right = w2

                in_mask_left = lanes_inliers(X1, y1, w_left, residual_threshold)
                in_mask_right = lanes_inliers(X2, y2, w_right, residual_threshold)
                n_inliers = np.sum(in_mask_left) + np.sum(in_mask_right)
                score_left = lane_score(np.sum(in_mask_left),
                                        w_left,
                                        self.w_refs_left,
                                        score_lambdas)
                score_right = lane_score(np.sum(in_mask_right),
                                         w_right,
                                         self.w_refs_right,
                                         score_lambdas)

                self.w_fits_l2[i] = (w_left, w_right)
                self.n_inliers[i] = n_inliers
                self.inliers_masks[i] = (in_mask_left, in_mask_right)
                self.score_fits[i] = (score_left, score_right)

        # Best fit?
        scores = [s1+s2 for (s1, s2) in self.score_fits]
        idx = np.argmax(scores)
        w_left, w_right = self.w_fits_l2[idx]
        in_mask_left, in_mask_right = self.inliers_masks[idx]

        # Smoothing.
        smoothing = self.smoothing
        if self.w_refs_left.size > 0 and self.w_refs_right.size > 0:
            w_left = smoothing * w_left + (1. - smoothing) * self.w_refs_left[0]
            w_right = smoothing * w_right + (1. - smoothing) * self.w_refs_right[0]

        self.w1_ = w_left
        self.w2_ = w_right

        # Set regression parameters.
        base_estimator1 = LinearRegression(fit_intercept=False)
        base_estimator1.coef_ = w_left
        base_estimator1.intercept_ = 0.0
        base_estimator2 = LinearRegression(fit_intercept=False)
        base_estimator2.coef_ = w_right
        base_estimator2.intercept_ = 0.0

        # Save final model parameters.
        self.estimator1_ = base_estimator1
        self.estimator2_ = base_estimator2

        self.inlier_mask1_ = in_mask_left
        self.inlier_mask2_ = in_mask_right

        # # Estimate final model using all inliers
        # # base_estimator1.fit(X1_inlier_best, y1_inlier_best)
        # # base_estimator2.fit(X2_inlier_best, y2_inlier_best)

        return self
    def fit(self, X1, y1, X2, y2):
        """Fit estimator using RANSAC algorithm.

        Namely, the fit is done into two main steps:
        - pre-fitting: quickly select n_prefits configurations which seems
        suitable given topological constraints.
        - finding best fit: select the pre-fit with the maximum number of inliers
        as the best fit.

        Inputs:
          X1, y1: Left lane points (supposedly)
          X2, y2: Right lane points (supposedly)
        """
        check_consistent_length(X1, y1)
        check_consistent_length(X2, y2)

        # Assume linear model by default
        min_samples = X1.shape[1] + 1
        if min_samples > X1.shape[0] or min_samples > X2.shape[0]:
            raise ValueError("`min_samples` may not be larger than number "
                             "of samples ``X1-2.shape[0]``.")

        # Check additional parameters...
        if self.stop_probability < 0 or self.stop_probability > 1:
            raise ValueError("`stop_probability` must be in range [0, 1].")
        if self.residual_threshold is None:
            residual_threshold = np.median(np.abs(y - np.median(y)))
        else:
            residual_threshold = self.residual_threshold
        # random_state = check_random_state(self.random_state)

        # === Pre-fit with small subsets (4 points) === #
        # Allows to quickly pre-select some good configurations.
        w1_prefits, w2_prefits = lanes_ransac_prefit(X1, y1, X2, y2,
                                                     self.n_prefits,
                                                     self.max_trials,
                                                     self.is_valid_diffs,
                                                     self.is_valid_bounds)

        # === Select best pre-fit, using the full dataset === #
        post_fit = 0
        (w1,
         w2,
         inlier_mask1,
         inlier_mask2) = lanes_ransac_select_best(X1, y1, X2, y2,
                                                  w1_prefits, w2_prefits,
                                                  residual_threshold,
                                                  post_fit)
        self.w1_ = w1
        self.w2_ = w2

        # Set regression parameters.
        base_estimator1 = LinearRegression(fit_intercept=False)
        base_estimator1.coef_ = w1
        base_estimator1.intercept_ = 0.0
        base_estimator2 = LinearRegression(fit_intercept=False)
        base_estimator2.coef_ = w2
        base_estimator2.intercept_ = 0.0

        # Save final model parameters.
        self.estimator1_ = base_estimator1
        self.estimator2_ = base_estimator2

        self.inlier_mask1_ = inlier_mask1
        self.inlier_mask2_ = inlier_mask2

        # # Estimate final model using all inliers
        # # base_estimator1.fit(X1_inlier_best, y1_inlier_best)
        # # base_estimator2.fit(X2_inlier_best, y2_inlier_best)

        return self