예제 #1
0
def test__check_reg_targets_exception():
    invalid_multioutput = 'this_value_is_not_valid'
    expected_message = ("Allowed 'multioutput' string values are.+"
                        "You provided multioutput={!r}".format(
                            invalid_multioutput))
    with pytest.raises(ValueError, match=expected_message):
        _check_reg_targets([1, 2, 3], [[1], [2], [3]], invalid_multioutput)
예제 #2
0
def test__check_reg_targets():
    # All of length 3
    EXAMPLES = [
        ("continuous", [1, 2, 3], 1),
        ("continuous", [[1], [2], [3]], 1),
        ("continuous-multioutput", [[1, 1], [2, 2], [3, 1]], 2),
        ("continuous-multioutput", [[5, 1], [4, 2], [3, 1]], 2),
        ("continuous-multioutput", [[1, 3, 4], [2, 2, 2], [3, 1, 1]], 3),
    ]

    for (type1, y1, n_out1), (type2, y2, n_out2) in product(EXAMPLES,
                                                            repeat=2):

        if type1 == type2 and n_out1 == n_out2:
            y_type, y_check1, y_check2, multioutput = _check_reg_targets(
                y1, y2, None)
            assert type1 == y_type
            if type1 == 'continuous':
                assert_array_equal(y_check1, np.reshape(y1, (-1, 1)))
                assert_array_equal(y_check2, np.reshape(y2, (-1, 1)))
            else:
                assert_array_equal(y_check1, y1)
                assert_array_equal(y_check2, y2)
        else:
            with pytest.raises(ValueError):
                _check_reg_targets(y1, y2, None)
예제 #3
0
def mean_absolute_percentage_error(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    sample_weight: Optional[np.ndarray] = None,
    multioutput: str = "uniform_average",
) -> float:
    """
    Return the MAPE (Mean Absolute Percentage Error) of a prediction.

    The formula is np.mean(np.abs((y_true - y_pred) / y_true)).

    Parameters
    ----------
    y_true : np.ndarray
        Observed values.

    y_pred : np.ndarray
        Predicted values.

    sample_weight : Optional[np.ndarray], default=None
        Individual weights for each sample.

    multioutput : {"raw_values", "uniform_average"} or array-like
        Defines aggregating of multiple output values.
        Array-like value defines weights used to average errors.
        If input is list then the shape must be (n_outputs,).

        - "raw_values": Returns a full set of errors in case of multioutput input.
        - "uniform_average": Errors of all outputs are averaged with uniform weight.

    Returns
    -------
    float
        MAPE value of the input.

    Examples
    --------
    >>> import numpy as np
    >>> y_true = np.array([1, 2, 4])
    >>> y_pred = np.array([1, 1, 2])
    >>> mean_absolute_percentage_error(y_true, y_pred)
    0.3333333333333333
    """
    _, y_true, y_pred, multioutput = _check_reg_targets(
        y_true, y_pred, multioutput)
    check_consistent_length(y_true, y_pred, sample_weight)
    eps = np.finfo(np.float64).eps

    output_errors = np.average(np.abs(
        (y_true - y_pred) / np.maximum(y_true, eps)),
                               weights=sample_weight)

    if isinstance(multioutput, str):
        if multioutput == "raw_values":
            return output_errors
        elif multioutput == "uniform_average":
            # pass None as weights to np.average: uniform mean
            multioutput = None

    return np.average(output_errors, weights=multioutput)
예제 #4
0
def group_mean_log_mae(
    y_true: Union[pd.DataFrame, pd.Series, Sequence, np.ndarray],
    y_pred: Union[pd.DataFrame, pd.Series, Sequence, np.ndarray],
    groups: Union[Sequence, np.ndarray, pd.Series],
    floor: float = 1e-9,
) -> float:
    """Calculates the Group Mean Log Mean Absolute Error. Used in a Kaggle competition.

    Parameters
    ----------
    y_true: list or array-like
        The true, or the expected, values of our problem; along with the group attached
    y_pred: list or array-like
        The predicted values of our problem; along with the group attached
    groups: list or array like
        What our data is being grouped by.
    floor: float, default=1e-9
        The minimum value our Group Mean Log MAE can be (as 0 is undefined for log transformations).

    Returns
    -------
    Our Group Mean Log MAE score
    """
    y_problem, y_true, y_pred, multioutput = _check_reg_targets(
        y_true, y_pred, multioutput="raw_values")
    y_true = pd.Series([i[0] for i in y_true])
    y_pred = pd.Series([i[0] for i in y_pred])
    maes = (y_true - y_pred).abs().groupby(groups).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()
 def load_data(self, y_true: 'DataFrame', y_pred: 'DataFrame', *args,
               **kwargs):
     """TODO: Rename class fo clarify it receives a dataframe and cast to numpyarray
     Args:
         y_true (DataFrame): ground truth
         y_pred (DataFrame): predicted probabilities for positive class
     """
     y_type, y_true, y_pred, multioutput = _check_reg_targets(
         y_true, y_pred, multioutput='uniform_average')
     self.y_true = y_true
     self.y_pred = y_pred > self.probability_threshold
예제 #6
0
def _symmetric_mean_absolute_percentage_error(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    sample_weight: Optional[np.ndarray] = None,
    multioutput: str = "uniform_average",
):
    r"""Symmetric mean absolute percentage error regression loss (SMAPE_):

    .. math:: \text{SMAPE} = \frac{2}{n}\sum_1^n\frac{max(|   y_i - \hat{y_i} |}{| y_i | + | \hat{y_i} |, \epsilon)}

    Where :math:`y` is a tensor of target values, and :math:`\hat{y}` is a tensor of predictions.

    Args:
        y_true: array-like of shape (n_samples,) or (n_samples, n_outputs)
            Ground truth (correct) target values.
        y_pred: array-like of shape (n_samples,) or (n_samples, n_outputs)
            Estimated target values.
        sample_weight: array-like of shape (n_samples,), default=None
            Sample weights.
        multioutput: {'raw_values', 'uniform_average'} or array-like
            Defines aggregating of multiple output values.
            Array-like value defines weights used to average errors.
            If input is list then the shape must be (n_outputs,).

                - 'raw_values': Returns a full set of errors in case of multioutput input.
                - 'uniform_average': Errors of all outputs are averaged with uniform weight.

    Returns:
        loss: float or ndarray of floats in the range [0, 1]
            If multioutput is 'raw_values', then symmetric mean absolute percentage error
            is returned for each output separately.
            If multioutput is 'uniform_average' or an ndarray of weights, then the
            weighted average of all output errors is returned.
            MAPE output is non-negative floating point. The best value is 0.0.
            But note the fact that bad predictions can lead to arbitarily large
            MAPE values, especially if some y_true values are very close to zero.
            Note that we return a large value instead of `inf` when y_true is zero.

    """
    _, y_true, y_pred, multioutput = _check_reg_targets(
        y_true, y_pred, multioutput)
    check_consistent_length(y_true, y_pred, sample_weight)
    epsilon = np.finfo(np.float64).eps
    smape = 2 * np.abs(y_pred - y_true) / np.maximum(
        np.abs(y_true) + np.abs(y_pred), epsilon)
    output_errors = np.average(smape, weights=sample_weight, axis=0)
    if isinstance(multioutput, str):
        if multioutput == "raw_values":
            return output_errors
        # pass None as weights to np.average: uniform mean
        multioutput = None

    return np.average(output_errors, weights=multioutput)
예제 #7
0
def mean_absolute_percentage_error(y_true,
                                   y_pred,
                                   sample_weight=None,
                                   multioutput='uniform_average'):
    """Mean absolute percentage error regression loss.
    Note here that we do not represent the output as a percentage in range
    [0, 100]. Instead, we represent it in range [0, 1/eps]. Read more in the
    :ref:`User Guide <mean_absolute_percentage_error>`.
    .. versionadded:: 0.24
    Parameters
    ----------
    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Ground truth (correct) target values.
    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Estimated target values.
    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.
    multioutput : {'raw_values', 'uniform_average'} or array-like
        Defines aggregating of multiple output values.
        Array-like value defines weights used to average errors.
        If input is list then the shape must be (n_outputs,).
        'raw_values' :
            Returns a full set of errors in case of multioutput input.
        'uniform_average' :
            Errors of all outputs are averaged with uniform weight.
    Returns
    -------
    loss : float or ndarray of floats in the range [0, 1/eps]
        If multioutput is 'raw_values', then mean absolute percentage error
        is returned for each output separately.
        If multioutput is 'uniform_average' or an ndarray of weights, then the
        weighted average of all output errors is returned.
        MAPE output is non-negative floating point. The best value is 0.0.
        But note the fact that bad predictions can lead to arbitarily large
        MAPE values, especially if some y_true values are very close to zero.
        Note that we return a large value instead of `inf` when y_true is zero.
    """
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
        y_true, y_pred, multioutput)
    check_consistent_length(y_true, y_pred, sample_weight)
    epsilon = np.finfo(np.float64).eps
    mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), epsilon)
    output_errors = np.average(mape, weights=sample_weight, axis=0)
    if isinstance(multioutput, str):
        if multioutput == 'raw_values':
            return output_errors
        elif multioutput == 'uniform_average':
            # pass None as weights to np.average: uniform mean
            multioutput = None

    return np.average(output_errors, weights=multioutput)
예제 #8
0
def adjusted_explained_variance_score(
    y_true: Union[Sequence[float], np.ndarray, pd.Series],
    y_pred: Union[Sequence[float], np.ndarray, pd.Series],
    features_vector: Optional[Union[Sequence[str], np.ndarray,
                                    pd.Series]] = None,
    num_features: Optional[int] = None,
) -> float:
    """Calculates an adjusted explained_variance_score that penalizes models that use too many superfluous features

    Parameters
    ----------
    y_true: list or array like
        The true, or the expected, values of our problem
    y_pred: list or array like
        The predicted values of our problem
    features_vector: list or array like, default=None
        A list of all features used for our model.
    num_features: int, default=None
        The number of features used for our model. Used if features_vector is None

    Returns
    -------
    Our adjusted explained_variance_score.
    """
    y_problem, y_true, y_pred, multioutput = _check_reg_targets(
        y_true, y_pred, multioutput="raw_values")
    if features_vector:
        if len(features_vector) >= len(y_true) - 1:
            raise Exception(
                "Number of features is greater than number of rows and 1 degree of freedom"
            )
        if len(features_vector) < 1:
            raise Exception("Cannot have less than one feature")
        p = len(features_vector)
        n = len(y_true)
    elif num_features:
        if num_features >= len(y_true) - 1:
            raise Exception(
                "Number of features is greater than number of rows and 1 degree of freedom"
            )
        if num_features < 1:
            raise Exception("Cannot have less than one feature")
        p = num_features
        n = len(y_true)
    else:
        raise Exception("No features available to calculate adjusted score")
    evs = (explained_variance_score(y_true, y_pred)
           if explained_variance_score(y_true, y_pred) > 0 else 0)
    return 1 - (1 - evs) * (n - 1) / (n - p - 1)
예제 #9
0
        def mean_absolute_percentage_error(
            y_true, y_pred, sample_weight=None, multioutput="uniform_average"
        ):
            y_type, y_true, y_pred, multioutput = _check_reg_targets(
                y_true, y_pred, multioutput
            )
            check_consistent_length(y_true, y_pred, sample_weight)
            mask = y_true != 0
            y_true = y_true[mask]
            y_pred = y_pred[mask]
            mape = np.abs(y_pred - y_true) / np.abs(y_true)
            output_errors = np.average(mape, weights=sample_weight, axis=0)
            if isinstance(multioutput, str):
                if multioutput == "raw_values":
                    return output_errors
                elif multioutput == "uniform_average":
                    # pass None as weights to np.average: uniform mean
                    multioutput = None

            return np.average(output_errors, weights=multioutput)
예제 #10
0
def mean_max10_error(y_true,
                     y_pred,
                     sample_weight=None,
                     multioutput='uniform_average'):
    '''
    Calcula a media dos 10 maiores erros
    '''
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
        y_true, y_pred, multioutput)
    max10 = heapq.nlargest(10, np.abs(y_true - y_pred))
    output_errors = np.average(max10, weights=sample_weight, axis=0)

    if isinstance(multioutput, str):
        if multioutput == 'raw_values':
            return output_errors
        elif multioutput == 'uniform_average':
            # pass None as weights to np.average: uniform mean
            multioutput = None

    return np.average(output_errors, weights=multioutput)
예제 #11
0
def root_mean_squared_error(
    y_true: Union[Sequence[float], np.ndarray, pd.Series],
    y_pred: Union[Sequence[float], np.ndarray, pd.Series],
) -> float:
    """Calculates the Root Mean Squared Error for regression problems.

    Parameters
    ----------
    y_true: list or array like
        The true, or the expected, values of our problem
    y_pred: list or array like
        The predicted values of our problem

    Returns
    -------
    Our RMSE score
    """
    y_problem, y_true, y_pred, multioutput = _check_reg_targets(
        y_true, y_pred, multioutput="raw_values")
    n = len(y_true)
    return math.sqrt(np.sum((y_true - y_pred)**2) / n)
예제 #12
0
def standard_absolute_error(y_true,
                            y_pred,
                            sample_weight=None,
                            multioutput='uniform_average'):
    '''
    Desvio padrão do erro
    '''
    #return np.std(np.abs(y_true - y_pred))

    y_type, y_true, y_pred, multioutput = _check_reg_targets(
        y_true, y_pred, multioutput)
    std_errors = np.nanstd(np.abs(y_pred - y_true), axis=0)
    output_errors = np.average(std_errors, weights=sample_weight, axis=0)

    if isinstance(multioutput, str):
        if multioutput == 'raw_values':
            return output_errors
        elif multioutput == 'uniform_average':
            # pass None as weights to np.average: uniform mean
            multioutput = None

    return np.average(output_errors, weights=multioutput)
예제 #13
0
def mape_score(
    y_true: Union[Sequence[float], np.ndarray, pd.Series],
    y_pred: Union[Sequence[float], np.ndarray, pd.Series],
) -> float:
    """Calculates the Mean Absolute Percentage Error, a common metric used for Time Series Problems

    Parameters
    ----------
    y_true: list or array like
        The true, or the expected, values of our problem
    y_pred: list or array like
        The predicted values of our problem

    Returns
    -------
    Our MAPE score
    """
    y_problem, y_true, y_pred, multioutput = _check_reg_targets(
        y_true, y_pred, multioutput="raw_values")
    if 0 in y_true:
        raise Exception("Cannot divide by zero")
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
예제 #14
0
def smape_score(
    y_true: Union[Sequence[float], np.ndarray, pd.Series],
    y_pred: Union[Sequence[float], np.ndarray, pd.Series],
) -> float:
    """Calculates the Symmetric Mean Absolute Percentage Error. Used when there are zeros in our y_true that would cause
    MAPE to be undefined.

    Parameters
    ----------
    y_true: list or array like
        The true, or the expected, values of our problem
    y_pred: list or array like
        The predicted values of our problem

    Returns
    -------
    Our SMAPE score
    """
    y_problem, y_true, y_pred, multioutput = _check_reg_targets(
        y_true, y_pred, multioutput="raw_values")
    error = np.abs(y_true - y_pred)
    total = np.abs(y_true) + np.abs(y_pred)
    return 100 * np.sum(error / total) / len(error)
예제 #15
0
def mean_directional_accuracy(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    sample_weight: Optional[np.ndarray] = None,
    multioutput: str = "uniform_average",
) -> float:
    """
    Return the MDA (Mean Directional Accuracy) of a prediction.

    This is the mean of the vector 1_{sgn(y_true - y_true_lag_1) = sgn(y_pred - y_true_lag_1)}.
    In plain words, it computes how often the model got the direction of the time series movement right.

    Parameters
    ----------
    y_true : np.ndarray (non-negative numbers)
        Observed values.

    y_pred : np.ndarray
        Predicted values.

    sample_weight : Optional[np.ndarray], default=None
        Individual weights for each sample. The first entry is ignored since the MDA loss term consists
        of len(y_true) - 1 summands.

    multioutput : {"raw_values", "uniform_average"} or array-like
        Defines aggregating of multiple output values.
        Array-like value defines weights used to average errors.
        If input is list then the shape must be (n_outputs,).

        - "raw_values": Returns a full set of errors in case of multioutput input.
        - "uniform_average": Errors of all outputs are averaged with uniform weight.

    Returns
    -------
    float
        MDA value of the input.

    Examples
    --------
    >>> import numpy as np
    >>> y_true = np.array([1, 2, 4])
    >>> y_pred = np.array([1, 1, 3])
    >>> mean_directional_accuracy(y_true, y_pred)
    0.5
    """
    _, y_true, y_pred, multioutput = _check_reg_targets(
        y_true, y_pred, multioutput)
    check_consistent_length(y_true, y_pred, sample_weight)

    output_errors = np.average(
        np.sign(y_true[1:] - y_true[:-1]) == np.sign(y_pred[1:] - y_true[:-1]),
        weights=sample_weight[1:] if sample_weight is not None else None,
    )

    if isinstance(multioutput, str):
        if multioutput == "raw_values":
            return output_errors
        elif multioutput == "uniform_average":
            # pass None as weights to np.average: uniform mean
            multioutput = None

    return np.average(output_errors, weights=multioutput)
예제 #16
0
def mean_log_quotient(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    sample_weight: Optional[np.ndarray] = None,
    multioutput: str = "uniform_average",
) -> float:
    """
    Return the MLQ (Mean Log Quotient) of a prediction.

    This is np.mean(np.log(y_pred / y_true)**2).

    Parameters
    ----------
    y_true : np.ndarray (non-negative numbers)
        Observed values.

    y_pred : np.ndarray
        Predicted values.

    sample_weight : Optional[np.ndarray], default=None
        Individual weights for each sample.

    multioutput : {"raw_values", "uniform_average"} or array-like
        Defines aggregating of multiple output values.
        Array-like value defines weights used to average errors.
        If input is list then the shape must be (n_outputs,).

        - "raw_values": Returns a full set of errors in case of multioutput input.
        - "uniform_average": Errors of all outputs are averaged with uniform weight.

    Returns
    -------
    float
        MLQ value of the input.

    Examples
    --------
    >>> import numpy as np
    >>> y_true = np.array([1, 2, 4])
    >>> y_pred = np.array([1, 1, 3])
    >>> mean_log_quotient(y_true, y_pred)
    0.1877379962427844

    Notes
    -----
    See Tofallis, C (2015) "A Better Measure of Relative Prediction Accuracy for Model Selection and Model Estimation",
    Journal of the Operational Research Society, 66(8),1352-1362.
    """
    _, y_true, y_pred, multioutput = _check_reg_targets(
        y_true, y_pred, multioutput)
    check_consistent_length(y_true, y_pred, sample_weight)

    output_errors = np.average(np.log(y_pred / y_true)**2,
                               weights=sample_weight)

    if isinstance(multioutput, str):
        if multioutput == "raw_values":
            return output_errors
        elif multioutput == "uniform_average":
            # pass None as weights to np.average: uniform mean
            multioutput = None

    return np.average(output_errors, weights=multioutput)
 def load_data(self, y_true, y_pred, *args, **kwargs):
     y_type, y_true, y_pred, multioutput = _check_reg_targets(
         y_true, y_pred, multioutput='uniform_average')
     self.y_true = y_true
     self.y_pred = y_pred