def test__check_reg_targets_exception(): invalid_multioutput = 'this_value_is_not_valid' expected_message = ("Allowed 'multioutput' string values are.+" "You provided multioutput={!r}".format( invalid_multioutput)) with pytest.raises(ValueError, match=expected_message): _check_reg_targets([1, 2, 3], [[1], [2], [3]], invalid_multioutput)
def test__check_reg_targets(): # All of length 3 EXAMPLES = [ ("continuous", [1, 2, 3], 1), ("continuous", [[1], [2], [3]], 1), ("continuous-multioutput", [[1, 1], [2, 2], [3, 1]], 2), ("continuous-multioutput", [[5, 1], [4, 2], [3, 1]], 2), ("continuous-multioutput", [[1, 3, 4], [2, 2, 2], [3, 1, 1]], 3), ] for (type1, y1, n_out1), (type2, y2, n_out2) in product(EXAMPLES, repeat=2): if type1 == type2 and n_out1 == n_out2: y_type, y_check1, y_check2, multioutput = _check_reg_targets( y1, y2, None) assert type1 == y_type if type1 == 'continuous': assert_array_equal(y_check1, np.reshape(y1, (-1, 1))) assert_array_equal(y_check2, np.reshape(y2, (-1, 1))) else: assert_array_equal(y_check1, y1) assert_array_equal(y_check2, y2) else: with pytest.raises(ValueError): _check_reg_targets(y1, y2, None)
def mean_absolute_percentage_error( y_true: np.ndarray, y_pred: np.ndarray, sample_weight: Optional[np.ndarray] = None, multioutput: str = "uniform_average", ) -> float: """ Return the MAPE (Mean Absolute Percentage Error) of a prediction. The formula is np.mean(np.abs((y_true - y_pred) / y_true)). Parameters ---------- y_true : np.ndarray Observed values. y_pred : np.ndarray Predicted values. sample_weight : Optional[np.ndarray], default=None Individual weights for each sample. multioutput : {"raw_values", "uniform_average"} or array-like Defines aggregating of multiple output values. Array-like value defines weights used to average errors. If input is list then the shape must be (n_outputs,). - "raw_values": Returns a full set of errors in case of multioutput input. - "uniform_average": Errors of all outputs are averaged with uniform weight. Returns ------- float MAPE value of the input. Examples -------- >>> import numpy as np >>> y_true = np.array([1, 2, 4]) >>> y_pred = np.array([1, 1, 2]) >>> mean_absolute_percentage_error(y_true, y_pred) 0.3333333333333333 """ _, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput) check_consistent_length(y_true, y_pred, sample_weight) eps = np.finfo(np.float64).eps output_errors = np.average(np.abs( (y_true - y_pred) / np.maximum(y_true, eps)), weights=sample_weight) if isinstance(multioutput, str): if multioutput == "raw_values": return output_errors elif multioutput == "uniform_average": # pass None as weights to np.average: uniform mean multioutput = None return np.average(output_errors, weights=multioutput)
def group_mean_log_mae( y_true: Union[pd.DataFrame, pd.Series, Sequence, np.ndarray], y_pred: Union[pd.DataFrame, pd.Series, Sequence, np.ndarray], groups: Union[Sequence, np.ndarray, pd.Series], floor: float = 1e-9, ) -> float: """Calculates the Group Mean Log Mean Absolute Error. Used in a Kaggle competition. Parameters ---------- y_true: list or array-like The true, or the expected, values of our problem; along with the group attached y_pred: list or array-like The predicted values of our problem; along with the group attached groups: list or array like What our data is being grouped by. floor: float, default=1e-9 The minimum value our Group Mean Log MAE can be (as 0 is undefined for log transformations). Returns ------- Our Group Mean Log MAE score """ y_problem, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput="raw_values") y_true = pd.Series([i[0] for i in y_true]) y_pred = pd.Series([i[0] for i in y_pred]) maes = (y_true - y_pred).abs().groupby(groups).mean() return np.log(maes.map(lambda x: max(x, floor))).mean()
def load_data(self, y_true: 'DataFrame', y_pred: 'DataFrame', *args, **kwargs): """TODO: Rename class fo clarify it receives a dataframe and cast to numpyarray Args: y_true (DataFrame): ground truth y_pred (DataFrame): predicted probabilities for positive class """ y_type, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput='uniform_average') self.y_true = y_true self.y_pred = y_pred > self.probability_threshold
def _symmetric_mean_absolute_percentage_error( y_true: np.ndarray, y_pred: np.ndarray, sample_weight: Optional[np.ndarray] = None, multioutput: str = "uniform_average", ): r"""Symmetric mean absolute percentage error regression loss (SMAPE_): .. math:: \text{SMAPE} = \frac{2}{n}\sum_1^n\frac{max(| y_i - \hat{y_i} |}{| y_i | + | \hat{y_i} |, \epsilon)} Where :math:`y` is a tensor of target values, and :math:`\hat{y}` is a tensor of predictions. Args: y_true: array-like of shape (n_samples,) or (n_samples, n_outputs) Ground truth (correct) target values. y_pred: array-like of shape (n_samples,) or (n_samples, n_outputs) Estimated target values. sample_weight: array-like of shape (n_samples,), default=None Sample weights. multioutput: {'raw_values', 'uniform_average'} or array-like Defines aggregating of multiple output values. Array-like value defines weights used to average errors. If input is list then the shape must be (n_outputs,). - 'raw_values': Returns a full set of errors in case of multioutput input. - 'uniform_average': Errors of all outputs are averaged with uniform weight. Returns: loss: float or ndarray of floats in the range [0, 1] If multioutput is 'raw_values', then symmetric mean absolute percentage error is returned for each output separately. If multioutput is 'uniform_average' or an ndarray of weights, then the weighted average of all output errors is returned. MAPE output is non-negative floating point. The best value is 0.0. But note the fact that bad predictions can lead to arbitarily large MAPE values, especially if some y_true values are very close to zero. Note that we return a large value instead of `inf` when y_true is zero. """ _, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput) check_consistent_length(y_true, y_pred, sample_weight) epsilon = np.finfo(np.float64).eps smape = 2 * np.abs(y_pred - y_true) / np.maximum( np.abs(y_true) + np.abs(y_pred), epsilon) output_errors = np.average(smape, weights=sample_weight, axis=0) if isinstance(multioutput, str): if multioutput == "raw_values": return output_errors # pass None as weights to np.average: uniform mean multioutput = None return np.average(output_errors, weights=multioutput)
def mean_absolute_percentage_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average'): """Mean absolute percentage error regression loss. Note here that we do not represent the output as a percentage in range [0, 100]. Instead, we represent it in range [0, 1/eps]. Read more in the :ref:`User Guide <mean_absolute_percentage_error>`. .. versionadded:: 0.24 Parameters ---------- y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) Ground truth (correct) target values. y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) Estimated target values. sample_weight : array-like of shape (n_samples,), default=None Sample weights. multioutput : {'raw_values', 'uniform_average'} or array-like Defines aggregating of multiple output values. Array-like value defines weights used to average errors. If input is list then the shape must be (n_outputs,). 'raw_values' : Returns a full set of errors in case of multioutput input. 'uniform_average' : Errors of all outputs are averaged with uniform weight. Returns ------- loss : float or ndarray of floats in the range [0, 1/eps] If multioutput is 'raw_values', then mean absolute percentage error is returned for each output separately. If multioutput is 'uniform_average' or an ndarray of weights, then the weighted average of all output errors is returned. MAPE output is non-negative floating point. The best value is 0.0. But note the fact that bad predictions can lead to arbitarily large MAPE values, especially if some y_true values are very close to zero. Note that we return a large value instead of `inf` when y_true is zero. """ y_type, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput) check_consistent_length(y_true, y_pred, sample_weight) epsilon = np.finfo(np.float64).eps mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), epsilon) output_errors = np.average(mape, weights=sample_weight, axis=0) if isinstance(multioutput, str): if multioutput == 'raw_values': return output_errors elif multioutput == 'uniform_average': # pass None as weights to np.average: uniform mean multioutput = None return np.average(output_errors, weights=multioutput)
def adjusted_explained_variance_score( y_true: Union[Sequence[float], np.ndarray, pd.Series], y_pred: Union[Sequence[float], np.ndarray, pd.Series], features_vector: Optional[Union[Sequence[str], np.ndarray, pd.Series]] = None, num_features: Optional[int] = None, ) -> float: """Calculates an adjusted explained_variance_score that penalizes models that use too many superfluous features Parameters ---------- y_true: list or array like The true, or the expected, values of our problem y_pred: list or array like The predicted values of our problem features_vector: list or array like, default=None A list of all features used for our model. num_features: int, default=None The number of features used for our model. Used if features_vector is None Returns ------- Our adjusted explained_variance_score. """ y_problem, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput="raw_values") if features_vector: if len(features_vector) >= len(y_true) - 1: raise Exception( "Number of features is greater than number of rows and 1 degree of freedom" ) if len(features_vector) < 1: raise Exception("Cannot have less than one feature") p = len(features_vector) n = len(y_true) elif num_features: if num_features >= len(y_true) - 1: raise Exception( "Number of features is greater than number of rows and 1 degree of freedom" ) if num_features < 1: raise Exception("Cannot have less than one feature") p = num_features n = len(y_true) else: raise Exception("No features available to calculate adjusted score") evs = (explained_variance_score(y_true, y_pred) if explained_variance_score(y_true, y_pred) > 0 else 0) return 1 - (1 - evs) * (n - 1) / (n - p - 1)
def mean_absolute_percentage_error( y_true, y_pred, sample_weight=None, multioutput="uniform_average" ): y_type, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput ) check_consistent_length(y_true, y_pred, sample_weight) mask = y_true != 0 y_true = y_true[mask] y_pred = y_pred[mask] mape = np.abs(y_pred - y_true) / np.abs(y_true) output_errors = np.average(mape, weights=sample_weight, axis=0) if isinstance(multioutput, str): if multioutput == "raw_values": return output_errors elif multioutput == "uniform_average": # pass None as weights to np.average: uniform mean multioutput = None return np.average(output_errors, weights=multioutput)
def mean_max10_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average'): ''' Calcula a media dos 10 maiores erros ''' y_type, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput) max10 = heapq.nlargest(10, np.abs(y_true - y_pred)) output_errors = np.average(max10, weights=sample_weight, axis=0) if isinstance(multioutput, str): if multioutput == 'raw_values': return output_errors elif multioutput == 'uniform_average': # pass None as weights to np.average: uniform mean multioutput = None return np.average(output_errors, weights=multioutput)
def root_mean_squared_error( y_true: Union[Sequence[float], np.ndarray, pd.Series], y_pred: Union[Sequence[float], np.ndarray, pd.Series], ) -> float: """Calculates the Root Mean Squared Error for regression problems. Parameters ---------- y_true: list or array like The true, or the expected, values of our problem y_pred: list or array like The predicted values of our problem Returns ------- Our RMSE score """ y_problem, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput="raw_values") n = len(y_true) return math.sqrt(np.sum((y_true - y_pred)**2) / n)
def standard_absolute_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average'): ''' Desvio padrão do erro ''' #return np.std(np.abs(y_true - y_pred)) y_type, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput) std_errors = np.nanstd(np.abs(y_pred - y_true), axis=0) output_errors = np.average(std_errors, weights=sample_weight, axis=0) if isinstance(multioutput, str): if multioutput == 'raw_values': return output_errors elif multioutput == 'uniform_average': # pass None as weights to np.average: uniform mean multioutput = None return np.average(output_errors, weights=multioutput)
def mape_score( y_true: Union[Sequence[float], np.ndarray, pd.Series], y_pred: Union[Sequence[float], np.ndarray, pd.Series], ) -> float: """Calculates the Mean Absolute Percentage Error, a common metric used for Time Series Problems Parameters ---------- y_true: list or array like The true, or the expected, values of our problem y_pred: list or array like The predicted values of our problem Returns ------- Our MAPE score """ y_problem, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput="raw_values") if 0 in y_true: raise Exception("Cannot divide by zero") return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def smape_score( y_true: Union[Sequence[float], np.ndarray, pd.Series], y_pred: Union[Sequence[float], np.ndarray, pd.Series], ) -> float: """Calculates the Symmetric Mean Absolute Percentage Error. Used when there are zeros in our y_true that would cause MAPE to be undefined. Parameters ---------- y_true: list or array like The true, or the expected, values of our problem y_pred: list or array like The predicted values of our problem Returns ------- Our SMAPE score """ y_problem, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput="raw_values") error = np.abs(y_true - y_pred) total = np.abs(y_true) + np.abs(y_pred) return 100 * np.sum(error / total) / len(error)
def mean_directional_accuracy( y_true: np.ndarray, y_pred: np.ndarray, sample_weight: Optional[np.ndarray] = None, multioutput: str = "uniform_average", ) -> float: """ Return the MDA (Mean Directional Accuracy) of a prediction. This is the mean of the vector 1_{sgn(y_true - y_true_lag_1) = sgn(y_pred - y_true_lag_1)}. In plain words, it computes how often the model got the direction of the time series movement right. Parameters ---------- y_true : np.ndarray (non-negative numbers) Observed values. y_pred : np.ndarray Predicted values. sample_weight : Optional[np.ndarray], default=None Individual weights for each sample. The first entry is ignored since the MDA loss term consists of len(y_true) - 1 summands. multioutput : {"raw_values", "uniform_average"} or array-like Defines aggregating of multiple output values. Array-like value defines weights used to average errors. If input is list then the shape must be (n_outputs,). - "raw_values": Returns a full set of errors in case of multioutput input. - "uniform_average": Errors of all outputs are averaged with uniform weight. Returns ------- float MDA value of the input. Examples -------- >>> import numpy as np >>> y_true = np.array([1, 2, 4]) >>> y_pred = np.array([1, 1, 3]) >>> mean_directional_accuracy(y_true, y_pred) 0.5 """ _, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput) check_consistent_length(y_true, y_pred, sample_weight) output_errors = np.average( np.sign(y_true[1:] - y_true[:-1]) == np.sign(y_pred[1:] - y_true[:-1]), weights=sample_weight[1:] if sample_weight is not None else None, ) if isinstance(multioutput, str): if multioutput == "raw_values": return output_errors elif multioutput == "uniform_average": # pass None as weights to np.average: uniform mean multioutput = None return np.average(output_errors, weights=multioutput)
def mean_log_quotient( y_true: np.ndarray, y_pred: np.ndarray, sample_weight: Optional[np.ndarray] = None, multioutput: str = "uniform_average", ) -> float: """ Return the MLQ (Mean Log Quotient) of a prediction. This is np.mean(np.log(y_pred / y_true)**2). Parameters ---------- y_true : np.ndarray (non-negative numbers) Observed values. y_pred : np.ndarray Predicted values. sample_weight : Optional[np.ndarray], default=None Individual weights for each sample. multioutput : {"raw_values", "uniform_average"} or array-like Defines aggregating of multiple output values. Array-like value defines weights used to average errors. If input is list then the shape must be (n_outputs,). - "raw_values": Returns a full set of errors in case of multioutput input. - "uniform_average": Errors of all outputs are averaged with uniform weight. Returns ------- float MLQ value of the input. Examples -------- >>> import numpy as np >>> y_true = np.array([1, 2, 4]) >>> y_pred = np.array([1, 1, 3]) >>> mean_log_quotient(y_true, y_pred) 0.1877379962427844 Notes ----- See Tofallis, C (2015) "A Better Measure of Relative Prediction Accuracy for Model Selection and Model Estimation", Journal of the Operational Research Society, 66(8),1352-1362. """ _, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput) check_consistent_length(y_true, y_pred, sample_weight) output_errors = np.average(np.log(y_pred / y_true)**2, weights=sample_weight) if isinstance(multioutput, str): if multioutput == "raw_values": return output_errors elif multioutput == "uniform_average": # pass None as weights to np.average: uniform mean multioutput = None return np.average(output_errors, weights=multioutput)
def load_data(self, y_true, y_pred, *args, **kwargs): y_type, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput='uniform_average') self.y_true = y_true self.y_pred = y_pred