def load_metrics(): """Yields all the metrics.""" for name, obj in inspect.getmembers( importlib.import_module('river.metrics'), inspect.isclass): if name == 'Metric': continue if issubclass(obj, metrics.Rolling): yield obj(metric=metrics.MSE(), window_size=42) continue elif name == 'RegressionMultiOutput': yield obj(metric=metrics.MSE()) continue try: sig = inspect.signature(obj) yield obj( **{ param.name: param.default if param.default != param.empty else 5 for param in sig.parameters.values() }) except ValueError: yield obj()
def test_compose(): metrics.MAE() + metrics.MSE() metrics.Accuracy() + metrics.LogLoss() with pytest.raises(ValueError): _ = metrics.MSE() + metrics.LogLoss() with pytest.raises(ValueError): _ = metrics.MSE() + metrics.MAE() + metrics.LogLoss()
def reset(self): self.mae = metrics.Rolling(metrics.MAE(), window_size=self.window_size) self.mse = metrics.Rolling(metrics.MSE(), window_size=self.window_size) self.r2 = metrics.Rolling(metrics.R2(), window_size=self.window_size) self.sample_count = 0 self.last_true_label = None self.last_prediction = None
def __init__(self): super().__init__() self.mae = metrics.MAE() self.mse = metrics.MSE() self.r2 = metrics.R2() self.last_true_label = None self.last_prediction = None
def __init__( self, # Forest parameters n_models: int = 10, max_features="sqrt", aggregation_method: str = "median", lambda_value: int = 6, metric: metrics.RegressionMetric = metrics.MSE(), disable_weighted_vote=True, drift_detector: base.DriftDetector = ADWIN(0.001), warning_detector: base.DriftDetector = ADWIN(0.01), # Tree parameters grace_period: int = 50, max_depth: int = None, split_confidence: float = 0.01, tie_threshold: float = 0.05, leaf_prediction: str = "model", leaf_model: base.Regressor = None, model_selector_decay: float = 0.95, nominal_attributes: list = None, splitter: Splitter = None, min_samples_split: int = 5, binary_split: bool = False, max_size: int = 500, memory_estimate_period: int = 2_000_000, stop_mem_management: bool = False,
def __init__(self, window_size=200): super().__init__() self.window_size = window_size self.mae = metrics.Rolling(metrics.MAE(), window_size=self.window_size) self.mse = metrics.Rolling(metrics.MSE(), window_size=self.window_size) self.r2 = metrics.Rolling(metrics.R2(), window_size=self.window_size) self.sample_count = 0 self.last_true_label = None self.last_prediction = None
def __init__( self, # Forest parameters n_models: int = 10, max_features="sqrt", aggregation_method: str = "median", lambda_value: int = 6, metric: metrics.RegressionMetric = metrics.MSE(), disable_weighted_vote=True, drift_detector: base.DriftDetector = ADWIN(0.001), warning_detector: base.DriftDetector = ADWIN(0.01), # Tree parameters grace_period: int = 50, max_depth: int = None, split_confidence: float = 0.01, tie_threshold: float = 0.05, leaf_prediction: str = "model", leaf_model: base.Regressor = None, model_selector_decay: float = 0.95, nominal_attributes: list = None, splitter: Splitter = None, min_samples_split: int = 5, max_size: int = 100, memory_estimate_period: int = 2000000, seed: int = None, **kwargs, ): super().__init__( n_models=n_models, max_features=max_features, lambda_value=lambda_value, metric=metric, disable_weighted_vote=disable_weighted_vote, drift_detector=drift_detector, warning_detector=warning_detector, seed=seed, ) self._n_samples_seen = 0 self._base_member_class = ForestMemberRegressor # Tree parameters self.grace_period = grace_period self.max_depth = max_depth self.split_confidence = split_confidence self.tie_threshold = tie_threshold self.leaf_prediction = leaf_prediction self.leaf_model = leaf_model self.model_selector_decay = model_selector_decay self.nominal_attributes = nominal_attributes self.splitter = splitter self.min_samples_split = min_samples_split self.max_size = max_size self.memory_estimate_period = memory_estimate_period self.kwargs = kwargs if aggregation_method in self._VALID_AGGREGATION_METHOD: self.aggregation_method = aggregation_method else: raise ValueError( f"Invalid aggregation_method: {aggregation_method}.\n" f"Valid values are: {self._VALID_AGGREGATION_METHOD}" )
(metrics.WeightedRecall(), partial(sk_metrics.recall_score, average='weighted')), (metrics.FBeta(beta=.5), partial(sk_metrics.fbeta_score, beta=.5)), (metrics.MacroFBeta(beta=.5), partial(sk_metrics.fbeta_score, beta=.5, average='macro')), (metrics.MicroFBeta(beta=.5), partial(sk_metrics.fbeta_score, beta=.5, average='micro')), (metrics.WeightedFBeta(beta=.5), partial(sk_metrics.fbeta_score, beta=.5, average='weighted')), (metrics.F1(), sk_metrics.f1_score), (metrics.MacroF1(), partial(sk_metrics.f1_score, average='macro')), (metrics.MicroF1(), partial(sk_metrics.f1_score, average='micro')), (metrics.WeightedF1(), partial(sk_metrics.f1_score, average='weighted')), (metrics.MCC(), sk_metrics.matthews_corrcoef), (metrics.MAE(), sk_metrics.mean_absolute_error), (metrics.MSE(), sk_metrics.mean_squared_error), ] @pytest.mark.parametrize('metric, sk_metric', [ pytest.param(metric, sk_metric, id=f'{metric.__class__.__name__}') for metric, sk_metric in TEST_CASES ]) @pytest.mark.filterwarnings('ignore::RuntimeWarning') @pytest.mark.filterwarnings( 'ignore::sklearn.metrics.classification.UndefinedMetricWarning') def test_metric(metric, sk_metric): # Check str works str(metric)
class AdaptiveRandomForestRegressor(BaseForest, base.Regressor): r"""Adaptive Random Forest regressor. The 3 most important aspects of Adaptive Random Forest [^1] are: 1. inducing diversity through re-sampling 2. inducing diversity through randomly selecting subsets of features for node splits 3. drift detectors per base tree, which cause selective resets in response to drifts Notice that this implementation is slightly different from the original algorithm proposed in [^2]. The `HoeffdingTreeRegressor` is used as base learner, instead of `FIMT-DD`. It also adds a new strategy to monitor the predictions and check for concept drifts. The deviations of the predictions to the target are monitored and normalized in the [0, 1] range to fulfill ADWIN's requirements. We assume that the data subjected to the normalization follows a normal distribution, and thus, lies within the interval of the mean $\pm3\sigma$. Parameters ---------- n_models Number of trees in the ensemble. max_features Max number of attributes for each node split.<br/> - If `int`, then consider `max_features` at each split.<br/> - If `float`, then `max_features` is a percentage and `int(max_features * n_features)` features are considered per split.<br/> - If "sqrt", then `max_features=sqrt(n_features)`.<br/> - If "log2", then `max_features=log2(n_features)`.<br/> - If None, then ``max_features=n_features``. lambda_value The lambda value for bagging (lambda=6 corresponds to Leveraging Bagging). metric Metric used to track trees performance within the ensemble. Depending, on the configuration, this metric is also used to weight predictions from the members of the ensemble. aggregation_method The method to use to aggregate predictions in the ensemble.<br/> - 'mean'<br/> - 'median' - If selected will disable the weighted vote. disable_weighted_vote If `True`, disables the weighted vote prediction, i.e. does not assign weights to individual tree's predictions and uses the arithmetic mean instead. Otherwise will use the `metric` value to weight predictions. drift_detector Drift Detection method. Set to None to disable Drift detection. warning_detector Warning Detection method. Set to None to disable warning detection. grace_period [*Tree parameter*] Number of instances a leaf should observe between split attempts. max_depth [*Tree parameter*] The maximum depth a tree can reach. If `None`, the tree will grow indefinitely. split_confidence [*Tree parameter*] Allowed error in split decision, a value closer to 0 takes longer to decide. tie_threshold [*Tree parameter*] Threshold below which a split will be forced to break ties. leaf_prediction [*Tree parameter*] Prediction mechanism used at leaves.</br> - 'mean' - Target mean</br> - 'model' - Uses the model defined in `leaf_model`</br> - 'adaptive' - Chooses between 'mean' and 'model' dynamically</br> leaf_model [*Tree parameter*] The regression model used to provide responses if `leaf_prediction='model'`. If not provided, an instance of `river.linear_model.LinearRegression` with the default hyperparameters is used. model_selector_decay The exponential decaying factor applied to the learning models' squared errors, that are monitored if `leaf_prediction='adaptive'`. Must be between `0` and `1`. The closer to `1`, the more importance is going to be given to past observations. On the other hand, if its value approaches `0`, the recent observed errors are going to have more influence on the final decision. nominal_attributes [*Tree parameter*] List of Nominal attributes. If empty, then assume that all attributes are numerical. attr_obs [*Tree parameter*] The attribute observer (AO) used to monitor the target statistics of numeric features and perform splits. Parameters can be passed to the AOs (when supported) by using `attr_obs_params`. Valid options are:</br> - `'e-bst'`: Extended Binary Search Tree (E-BST). This AO has no parameters.</br> See notes for more information about the supported AOs. attr_obs_params [*Tree parameter*] Parameters passed to the numeric AOs. See `attr_obs` for more information. min_samples_split [*Tree parameter*] The minimum number of samples every branch resulting from a split candidate must have to be considered valid. max_size [*Tree parameter*] Maximum memory (MB) consumed by the tree. memory_estimate_period [*Tree parameter*] Number of instances between memory consumption checks. seed If `int`, `seed` is used to seed the random number generator; If `RandomState`, `seed` is the random number generator; If `None`, the random number generator is the `RandomState` instance used by `np.random`. kwargs Other parameters passed to `river.tree.BaseHoeffdingTree`. Notes ----- Hoeffding trees rely on Attribute Observer (AO) algorithms to monitor input features and perform splits. Nominal features can be easily dealt with, since the partitions are well-defined. Numerical features, however, require more sophisticated solutions. Currently, only one AO is supported in `river` for regression trees: - The Extended Binary Search Tree (E-BST) uses an exhaustive algorithm to find split candidates, similarly to batch decision tree algorithms. It ends up storing all observations between split attempts. However, E-BST automatically removes bad split points periodically from its structure and, thus, alleviates the memory and time costs involved in its usage. References ---------- [^1]: Gomes, H.M., Bifet, A., Read, J., Barddal, J.P., Enembreck, F., Pfharinger, B., Holmes, G. and Abdessalem, T., 2017. Adaptive random forests for evolving data stream classification. Machine Learning, 106(9-10), pp.1469-1495. [^2]: Gomes, H.M., Barddal, J.P., Boiko, L.E., Bifet, A., 2018. Adaptive random forests for data stream regression. ESANN 2018. Examples -------- >>> from river import datasets >>> from river import evaluate >>> from river import metrics >>> from river import ensemble >>> from river import preprocessing >>> dataset = datasets.TrumpApproval() >>> model = ( ... preprocessing.StandardScaler() | ... ensemble.AdaptiveRandomForestRegressor(n_models=3, seed=42) ... ) >>> metric = metrics.MAE() >>> evaluate.progressive_val_score(dataset, model, metric) MAE: 1.870913 """ _MEAN = "mean" _MEDIAN = "median" _VALID_AGGREGATION_METHOD = [_MEAN, _MEDIAN] def __init__( self, # Forest parameters n_models: int = 10, max_features="sqrt", aggregation_method: str = "median", lambda_value: int = 6, <<<<<<< HEAD metric: RegressionMetric = MSE(), ======= metric: metrics.RegressionMetric = metrics.MSE(), >>>>>>> upstream/master disable_weighted_vote=True, drift_detector: base.DriftDetector = ADWIN(0.001), warning_detector: base.DriftDetector = ADWIN(0.01), # Tree parameters grace_period: int = 50, max_depth: int = None, split_confidence: float = 0.01, tie_threshold: float = 0.05, leaf_prediction: str = "model", leaf_model: base.Regressor = None, model_selector_decay: float = 0.95, nominal_attributes: list = None, attr_obs: str = "e-bst", attr_obs_params: dict = None, min_samples_split: int = 5, max_size: int = 100, memory_estimate_period: int = 2000000, seed: int = None, **kwargs, ): super().__init__( n_models=n_models, max_features=max_features, lambda_value=lambda_value, metric=metric, disable_weighted_vote=disable_weighted_vote, drift_detector=drift_detector, warning_detector=warning_detector, seed=seed, ) self._n_samples_seen = 0 self._base_member_class = ForestMemberRegressor # Tree parameters self.grace_period = grace_period self.max_depth = max_depth self.split_confidence = split_confidence self.tie_threshold = tie_threshold self.leaf_prediction = leaf_prediction self.leaf_model = leaf_model self.model_selector_decay = model_selector_decay self.nominal_attributes = nominal_attributes self.attr_obs = attr_obs self.attr_obs_params = attr_obs_params self.min_samples_split = min_samples_split self.max_size = max_size self.memory_estimate_period = memory_estimate_period self.kwargs = kwargs if aggregation_method in self._VALID_AGGREGATION_METHOD: self.aggregation_method = aggregation_method else: raise ValueError( f"Invalid aggregation_method: {aggregation_method}.\n" f"Valid values are: {self._VALID_AGGREGATION_METHOD}" )
def reset(self): self.mae = metrics.MAE() self.mse = metrics.MSE() self.r2 = metrics.R2() self.last_true_label = None self.last_prediction = None
def build_model_4snarimax(self): if os.path.exists( self.pck_filename ): #if model backup exists then load it and update model from start1 to start2 src_bck = pickle.load(open(self.pck_filename, 'rb')) model = src_bck.snarimax_model metric = src_bck.snarimax_metric self.snarimax_para = src_bck.snarimax_para self.snarimax_model = model self.snarimax_metric = metric start1 = src_bck.data.index[-1] start2 = self.data.index[ -1] #self.data.index[-self.data.index[-1].weekday()] else: #if model backup does not exist then rebuild model from the start p, d, q, m, sp, sd, sq = self.snarimax_para extract_features = compose.TransformerUnion(get_ordinal_date) model = ( extract_features | time_series.SNARIMAX( p=p, d=d, q=q, m=m, sp=sp, sd=sd, sq=sq, regressor=( #preprocessing.Normalizer() | preprocessing.AdaptiveStandardScaler(alpha=0.1) | preprocessing.StandardScaler() | #preprocessing.RobustScaler(with_scaling=True) | linear_model.LinearRegression( intercept_init=0, optimizer=optim.SGD(0.0001), #important parameter #optimizer=optim.AdaDelta(0.8,0.00001), #important parameter #optimizer=optim.AMSGrad(lr=0.01,beta_1=0.8,beta_2=0.1), intercept_lr=0.001)))) metric = metrics.Rolling(metrics.MSE(), self.dd_historic) #metric = metrics.MSE() start1 = self.data.index[0] start2 = self.data.index[ -1] #self.data.index[-self.data.index[-1].weekday()] if start1 < start2: for t in pd.date_range(start1, start2, freq='D'): x, y = self.snarimax_data.loc[t][['ds', 'temp']].values y_pred = model.forecast(horizon=1, xs=[x]) #print(x,y,y_pred[0],y-y_pred[0]) model = model.learn_one(x, y) metric = metric.update(y, y_pred[0]) self.snarimax_model = model self.snarimax_metric = metric with open(self.pck_filename, 'wb') as fh: pickle.dump(self, fh) #for t in pd.date_range(start1, start2): # x = self.snarimax_data.loc[pd.date_range(t-timedelta(self.dd_historic),t)][['ds']].values # y = self.snarimax_data.loc[pd.date_range(t-timedelta(self.dd_historic),t)][['temp']].values # x = np.hstack(x) # y = np.hstack(y) # y_pred = model.forecast(horizon=self.dd_historic+1, xs=x) # for i in range(0,self.dd_historic): # model = model.learn_one(x[i], y[i]) # metric = metric.update(y[i], y_pred[i]) return