示例#1
0
 def __init__(
     self,
     # Forest parameters
     n_models: int = 10,
     max_features="sqrt",
     aggregation_method: str = "median",
     lambda_value: int = 6,
     metric: metrics.RegressionMetric = metrics.MSE(),
     disable_weighted_vote=True,
     drift_detector: base.DriftDetector = ADWIN(0.001),
     warning_detector: base.DriftDetector = ADWIN(0.01),
     # Tree parameters
     grace_period: int = 50,
     max_depth: int = None,
     split_confidence: float = 0.01,
     tie_threshold: float = 0.05,
     leaf_prediction: str = "model",
     leaf_model: base.Regressor = None,
     model_selector_decay: float = 0.95,
     nominal_attributes: list = None,
     splitter: Splitter = None,
     min_samples_split: int = 5,
     binary_split: bool = False,
     max_size: int = 500,
     memory_estimate_period: int = 2_000_000,
     stop_mem_management: bool = False,
示例#2
0
 def __init__(self, stats, depth, attr_obs, attr_obs_params, adwin_delta,
              seed):
     super().__init__(stats, depth, attr_obs, attr_obs_params)
     self.adwin_delta = adwin_delta
     self._adwin = ADWIN(delta=self.adwin_delta)
     self.error_change = False
     self._rng = check_random_state(seed)
示例#3
0
 def __init__(
     self,
     n_models: int = 10,
     max_features: typing.Union[bool, str, int] = "sqrt",
     lambda_value: int = 6,
     metric: metrics.MultiClassMetric = metrics.Accuracy(),
     disable_weighted_vote=False,
     drift_detector: typing.Union[base.DriftDetector,
                                  None] = ADWIN(delta=0.001),
     warning_detector: typing.Union[base.DriftDetector,
                                    None] = ADWIN(delta=0.01),
     # Tree parameters
     grace_period: int = 50,
     max_depth: int = None,
     split_criterion: str = "info_gain",
     split_confidence: float = 0.01,
     tie_threshold: float = 0.05,
     leaf_prediction: str = "nba",
     nb_threshold: int = 0,
     nominal_attributes: list = None,
     splitter: Splitter = None,
     binary_split: bool = False,
     max_size: int = 32,
     memory_estimate_period: int = 2_000_000,
     stop_mem_management: bool = False,
    def __init__(self,
                 model=HoeffdingTreeClassifier(grace_period=50,
                                               split_confidence=0.01),
                 n_models: int = 100,
                 subspace_size: typing.Union[int, float, str] = .6,
                 training_method: str = "patches",
                 lam: float = 6.0,
                 drift_detector: typing.Union[base.DriftDetector,
                                              None] = ADWIN(delta=1e-5),
                 warning_detector: base.DriftDetector = ADWIN(delta=1e-4),
                 disable_weighted_vote: bool = False,
                 nominal_attributes=None,
                 seed=None,
                 metric: MultiClassMetric = Accuracy()):
        super().__init__([None
                          ])  # List of models is properly initialized later
        self.models = []
        self.model = model  # Not restricted to a specific base estimator.
        self.n_models = n_models
        self.subspace_size = subspace_size
        self.training_method = training_method
        self.lam = lam
        self.drift_detector = drift_detector
        self.warning_detector = warning_detector
        self.disable_weighted_vote = disable_weighted_vote
        self.metric = metric
        self.nominal_attributes = nominal_attributes if nominal_attributes else []
        self.seed = seed
        self._rng = check_random_state(self.seed)

        self._n_samples_seen = 0
        self._subspaces = None

        self._base_learner_class = StreamingRandomPatchesBaseLearner
示例#5
0
    def __init__(self, split_test, stats, depth, adwin_delta, seed):
        super().__init__(split_test, stats, depth)
        self.adwin_delta = adwin_delta
        self._adwin = ADWIN(delta=self.adwin_delta)
        self._alternate_tree = None
        self._error_change = False

        self._rng = check_random_state(seed)
示例#6
0
    def __init__(self, stats, *children, adwin_delta, seed, **attributes):
        super().__init__(stats, *children, **attributes)
        self.adwin_delta = adwin_delta
        self._adwin = ADWIN(delta=self.adwin_delta)
        self._alternate_tree = None
        self._error_change = False

        self._rng = check_random_state(seed)
    def __init__(
            self,
            n_models: int = 10,
            max_features: typing.Union[bool, str, int] = 'sqrt',
            lambda_value: int = 6,
            metric: MultiClassMetric = Accuracy(),
            disable_weighted_vote=False,
            drift_detector: typing.Union[base.DriftDetector,
                                         None] = ADWIN(delta=0.001),
            warning_detector: typing.Union[base.DriftDetector,
                                           None] = ADWIN(delta=0.01),
            # Tree parameters
            max_size: int = 32,
            memory_estimate_period: int = 2000000,
            grace_period: int = 50,
            split_criterion: str = 'info_gain',
            split_confidence: float = 0.01,
            tie_threshold: float = 0.05,
            binary_split=False,
            stop_mem_management=False,
            remove_poor_attrs=False,
            merit_preprune=True,
            leaf_prediction: str = 'nba',
            nb_threshold: int = 0,
            nominal_attributes: list = None,
            attr_obs: str = 'gaussian',
            attr_obs_params: dict = None,
            max_depth: int = None,
            seed=None):
        super().__init__(n_models=n_models,
                         max_features=max_features,
                         lambda_value=lambda_value,
                         metric=metric,
                         disable_weighted_vote=disable_weighted_vote,
                         drift_detector=drift_detector,
                         warning_detector=warning_detector,
                         seed=seed)

        self._n_samples_seen = 0
        self._base_member_class = ForestMemberClassifier

        # Tree parameters
        self.max_size = max_size
        self.memory_estimate_period = memory_estimate_period
        self.grace_period = grace_period
        self.split_criterion = split_criterion
        self.split_confidence = split_confidence
        self.tie_threshold = tie_threshold
        self.binary_split = binary_split
        self.stop_mem_management = stop_mem_management
        self.remove_poor_attrs = remove_poor_attrs
        self.merit_preprune = merit_preprune
        self.leaf_prediction = leaf_prediction
        self.nb_threshold = nb_threshold
        self.nominal_attributes = nominal_attributes
        self.attr_obs = attr_obs
        self.attr_obs_params = attr_obs_params
        self.max_depth = max_depth
    def __init__(
        self,
        n_models: int = 10,
        max_features: typing.Union[bool, str, int] = "sqrt",
        lambda_value: int = 6,
        metric: metrics.MultiClassMetric = metrics.Accuracy(),
        disable_weighted_vote=False,
        drift_detector: typing.Union[base.DriftDetector,
                                     None] = ADWIN(delta=0.001),
        warning_detector: typing.Union[base.DriftDetector,
                                       None] = ADWIN(delta=0.01),
        # Tree parameters
        grace_period: int = 50,
        max_depth: int = None,
        split_criterion: str = "info_gain",
        split_confidence: float = 0.01,
        tie_threshold: float = 0.05,
        leaf_prediction: str = "nba",
        nb_threshold: int = 0,
        nominal_attributes: list = None,
        attr_obs: str = "gaussian",
        attr_obs_params: dict = None,
        max_size: int = 32,
        memory_estimate_period: int = 2000000,
        seed: int = None,
        **kwargs,
    ):
        super().__init__(
            n_models=n_models,
            max_features=max_features,
            lambda_value=lambda_value,
            metric=metric,
            disable_weighted_vote=disable_weighted_vote,
            drift_detector=drift_detector,
            warning_detector=warning_detector,
            seed=seed,
        )

        self._n_samples_seen = 0
        self._base_member_class = ForestMemberClassifier

        # Tree parameters
        self.grace_period = grace_period
        self.max_depth = max_depth
        self.split_criterion = split_criterion
        self.split_confidence = split_confidence
        self.tie_threshold = tie_threshold
        self.leaf_prediction = leaf_prediction
        self.nb_threshold = nb_threshold
        self.nominal_attributes = nominal_attributes
        self.attr_obs = attr_obs
        self.attr_obs_params = attr_obs_params
        self.max_size = max_size
        self.memory_estimate_period = memory_estimate_period
        self.kwargs = kwargs
示例#9
0
    def __init__(
        self,
        model: base.Estimator = None,
        n_models: int = 10,
        subspace_size: typing.Union[int, float, str] = 0.6,
        training_method: str = "patches",
        lam: float = 6.0,
        drift_detector: base.DriftDetector = None,
        warning_detector: base.DriftDetector = None,
        disable_detector: str = "off",
        disable_weighted_vote: bool = False,
        seed=None,
        metric: Metric = None,
    ):
        if model is None:
            model = HoeffdingTreeClassifier(grace_period=50,
                                            split_confidence=0.01)

        if drift_detector is None:
            drift_detector = ADWIN(delta=1e-5)

        if warning_detector is None:
            warning_detector = ADWIN(delta=1e-4)

        if disable_detector == "off":
            pass
        elif disable_detector == "drift":
            drift_detector = None
            warning_detector = None
        elif disable_detector == "warning":
            warning_detector = None
        else:
            raise AttributeError(
                f"{disable_detector} is not a valid value for disable_detector.\n"
                f"Valid options are: 'off', 'drift', 'warning'")

        if metric is None:
            metric = Accuracy()

        super().__init__(
            model=model,
            n_models=n_models,
            subspace_size=subspace_size,
            training_method=training_method,
            lam=lam,
            drift_detector=drift_detector,
            warning_detector=warning_detector,
            disable_detector=disable_detector,
            disable_weighted_vote=disable_weighted_vote,
            seed=seed,
            metric=metric,
        )

        self._base_learner_class = BaseSRPClassifier
示例#10
0
def adwin(data):
    adwin = ADWIN()
    i=0
    val=0
    print(data)
    drifts = []
    for row in data:
        in_drift, in_warning = adwin.update(row['count'])
        if in_drift:
            print(f"Change detected at index {row['date']}, input value: {row['count']}")
            drifts.append({'date':row['date'],'count':row['count']})
    return drifts
    
示例#11
0
    def learn_one(self,
                  x,
                  y,
                  *,
                  sample_weight=1.0,
                  tree=None,
                  parent=None,
                  parent_branch=None):
        if tree.bootstrap_sampling:
            # Perform bootstrap-sampling
            k = self._rng.poisson(1.0)
            if k > 0:
                sample_weight = sample_weight * k

        aux = self.prediction(x, tree=tree)
        class_prediction = max(aux, key=aux.get) if aux else None

        is_correct = y == class_prediction

        if self._adwin is None:
            self._adwin = ADWIN(delta=self.adwin_delta)

        old_error = self.error_estimation

        # Update ADWIN
        self._error_change, _ = self._adwin.update(int(not is_correct))

        # Error is decreasing
        if self._error_change and old_error > self.error_estimation:
            self._error_change = False

        # Update statistics
        super().learn_one(x, y, sample_weight=sample_weight, tree=tree)

        weight_seen = self.total_weight

        if weight_seen - self.last_split_attempt_at >= tree.grace_period:
            if self.depth >= tree.max_depth:
                # Depth-based pre-pruning
                self.deactivate()
                tree._n_inactive_leaves += 1
                tree._n_active_leaves -= 1
            elif self.is_active():
                tree._attempt_to_split(
                    self,
                    parent,
                    parent_branch,
                    adwin_delta=tree.adwin_confidence,
                    seed=tree.seed,
                )
                self.last_split_attempt_at = weight_seen
示例#12
0
    def __init__(self,
                 model: base.Classifier,
                 n_models: int = 10,
                 w: float = 6,
                 adwin_delta: float = 0.002,
                 bagging_method: str = 'bag',
                 seed: int = None):
        super().__init__(model=model, n_models=n_models, seed=seed)
        self.n_detected_changes = 0
        self.w = w
        self.adwin_delta = adwin_delta
        self.bagging_method = bagging_method
        self._drift_detectors = [
            copy.deepcopy(ADWIN(delta=self.adwin_delta))
            for _ in range(self.n_models)
        ]

        # Set bagging function
        if bagging_method == 'bag':
            self._bagging_fct = self._leveraging_bag
        elif bagging_method == 'me':
            self._bagging_fct = self._leveraging_bag_me
        elif bagging_method == 'half':
            self._bagging_fct = self._leveraging_bag_half
        elif bagging_method == 'wt':
            self._bagging_fct = self._leveraging_bag_wt
        elif bagging_method == 'subag':
            self._bagging_fct = self._leveraging_subag
        else:
            raise ValueError(f"Invalid bagging_method: {bagging_method}\n"
                             f"Valid options: {self._BAGGING_METHODS}")
示例#13
0
    def learn_one(self, x, y):
        change_detected = False
        for i, model in enumerate(self):
            k = self._bagging_fct(x=x, y=y, model_idx=i)

            for _ in range(k):
                model.learn_one(x, y)

            y_pred = self.models[i].predict_one(x)
            if y_pred is not None:
                incorrectly_classifies = int(y_pred != y)
                error = self._drift_detectors[i].estimation
                self._drift_detectors[i].update(incorrectly_classifies)
                if self._drift_detectors[i].change_detected:
                    if self._drift_detectors[i].estimation > error:
                        change_detected = True

        if change_detected:
            self.n_detected_changes += 1
            max_error_idx = max(
                range(len(self._drift_detectors)),
                key=lambda j: self._drift_detectors[j].estimation)
            self.models[max_error_idx] = copy.deepcopy(self.model)
            self._drift_detectors[max_error_idx] = ADWIN(
                delta=self.adwin_delta)

        return self
示例#14
0
    def learn_one(self, x, y):

        change_detected = False
        for i, model in enumerate(self):
            for _ in range(self._rng.poisson(1)):
                model.learn_one(x, y)

            try:
                y_pred = model.predict_one(x)
                error_estimation = self._drift_detectors[i].estimation
                self._drift_detectors[i].update(int(y_pred == y))
                if self._drift_detectors[i].change_detected:
                    if self._drift_detectors[i].estimation > error_estimation:
                        change_detected = True
            except ValueError:
                change_detected = False

        if change_detected:
            max_error_idx = max(
                range(len(self._drift_detectors)),
                key=lambda j: self._drift_detectors[j].estimation)
            self.models[max_error_idx] = copy.deepcopy(self.model)
            self._drift_detectors[max_error_idx] = ADWIN()

        return self
示例#15
0
 def __adjust_ensemble_size(self):
     if len(self.classes) != len(self.ensemble):
         if len(self.classes) > len(self.ensemble):
             for i in range(len(self.ensemble), len(self.classes)):
                 self.ensemble.append(cp.deepcopy(self.base_estimator))
                 self.actual_n_estimators += 1
                 self.adwin_ensemble.append(ADWIN())
示例#16
0
    def __init__(
        self,
        model: base.Classifier,
        param_grid,
        population_size=10,
        sampling_size=1,
        metric=metrics.Accuracy,
        sampling_rate=1000,
        w: float = 6,
        adwin_delta: float = 0.002,
        bagging_method: str = "bag",
        seed: int = None,
    ):

        param_iter = ParameterSampler(param_grid, population_size)
        param_list = list(param_iter)
        param_list = [dict((k, v) for (k, v) in d.items()) for d in param_list]
        super().__init__(
            self._initialize_model(model=model, params=params)
            for params in param_list)
        self.param_grid = param_grid
        self.population_size = population_size
        self.sampling_size = sampling_size
        self.metric = metric
        self.sampling_rate = sampling_rate
        self.n_models = population_size
        self.model = model
        self.seed = seed
        self._rng = np.random.RandomState(seed)
        self._i = 0
        self._population_metrics = [
            copy.deepcopy(metric()) for _ in range(self.n_models)
        ]
        self._drift_detectors = [
            copy.deepcopy(ADWIN(delta=adwin_delta))
            for _ in range(self.n_models)
        ]
        self.n_detected_changes = 0
        self.w = w
        self.adwin_delta = adwin_delta
        self.bagging_method = bagging_method

        # Set bagging function
        if bagging_method == "bag":
            self._bagging_fct = self._leveraging_bag
        elif bagging_method == "me":
            self._bagging_fct = self._leveraging_bag_me
        elif bagging_method == "half":
            self._bagging_fct = self._leveraging_bag_half
        elif bagging_method == "wt":
            self._bagging_fct = self._leveraging_bag_wt
        elif bagging_method == "subag":
            self._bagging_fct = self._leveraging_subag
        else:
            raise ValueError(f"Invalid bagging_method: {bagging_method}\n"
                             f"Valid options: {self._BAGGING_METHODS}")
示例#17
0
 def __configure(self):
     if hasattr(self.base_estimator, "reset"):
         self.base_estimator.reset()
     self.actual_n_estimators = self.n_estimators
     self.ensemble = [cp.deepcopy(self.base_estimator) for _ in range(self.actual_n_estimators)]
     self.adwin_ensemble = [ADWIN(self.delta) for _ in range(self.actual_n_estimators)]
     self._random_state = check_random_state(self.random_state)
     self.n_detected_changes = 0
     self.classes = None
     self.init_matrix_codes = True
示例#18
0
 def __adjust_ensemble_size(self):
     if len(self.classes) != len(self.ensemble):
         if len(self.classes) > len(self.ensemble):
             for i in range(len(self.ensemble), len(self.classes)):
                 self.ensemble.append(cp.deepcopy(self.base_estimator))
                 self.actual_n_estimators += 1
                 self.adwin_ensemble.append(ADWIN())
             self.lam_sc = np.zeros(self.actual_n_estimators)
             self.lam_pos = np.zeros(self.actual_n_estimators)
             self.lam_neg = np.zeros(self.actual_n_estimators)
             self.lam_sw = np.zeros(self.actual_n_estimators)
             self.epsilon = np.zeros(self.actual_n_estimators)
示例#19
0
def demo():
    """ _test_adwin

    In this demo, an ADWIN object evaluates a sequence of numbers corresponding to 2 distributions.
    The ADWIN object indicates the indices where change is detected.

    The first half of the data is a sequence of randomly generated 0's and 1's.
    The second half of the data is a normal distribution of integers from 0 to 7.

    """
    adwin = ADWIN()
    size = 2000
    change_start = 999
    np.random.seed(1)
    data_stream = np.random.randint(2, size=size)
    data_stream[change_start:] = np.random.randint(8, size=size - change_start)

    for i in range(size):
        change_detected, _ = adwin.update(data_stream[i])
        if change_detected:
            print('Change has been detected in data: ' + str(data_stream[i]) +
                  ' - of index: ' + str(i))
示例#20
0
    def __configure(self):
        if hasattr(self.base_estimator, "reset"):
            self.base_estimator.reset()

        self.actual_n_estimators = self.n_estimators
        self.adwin_ensemble = []
        for i in range(self.actual_n_estimators):
            self.adwin_ensemble.append(ADWIN())
        self.ensemble = [
            cp.deepcopy(self.base_estimator)
            for _ in range(self.actual_n_estimators)
        ]
        self._random_state = check_random_state(self.random_state)
示例#21
0
    def __configure(self):
        if hasattr(self.base_estimator, "reset"):
            self.base_estimator.reset()

        self.actual_n_estimators = self.n_estimators
        self.adwin_ensemble = []
        for i in range(self.actual_n_estimators):
            self.adwin_ensemble.append(ADWIN())
        self.ensemble = [
            cp.deepcopy(self.base_estimator)
            for _ in range(self.actual_n_estimators)
        ]
        self._random_state = check_random_state(self.random_state)
        self.lam_sc = np.zeros(self.actual_n_estimators)
        self.lam_pos = np.zeros(self.actual_n_estimators)
        self.lam_neg = np.zeros(self.actual_n_estimators)
        self.lam_sw = np.zeros(self.actual_n_estimators)
        self.epsilon = np.zeros(self.actual_n_estimators)
        self.n_pos = 0
        self.n_neg = 0
示例#22
0
    def learn_one(self, x: dict, y: base.typing.ClfTarget, **kwargs):
        # Create Dataset if not initialized
        # Check if population needs to be updated
        if self._i % self.sampling_rate == 0:
            scores = [be.get() for be in self._population_metrics]
            idx_best = scores.index(max(scores))
            idx_worst = scores.index(min(scores))
            child = self._mutate_estimator(estimator=self[idx_best])
            self.models[idx_worst] = child
            #self.population_metrics[idx_worst] = copy.deepcopy(self.metric())

        change_detected = False
        for i, model in enumerate(self):
            self._population_metrics[i].update(y_true=y,
                                               y_pred=model.predict_one(x))
            k = self._bagging_fct(x=x, y=y, model_idx=i)

            for _ in range(k):
                model.learn_one(x, y)

            y_pred = self.models[i].predict_one(x)
            if y_pred is not None:
                incorrectly_classifies = int(y_pred != y)
                error = self._drift_detectors[i].estimation
                self._drift_detectors[i].update(incorrectly_classifies)
                if self._drift_detectors[i].change_detected:
                    if self._drift_detectors[i].estimation > error:
                        change_detected = True

        if change_detected:
            self.n_detected_changes += 1
            max_error_idx = max(
                range(len(self._drift_detectors)),
                key=lambda j: self._drift_detectors[j].estimation,
            )
            self.models[max_error_idx] = copy.deepcopy(self.model)
            self._drift_detectors[max_error_idx] = ADWIN(
                delta=self.adwin_delta)

        return self
示例#23
0
class AdaLearningNodeClassifier(LearningNodeNBA, AdaNode):
    """Learning node for Hoeffding Adaptive Tree.

    Parameters
    ----------
    stats
        Initial class observations.
    depth
        The depth of the learning node in the tree.
    attr_obs
        The numeric attribute observer algorithm used to monitor target statistics
        and perform split attempts.
    attr_obs_params
        The parameters passed to the numeric attribute observer algorithm.
    adwin_delta
        The delta parameter of ADWIN.
    seed
        Seed to control the generation of random numbers and support reproducibility.
    """
    def __init__(self, stats, depth, attr_obs, attr_obs_params, adwin_delta, seed):
        super().__init__(stats, depth, attr_obs, attr_obs_params)
        self.adwin_delta = adwin_delta
        self._adwin = ADWIN(delta=self.adwin_delta)
        self.error_change = False
        self._rng = check_random_state(seed)

    @property
    def n_leaves(self):
        return 1

    @property
    def error_estimation(self):
        return self._adwin.estimation

    @property
    def error_width(self):
        return self._adwin.width

    def error_is_null(self):
        return self._adwin is None

    def kill_tree_children(self, hat):
        pass

    def learn_one(self, x, y, *, sample_weight=1., tree=None, parent=None, parent_branch=-1):
        if tree.bootstrap_sampling:
            # Perform bootstrap-sampling
            k = self._rng.poisson(1.0)
            if k > 0:
                sample_weight = sample_weight * k

        aux = self.leaf_prediction(x, tree=tree)
        class_prediction = max(aux, key=aux.get) if aux else None

        is_correct = (y == class_prediction)

        if self._adwin is None:
            self._adwin = ADWIN(delta=self.adwin_delta)

        old_error = self.error_estimation

        # Update ADWIN
        self.error_change, _ = self._adwin.update(int(not is_correct))

        # Error is decreasing
        if self.error_change and old_error > self.error_estimation:
            self.error_change = False

        # Update statistics
        super().learn_one(x, y, sample_weight=sample_weight, tree=tree)

        weight_seen = self.total_weight

        if weight_seen - self.last_split_attempt_at >= tree.grace_period:
            if self.depth >= tree.max_depth:
                # Depth-based pre-pruning
                self.deactivate()
                tree._n_inactive_leaves += 1
                tree._n_active_leaves -= 1
            else:
                tree._attempt_to_split(self, parent, parent_branch)
                self.last_split_attempt_at = weight_seen

    # Override LearningNodeNBA
    def leaf_prediction(self, x, *, tree=None):
        if not self.stats:
            return

        prediction_option = tree.leaf_prediction
        if not self.is_active() or prediction_option == tree._MAJORITY_CLASS:
            dist = normalize_values_in_dict(self.stats, inplace=False)
        elif prediction_option == tree._NAIVE_BAYES:
            if self.total_weight >= tree.nb_threshold:
                dist = do_naive_bayes_prediction(x, self.stats, self.attribute_observers)
            else:  # Use majority class
                dist = normalize_values_in_dict(self.stats, inplace=False)
        else:  # Naive Bayes Adaptive
            dist = super().leaf_prediction(x, tree=tree)

        dist_sum = sum(dist.values())
        normalization_factor = dist_sum * self.error_estimation * self.error_estimation

        # Weight node's responses accordingly to the estimated error monitored by ADWIN
        # Useful if both the predictions of the alternate tree and the ones from the main tree
        # are combined -> give preference to the most accurate one
        dist = normalize_values_in_dict(dist, normalization_factor, inplace=False)

        return dist

    # Override AdaNode: enable option vote (query potentially more than one leaf for responses)
    def filter_instance_to_leaves(self, x, parent, parent_branch, found_nodes):
        found_nodes.append(FoundNode(self, parent, parent_branch))
示例#24
0
    def partial_fit(self, X, y, classes=None, sample_weight=None):
        """ Partially fits the model, based on the X and y matrix.

        Since it's an ensemble learner, if X and y matrix of more than one
        sample are passed, the algorithm will partial fit the model one sample
        at a time.

        Each sample is trained by each classifier a total of K times, where K
        is drawn by a :math:`Poisson(l)` distribution. :math:`l` is updated after every example
        using :math:`lambda_{sc}` if th estimator correctly classifies the example or
        :math:`lambda_{sw}` in the other case.

        Parameters
        ----------
        X : numpy.ndarray of shape (n_samples, n_features)
            The features to train the model.

        y: numpy.ndarray of shape (n_samples)
            An array-like with the class labels of all samples in X.

        classes: numpy.ndarray, optional (default=None)
            Array with all possible/known class labels. This is an optional parameter, except
            for the first partial_fit call where it is compulsory.

        sample_weight: Array-like
            Instance weight. If not provided, uniform weights are assumed. Usage varies depending on the base estimator.

        Raises
        ------
        ValueError: A ValueError is raised if the 'classes' parameter is not
        passed in the first partial_fit call, or if they are passed in further
        calls but differ from the initial classes list passed.

        Returns
        -------
        self

        """
        if self.ensemble is None:
            self.__configure()

        if self.classes is None:
            if classes is None:
                raise ValueError(
                    "The first partial_fit call should pass all the classes.")
            else:
                self.classes = classes

        if self.classes is not None and classes is not None:
            if set(self.classes) == set(classes):
                pass
            else:
                raise ValueError(
                    "The classes passed to the partial_fit function differ from those passed earlier."
                )

        self.__adjust_ensemble_size()

        r, _ = get_dimensions(X)
        for j in range(r):
            change_detected = False
            lam = 1
            for i in range(self.actual_n_estimators):
                a = (i + 1) / self.actual_n_estimators
                if y[j] == 1:
                    self.pos_samples.append(X[j])
                    lam = a * self.sampling_rate
                    lam_smote = (1 - a) * self.sampling_rate
                    k = self._random_state.poisson(lam)
                    if k > 0:
                        for b in range(k):
                            self.ensemble[i].partial_fit([X[j]], [y[j]],
                                                         classes,
                                                         sample_weight)
                    k_smote = self._random_state.poisson(lam_smote)

                    if k_smote > 0:
                        for b in range(k_smote):
                            x_smote = self.online_smote()
                            self.ensemble[i].partial_fit([x_smote], [y[j]],
                                                         classes,
                                                         sample_weight)
                else:
                    k = self._random_state.poisson(lam)
                    if k > 0:
                        for b in range(k):
                            self.ensemble[i].partial_fit([X[j]], [y[j]],
                                                         classes,
                                                         sample_weight)

                if self.drift_detection:
                    try:
                        pred = self.ensemble[i].predict(X)
                        error_estimation = self.adwin_ensemble[i].estimation
                        for k in range(r):
                            if pred[k] is not None:
                                self.adwin_ensemble[i].update(
                                    int(pred[k] == y[k]))
                        if self.adwin_ensemble[i].change_detected:
                            if self.adwin_ensemble[
                                    i].estimation > error_estimation:
                                change_detected = True
                    except ValueError:
                        change_detected = False
                        pass

            if change_detected and self.drift_detection:
                max_threshold = 0.0
                i_max = -1
                for i in range(self.actual_n_estimators):
                    if max_threshold < self.adwin_ensemble[i].estimation:
                        max_threshold = self.adwin_ensemble[i].estimation
                        i_max = i
                if i_max != -1:
                    self.ensemble[i_max].reset()
                    self.adwin_ensemble[i_max] = ADWIN()

        return self
示例#25
0
    def __partial_fit(self, X, y):
        if self.init_matrix_codes and self.enable_code_matrix:
            self.__init_output_codes()

        change_detected = False
        for i in range(self.actual_n_estimators):

            # leveraging_bag - Leveraging Bagging
            if self.leverage_algorithm == self._LEVERAGE_ALGORITHMS[0]:
                k = self._random_state.poisson(self.w)

            # leveraging_bag_me - Missclassification Error
            elif self.leverage_algorithm == self._LEVERAGE_ALGORITHMS[1]:
                error = self.adwin_ensemble[i].estimation
                pred = self.ensemble[i].predict(np.asarray([X]))
                if pred is None:
                    k = 1.0
                elif pred[0] != y:
                    k = 1.0
                elif (error != 1.0 and
                      self._random_state.rand() < (error / (1.0 - error))):
                    k = 1.0
                else:
                    k = 0.0

            # leveraging_bag_half - Resampling without replacement for
            #                       half of the instances
            elif self.leverage_algorithm == self._LEVERAGE_ALGORITHMS[2]:
                w = 1.0
                k = 0.0 if (self._random_state.randint(2) == 1) else w

            # leveraging_bag_wt - Without taking out all instances
            elif self.leverage_algorithm == self._LEVERAGE_ALGORITHMS[3]:
                w = 1.0
                k = 1.0 + self._random_state.poisson(w)

            # leveraging_subag - Resampling without replacement
            elif self.leverage_algorithm == self._LEVERAGE_ALGORITHMS[4]:
                w = 1.0
                k = self._random_state.poisson(1)
                k = w if k > 0 else 0

            else:
                raise RuntimeError("Invalid option for leverage_algorithm: '{}'\n"
                                   "Valid options are: {}".format(self.leverage_algorithm,
                                                                  self._LEVERAGE_ALGORITHMS))

            y_coded = cp.deepcopy(y)
            if k > 0:
                classes = self.classes
                if self.enable_code_matrix:
                    y_coded = self.matrix_codes[i][int(y)]
                    classes = [0, 1]
                for _ in range(int(k)):
                    self.ensemble[i].partial_fit(X=np.asarray([X]), y=np.asarray([y_coded]),
                                                 classes=classes)

            pred = self.ensemble[i].predict(np.asarray([X]))
            if pred is not None:
                add = 0 if (pred[0] == y_coded) else 1
                error = self.adwin_ensemble[i].estimation
                self.adwin_ensemble[i].update(add)
                if self.adwin_ensemble[i].change_detected:
                    if self.adwin_ensemble[i].estimation > error:
                        change_detected = True

        if change_detected:
            self.n_detected_changes += 1
            max_threshold = 0.0
            i_max = -1
            for i in range(self.actual_n_estimators):
                if max_threshold < self.adwin_ensemble[i].estimation:
                    max_threshold = self.adwin_ensemble[i].estimation
                    i_max = i
            if i_max != -1:
                self.ensemble[i_max].reset()
                self.adwin_ensemble[i_max] = ADWIN(self.delta)
        return self
示例#26
0
def test_adwin():
    expected_indices = [1055, 1087, 1151]
    detected_indices = perform_test(ADWIN(), data_stream_1)

    assert detected_indices == expected_indices
示例#27
0
    def __init__(
        self,
        # Forest parameters
        n_models: int = 10,
        max_features="sqrt",
        aggregation_method: str = "median",
        lambda_value: int = 6,
        metric: metrics.RegressionMetric = metrics.MSE(),
        disable_weighted_vote=True,
        drift_detector: base.DriftDetector = ADWIN(0.001),
        warning_detector: base.DriftDetector = ADWIN(0.01),
        # Tree parameters
        grace_period: int = 50,
        max_depth: int = None,
        split_confidence: float = 0.01,
        tie_threshold: float = 0.05,
        leaf_prediction: str = "model",
        leaf_model: base.Regressor = None,
        model_selector_decay: float = 0.95,
        nominal_attributes: list = None,
        splitter: Splitter = None,
        min_samples_split: int = 5,
        max_size: int = 100,
        memory_estimate_period: int = 2000000,
        seed: int = None,
        **kwargs,
    ):
        super().__init__(
            n_models=n_models,
            max_features=max_features,
            lambda_value=lambda_value,
            metric=metric,
            disable_weighted_vote=disable_weighted_vote,
            drift_detector=drift_detector,
            warning_detector=warning_detector,
            seed=seed,
        )

        self._n_samples_seen = 0
        self._base_member_class = ForestMemberRegressor

        # Tree parameters
        self.grace_period = grace_period
        self.max_depth = max_depth
        self.split_confidence = split_confidence
        self.tie_threshold = tie_threshold
        self.leaf_prediction = leaf_prediction
        self.leaf_model = leaf_model
        self.model_selector_decay = model_selector_decay
        self.nominal_attributes = nominal_attributes
        self.splitter = splitter
        self.min_samples_split = min_samples_split
        self.max_size = max_size
        self.memory_estimate_period = memory_estimate_period
        self.kwargs = kwargs

        if aggregation_method in self._VALID_AGGREGATION_METHOD:
            self.aggregation_method = aggregation_method
        else:
            raise ValueError(
                f"Invalid aggregation_method: {aggregation_method}.\n"
                f"Valid values are: {self._VALID_AGGREGATION_METHOD}"
            )
示例#28
0
 def __init__(self, model: base.Classifier, n_models=10, seed: int = None):
     super().__init__(model=model, n_models=n_models, seed=seed)
     self._drift_detectors = [
         copy.deepcopy(ADWIN()) for _ in range(self.n_models)
     ]
示例#29
0
# ADWIN

import numpy as np
from river.drift import ADWIN
np.random.seed(12345)

adwin = ADWIN()

# Simulate a data stream composed by two data distributions
data_stream = np.concatenate(
    (np.random.randint(2, size=1000), np.random.randint(4, high=8, size=1000)))

# Update drift detector and verify if change is detected
for i, val in enumerate(data_stream):
    in_drift, in_warning = adwin.update(val)
    if in_drift:
        print(f"Change detected at index {i}, input value: {val}")
class AdaptiveRandomForestRegressor(BaseForest, base.Regressor):
    r"""Adaptive Random Forest regressor.

    The 3 most important aspects of Adaptive Random Forest [^1] are:

    1. inducing diversity through re-sampling

    2. inducing diversity through randomly selecting subsets of features for
       node splits

    3. drift detectors per base tree, which cause selective resets in response
       to drifts

    Notice that this implementation is slightly different from the original
    algorithm proposed in [^2]. The `HoeffdingTreeRegressor` is used as base
    learner, instead of `FIMT-DD`. It also adds a new strategy to monitor the
    predictions and check for concept drifts. The deviations of the predictions
    to the target are monitored and normalized in the [0, 1] range to fulfill ADWIN's
    requirements. We assume that the data subjected to the normalization follows
    a normal distribution, and thus, lies within the interval of the mean $\pm3\sigma$.

    Parameters
    ----------
    n_models
        Number of trees in the ensemble.
    max_features
        Max number of attributes for each node split.<br/>
        - If `int`, then consider `max_features` at each split.<br/>
        - If `float`, then `max_features` is a percentage and
          `int(max_features * n_features)` features are considered per split.<br/>
        - If "sqrt", then `max_features=sqrt(n_features)`.<br/>
        - If "log2", then `max_features=log2(n_features)`.<br/>
        - If None, then ``max_features=n_features``.
    lambda_value
        The lambda value for bagging (lambda=6 corresponds to Leveraging Bagging).
    metric
        Metric used to track trees performance within the ensemble. Depending,
        on the configuration, this metric is also used to weight predictions
        from the members of the ensemble.
    aggregation_method
        The method to use to aggregate predictions in the ensemble.<br/>
        - 'mean'<br/>
        - 'median' - If selected will disable the weighted vote.
    disable_weighted_vote
        If `True`, disables the weighted vote prediction, i.e. does not assign
        weights to individual tree's predictions and uses the arithmetic mean
        instead. Otherwise will use the `metric` value to weight predictions.
    drift_detector
        Drift Detection method. Set to None to disable Drift detection.
    warning_detector
        Warning Detection method. Set to None to disable warning detection.
    grace_period
        [*Tree parameter*] Number of instances a leaf should observe between
        split attempts.
    max_depth
        [*Tree parameter*] The maximum depth a tree can reach. If `None`, the
        tree will grow indefinitely.
    split_confidence
        [*Tree parameter*] Allowed error in split decision, a value closer to 0
        takes longer to decide.
    tie_threshold
        [*Tree parameter*] Threshold below which a split will be forced to break
        ties.
    leaf_prediction
        [*Tree parameter*] Prediction mechanism used at leaves.</br>
        - 'mean' - Target mean</br>
        - 'model' - Uses the model defined in `leaf_model`</br>
        - 'adaptive' - Chooses between 'mean' and 'model' dynamically</br>
    leaf_model
        [*Tree parameter*] The regression model used to provide responses if
        `leaf_prediction='model'`. If not provided, an instance of
        `river.linear_model.LinearRegression` with the default hyperparameters
         is used.
    model_selector_decay
        The exponential decaying factor applied to the learning models' squared
        errors, that are monitored if `leaf_prediction='adaptive'`. Must be
        between `0` and `1`. The closer to `1`, the more importance is going to
        be given to past observations. On the other hand, if its value
        approaches `0`, the recent observed errors are going to have more
        influence on the final decision.
    nominal_attributes
        [*Tree parameter*] List of Nominal attributes. If empty, then assume that
        all attributes are numerical.
    attr_obs
        [*Tree parameter*] The attribute observer (AO) used to monitor the target
        statistics of numeric features and perform splits. Parameters can be passed to the
        AOs (when supported) by using `attr_obs_params`. Valid options are:</br>
        - `'e-bst'`: Extended Binary Search Tree (E-BST). This AO has no parameters.</br>
        See notes for more information about the supported AOs.
    attr_obs_params
        [*Tree parameter*] Parameters passed to the numeric AOs. See `attr_obs`
        for more information.
    min_samples_split
        [*Tree parameter*] The minimum number of samples every branch resulting from a split
        candidate must have to be considered valid.
    max_size
        [*Tree parameter*] Maximum memory (MB) consumed by the tree.
    memory_estimate_period
        [*Tree parameter*] Number of instances between memory consumption checks.
    seed
        If `int`, `seed` is used to seed the random number generator;
        If `RandomState`, `seed` is the random number generator;
        If `None`, the random number generator is the `RandomState` instance
        used by `np.random`.
    kwargs
        Other parameters passed to `river.tree.BaseHoeffdingTree`.

    Notes
    -----
    Hoeffding trees rely on Attribute Observer (AO) algorithms to monitor input features
    and perform splits. Nominal features can be easily dealt with, since the partitions
    are well-defined. Numerical features, however, require more sophisticated solutions.
    Currently, only one AO is supported in `river` for regression trees:

    - The Extended Binary Search Tree (E-BST) uses an exhaustive algorithm to find split
    candidates, similarly to batch decision tree algorithms. It ends up storing all
    observations between split attempts. However, E-BST automatically removes bad split
    points periodically from its structure and, thus, alleviates the memory and time
    costs involved in its usage.

    References
    ----------
    [^1]: Gomes, H.M., Bifet, A., Read, J., Barddal, J.P., Enembreck, F.,
          Pfharinger, B., Holmes, G. and Abdessalem, T., 2017. Adaptive random
          forests for evolving data stream classification. Machine Learning,
          106(9-10), pp.1469-1495.

    [^2]: Gomes, H.M., Barddal, J.P., Boiko, L.E., Bifet, A., 2018.
          Adaptive random forests for data stream regression. ESANN 2018.

    Examples
    --------
    >>> from river import datasets
    >>> from river import evaluate
    >>> from river import metrics
    >>> from river import ensemble
    >>> from river import preprocessing

    >>> dataset = datasets.TrumpApproval()

    >>> model = (
    ...     preprocessing.StandardScaler() |
    ...     ensemble.AdaptiveRandomForestRegressor(n_models=3, seed=42)
    ... )

    >>> metric = metrics.MAE()

    >>> evaluate.progressive_val_score(dataset, model, metric)
    MAE: 1.870913

    """

    _MEAN = "mean"
    _MEDIAN = "median"
    _VALID_AGGREGATION_METHOD = [_MEAN, _MEDIAN]

    def __init__(
        self,
        # Forest parameters
        n_models: int = 10,
        max_features="sqrt",
        aggregation_method: str = "median",
        lambda_value: int = 6,
<<<<<<< HEAD
        metric: RegressionMetric = MSE(),
=======
        metric: metrics.RegressionMetric = metrics.MSE(),
>>>>>>> upstream/master
        disable_weighted_vote=True,
        drift_detector: base.DriftDetector = ADWIN(0.001),
        warning_detector: base.DriftDetector = ADWIN(0.01),
        # Tree parameters
        grace_period: int = 50,
        max_depth: int = None,
        split_confidence: float = 0.01,
        tie_threshold: float = 0.05,
        leaf_prediction: str = "model",
        leaf_model: base.Regressor = None,
        model_selector_decay: float = 0.95,
        nominal_attributes: list = None,
        attr_obs: str = "e-bst",
        attr_obs_params: dict = None,
        min_samples_split: int = 5,
        max_size: int = 100,
        memory_estimate_period: int = 2000000,
        seed: int = None,
        **kwargs,
    ):
        super().__init__(
            n_models=n_models,
            max_features=max_features,
            lambda_value=lambda_value,
            metric=metric,
            disable_weighted_vote=disable_weighted_vote,
            drift_detector=drift_detector,
            warning_detector=warning_detector,
            seed=seed,
        )

        self._n_samples_seen = 0
        self._base_member_class = ForestMemberRegressor

        # Tree parameters
        self.grace_period = grace_period
        self.max_depth = max_depth
        self.split_confidence = split_confidence
        self.tie_threshold = tie_threshold
        self.leaf_prediction = leaf_prediction
        self.leaf_model = leaf_model
        self.model_selector_decay = model_selector_decay
        self.nominal_attributes = nominal_attributes
        self.attr_obs = attr_obs
        self.attr_obs_params = attr_obs_params
        self.min_samples_split = min_samples_split
        self.max_size = max_size
        self.memory_estimate_period = memory_estimate_period
        self.kwargs = kwargs

        if aggregation_method in self._VALID_AGGREGATION_METHOD:
            self.aggregation_method = aggregation_method
        else:
            raise ValueError(
                f"Invalid aggregation_method: {aggregation_method}.\n"
                f"Valid values are: {self._VALID_AGGREGATION_METHOD}"
            )