示例#1
0
class ADWINChangeDetector(BaseDriftDetector):
    """ Drift detection method based in ADWIN.

        Parameters
        ----------
        delta : float (default=0.002)
            The delta parameter for the ADWIN algorithm.

        Notes
        -----
        ADWIN [1]_ (ADaptive WINdowing) is an adaptive sliding window algorithm
        for detecting change, and keeping updated statistics about a data stream.
        ADWIN allows algorithms not adapted for drifting data, to be resistant
        to this phenomenon.

        The general idea is to keep statistics from a window of variable size while
        detecting concept drift.

        The algorithm will decide the size of the window by cutting the statistics'
        window at different points and analysing the average of some statistic over
        these two windows. If the absolute value of the difference between the two
        averages surpasses a pre-defined threshold, change is detected at that point
        and all data before that time is discarded.

        References
        ----------
        .. [1] Bifet, Albert, and Ricard Gavalda. "Learning from time-changing data with adaptive windowing."
           In Proceedings of the 2007 SIAM international conference on data mining, pp. 443-448.
           Society for Industrial and Applied Mathematics, 2007.

        Examples
        --------
        >>> # Imports
        >>> import numpy as np
        >>> from skmultiflow.drift_detection import ADWINChangeDetector
        >>> adwin_change_detector = ADWINChangeDetector()
        >>> # Simulating a data stream as a normal distribution of 1's and 0's
        >>> data_stream = np.random.randint(2, size=2000)
        >>> # Changing the data concept from index 999 to 2000
        >>> for i in range(999, 2000):
        ...     data_stream[i] = np.random.randint(4, high=8)
        >>> # Adding stream elements to ADWIN and verifying if drift occurred
        >>> for i in range(2000):
        ...     adwin_change_detector.add_element(data_stream[i])
        ...     if adwin_change_detector.detected_change():
        ...         print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i))

        """
    def __init__(self, delta=.002):
        super().__init__()
        self.adwin = ADWIN(delta=delta)
        super().reset()

    def add_element(self, input_value):
        err_estim = self.adwin.estimation
        self.adwin.add_element(input_value)
        res_input = self.adwin.detected_change()

        self.in_concept_change = False
        self.in_warning_zone = False

        if self.adwin.detected_warning_zone():
            self.in_warning_zone = True
        if res_input:
            if self.adwin.estimation > err_estim:
                self.in_concept_change = True
                self.in_warning_zone = False

        self.estimation = self.adwin.estimation
示例#2
0
class DeepNNPytorch(BaseSKMObject, ClassifierMixin):
    def __init__(
            self,
            class_labels=['0', '1'],  # {'up':0,'down':1}
            use_cpu=True,
            process_as_a_batch=False,
            use_threads=False,
            background_training_after=4):
        # configuration variables (which has the same name as init parameters)
        self.class_labels = class_labels
        self.use_threads = use_threads
        self.background_training_after = background_training_after

        super().__init__()

        # status variables
        self.class_to_label = {}
        self.foreground_nets = []  # type: List[ANN]
        self.background_nets = []  # type: List[ANN]
        self.drift_detection_method = None
        self.warning_detection_method = None
        self.detected_warnings = 0
        self.samples_seen = 0
        self.last_detected_drift_around = 0
        self.background_learner_threads = []
        self.background_train_results = None
        self.foreground_train_results = None

        self.init_status_values()

    def init_status_values(self):
        # init status variables
        self.class_to_label = {}
        for i in range(len(self.class_labels)):
            self.class_to_label.update({i: self.class_labels[i]})

        for i in range(len(foreground_net_config)):
            self.foreground_nets.append(
                ANN(learning_rate=foreground_net_config[i]['l_rate'],
                    optimizer_type=foreground_net_config[i]['optimizer_type'],
                    class_labels=self.class_labels))

        for i in range(len(background_net_config)):
            self.background_nets.append(
                ANN(learning_rate=foreground_net_config[i]['l_rate'],
                    optimizer_type=background_net_config[i]['optimizer_type'],
                    class_labels=self.class_labels))

        self.drift_detection_method = ADWIN(delta=1e-3,
                                            direction=ADWIN.DETECT_DOWN)
        self.warning_detection_method = ADWIN(delta=1e-8,
                                              direction=ADWIN.DETECT_DOWN)

        self.detected_warnings = 0
        self.samples_seen = 0
        self.last_detected_drift_around = 0
        self.background_learner_threads = []
        self.background_train_results = None
        self.foreground_train_results = None
        print(self)

    def partial_fit(self, X, y, classes=None, sample_weight=None):
        r, c = get_dimensions(X)
        self.samples_seen += r

        # if self.samples_seen % 2 == 0:
        if len(self.background_learner_threads) == 0:
            if self.samples_seen % self.background_training_after == 0:
                self.background_train_results = {
                    'probas': [None] * len(self.background_nets),
                    'y_hats': [None] * len(self.background_nets),
                    'avg_loss_since_last_detected_drift_by_parent':
                    [0] * len(self.background_nets)
                }
                for i in range(len(self.background_nets)):
                    self.background_learner_threads.append(
                        threading.Thread(target=net_train,
                                         args=(
                                             self.background_nets[i],
                                             X,
                                             r,
                                             c,
                                             y,
                                             self.background_train_results,
                                             i,
                                             self.last_detected_drift_around,
                                         )))

                for i in range(len(self.background_nets)):
                    self.background_learner_threads[i].start()
        else:  # there are live background learner threads
            # wait for self.background_training_after instances to join them
            if self.samples_seen % self.background_training_after == self.background_training_after - 1:
                # TODO: CPython does not support multi threading: https://docs.python.org/3/library/threading.html
                #  we still may be fine as long as we don't compile the module using CPython.
                #  Multiprocessing is an alternative:
                #  https://docs.python.org/3/library/multiprocessing.html#module-multiprocessing
                for i in range(len(self.background_nets)):
                    self.background_learner_threads[i].join()
                self.background_learner_threads = []

                if self.foreground_train_results is not None:
                    min_back = np.argmin(self.background_train_results[
                        'avg_loss_since_last_detected_drift_by_parent'],
                                         axis=0)
                    max_fore = np.argmax(self.foreground_train_results[
                        'avg_loss_since_last_detected_drift_by_parent'],
                                         axis=0)
                    # min_back < max_fore
                    if self.background_train_results['avg_loss_since_last_detected_drift_by_parent'][min_back] \
                            < self.foreground_train_results['avg_loss_since_last_detected_drift_by_parent'][max_fore]:
                        tmp_net = self.foreground_nets[max_fore]
                        self.foreground_nets[max_fore] = self.background_nets[
                            min_back]
                        self.background_nets[min_back] = tmp_net

        self.foreground_train_results = {
            'probas': [None] * len(self.foreground_nets),
            'y_hats': [None] * len(self.foreground_nets),
            'avg_loss_since_last_detected_drift_by_parent':
            [0] * len(self.foreground_nets)
        }
        if self.use_threads:
            t = []
            for i in range(len(self.foreground_nets)):
                t.append(
                    threading.Thread(target=net_train,
                                     args=(
                                         self.foreground_nets[i],
                                         X,
                                         r,
                                         c,
                                         y,
                                         self.foreground_train_results,
                                         i,
                                         self.last_detected_drift_around,
                                     )))

            for i in range(len(self.foreground_nets)):
                t[i].start()

            for i in range(len(self.foreground_nets)):
                t[i].join()
        else:
            for i in range(len(self.foreground_nets)):
                net_train(self.foreground_nets[i], X, r, c, y,
                          self.foreground_train_results, i,
                          self.last_detected_drift_around)

        if self.drift_detection_method is not None:
            # get predicted class and compare with actual class label
            predicted_label = vectorized_map_class_to_label(
                np.argmax(
                    np.sum(self.foreground_train_results['probas'], axis=0) /
                    len(self.foreground_nets),
                    axis=1),
                class_to_label_map=self.class_to_label)
            # TODO: we may have to have a special case for batch processing
            predicted_matches_actual = predicted_label == y

            self.drift_detection_method.add_element(
                1 if predicted_matches_actual else 0)
            if self.warning_detection_method is not None:
                self.warning_detection_method.add_element(
                    1 if predicted_matches_actual else 0)

            # pass the difference to the detector
            # predicted_matches_actual = torch.abs(y-output).detach().numpy()[0]
            # self.drift_detection_method.add_element(predicted_matches_actual)

            # Check if the was a warning
            if self.warning_detection_method is not None:
                if self.warning_detection_method.detected_change():
                    self.detected_warnings += 1
            else:  # warning detector is None, hence drift detector has warning detection capability.
                if self.drift_detection_method.detected_warning_zone():
                    self.detected_warnings += 1  # 3 is the threshold level
            # Check if the was a change
            if self.detected_warnings > 3 and self.drift_detection_method.detected_change(
            ):
                print('Drift detected by {} around {} th sample.'.format(
                    self.drift_detection_method, self.samples_seen))
                self.detected_warnings = 0
                self.last_detected_drift_around = self.samples_seen
                # Find the the worst learner from the foreground and replace it with the background

        return self

    def predict(self, X):
        y_proba = self.predict_proba(X)
        pred_sum_per_class = np.sum(y_proba, axis=0)
        pred_avgsum_per_class = np.divide(pred_sum_per_class,
                                          len(self.foreground_nets))
        y_pred = np.argmax(pred_avgsum_per_class, axis=0)
        return vectorized_map_class_to_label(
            np.asarray([y_pred]), class_to_label_map=self.class_to_label)

    def predict_proba(self, X):
        r, c = get_dimensions(X)
        probas = np.zeros([len(self.foreground_nets), len(self.class_labels)])
        # if self.use_threads:
        #     t = []
        #     for i in range(len(self.nets)):
        #         t.append(threading.Thread(target=net_predict_proba, args=(self.nets[i], X, r, c, probas, i,)))
        #
        #     for i in range(len(self.nets)):
        #         t[i].start()
        #
        #     for i in range(len(self.nets)):
        #         t[i].join()
        # else:
        for i in range(len(self.foreground_nets)):
            net_predict_proba(self.foreground_nets[i], X, r, c, probas, i)

        return np.asarray(probas)

    def reset(self):
        # configuration variables (which has the same name as init parameters) should be copied by the caller function
        for i in range(len(self.foreground_nets)):
            self.foreground_nets[i].reset()
        return self

    def __str__(self):
        return str(self.__class__) + ": " + str(self.__dict__)

    def stream_ended(self):
        print('\nNetwork configuration:\n'
              '{}\n'
              '=======================================\n'
              'Foreground Nets\n'.format(self))
        print(
            'optimizer_type,learning_rate,accumulated_loss,accumulated_loss_since_last_detected_drift_by_parent'
        )
        for i in range(len(self.foreground_nets)):
            print('{},{},{},{}'.format(
                self.foreground_nets[i].optimizer_type,
                self.foreground_nets[i].learning_rate,
                self.foreground_nets[i].accumulated_loss /
                self.foreground_nets[i].samples_seen, self.foreground_nets[i].
                accumulated_loss_since_last_detected_drift_by_parent /
                self.foreground_nets[i].
                samples_seen_after_last_detected_drift_by_parent))
        print('\n' 'Background Nets\n'.format(self))
        for i in range(len(self.background_nets)):
            print('{},{},{},{}'.format(
                self.background_nets[i].optimizer_type,
                self.background_nets[i].learning_rate,
                self.background_nets[i].accumulated_loss /
                self.background_nets[i].samples_seen, self.background_nets[i].
                accumulated_loss_since_last_detected_drift_by_parent /
                self.background_nets[i].
                samples_seen_after_last_detected_drift_by_parent))
        print('\n')