示例#1
0
 def reset(self):
     self.total_square_error = 0.0
     self.average_error = 0.0
     self.last_true_label = None
     self.last_prediction = None
     self.total_square_error_correction = FastBuffer(self.window_size)
     self.average_error_correction = FastBuffer(self.window_size)
示例#2
0
 def __init__(self, window_size=200):
     super().__init__()
     self.total_square_error = 0.0
     self.average_error = 0.0
     self.last_true_label = None
     self.last_prediction = None
     self.total_square_error_correction = FastBuffer(window_size)
     self.average_error_correction = FastBuffer(window_size)
     self.window_size = window_size
示例#3
0
 def reset(self, targets=None):
     if targets is not None:
         self.n_targets = len(targets)
     else:
         self.n_targets = 0
     self.majority_classifier = 0
     self.correct_no_change = 0
     self.confusion_matrix.restart(self.n_targets)
     self.majority_classifier_correction = FastBuffer(self.window_size)
     self.correct_no_change_correction = FastBuffer(self.window_size)
示例#4
0
    def __configure(self, missing_value, strategy, window_size, new_value=1):
        if hasattr(missing_value, 'append'):
            self.missing_value = missing_value
        else:
            self.missing_value = [missing_value]
        self.strategy = strategy
        self.window_size = window_size
        self.new_value = new_value

        if strategy in ['mean', 'median', 'mode']:
            self.window = FastBuffer(max_size=window_size)
示例#5
0
    def __init__(self, targets=None, dtype=np.int64, window_size=200):
        super().__init__()
        if targets is not None:
            self.n_targets = len(targets)
        else:
            self.n_targets = 0
        self.confusion_matrix = ConfusionMatrix(self.n_targets, dtype)
        self.last_class = None

        self.targets = targets
        self.window_size = window_size
        self.true_labels = FastBuffer(window_size)
        self.predictions = FastBuffer(window_size)
        self.temp = 0
        self.last_prediction = None
        self.last_true_label = None

        self.majority_classifier = 0
        self.correct_no_change = 0
        self.majority_classifier_correction = FastBuffer(window_size)
        self.correct_no_change_correction = FastBuffer(window_size)
示例#6
0
class WindowClassificationMeasurements(BaseObject):
    """ WindowClassificationMeasurements
    
    This class will maintain a fixed sized window of the newest information 
    about one classifier. It can provide, as requested, any of the relevant 
    current metrics about the classifier, measured inside the window.
     
    To keep track of statistics inside a window, the class will use a 
    ConfusionMatrix object, alongside FastBuffers, to simulate fixed sized 
    windows of the important classifier's attributes.
    
    Its functionalities are somewhat similar to those of the 
    ClassificationMeasurements class. The difference is that the statistics 
    kept by this class are local, or partial, while the statistics kept by 
    the ClassificationMeasurements class are global.
    
    At any given moment, it can compute the following statistics: performance, 
    kappa, kappa_t, kappa_m, majority_class and error rate.
    
    Parameters
    ----------
    targets: list
        A list containing the possible labels.
    
    dtype: data type (Default: numpy.int64)
        The data type of the existing labels.
        
    window_size: int (Default: 200)
        The width of the window. Determines how many samples the object 
        can see.
    
    Examples
    --------
    
    """
    def __init__(self, targets=None, dtype=np.int64, window_size=200):
        super().__init__()
        if targets is not None:
            self.n_targets = len(targets)
        else:
            self.n_targets = 0
        self.confusion_matrix = ConfusionMatrix(self.n_targets, dtype)
        self.last_class = None

        self.targets = targets
        self.window_size = window_size
        self.true_labels = FastBuffer(window_size)
        self.predictions = FastBuffer(window_size)
        self.temp = 0
        self.last_prediction = None
        self.last_true_label = None

        self.majority_classifier = 0
        self.correct_no_change = 0
        self.majority_classifier_correction = FastBuffer(window_size)
        self.correct_no_change_correction = FastBuffer(window_size)

    def reset(self, targets=None):
        if targets is not None:
            self.n_targets = len(targets)
        else:
            self.n_targets = 0
        self.majority_classifier = 0
        self.correct_no_change = 0
        self.confusion_matrix.restart(self.n_targets)
        self.majority_classifier_correction = FastBuffer(self.window_size)
        self.correct_no_change_correction = FastBuffer(self.window_size)

    def add_result(self, sample, prediction):
        """ add_result

        Updates its statistics with the results of a prediction. If needed it 
        will remove samples from the observation window.

        Parameters
        ----------
        sample: int
            The true label.

        prediction: int
            The classifier's prediction

        """
        true_y = self._get_target_index(sample, True)
        pred = self._get_target_index(prediction, True)
        old_true = self.true_labels.add_element(np.array([sample]))
        old_predict = self.predictions.add_element(np.array([prediction]))

        # Verify if its needed to decrease the count of any label
        # pair in the confusion matrix
        if (old_true is not None) and (old_predict is not None):
            self.temp += 1
            error = self.confusion_matrix.remove(
                self._get_target_index(old_true[0]),
                self._get_target_index(old_predict[0]))
            self.correct_no_change += self.correct_no_change_correction.peek()
            self.majority_classifier += self.majority_classifier_correction.peek(
            )

        # Verify if its needed to decrease the majority_classifier count
        if (self.get_majority_class()
                == sample) and (self.get_majority_class() is not None):
            self.majority_classifier += 1
            self.majority_classifier_correction.add_element([-1])
        else:
            self.majority_classifier_correction.add_element([0])

        # Verify if its needed to decrease the correct_no_change
        if (self.last_true_label == sample) and (self.last_true_label
                                                 is not None):
            self.correct_no_change += 1
            self.correct_no_change_correction.add_element([-1])
        else:
            self.correct_no_change_correction.add_element([0])

        self.confusion_matrix.update(true_y, pred)

        self.last_true_label = sample
        self.last_prediction = prediction

    def get_last(self):
        return self.last_true_label, self.last_prediction

    def get_majority_class(self):
        """ get_majority_class
         
        Computes the window/local true majority class.
         
        Returns
        -------
        int
            Returns the true window/local majority class.
        
        """
        if (self.n_targets is None) or (self.n_targets == 0):
            return None
        majority_class = 0
        max_prob = 0.0
        for i in range(self.n_targets):
            sum = 0.0
            for j in range(self.n_targets):
                sum += self.confusion_matrix.value_at(i, j)
            sum = sum / self.true_labels.get_current_size()
            if sum > max_prob:
                max_prob = sum
                majority_class = i

        return majority_class

    def get_performance(self):
        """ get_performance

        Computes the window/local performance.

        Returns
        -------
        float
            Returns the window/local performance.

        """
        sum_value = 0.0
        n, _ = self.confusion_matrix.shape()
        for i in range(n):
            sum_value += self.confusion_matrix.value_at(i, i)
        try:
            return sum_value / self.true_labels.get_current_size()
        except ZeroDivisionError:
            return 0.0

    def get_incorrectly_classified_ratio(self):
        return 1.0 - self.get_performance()

    def _get_target_index(self, target, add=False):
        """ _get_target_index

        Computes the index of an element in the self.targets list. 
        Also reshapes the ConfusionMatrix and adds new found targets 
        if add is True.

        Parameters
        ----------
        target: int
            A class label.

        add: bool
            Either to add new found labels to the targets list or not.

        Returns
        -------
        int
            The target index in the self.targets list.

        """
        if (self.targets is None) and add:
            self.targets = []
            self.targets.append(target)
            self.n_targets = len(self.targets)
            self.confusion_matrix.reshape(len(self.targets), len(self.targets))
        elif (self.targets is None) and (not add):
            return None
        if ((target not in self.targets) and (add)):
            self.targets.append(target)
            self.n_targets = len(self.targets)
            self.confusion_matrix.reshape(len(self.targets), len(self.targets))
        for i in range(len(self.targets)):
            if self.targets[i] == target:
                return i
        return None

    def get_kappa(self):
        """ get_kappa

        Computes the window/local Cohen's kappa coefficient.

        Returns
        -------
        float
            Returns the window/local Cohen's kappa coefficient.

        """
        p0 = self.get_performance()
        pc = 0.0
        n, l = self.confusion_matrix.shape()
        for i in range(n):
            row = self.confusion_matrix.row(i)
            column = self.confusion_matrix.column(i)

            sum_row = np.sum(row) / self.true_labels.get_current_size()
            sum_column = np.sum(column) / self.true_labels.get_current_size()

            pc += sum_row * sum_column

        if pc == 1:
            return 1
        return (p0 - pc) / (1.0 - pc)

    def get_kappa_t(self):
        """ get_kappa_t

        Computes the window/local Cohen's kappa T coefficient. This measures 
        the temporal correlation between samples.

        Returns
        -------
        float
            Returns the window/local Cohen's kappa T coefficient.

        """
        p0 = self.get_performance()
        if self._sample_count != 0:
            pc = self.correct_no_change / self._sample_count
        else:
            pc = 0
        if pc == 1:
            return 1
        return (p0 - pc) / (1.0 - pc)

    def get_kappa_m(self):
        """ get_kappa_t

        Computes the window/local Cohen's kappa M coefficient. 

        Returns
        -------
        float
            Returns the window/local Cohen's kappa M coefficient.

        """
        p0 = self.get_performance()
        if self._sample_count != 0:
            pc = self.majority_classifier / self._sample_count
        else:
            pc = 0
        if pc == 1:
            return 1
        return (p0 - pc) / (1.0 - pc)

    @property
    def _matrix(self):
        return self.confusion_matrix._matrix

    @property
    def _sample_count(self):
        return self.true_labels.get_current_size()

    def get_class_type(self):
        return 'collection'

    def get_info(self):
        return 'ClassificationMeasurements: targets: ' + str(self.targets) + \
               ' - sample_count: ' + str(self._sample_count) + \
               ' - window_size: ' + str(self.window_size) + \
               ' - performance: ' + str(self.get_performance()) + \
               ' - kappa: ' + str(self.get_kappa()) + \
               ' - kappa_t: ' + str(self.get_kappa_t()) + \
               ' - kappa_m: ' + str(self.get_kappa_m()) + \
               ' - majority_class: ' + str(self.get_majority_class())
示例#7
0
class WindowRegressionMeasurements(BaseObject):
    """ WindowRegressionMeasurements
    
    This class is used to keep updated statistics over a regression 
    learner in a regression problem context inside a fixed sized window.
    It uses FastBuffer objects to simulate the fixed sized windows.
    
    It will keep track of partial metrics, that can be provided at 
    any moment. The relevant metrics kept by an instance of this class 
    are: MSE (mean square error) and MAE (mean absolute error). 
    
    """
    def __init__(self, window_size=200):
        super().__init__()
        self.total_square_error = 0.0
        self.average_error = 0.0
        self.last_true_label = None
        self.last_prediction = None
        self.total_square_error_correction = FastBuffer(window_size)
        self.average_error_correction = FastBuffer(window_size)
        self.window_size = window_size

    def reset(self):
        self.total_square_error = 0.0
        self.average_error = 0.0
        self.last_true_label = None
        self.last_prediction = None
        self.total_square_error_correction = FastBuffer(self.window_size)
        self.average_error_correction = FastBuffer(self.window_size)

    def add_result(self, sample, prediction):
        """ add_result

        Use the true label and the prediction to update the statistics.

        Parameters
        ----------
        sample: int
            The true label.

        prediction: int
            The classifier's prediction

        """
        self.last_true_label = sample
        self.last_prediction = prediction
        self.total_square_error += (sample - prediction) * (sample -
                                                            prediction)
        self.average_error += np.absolute(sample - prediction)

        old_square = self.total_square_error_correction.add_element(
            np.array([-1 * ((sample - prediction) * (sample - prediction))]))
        old_average = self.average_error_correction.add_element(
            np.array([-1 * (np.absolute(sample - prediction))]))

        if (old_square is not None) and (old_average is not None):
            self.total_square_error += old_square[0]
            self.average_error += old_average[0]

    def get_mean_square_error(self):
        """ get_mean_square_error

        Computes the window/local mean square error.

        Returns
        -------
        float
            Returns the window/local mean square error.

        """
        if self._sample_count == 0:
            return 0.0
        else:
            return self.total_square_error / self._sample_count

    def get_average_error(self):
        """ get_average_error

        Computes the window/local mean absolute error.

        Returns
        -------
        float
            Returns the window/local mean absolute error.

        """
        if self._sample_count == 0:
            return 0.0
        else:
            return self.average_error / self._sample_count

    def get_last(self):
        return self.last_true_label, self.last_prediction

    @property
    def _sample_count(self):
        return self.total_square_error_correction.get_current_size()

    def get_class_type(self):
        return 'collection'

    def get_info(self):
        return 'RegressionMeasurements: sample_count: ' + str(self._sample_count) + \
               ' - mean_square_error: ' + str(self.get_mean_square_error()) + \
               ' - mean_absolute_error: ' + str(self.get_average_error())
示例#8
0
class MissingValuesCleaner(BaseTransform):
    """ MissingValuesCleaner
    
    This is a transform object. It provides a simple way to replace missing 
    values in samples with another value, which can be chosen from a set of 
    replacing strategies.
    
    A missing value in a sample can be coded in many different ways, but the 
    most common one is to use numpy's NaN, that's why that is the default 
    missing value parameter.
    
    The user should choose the correct substitution strategy for his use 
    case, as each strategy has its pros and cons. The strategy can be chosen 
    from a set of predefined strategies, which are: 'zero', 'mean', 'median', 
    'mode', 'custom'.
    
    Parameters
    ----------
    missing_value: int, char (Default: numpy.nan)
        The way a missed value is coded in the matrices that are to be 
        transformed.
    
    strategy: string (Default: 'zero')
        The strategy adopted to find the missing value replacement. It can 
        be one of the following: 'zero', 'mean', 'median', 'mode', 'custom'.
    
    window_size: int (Default: 200)
        Defines the window size for the 'mean', 'median' and 'mode' strategies.
    
    new_value: int (Default: 1)
        This is the replacement value in case the chosen strategy is 'custom'.
        
    Examples
    --------
    >>> # Imports
    >>> import numpy as np
    >>> from skmultiflow .options.file_option import FileOption
    >>> from skmultiflow.data.file_stream import FileStream
    >>> from skmultiflow.filtering.base_filters import MissingValuesCleaner
    >>> # Setting up a stream
    >>> opt = FileOption('FILE', 'OPT_NAME', 'skmultiflow/datasets/covtype.csv', 'csv', False)
    >>> stream = FileStream(opt, -1, 1)
    >>> stream.prepare_for_use()
    >>> # Setting up the filter to substitute values -47 by the median of the 
    >>> # last 10 samples
    >>> filter = MissingValuesCleaner(-47, 'median', 10)
    >>> X, y = stream.next_instance(10)
    >>> X[9, 0] = -47
    >>> # We will use this list to keep track of values
    >>> list = []
    >>> # Iterate over the first 9 samples, to build a sample window
    >>> for i in range(9):
    ...     X_transf = filter.partial_fit_transform([X[i].tolist()])
    ...     list.append(X_transf[0][0])
    ...     print(X_transf)
    >>>
    >>> # Transform last sample. The first feature should be replaced by the list's 
    >>> # median value
    >>> X_transf = filter.partial_fit_transform([X[9].tolist()])
    >>> print(X_transf)
    >>> np.median(list)
    
    """
    def __init__(self,
                 missing_value=np.nan,
                 strategy='zero',
                 window_size=200,
                 new_value=1):
        super().__init__()
        #default_values
        self.missing_value = np.nan
        self.strategy = 'zero'
        self.window_size = 200
        self.window = None
        self.new_value = 1

        self.__configure(missing_value, strategy, window_size, new_value)

    def __configure(self, missing_value, strategy, window_size, new_value=1):
        if hasattr(missing_value, 'append'):
            self.missing_value = missing_value
        else:
            self.missing_value = [missing_value]
        self.strategy = strategy
        self.window_size = window_size
        self.new_value = new_value

        if strategy in ['mean', 'median', 'mode']:
            self.window = FastBuffer(max_size=window_size)

    def transform(self, X):
        """ transform
        
        Does the transformation process in the samples in X.
        
        Parameters
        ----------
        X: numpy.ndarray of shape (n_samples, n_features)
            The sample or set of samples that should be transformed.
        
        """
        r, c = get_dimensions(X)
        for i in range(r):
            for j in range(c):
                if X[i][j] in self.missing_value:
                    X[i][j] = self._get_substitute(j)

        return X

    def _get_substitute(self, column_index):
        """ _get_substitute
        
        Computes the replacement for a missing value.
        
        Parameters
        ----------
        column_index: int
            The index from the column where the missing value was found.
            
        Returns
        -------
        int or float
            The replacement.
        
        """
        if self.strategy == 'zero':
            return 0
        elif self.strategy == 'mean':
            if not self.window.isempty():
                return np.mean(
                    np.array(
                        self.window.get_queue())[:, column_index:column_index +
                                                 1])
            else:
                return self.new_value
        elif self.strategy == 'median':
            if not self.window.isempty():
                return np.median(
                    np.array(
                        self.window.get_queue())[:, column_index:column_index +
                                                 1].flatten())
            else:
                return self.new_value
        elif self.strategy == 'mode':
            if not self.window.isempty():
                return stats.mode(
                    np.array(
                        self.window.get_queue())[:, column_index:column_index +
                                                 1].flatten())
            else:
                return self.new_value
        elif self.strategy == 'custom':
            return self.new_value

    def partial_fit_transform(self, X, y=None):
        """ partial_fit_transform
        
        Partially fits the model and then apply the transform to the data.
        
        Parameters
        ----------
        X: numpy.ndarray of shape (n_samples, n_features)
            The sample or set of samples that should be transformed.
            
        y: Array-like
            The true labels.
         
        Returns
        -------
        numpy.ndarray of shape (n_samples, n_features)
            The transformed data.
        
        """
        X = self.transform(X)
        if self.strategy in ['mean', 'median', 'mode']:
            self.window.add_element(X)

        return X

    def partial_fit(self, X, y=None):
        """ partial_fit
        
        Partial fits the model.
        
        Parameters
        ----------
        X: numpy.ndarray of shape (n_samples, n_features)
            The sample or set of samples that should be transformed.
            
        y: Array-like
            The true labels.
        
        Returns
        -------
        MissingValuesCleaner
            self
        
        """
        X = np.asarray(X)
        if self.strategy in ['mean', 'meadian', 'mode']:
            self.window.add_element(X)
        return self

    def get_info(self):
        return 'MissingValueCleaner: missing_value: ' + str(self.missing_value) + \
               ' - strategy: ' + self.strategy + \
               ' - window_size: ' + str(self.window_size) + \
               ' - new_value: ' + str(self.new_value)