class MissingValuesCleaner(StreamTransform):
    """ Fill missing values with some defined value.

    Provides a simple way to replace missing values in data samples with some value. The imputation value
    can be set via a set of imputation strategies.
    
    Parameters
    ----------
    missing_value: int, float or list (Default: numpy.nan)
        Missing value to replace
    
    strategy: string (Default: 'zero')
        The strategy adopted to find the missing value replacement. It can 
        be one of the following: 'zero', 'mean', 'median', 'mode', 'custom'.
    
    window_size: int (Default: 200)
        Defines the window size for the 'mean', 'median' and 'mode' strategies.
    
    new_value: int (Default: 1)
        This is the replacement value in case the chosen strategy is 'custom'.
        
    Examples
    --------
    >>> # Imports
    >>> import numpy as np
    >>> from skmultiflow.data.file_stream import FileStream
    >>> from skmultiflow.transform.missing_values_cleaner import MissingValuesCleaner
    >>> # Setting up a stream
    >>> stream = FileStream('skmultiflow/data/datasets/covtype.csv', -1, 1)
    >>> # Setting up the filter to substitute values -47 by the median of the
    >>> # last 10 samples
    >>> cleaner = MissingValuesCleaner(-47, 'median', 10)
    >>> X, y = stream.next_sample(10)
    >>> X[9, 0] = -47
    >>> # We will use this list to keep track of values
    >>> data = []
    >>> # Iterate over the first 9 samples, to build a sample window
    >>> for i in range(9):
    >>>     X_transf = cleaner.partial_fit_transform([X[i].tolist()])
    >>>     data.append(X_transf[0][0])
    >>>
    >>> # Transform last sample. The first feature should be replaced by the list's 
    >>> # median value
    >>> X_transf = cleaner.partial_fit_transform([X[9].tolist()])
    >>> np.median(data)

    Notes
    -----
    A missing value in a sample can be coded in many different ways, but the
    most common one is to use numpy's NaN, that's why that is the default
    missing value parameter.

    The user should choose the correct substitution strategy for his use
    case, as each strategy has its pros and cons. The strategy can be chosen
    from a set of predefined strategies, which are: 'zero', 'mean', 'median',
    'mode', 'custom'.

    Notice that `MissingValuesCleaner` can actually be used to replace arbitrary
    values.

    """
    def __init__(self,
                 missing_value=np.nan,
                 strategy='zero',
                 window_size=200,
                 new_value=1):
        super().__init__()
        if isinstance(missing_value, list):
            self.missing_value = missing_value
        else:
            self.missing_value = [missing_value]
        self.strategy = strategy
        self.window_size = window_size
        self.window = None
        self.new_value = new_value

        self.__configure()

    def __configure(self):
        if self.strategy in ['mean', 'median', 'mode']:
            self.window = FastBuffer(max_size=self.window_size)

    def transform(self, X):
        """ transform
        
        Does the transformation process in the samples in X.
        
        Parameters
        ----------
        X: numpy.ndarray of shape (n_samples, n_features)
            The sample or set of samples that should be transformed.
        
        """
        r, c = get_dimensions(X)
        for i in range(r):
            if self.strategy in ['mean', 'median', 'mode']:
                self.window.add_element([X[i][:]])
            for j in range(c):
                if X[i][j] in self.missing_value or np.isnan(X[i][j]):
                    X[i][j] = self._get_substitute(j)

        return X

    def _get_substitute(self, column_index):
        """ _get_substitute
        
        Computes the replacement for a missing value.
        
        Parameters
        ----------
        column_index: int
            The index from the column where the missing value was found.
            
        Returns
        -------
        int or float
            The replacement.
        
        """
        if self.strategy == 'zero':
            return 0
        elif self.strategy == 'mean':
            if not self.window.is_empty():
                return np.nanmean(
                    np.array(self.window.get_queue())[:, column_index])
            else:
                return self.new_value
        elif self.strategy == 'median':
            if not self.window.is_empty():
                return np.nanmedian(
                    np.array(self.window.get_queue())[:, column_index])
            else:
                return self.new_value
        elif self.strategy == 'mode':
            if not self.window.is_empty():
                return stats.mode(np.array(
                    self.window.get_queue())[:, column_index],
                                  nan_policy='omit')[0]
            else:
                return self.new_value
        elif self.strategy == 'custom':
            return self.new_value

    def partial_fit_transform(self, X, y=None):
        """ partial_fit_transform
        
        Partially fits the model and then apply the transform to the data.
        
        Parameters
        ----------
        X: numpy.ndarray of shape (n_samples, n_features)
            The sample or set of samples that should be transformed.
            
        y: Array-like
            The true labels.
         
        Returns
        -------
        numpy.ndarray of shape (n_samples, n_features)
            The transformed data.
        
        """
        X = self.transform(X)

        return X

    def partial_fit(self, X, y=None):
        """ partial_fit
        
        Partial fits the model.
        
        Parameters
        ----------
        X: numpy.ndarray of shape (n_samples, n_features)
            The sample or set of samples that should be transformed.
            
        y: Array-like
            The true labels.
        
        Returns
        -------
        MissingValuesCleaner
            self
        
        """
        X = np.asarray(X)
        if self.strategy in ['mean', 'median', 'mode']:
            self.window.add_element(X)
        return self
Exemplo n.º 2
0
class MyKNNClassifier(KNNClassifier): # ... 
    def __init__(self, n_neighbors=5, max_window_size=1000, leaf_size=30, metric='euclidean', weighted_vote=False,
                 standardize = False):
        self.weighted_vote = weighted_vote
        self.standardize = standardize
        super().__init__(n_neighbors=n_neighbors, max_window_size=max_window_size, leaf_size=leaf_size, metric=metric)
        self.window_size = max_window_size
        self.window = None

        self.__configure()

    def __configure(self):
        self.window = FastBuffer(max_size=self.window_size)
        
    def partial_fit(self, X, y, classes=None, sample_weight=None):
        if(self.standardize == True):
            instance = np.array(X)
            X = self.transform_vector(instance)
            self.window.add_element(X)
        r, c = get_dimensions(X)

        if classes is not None:
            self.classes = list(set().union(self.classes, classes))

        for i in range(r):
            self.data_window.add_sample(X[i], y[i])

        return self
    
    
    def standardization(self, X):   
        #scaler = MinMaxScaler(feature_range=(0, 1))
        #scaler = scaler.fit(X)
        #print('Min: %f, Max: %f' % (scaler.data_min_, scaler.data_max_))
        #normalize the dataset and print the first 5 rows
        #normalized = scaler.transform(X)
        
        #return X
        
        scaler = StandardScaler()
        scaler.fit(X)
        normalized = scaler.fit_transform(X)
        X = normalized
        
        return X
    

    #Modify this method
    def predict_proba(self, X):
        #print("Not Weighted")
        #Add standardization in this method too
        if(self.standardize == True):
            instance = np.array(X)
            X = self.transform_vector(instance)
            
        r, c = get_dimensions(X)

        #print("Value of R: ", r) # r = 1
        #print("Value of C: ", c) # c = 2
        if self.data_window is None or self.data_window.size < self.n_neighbors:
            # The model is empty, defaulting to zero
            return np.zeros(shape=(r, 1))
        proba = []

        self.classes = list(set().union(self.classes, np.unique(self.data_window.targets_buffer.astype(np.int))))
        new_dist, new_ind = self._get_neighbors(X)

        #print("new_dist: ", new_dist)
        #print("new_ind: ", new_ind)
        ###################################### Weighting that I've added #######################################################
        #if(self.weighted_vote == True):
            #votes = self.vote(new_ind)
        #  self.classes = int(self.data_window.get_targets_matrix()[new_ind]) #Class of our index
        
        if(self.weighted_vote == False):
            #print("Not Weighted")
            for i in range(r):
                votes = [0.0 for _ in range(int(max(self.classes) + 1))]
                for index in new_ind[i]:
                    votes[int(self.data_window.targets_buffer[index])] += 1. / len(new_ind[i])

                proba.append(votes)
                
        else:
            #print("Weighted")
            position = 0
            for i in range(r):
                votes = [0.0 for _ in range(int(max(self.classes) + 1))]
                for index in new_ind[i]:
                    votes[int(self.data_window.targets_buffer[index])] += np.sum((1. / new_dist[i][position])) / len(new_ind[i])
                    position = position + 1
                proba.append(votes)

        return np.asarray(proba)

    
    def calculate_mean(self, column_index):
        mean = 0.
        if not self.window.is_empty():
            mean = np.nanmean(np.array(self.window.get_queue())[:, column_index])
        return mean

    def calculate_stddev(self, column_index):
        std = 1.
        if not self.window.is_empty():
            std = np.nanstd(np.array(self.window.get_queue())[:, column_index])
        if(std == 0.):
            std = 1.
        return std
    
    def transform_vector(self, X):
        r, c = get_dimensions(X)
        for i in range(r):
            row = np.copy([X[i][:]])
            for j in range(c):
                value = X[i][j]
                mean = self.calculate_mean(j)
                standard_deviation = self.calculate_stddev(j)
                standardized = (value - mean) / standard_deviation
                X[i][j] = standardized
            self.window.add_element(row)
        return X
class WindowedMinmaxScaler(StreamTransform):
    """ Transform features by scaling each feature to a given range.
    This estimator scales and translates each feature individually such
    that it is in the given range on the training set, e.g. between zero and one.
    For the training set we consider a window of a given length.

    Parameters
    ----------
    window_size: int (Default: 200)
        Defines the window size to compute min and max values.

    Examples
    --------
    """

    def __init__(self, window_size=200):
        super().__init__()
        self.window_size = window_size
        self.window = None

        self.__configure()

    def __configure(self):
        self.window = FastBuffer(max_size=self.window_size)

    def transform(self, X):
        """ Does the transformation process in the samples in X.

        Parameters
        ----------
        X: numpy.ndarray of shape (n_samples, n_features)
            The sample or set of samples that should be transformed.

        """
        r, c = get_dimensions(X)
        for i in range(r):
            row = np.copy([X[i][:]])
            for j in range(c):
                value = X[i][j]
                min_val = self._get_min(j)
                max_val = self._get_max(j)
                if((max_val-min_val)==0):
                    transformed=0
                else:
                    X_std = (value - min_val) / (max_val - min_val)
                    transformed = X_std * (max_val - min_val) + min_val
                X[i][j] = transformed
            self.window.add_element(row)
        return X

    def _get_min(self, column_index):
        min_val = 0.
        if not self.window.is_empty():
            min_val = np.nanmin(np.array(self.window.get_queue())[:, column_index])
        return min_val

    def _get_max(self, column_index):
        max_val = 1.
        if not self.window.is_empty():
            max_val = np.nanmax(np.array(self.window.get_queue())[:, column_index])
        return max_val

    def partial_fit_transform(self, X, y=None):
        """ Partially fits the model and then apply the transform to the data.

        Parameters
        ----------
        X: numpy.ndarray of shape (n_samples, n_features)
            The sample or set of samples that should be transformed.

        y: numpy.ndarray (optional, default=None)
            The target values.

        Returns
        -------
        numpy.ndarray of shape (n_samples, n_features)
            The transformed data.

        """
        X = self.transform(X)

        return X

    def partial_fit(self, X, y=None):
        """ Partial fits the model.

        Parameters
        ----------
        X: numpy.ndarray of shape (n_samples, n_features)
            The sample or set of samples that should be transformed.

        y: numpy.ndarray (optional, default=None)
            The target values.

        Returns
        -------
        MinmaxScaler
            self

        """
        self.window.add_element(X)
        return self