Exemplo n.º 1
0
class FillNan(BaseTransformer):
    def __init__(self, fill_method='zero', fill_missing=True, **kwargs):
        """Imputs NaN's using various filling methods like mean, zero, median, min, random


        Args:
            fill_method: How NaN's will be exchanged. Possible values: 'mean', 'zero', 'median', 'min', 'random'
            fill_missing: If True, transformer will fill NaN values by filling method
        """
        super().__init__()
        self.fill_missing = fill_missing
        self.filler = SimpleFill(fill_method)

    def transform(self, X):
        """
        Args:
            X: DataFrame with NaN's
        Returns:
            Dictionary with one key - 'X' corresponding to given DataFrame but without nan's

        """
        if self.fill_missing:
            X = self.filler.complete(X)
        return {'X': X}

    def load(self, filepath):
        self.filler = joblib.load(filepath)
        return self

    def persist(self, filepath):
        joblib.dump(self.filler, filepath)
Exemplo n.º 2
0
    def impute_using_statistics(df, method='min'):
        """
        Imputes the missing values by the selected statistical property of each column

        :param df: The input dataframe that contains missing values
        :param method: The imputation method (min by default)
            "zero": fill missing entries with zeros
            "mean": fill with column means
            "median" : fill with column medians
            "min": fill with min value per column
            "random": fill with gaussian noise according to mean/std of column
        :return: the imputed dataframe
        """
        sf = SimpleFill(method)
        imputed_matrix = sf.complete(df.values)
        imputed_df = pd.DataFrame(imputed_matrix, df.index, df.columns)
        return imputed_df
Exemplo n.º 3
0
from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, SimpleFill

n = 200
m = 20
inner_rank = 4
X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m))
print("Mean squared element: %0.4f" % (X ** 2).mean())

# X is a data matrix which we're going to randomly drop entries from
missing_mask = np.random.rand(*X.shape) < 0.1
X_incomplete = X.copy()
# missing entries indicated with NaN
X_incomplete[missing_mask] = np.nan

meanFill = SimpleFill("mean")
X_filled_mean = meanFill.complete(X_incomplete)

# Use 3 nearest rows which have a feature to fill in each row's missing features
knnImpute = KNN(k=3)
X_filled_knn = knnImpute.complete(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
# sometimes useful for low-rank imputation methods