class FillNan(BaseTransformer): def __init__(self, fill_method='zero', fill_missing=True, **kwargs): """Imputs NaN's using various filling methods like mean, zero, median, min, random Args: fill_method: How NaN's will be exchanged. Possible values: 'mean', 'zero', 'median', 'min', 'random' fill_missing: If True, transformer will fill NaN values by filling method """ super().__init__() self.fill_missing = fill_missing self.filler = SimpleFill(fill_method) def transform(self, X): """ Args: X: DataFrame with NaN's Returns: Dictionary with one key - 'X' corresponding to given DataFrame but without nan's """ if self.fill_missing: X = self.filler.complete(X) return {'X': X} def load(self, filepath): self.filler = joblib.load(filepath) return self def persist(self, filepath): joblib.dump(self.filler, filepath)
def impute_using_statistics(df, method='min'): """ Imputes the missing values by the selected statistical property of each column :param df: The input dataframe that contains missing values :param method: The imputation method (min by default) "zero": fill missing entries with zeros "mean": fill with column means "median" : fill with column medians "min": fill with min value per column "random": fill with gaussian noise according to mean/std of column :return: the imputed dataframe """ sf = SimpleFill(method) imputed_matrix = sf.complete(df.values) imputed_df = pd.DataFrame(imputed_matrix, df.index, df.columns) return imputed_df
from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, SimpleFill n = 200 m = 20 inner_rank = 4 X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m)) print("Mean squared element: %0.4f" % (X ** 2).mean()) # X is a data matrix which we're going to randomly drop entries from missing_mask = np.random.rand(*X.shape) < 0.1 X_incomplete = X.copy() # missing entries indicated with NaN X_incomplete[missing_mask] = np.nan meanFill = SimpleFill("mean") X_filled_mean = meanFill.complete(X_incomplete) # Use 3 nearest rows which have a feature to fill in each row's missing features knnImpute = KNN(k=3) X_filled_knn = knnImpute.complete(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().complete(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods