def describe(data): # verbose=True): """ Print input/output multiple times Parameters ---------- data: numpy.nd.array The data you want to get a description from verbose: boolean(optional) Decides whether the description is short or long form Returns ------- dict missingness: list Confidence interval of data being MCAR, MAR or MNAR - in that order null_xy: list of tuples Indices of all null points null_n: list Total number of null values for each column pmissing_n: float Percentage of missing values in dataset null_rows: list Indices of all rows that are completely null null_cols: list Indices of all columns that are completely null mean_rows: list Mean value of each row mean_cols: list Mean value of each column std_dev: list std dev for each row/column min_max: list Finds the minimum and maximum for each row """ # missingness = [0.33, 0.33, 0.33] # find_missingness(data) null_xy = find_null(data) null_n = len(null_xy) pmissing_n = float(null_n / len(data.flatten)) # pmissing_rows = "" # pmissing_cols = "" # null_rows = "" # null_cols = "" # mean_rows = "" # mean_cols = "" # std_dev = "" # "missingness": missingness, description = { "null_xy": null_xy, "null_n": null_n, "pmissing_n": pmissing_n } # "pmissing_rows": pmissing_rows, # "pmissing_cols": pmissing_cols, # "null_rows": null_rows, # "null_cols": null_cols, # "mean_rows": mean_rows, # "mean_cols": mean_cols, # "std_dev": std_dev} return description
def arima(data, p, d, q, axis=0): """Autoregressive Integrated Moving Average Imputation Stationary model PARAMETERS ---------- data: numpy.ndarray The matrix with missing values that you want to impute p: int Number of autoregressive terms. Ex (p,d,q)=(1,0,0). d: int Number of nonseasonal differences needed for stationarity q: int Number of lagged forecast errors in the prediction equation axis: boolean (optional) 0 if time series is in row format (Ex. data[0][:] is 1st data point). 1 if time series is in col format (Ex. data[:][0] is 1st data point). RETURNS ------- numpy.ndarray """ assert isinstance(p, int), "Parameter `p` must be an integer" assert isinstance(d, int), "Parameter `d` must be an integer" assert isinstance(q, int), "Parameter `q` must be an integer" null_xy = find_null(data) for x, y in null_xy: print(x, y) return data
def mode(data): """ Substitute missing values with the mode of that column(most frequent). In the case that there is a tie (there are multiple, most frequent values) for a column randomly pick one of them. Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ null_xy = find_null(data) modes = [] for y_i in range(np.shape(data)[1]): unique_counts = np.unique(data[:, [y_i]], return_counts=True) max_count = np.max(unique_counts[1]) mode_y = [ unique for unique, count in np.transpose(unique_counts) if count == max_count and not np.isnan(unique) ] modes.append(mode_y) # Appends index of column and column modes for x_i, y_i in null_xy: data[x_i][y_i] = np.random.choice(modes[y_i]) return data
def count_missing(data): """ Calculate the total percentage of missing values and also the percentage in each column. Parameters ---------- data: np.array Data to impute. Returns ------- dict Percentage of missing values in total and in each column. """ size = len(data.flatten()) null_xy = find_null(data) np.unique(null_xy) counter = {y: 0. for y in np.unique(null_xy.T[1])} change_in_percentage = 1. / size for _, y in null_xy: counter[y] += change_in_percentage total_missing = len(null_xy) / size counter["total"] = total_missing return counter
def locf(data, axis=0): """ Last Observation Carried Forward For each set of missing indices, use the value of one row before(same column). In the case that the missing value is the first row, look one row ahead instead. If this next row is also NaN, look to the next row. Repeat until you find a row in this column that's not NaN. All the rows before will be filled with this value. Parameters ---------- data: numpy.ndarray Data to impute. axis: boolean (optional) 0 if time series is in row format (Ex. data[0][:] is 1st data point). 1 if time series is in col format (Ex. data[:][0] is 1st data point). Returns ------- numpy.ndarray Imputed data. """ if axis == 0: data = np.transpose(data) elif axis == 1: pass else: raise BadInputError( "Error: Axis value is invalid, please use either 0 (row format) or 1 (column format)" ) null_xy = find_null(data) for x_i, y_i in null_xy: # Simplest scenario, look one row back if x_i - 1 > -1: data[x_i][y_i] = data[x_i - 1][y_i] # Look n rows forward else: x_residuals = np.shape(data)[0] - x_i - 1 # n datapoints left val_found = False for i in range(1, x_residuals): if not np.isnan(data[x_i + i][y_i]): val_found = True break if val_found: # pylint: disable=undefined-loop-variable for x_nan in range(i): data[x_i + x_nan][y_i] = data[x_i + i][y_i] else: raise Exception("Error: Entire Column is NaN") return data
def em(data, loops=50): """ Imputes given data using expectation maximization. E-step: Calculates the expected complete data log likelihood ratio. M-step: Finds the parameters that maximize the log likelihood of the complete data. Parameters ---------- data: numpy.nd.array Data to impute. loops: int Number of em iterations to run before breaking. inplace: boolean If True, operate on the numpy array reference Returns ------- numpy.nd.array Imputed data. """ null_xy = find_null(data) for x_i, y_i in null_xy: col = data[:, int(y_i)] mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() col[x_i] = np.random.normal(loc=mu, scale=std) previous, i = 1, 1 for i in range(loops): # Expectation mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() # Maximization col[x_i] = np.random.normal(loc=mu, scale=std) # Break out of loop if likelihood doesn't change at least 10% # and has run at least 5 times delta = (col[x_i] - previous) / previous if i > 5 and delta < 0.1: data[x_i][y_i] = col[x_i] break data[x_i][y_i] = col[x_i] previous = col[x_i] return data
def impute_SOM(data, n): imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0) data_fill0 = imputer.fit_transform(data) data_miss = data.copy() som = SOM(data_fill0, (n, n), 3, 300) som.train() res = np.array(som.train_result()).reshape(-1, 1) null_set = find_null(data) for i1, i2 in null_set: neighbor = som.getneighbor(res[i1][0], 1) activation_group = neighbor[0] for j in neighbor: activation_group = activation_group | j activation_group = np.array([som.W.T[p] for p in activation_group]) data_miss[i1, i2] = activation_group.mean(axis=0)[i2] return data_miss
def mean(data): """ Substitute missing values with the mean of that column. Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ null_xy = find_null(data) for x_i, y_i in null_xy: row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])] new_value = np.mean(row_wo_nan) data[x_i][y_i] = new_value return data
def random(data): """ Fill missing values in with a randomly selected value from the same column. Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ null_xy = find_null(data) for x, y in null_xy: uniques = np.unique(data[:, y]) uniques = uniques[~np.isnan(uniques)] data[x][y] = np.random.choice(uniques) return data
def fast_knn(data, k=3, **kwargs): """ Impute using a variant of the nearest neighbours approach Basic idea: Impute array and then use the resulting complete array to construct a KDTree. Use this KDTree to compute nearest neighbours. After finding `k` nearest neighbours, take the weighted average of them. This approach is much, much faster than the other implementation (fit+transform for each subset) which is almost prohibitively expensive. Parameters ---------- data: numpy.ndarray 2D matrix to impute. Returns ------- numpy.ndarray Imputed data. """ null_xy = find_null(data) data_c = mean(data) kdtree = KDTree(data_c) for x_i, y_i in null_xy: distances, indices = kdtree.query(data_c[x_i], k=k + 1) # Will always return itself in the first index. Delete it. distances, indices = distances[1:], indices[1:] weights = (np.sum(distances) - distances) / np.sum(distances) # Make weights sum to 1 weights_unit = weights / np.sum(weights) # Assign missing value the weighted average of `k` nearest neighbours data[x_i][y_i] = np.dot(weights_unit, [data_c[y_i][ind] for ind in indices]) return data
def median(data): """ Substitute missing values with the median of that column(middle). Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ null_xy = find_null(data) cols_missing = set(null_xy.T[1]) medians = {} for y_i in cols_missing: cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])] median_y = np.median(cols_wo_nan) medians[str(y_i)] = median_y for x_i, y_i in null_xy: data[x_i][y_i] = medians[str(y_i)] return data
def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, **kwargs): """ Impute using a variant of the nearest neighbours approach Basic idea: Impute array with a basic mean impute and then use the resulting complete array to construct a KDTree. Use this KDTree to compute nearest neighbours. After finding `k` nearest neighbours, take the weighted average of them. Basically, find the nearest row in terms of distance This approach is much, much faster than the other implementation (fit+transform for each subset) which is almost prohibitively expensive. Parameters ---------- data: numpy.ndarray 2D matrix to impute. k: int, optional Parameter used for method querying the KDTree class object. Number of neighbours used in the KNN query. Refer to the docs for [`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). eps: nonnegative float, optional Parameter used for method querying the KDTree class object. From the SciPy docs: "Return approximate nearest neighbors; the kth returned value is guaranteed to be no further than (1+eps) times the distance to the real kth nearest neighbor". Refer to the docs for [`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). p : float, 1<=p<=infinity, optional Parameter used for method querying the KDTree class object. Straight from the SciPy docs: "Which Minkowski p-norm to use. 1 is the sum-of-absolute-values Manhattan distance 2 is the usual Euclidean distance infinity is the maximum-coordinate-difference distance". Refer to the docs for [`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). distance_upper_bound : nonnegative float, optional Parameter used for method querying the KDTree class object. Straight from the SciPy docs: "Return only neighbors within this distance. This is used to prune tree searches, so if you are doing a series of nearest-neighbor queries, it may help to supply the distance to the nearest neighbor of the most recent point." Refer to the docs for [`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). leafsize: int, optional Parameter used for construction of the `KDTree` class object. Straight from the SciPy docs: "The number of points at which the algorithm switches over to brute-force. Has to be positive". Refer to the docs for [`scipy.spatial.KDTree`](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.KDTree.html) for more information. Returns ------- numpy.ndarray Imputed data. Examples -------- >>> data = np.arange(25).reshape((5, 5)).astype(np.float) >>> data[0][2] = np.nan >>> data array([[ 0., 1., nan, 3., 4.], [ 5., 6., 7., 8., 9.], [10., 11., 12., 13., 14.], [15., 16., 17., 18., 19.], [20., 21., 22., 23., 24.]]) >> fast_knn(data, k=1) # Weighted average (by distance) of nearest 1 neighbour array([[ 0., 1., 7., 3., 4.], [ 5., 6., 7., 8., 9.], [10., 11., 12., 13., 14.], [15., 16., 17., 18., 19.], [20., 21., 22., 23., 24.]]) >> fast_knn(data, k=2) # Weighted average of nearest 2 neighbours array([[ 0. , 1. , 10.08608891, 3. , 4. ], [ 5. , 6. , 7. , 8. , 9. ], [10. , 11. , 12. , 13. , 14. ], [15. , 16. , 17. , 18. , 19. ], [20. , 21. , 22. , 23. , 24. ]]) >> fast_knn(data, k=3) array([[ 0. , 1. , 13.40249283, 3. , 4. ], [ 5. , 6. , 7. , 8. , 9. ], [10. , 11. , 12. , 13. , 14. ], [15. , 16. , 17. , 18. , 19. ], [20. , 21. , 22. , 23. , 24. ]]) >> fast_knn(data, k=5) # There are at most only 4 neighbours. Raises error ... IndexError: index 5 is out of bounds for axis 0 with size 5 """ null_xy = find_null(data) data_c = mean(data) kdtree = KDTree(data_c, leafsize=leafsize) for x_i, y_i in null_xy: distances, indices = kdtree.query( data_c[x_i], k=k + 1, eps=eps, p=p, distance_upper_bound=distance_upper_bound) # Will always return itself in the first index. Delete it. distances, indices = distances[1:], indices[1:] weights = distances / np.sum(distances) # Assign missing value the weighted average of `k` nearest neighbours data[x_i][y_i] = np.dot(weights, [data_c[ind][y_i] for ind in indices]) return data
def buck_iterative(data): """ Iterative variant of buck's method - Variable to regress on is chosen at random. - EM type infinite regression loop stops after change in prediction from previous prediction < 10% for all columns with missing values A Method of Estimation of Missing Values in Multivariate Data Suitable for use with an Electronic Computer S. F. Buck Journal of the Royal Statistical Society. Series B (Methodological) Vol. 22, No. 2 (1960), pp. 302-306 Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ null_xy = find_null(data) # Add a column of zeros to the index values null_xyv = np.append(null_xy, np.zeros((np.shape(null_xy)[0], 1)), axis=1) null_xyv = [[int(x), int(y), v] for x, y, v in null_xyv] temp = [] cols_missing = {y for _, y, _ in null_xyv} # Step 1: Simple Imputation, these are just placeholders for x_i, y_i, value in null_xyv: # Column containing nan value without the nan value col = data[:, [y_i]][~np.isnan(data[:, [y_i]])] new_value = np.mean(col) data[x_i][y_i] = new_value temp.append([x_i, y_i, new_value]) null_xyv = temp # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary) converged = [False] * len(null_xyv) while not all(converged): # Step 2: Placeholders are set back to missing for one variable/column dependent_col = int(np.random.choice(list(cols_missing))) missing_xs = [int(x) for x, y, value in null_xyv if y == dependent_col] # Step 3: Perform linear regression using the other variables x_train, y_train = [], [] for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs): x_train.append(np.delete(data[x_i], dependent_col)) y_train.append(data[x_i][dependent_col]) model = LinearRegression() model.fit(x_train, y_train) # Step 4: Missing values for the missing variable/column are replaced # with predictions from our new linear regression model # For null indices with the dependent column that was randomly chosen for i, z in enumerate(null_xyv): x_i = z[0] y_i = z[1] value = data[x_i, y_i] if y_i == dependent_col: # Row 'x' without the nan value new_value = model.predict( [np.delete(data[x_i], dependent_col)]) data[x_i][y_i] = new_value.reshape(1, -1) if value == 0.0: delta = (new_value - value) / 0.01 else: delta = (new_value - value) / value converged[i] = abs(delta) < 0.1 return data
def test_missing_values_present(): """ Check that the dataset is corrupted (missing values present)""" assert find_null(data).size != 0
def mice(data, **kwargs): """Multivariate Imputation by Chained Equations Reference: Buuren, S. V., & Groothuis-Oudshoorn, K. (2011). Mice: Multivariate Imputation by Chained Equations in R. Journal of Statistical Software, 45(3). doi:10.18637/jss.v045.i03 Implementation follows the main idea from the paper above. Differs in decision of which variable to regress on (here, I choose it at random). Also differs in stopping criterion (here the model stops after change in prediction from previous prediction is less than 10%). Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ null_xy = find_null(data) # Add a column of zeros to the index values null_xyv = np.append(null_xy, np.zeros((np.shape(null_xy)[0], 1)), axis=1) null_xyv = [[int(x), int(y), v] for x, y, v in null_xyv] temp = [] cols_missing = set([y for _, y, _ in null_xyv]) # Step 1: Simple Imputation, these are just placeholders for x_i, y_i, value in null_xyv: # Column containing nan value without the nan value col = data[:, [y_i]][~np.isnan(data[:, [y_i]])] new_value = np.mean(col) data[x_i][y_i] = new_value temp.append([x_i, y_i, new_value]) null_xyv = temp # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary) converged = [False] * len(null_xyv) while all(converged): # Step 2: Placeholders are set back to missing for one variable/column dependent_col = int(np.random.choice(list(cols_missing))) missing_xs = [int(x) for x, y, value in null_xyv if y == dependent_col] # Step 3: Perform linear regression using the other variables x_train, y_train = [], [] for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs): x_train.append(np.delete(data[x_i], dependent_col)) y_train.append(data[x_i][dependent_col]) model = LinearRegression() model.fit(x_train, y_train) # Step 4: Missing values for the missing variable/column are replaced # with predictions from our new linear regression model temp = [] # For null indices with the dependent column that was randomly chosen for i, x_i, y_i, value in enumerate(null_xyv): if y_i == dependent_col: # Row 'x' without the nan value new_value = model.predict(np.delete(data[x_i], dependent_col)) data[x_i][y_i] = new_value.reshape(1, -1) temp.append([x_i, y_i, new_value]) delta = (new_value - value) / value if delta < 0.1: converged[i] = True null_xyv = temp return data
def test_missing_values_present(self): """ Check that the dataset is corrupted (missing values present)""" self.assertTrue(find_null(self.data).size != 0)
def _nan_exists(data): """ True if there is at least one np.nan in the array""" null_xy = find_null(data) return len(null_xy) > 0
def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean, inplace=False, **kwargs): """ Interpolate the missing values based on nearby values. For example, with an array like this: array([[-1.24940, -1.38673, -0.03214945, 0.08255145, -0.007415], [ 2.14662, 0.32758 , -0.82601414, 1.78124027, 0.873998], [-0.41400, -0.977629, nan, -1.39255344, 1.680435], [ 0.40975, 1.067599, 0.29152388, -1.70160145, -0.565226], [-0.54592, -1.126187, 2.04004377, 0.16664863, -0.010677]]) Using a `k` or window size of 3. The one missing value would be set to -1.18509122. The window operates on the horizontal axis. Usage ----- The parameters default the function to a moving mean. You may want to change the default window size: moving_window(data, wsize=10) To only look at past data (null value is at the rightmost index in the window): moving_window(data, nindex=-1) To use a custom function: moving_window(data, func=np.median) You can also do something like take 1.5x the max of previous values in the window: moving_window(data, func=lambda arr: max(arr) * 1.50, nindex=-1) Parameters ---------- data: numpy.ndarray 2D matrix to impute. nindex: int Null index. Index of the null value inside the moving average window. Use cases: Say you wanted to make value skewed toward the left or right side. 0 would only take the average of values from the right and -1 would only take the average of values from the left wsize: int Window size. Size of the moving average window/area of values being used for each local imputation. This number includes the missing value. errors: {"raise", "coerce", "ignore"} Errors will occur with the indexing of the windows - for example if there is a nan at data[x][0] and `nindex` is set to -1 or there is a nan at data[x][-1] and `nindex` is set to 0. `"raise"` will raise an error, `"coerce"` will try again using an nindex set to the middle and `"ignore"` will just leave it as a nan. inplace: {True, False} Whether to return a copy or run on the passed-in array Returns ------- numpy.ndarray Imputed data. """ if errors == "ignore": raise Exception("`errors` value `ignore` not implemented yet. Sorry!") if not inplace: data = data.copy() wsize = 5 nindex = None if nindex is None: # If using equal window side lengths assert wsize % 2 == 1, "The parameter `wsize` should not be even "\ "if the value `nindex` is not set since it defaults to the midpoint "\ "and an even `wsize` makes the midpoint ambiguous" wside_left = wsize // 2 wside_right = wsize // 2 else: # If using custom window side lengths assert nindex < wsize, "The null index must be smaller than the window size" if nindex == -1: wside_left = wsize - 1 wside_right = 0 else: wside_left = nindex wside_right = wsize - nindex - 1 while True: null_xy = find_null(data) n_null_prev = len(null_xy) for x_i, y_i in null_xy: left_i = max(0, y_i-wside_left) right_i = min(wsize, y_i+wside_right+1) window = data[x_i, left_i: right_i] window_not_null = window[~np.isnan(window)] if len(window_not_null) > 0: try: data[x_i][y_i] = func(window_not_null) continue except Exception as e: if errors == "raise": raise e else: pass # Aggregate function didn't work for some reason if errors == "coerce": wside_left = wsize // 2 wside_right = wsize_left window = data[x_i, y_i-wside_leftk: y_i + wside_right] window_not_null = window[~np.isnan(window)] try: data[x_i][y_i] = func(window_not_null) except Exception: pass if n_null_prev == len(find_null(data)): break return data