def em_algorithm(data, loops=50, dtype="cont"): if not checks(data): raise Exception("Checks failed") if dtype == "cont": null_xy = find_null(data) for x_i, y_i in null_xy: col = data[:, int(y_i)] mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() col[x_i] = random.gauss(mu, std) previous, i = 1, 1 for i in range(loops): mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() col[x_i] = random.gauss(mu, std) delta = (col[x_i] - previous) / previous if i > 5 and delta < 0.1: data[x_i][y_i] = col[x_i] break data[x_i][y_i] = col[x_i] previous = col[x_i] return data else: raise Exception("Other dtypes not supported yet.")
def test_1d(self): """ Check 1d array, should return false""" _redirect_stdout(True) arr = np.array([np.nan, 2]) output = checks(arr) _redirect_stdout(False) self.assertFalse(output)
def from_before_observation(data, axis=0): if not checks(data): raise Exception("Checks failed") if axis == 0: data = np.transpose(data) elif axis == 1: pass null_xy = find_null(data) for x_i, y_i in null_xy: # Simplest scenario, look one row back if x_i - 1 > -1: data[x_i][y_i] = data[x_i - 1][y_i] # Look n rows forward else: x_residuals = np.shape(data)[0] - x_i - 1 # n data points left val_found = False for i in range(1, x_residuals): if not np.isnan(data[x_i + i][y_i]): val_found = True break if val_found: for x_nan in range(i): data[x_i + x_nan][y_i] = data[x_i + i][y_i] else: print("Error: Entire Column is NaN") raise Exception return data
def arima(data, p, d, q): """Autoregressive Integrated Moving Average Imputation PARAMETERS ---------- data: numpy.ndarray The matrix with missing values that you want to impute p: int Number of autoregressive terms d: int Number of nonseasonal differences needed for stationarity q: int Number of lagged forecast errors in the prediction equation RETURNS ------- numpy.ndarray """ # Verify inputs if not checks(data): raise Exception("Checks failed") try: p = int(p) d = int(d) q = int(q) data = isinstance(data, np.ndarray) except: raise Exception # Arima null_xy = find_null(data) for x, y in null_xy: print(x, y) return data
def test_3d_false(self): """ Check 3d array without setting allow_3d=True, should return false""" _redirect_stdout(True) arr = np.ones((5, 5, 3)) arr[0][0][0] = np.nan output = checks(arr) _redirect_stdout(False) self.assertFalse(output)
def test_correct_input(self): """ Test that an array that satisfies all checks returns True""" # Integer np.ndarray (check: `_is_ndarray`, `_shape_2d`) arr = np.array([[1, 2], [3, 4]]) # Cast integer array to float (check: `_dtype_float`) arr.dtype = np.float # Add a nan value to the array (check: `_nan_exists`) arr[0][0] = np.nan output = checks(arr) self.assertTrue(output)
def random_imputation(data): if not checks(data): raise Exception("Checks failed") null_xy = find_null(data) for x, y in null_xy: uniques = np.unique(data[:, y]) uniques = uniques[~np.isnan(uniques)] data[x][y] = np.random.choice(uniques) return data
def locf(data, axis=0): """ Last Observation Carried Forward For each set of missing indices, use the value of one row before(same column). In the case that the missing value is the first row, look one row ahead instead. If this next row is also NaN, look to the next row. Repeat until you find a row in this column that's not NaN. All the rows before will be filled with this value. Parameters ---------- data: numpy.ndarray Data to impute. axis: boolean (optional) 0 if time series is in row format (Ex. data[0][:] is 1st data point). 1 if time series is in col format (Ex. data[:][0] is 1st data point). Returns ------- numpy.ndarray Imputed data. """ if not checks(data): raise Exception("Checks failed") if axis == 0: data = np.transpose(data) elif axis == 1: pass null_xy = find_null(data) for x_i, y_i in null_xy: # Simplest scenario, look one row back if x_i - 1 > -1: data[x_i][y_i] = data[x_i - 1][y_i] # Look n rows forward else: x_residuals = np.shape(data)[0] - x_i - 1 # n datapoints left val_found = False for i in range(1, x_residuals): if not np.isnan(data[x_i + i][y_i]): val_found = True break if val_found: # pylint: disable=undefined-loop-variable for x_nan in range(i): data[x_i + x_nan][y_i] = data[x_i + i][y_i] else: print("Error: Entire Column is NaN") raise Exception return data
def em(data, loops=50, dtype="cont"): """ Imputes given data using expectation maximization. E-step: Calculates the expected complete data log likelihood ratio. M-step: Finds the parameters that maximize the log likelihood of the complete data. Parameters ---------- data: numpy.nd.array Data to impute. loops: int Number of em iterations to run before breaking. dtype: ("cont","disc") Indicates whether the possible values will come from a continuous range or categorical range. Returns ------- numpy.nd.array Imputed data. """ if not checks(data): raise Exception("Checks failed") if dtype == "cont": null_xy = find_null(data) for x_i, y_i in null_xy: col = data[:, int(y_i)] mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() col[x_i] = random.gauss(mu, std) previous, i = 1, 1 for i in range(loops): # Expectation mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() # Maximization col[x_i] = random.gauss(mu, std) # Break out of loop if likelihood doesn't change at least 10% # and has run at least 5 times delta = (col[x_i] - previous) / previous if i > 5 and delta < 0.1: data[x_i][y_i] = col[x_i] break data[x_i][y_i] = col[x_i] previous = col[x_i] return data else: raise Exception("Other dtypes not supported yet.")
def mean_imputation(data): """ Substitute missing values with the mean of that column. Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ if not checks(data): raise Exception("Checks failed") null_xy = find_null(data) for x_i, y_i in null_xy: row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])] new_value = np.mean(row_wo_nan) data[x_i][y_i] = new_value return data
def random_imputation(data): """ Fill missing values in with a randomly selected value from the same column. Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ if not checks(data): raise Exception("Checks failed") null_xy = find_null(data) for x, y in null_xy: uniques = np.unique(data[:, y]) uniques = uniques[~np.isnan(uniques)] data[x][y] = np.random.choice(uniques) return data
def test_not_nparray(self): """ If not an np.array, should return false""" _redirect_stdout(True) output = checks([[np.nan, 2.], [3, 4]]) _redirect_stdout(False) self.assertEqual(output, False)
def test_return_type(self): """ Check return type, should return a boolean""" _redirect_stdout(True) output = checks(np.array([[1., 2.], [3, 4]])) _redirect_stdout(False) self.assertEqual(type(output), type(False))
def test_2d(self): """ Check 2d array, should return true""" arr = np.array([[1., 2.], [3, 4]]) arr[0][0] = np.nan output = checks(arr) self.assertTrue(output)
def test_3d_true(self): """ Check 3d array setting allow_3d=True, should return true""" arr = np.ones((5, 5, 3)) arr[0][0][0] = np.nan output = checks(arr, allow_3d=True) self.assertTrue(output)
def mice(data): """Multivariate Imputation by Chained Equations Reference: Buuren, S. V., & Groothuis-Oudshoorn, K. (2011). Mice: Multivariate Imputation by Chained Equations in R. Journal of Statistical Software, 45(3). doi:10.18637/jss.v045.i03 Implementation follows the main idea from the paper above. Differs in decision of which variable to regress on (here, I choose it at random). Also differs in stopping criterion (here the model stops after change in prediction from previous prediction is less than 10%). PARAMETERS ---------- data: numpy.ndarray Data to impute. RETURNS ------- numpy.ndarray Imputed data. """ if not checks(data): raise Exception("Checks failed") null_xy = find_null(data) # Add a column of zeros to the index values null_xyv = np.append(null_xy, np.zeros((np.shape(null_xy)[0], 1)), axis=1) null_xyv = [[int(x), int(y), v] for x, y, v in null_xyv] temp = [] cols_missing = set([y for _, y, _ in null_xyv]) # Step 1: Simple Imputation, these are just placeholders for x_i, y_i, value in null_xyv: # Column containing nan value without the nan value col = data[:, [y_i]][~np.isnan(data[:, [y_i]])] new_value = np.mean(col) data[x_i][y_i] = new_value temp.append([x_i, y_i, new_value]) null_xyv = temp # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary) converged = [False] * len(null_xyv) while all(converged): # Step 2: Placeholders are set back to missing for one variable/column dependent_col = int(np.random.choice(list(cols_missing))) missing_xs = [int(x) for x, y, value in null_xyv if y == dependent_col] # Step 3: Perform linear regression using the other variables x_train, y_train = [], [] for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs): x_train.append(np.delete(data[x_i], dependent_col)) y_train.append(data[x_i][dependent_col]) model = LinearRegression() model.fit(x_train, y_train) # Step 4: Missing values for the missing variable/column are replaced # with predictions from our new linear regression model temp = [] # For null indices with the dependent column that was randomly chosen for i, x_i, y_i, value in enumerate(null_xyv): if y_i == dependent_col: # Row 'x' without the nan value new_value = model.predict(np.delete(data[x_i], dependent_col)) data[x_i][y_i] = new_value.reshape(1, -1) temp.append([x_i, y_i, new_value]) delta = (new_value - value) / value if delta < 0.1: converged[i] = True null_xyv = temp return data
def test_nan_exists(self): """ If no NaN, should return false""" _redirect_stdout(True) output = checks(np.array([[1., 2.], [3, 4]])) _redirect_stdout(False) self.assertEqual(output, False)