def test_missforest_zero_part2(): # Test with an imputable matrix and compare with missing_values="NaN" X_zero = gen_array(min_val=1, missing_values=0) X_nan = gen_array(min_val=1, missing_values=np.nan) statistics_mean = np.nanmean(X_nan, axis=0) imputer_zero = MissForest(missing_values=0, random_state=1337) imputer_nan = MissForest(missing_values=np.nan, random_state=1337) assert_array_equal(imputer_zero.fit_transform(X_zero), imputer_nan.fit_transform(X_nan)) assert_array_equal(imputer_zero.statistics_.get("col_means"), statistics_mean)
def impute_times(final, times_open, times_closed, columns, imputation_method="mean"): """ Impute open work items times with different methods :param final: Complete preprocessed dataframe :param times_open: Dataframe of work items that are not closed :param times_closed: Dataframe of work items that are closed :param columns: Columns to impute :param imputation_method: Choose between 'mean', 'KNN', 'forest' :return: Dataframe of open work items with imputed values """ if imputation_method == "mean": for col in columns: mean = times_closed[col].mean() mask = (times_open[col] == 0) times_open[col].mask(mask, mean, inplace=True) if imputation_method in ["KNN", "forest"]: if imputation_method == "KNN": imputer = KNNImputer(missing_values=0, col_max_missing=0.9) if imputation_method == "forest": imputer = MissForest(missing_values=0) for col in columns: try: val = imputer.fit_transform(pd.DataFrame(final[col]))[:, 0] other = pd.DataFrame(index=final.index, data=val, columns=[col]) mask = (times_open[col] == 0) times_open.loc[mask, col] = other except ValueError: imputer = KNNImputer(missing_values=0, col_max_missing=0.99) return times_open
def imputer(self, _steps, _answers, train_dataset, _X_train, _y_train, test_dataset, _X_test, _y_test, _headers): self.steps = _steps self.answers = _answers self.X_train = _X_train self.y_train = _y_train self.X_test = _X_test self.y_test = _y_test self.headers = _headers self.train_pipe_steps = [] for i, s in enumerate(self.steps): if (s == 'imputer'): if (self.answers[i][s] == 'Miss Forest'): imputer = MissForest() if (self.answers[i][s] == 'KNN Miss Values'): imputer = KNNImputer(n_neighbors=2) imputer.fit(self.X_train, self.y_train) self.X_train = imputer.transform(self.X_train) self.X_test = imputer.transform(self.X_test) self.new_train_dataset = pd.DataFrame(self.X_train, columns=self.headers[:-1]) self.new_train_dataset[self.headers[-1]] = self.y_train self.new_test_dataset = pd.DataFrame(self.X_test, columns=self.headers[:-1]) self.new_test_dataset[self.headers[-1]] = self.y_test return self.new_train_dataset, self.new_test_dataset
def deploy(file_name): file_name = file_name + '.csv' df = pd.read_csv(file_name) df = df.tail(30000) df = df.replace(to_replace=-9999, value=np.nan) # # i=0 # while (i<30): # i=i+1 # df['pressure'].fillna(method='backfill', inplace=True) # df['gph'].fillna(method='backfill', inplace=True) # # # # df= df[['pressure','temp','gph']] # print(df.head(10)) # df.replace(np.nan,0) # df1 = pd.read_excel('/Users/jashrathod/Desktop/') df_new = pd.DataFrame() df_new['wdir_new'] = df['wdir'] df_new['gph'] = df['gph'] df_new.reset_index(inplace=True) print(df_new.head()) #df_new = df.replace(-9999, np.nan) imputer = MissForest() df_new = imputer.fit_transform(df_new) #print(df_new.head()) df_new = pd.DataFrame(df_new) df_new.rename(columns={0: 'a', 1: 'b', 2: 'c'}) print(df_new.columns) print(df_new.head()) df = df.join(df_new) df_new.to_excel("1filmiss.xls")
def impute_values(df: pd.DataFrame, method: str = 'mean', **kwargs): """ Impute missing values in DataFrame (np.nan or None). ------------------------ Args: * df: pd.DataFrame of (samples x features) * method: string for what method of imputation to use ** 'mean': mean imputation ** 'knn': K-NN imputation (see missingpy.KNNImputer) ** 'rf': random forest imputation (see missingpy.MissForest) Returns: * pd.DataFrame: imputed values (samples x features) """ assert method in ('mean','knn','rf'), '{} not yet implemented.'.format(method) if method=='mean': return df.fillna(df.mean(0)) elif method=='knn': X = df.values imputer = KNNImputer(**kwargs) X_impute = imputer.fit_transform(X) return pd.DataFrame(X_impute, index=df.index, columns=df.columns) elif method=='rf': X = df.values imputer = MissForest(**kwargs) X_impute = imputer.fit_transform(X) return pd.DataFrame(X_impute, index=df.index, columns=df.columns)
def Impute_Data_RF(X_train, y_train, X_test, y_test, vals_mask, cols): XY_incomplete_train = np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1) XY_incomplete_test = np.concatenate((X_test, y_test.reshape(-1, 1)), axis=1) imputer = MissForest(random_state=1, n_jobs=-1) XY_completed_train = imputer.fit_transform(XY_incomplete_train) #min_vals_2=np.nanmin(XY_completed_train,axis=0) #max_vals_2=np.nanmax(XY_completed_train,axis=0) XY_completed_test = imputer.transform(XY_incomplete_test) X_train_imp = (XY_completed_train[:, 0:data.shape[1]]) y_train_imp = np.array(XY_completed_train[:, data.shape[1]] >= 5, dtype="int16") X_test_imp = (XY_completed_test[:, 0:data.shape[1]]) y_test_imp = np.array(XY_completed_test[:, data.shape[1]] >= 5, dtype="int16") for j in range(0, X_train_imp.shape[1]): if var.iloc[j]['type'] == 'cat': X_train_imp[:, j] = np.clip(np.round(X_train_imp[:, j]), min_vals[j], max_vals[j]) X_test_imp[:, j] = np.clip(np.round(X_test_imp[:, j]), min_vals[j], max_vals[j]) else: X_train_imp[:, j] = np.round(X_train_imp[:, j], decimals=1) X_test_imp[:, j] = np.round(X_test_imp[:, j], decimals=1) #min_vals_imp=np.nanmin(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0) #max_vals_imp=np.nanmax(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0) return (X_train_imp, y_train_imp, X_test_imp, y_test_imp)
def reconstruct(dataset, mask): print('Reconstructing using MissForest...') # train_data = dataset.orig_ds['train_X'] # mask = dataset.miss_masks[config_idx]['train_X'] (datasetLen, dim) = np.shape(dataset) train_data = dataset.copy() incomplete_dataset = np.zeros((datasetLen, dim)) # IterativeImputer requires corrupted entries to be identified as NaN # Using the mask to replace in the input dataset all zero entries for NaN for i in range(datasetLen): frame = train_data.loc[i, :] ms = mask.loc[i, :] ms.values[ms.values == 0] = np.nan incomplete_dataset[i] = frame.values * ms.values incomplete_dataset = pd.DataFrame(incomplete_dataset) imputer = MissForest(max_iter=5, verbose=0) reconstructed_dataset = imputer.fit_transform(incomplete_dataset) print(np.shape(reconstructed_dataset)) print(reconstructed_dataset) return pd.DataFrame(reconstructed_dataset)
def test_statstics_fit_transform(): # Test statistics_ when data in fit() and transform() are different X = np.array([ [1, 0, 0, 1], [2, 1, 2, 2], [3, 2, 3, 2], [np.nan, 4, 5, 5], [6, 7, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) statistics_mean = np.nanmean(X, axis=0) Y = np.array([ [0, 0, 0, 0], [2, 2, 2, 1], [3, 2, 3, 2], [np.nan, 4, 5, 5], [6, 7, 6, 7], [9, 9, 8, 8], [16, 15, 18, 19], ]) imputer = MissForest() imputer.fit(X).transform(Y) assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
def test_default_with_invalid_input(): # Test imputation with default values and invalid input # Test with all rows missing in a column X = np.array([ [np.nan, 0, 0, 1], [np.nan, 1, 2, np.nan], [np.nan, 2, 3, np.nan], [np.nan, 4, 5, 5], ]) imputer = MissForest(random_state=1337) msg = "One or more columns have all rows missing." assert_raise_message(ValueError, msg, imputer.fit, X) # Test with inf present X = np.array([ [np.inf, 1, 1, 2, np.nan], [2, 1, 2, 2, 3], [3, 2, 3, 3, 8], [np.nan, 6, 0, 5, 13], [np.nan, 7, 0, 7, 8], [6, 6, 2, 5, 7], ]) msg = "+/- inf values are not supported." assert_raise_message(ValueError, msg, MissForest().fit, X) # Test with inf present in matrix passed in transform() X = np.array([ [np.inf, 1, 1, 2, np.nan], [2, 1, 2, 2, 3], [3, 2, 3, 3, 8], [np.nan, 6, 0, 5, 13], [np.nan, 7, 0, 7, 8], [6, 6, 2, 5, 7], ]) X_fit = np.array([ [0, 1, 1, 2, np.nan], [2, 1, 2, 2, 3], [3, 2, 3, 3, 8], [np.nan, 6, 0, 5, 13], [np.nan, 7, 0, 7, 8], [6, 6, 2, 5, 7], ]) msg = "+/- inf values are not supported." assert_raise_message(ValueError, msg, MissForest().fit(X_fit).transform, X)
def test_missforest_imputation_shape(): # Verify the shapes of the imputed matrix n_rows = 10 n_cols = 2 X = gen_array(n_rows, n_cols) imputer = MissForest() X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (n_rows, n_cols))
def rf_imputing(data): #code me ! # Make an instance and perform the imputation imputer = MissForest(verbose=True) X = data.drop('VALUE_PER_UNIT', axis=1) X_imputed = imputer.fit_transform(X) # X_imputed['VALUE_PER_UNIT'] = data['VALUE_PER_UNIT'] return X_imputed
def test_missforest_categorical_multiple(): # Test with two missing values for multiple iterations df = np.array([ [0, 0, np.nan, 1], [0, 1, 1, 2], [0, 2, 1, 2], [np.nan, 4, 1, 5], [1, 7, 0, 7], [1, 8, 0, 8], [1, 15, 0, 19], [1, 18, 0, 17], ]) cat_vars = [0, 2] statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0] n_rows, n_cols = df.shape # Fit missforest and transform imputer = MissForest(random_state=1337) df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars) # Get iterations used by missforest above max_iter = imputer.iter_count_ # Get NaN mask nan_mask = np.isnan(df) nan_rows, nan_cols = np.where(nan_mask) # Make initial guess for missing values df_imp2 = df.copy() df_imp2[nan_rows, nan_cols] = np.take(statistics_mode, nan_cols) # Loop for max_iter count over the columns with NaNs for _ in range(max_iter): for c in nan_cols: # Identify all other columns (i.e. predictors) not_c = np.setdiff1d(np.arange(n_cols), c) # Identify rows with NaN and those without in 'c' y = df_imp2[:, c] X = df_imp2[:, not_c] good_rows = np.where(~nan_mask[:, c])[0] bad_rows = np.where(nan_mask[:, c])[0] # Fit model and predict rf = RandomForestClassifier(n_estimators=100, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) # Fill in values df_imp2[bad_rows, c] = pred_val assert_array_equal(df_imp1, df_imp2) assert_array_equal( imputer.statistics_.get('col_modes')[0], statistics_mode[cat_vars])
def main(p_miss=0.5, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42): np.random.seed(rand_seed) n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed) imputer = MissForest(decreasing=True, random_state=rand_seed, verbose=True) x_filled = imputer.fit_transform(xmiss) mse = mse_own(x_filled, data_x, mask) print("MSE for MissForest: ", mse) return x_filled, mse
def Missforest_Imputation(self, train_index, test_index, final): miss_info = self.miss_info obj_col = deepcopy(miss_info["obj_col"]) cat_var = [ idx for idx, i in enumerate(miss_info["original_column"]) if i in obj_col ] if final: if obj_col == []: self.numMI = MissForest(max_depth=5).fit_transform( X=self.full_miss_data.values) sample = self.numMI else: MI = MissForest(verbose=0, n_jobs=-1, max_depth=5).fit_transform( X=self.full_miss_data.values, cat_vars=cat_var) MI_pd = pd.DataFrame(MI, columns=miss_info["original_column"]) self.MI_pd = MI_pd sample = self.MI_pd else: if obj_col == []: MISS = MissForest(max_depth=5).\ fit(X = self.full_miss_data.iloc[train_index,:].values) self.numMI = MISS.transform( X=self.full_miss_data.iloc[test_index, :].values) sample = self.numMI else: MIss = MissForest(verbose = 0, n_jobs = -1 , max_depth=5).\ fit(X = self.full_miss_data.iloc[train_index,:].values , cat_vars= cat_var) MI = MIss.transform( self.full_miss_data.iloc[test_index, :].values) MI_pd = pd.DataFrame(MI, columns=miss_info["original_column"]) self.numMI = MI_pd[self.notobj].values sample = MI_pd.values return sample
def mf_impute(inp, subject=None, cols=None, categorical_variables=None): data = copy.deepcopy(inp) # Prepare input # if cols is none, perform for all columns (except first column) if cols is None: cols = data.columns[1:] # If subject is null, perform for all subjects if subject is None: inp = data[cols] else: # Create a dataframe with all selected subjects inp = pandas.DataFrame() for s in subject: inp = inp.append(get_subject(data, s, data.columns[0]).loc[:, cols]) if len(inp.columns) < 2: raise Exception("Multiple variables must be given as input") # Encode string columns # Note: only categorical variables are encoded if not categorical_variables is None: labels = {} for col in categorical_variables: if inp[col].dtype == np.dtype(object): encoded, mapping, label = label_encode(inp[col]) # Convert string column to encoded result inp[col] = encoded labels[col] = label else: labels = {} # Prepare MissForest Imputer imputer = MissForest() cat_vars = None if not categorical_variables is None: cat_vars = [] for categorical_variable in categorical_variables: cat_vars.append(list(inp.columns).index(categorical_variable)) # Fit and Transform the input res = imputer.fit_transform(inp.values, cat_vars=cat_vars) res = pandas.DataFrame(res, index=inp.index, columns=inp.columns) # Convert encoded columns back to strings for col in labels.keys(): res[col] = labels[col].inverse_transform(res[col].astype(int)) data.loc[res.index, res.columns] = res return data
def super_fillna(pre_tr_x, pre_te_x, target_col, how="mean"): tr_x = pre_tr_x.copy() te_x = pre_te_x.copy() if how == "mean": fill_value = tr_x[target_col].mean() tr_x.fillna(fill_value, inplace=True) te_x.fillna(fill_value, inplace=True) elif how == "median": fill_value = tr_x[target_col].median() tr_x.fillna(fill_value, inplace=True) te_x.fillna(fill_value, inplace=True) elif how == "rf": imputer = MissForest() tr_x[target_col] = imputer.fit_transform(tr_x[target_col]) te_x[target_col] = imputer.transform(te_x[target_col]) return tr_x, te_x
def define_imputer(self,impute_type): '''Initialize the imputer to be used for every iteration. Input: impute_type: string, {'simple': SimpleImputer, 'iterative': IterativeImputer and 'forest': RandomForest imputer} Output: Imputer: imputer object to be used in the pipeline ''' if impute_type=='simple': self.imputer = SimpleImputer(missing_values=np.nan, strategy='median', add_indicator=self.model_args['add_missing_indicator']) elif impute_type=='iterative': self.imputer = IterativeImputer(missing_values=np.nan, initial_strategy='median', add_indicator=self.model_args['add_missing_indicator']) elif impute_type=='forest': self.imputer = MissForest(random_state=self.random_state,n_jobs=-2)
def missforest_imputer(pd_data, random_state=None): """ Impute missing values using the MissForest imputer. Inputs: pd_data: (DataFrame) Data containing missing values. random_state: (int, optional) Seed of the pseudo random number generator to use. Returns: pd_imputed: (DataFrame) Data with missing values imputed. """ imputer = MissForest(random_state=random_state) pd_imputed = pd.DataFrame(imputer.fit_transform(pd_data), index=pd_data.index, columns=pd_data.columns) return pd_imputed
def test_missforest_zero(): # Test imputation when missing_values == 0 missing_values = 0 imputer = MissForest(missing_values=missing_values, random_state=0) # Test with missing_values=0 when NaN present X = gen_array(min_val=0) msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype assert_raise_message(ValueError, msg, imputer.fit, X) # Test with all zeroes in a column X = np.array([ [1, 0, 0, 0, 5], [2, 1, 0, 2, 3], [3, 2, 0, 0, 0], [4, 6, 0, 5, 13], ]) msg = "One or more columns have all rows missing." assert_raise_message(ValueError, msg, imputer.fit, X)
def fit(self, dataset): """Train standard imputation model. Args: - dataset: incomplete dataset """ if dataset.static_feature is not None: # MICE if self.imputation_model_name == 'mice': self.imputation_model = IterativeImputer() # MissForest elif self.imputation_model_name == 'missforest': self.imputation_model = MissForest() # KNN elif self.imputation_model_name == 'knn': self.imputation_model = KNNImputer() self.imputation_model.fit(dataset.static_feature) return
def test_missforest_numerical_single(): # Test imputation with default parameter values # Test with a single missing value df = np.array([ [1, 0, 0, 1], [2, 1, 2, 2], [3, 2, 3, 2], [np.nan, 4, 5, 5], [6, 7, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) statistics_mean = np.nanmean(df, axis=0) y = df[:, 0] X = df[:, 1:] good_rows = np.where(~np.isnan(y))[0] bad_rows = np.where(np.isnan(y))[0] rf = RandomForestRegressor(n_estimators=10, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) df_imputed = np.array([ [1, 0, 0, 1], [2, 1, 2, 2], [3, 2, 3, 2], [pred_val, 4, 5, 5], [6, 7, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) imputer = MissForest(n_estimators=10, random_state=1337) assert_array_equal(imputer.fit_transform(df), df_imputed) assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
def fit(self, dataset): """Train standard imputation model. Args: - dataset: incomplete dataset """ if dataset.static_feature is not None: # MICE if self.imputation_model_name == "mice": # TODO: Resolve the below: raise NotImplementedError( "IterativeImputer not implemented due to versioning issues with fancyimpute" ) # self.imputation_model = IterativeImputer() # MissForest elif self.imputation_model_name == "missforest": self.imputation_model = MissForest() # KNN elif self.imputation_model_name == "knn": self.imputation_model = KNNImputer() self.imputation_model.fit(dataset.static_feature) return
def test_missforest_categorical_single(): # Test imputation with default parameter values # Test with a single missing value df = np.array([ [0, 0, 0, 1], [0, 1, 2, 2], [0, 2, 3, 2], [np.nan, 4, 5, 5], [1, 7, 6, 7], [1, 8, 8, 8], [1, 15, 18, 19], ]) y = df[:, 0] X = df[:, 1:] good_rows = np.where(~np.isnan(y))[0] bad_rows = np.where(np.isnan(y))[0] rf = RandomForestClassifier(n_estimators=10, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) df_imputed = np.array([ [0, 0, 0, 1], [0, 1, 2, 2], [0, 2, 3, 2], [pred_val, 4, 5, 5], [1, 7, 6, 7], [1, 8, 8, 8], [1, 15, 18, 19], ]) imputer = MissForest(n_estimators=10, random_state=1337) assert_array_equal(imputer.fit_transform(df, cat_vars=0), df_imputed) assert_array_equal(imputer.fit_transform(df, cat_vars=[0]), df_imputed)
def test_missforest_mixed_multiple(): # Test with mixed data type df = np.array([ [np.nan, 0, 0, 1], [0, 1, 2, 2], [0, 2, 3, 2], [1, 4, 5, 5], [1, 7, 6, 7], [1, 8, 8, 8], [1, 15, 18, np.nan], ]) n_rows, n_cols = df.shape cat_vars = [0] num_vars = np.setdiff1d(range(n_cols), cat_vars) statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0] statistics_mean = np.nanmean(df, axis=0) # Fit missforest and transform imputer = MissForest(random_state=1337) df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars) # Get iterations used by missforest above max_iter = imputer.iter_count_ # Get NaN mask nan_mask = np.isnan(df) nan_rows, nan_cols = np.where(nan_mask) # Make initial guess for missing values df_imp2 = df.copy() df_imp2[0, 0] = statistics_mode[0] df_imp2[6, 3] = statistics_mean[3] # Loop for max_iter count over the columns with NaNs for _ in range(max_iter): for c in nan_cols: # Identify all other columns (i.e. predictors) not_c = np.setdiff1d(np.arange(n_cols), c) # Identify rows with NaN and those without in 'c' y = df_imp2[:, c] X = df_imp2[:, not_c] good_rows = np.where(~nan_mask[:, c])[0] bad_rows = np.where(nan_mask[:, c])[0] # Fit model and predict if c in cat_vars: rf = RandomForestClassifier(n_estimators=100, random_state=1337) else: rf = RandomForestRegressor(n_estimators=100, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) # Fill in values df_imp2[bad_rows, c] = pred_val assert_array_equal(df_imp1, df_imp2) assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean[num_vars]) assert_array_equal( imputer.statistics_.get('col_modes')[0], statistics_mode[cat_vars])
def __init__(self): self.imputer = MissForest(verbose=0)
SelectedImage = showImagesRandomImages( 3) #select and image randomly from MNSIT dataset missingPercentage = 0.2 # missing rate percentage missingImage = generateMissingFig( SelectedImage, missingPercentage) #inserting missing values to the original image imputer = KNNImputer(n_neighbors=2, weights="uniform") imputed_by_KNN = imputer.fit_transform(missingImage) KNNImputed_RMSE = mean_squared_error(SelectedImage, imputed_by_KNN) #plt.imshow(imputed_by_KNN, cmap='gray', vmin=0, vmax=1) #plt.show() imputer = MissForest() MissForest_imputed = imputer.fit_transform(missingImage) MissForest_RMSE = mean_squared_error(SelectedImage, MissForest_imputed) #plt.imshow(MissForest_imputed, cmap='gray', vmin=0, vmax=1) #plt.show() imputer = IterativeImputer() MICE_imputed = imputer.fit_transform(missingImage) MICE_RMSE = mean_squared_error(SelectedImage, MICE_imputed) #plt.imshow(MICE_imputed, cmap='gray', vmin=0, vmax=1) #plt.show() ppca = PPCA() ppca.fit(data=SelectedImage, d=100, verbose=True) PPCA_imputed = ppca.transform(missingImage) PPCA_RMSE = mean_squared_error(SelectedImage, PPCA_imputed)
def prepare_data(data, data_idxs, outcome, convert_categorical=True, keep_cols=None, scaler=None, imputer=None, verbose=False, seed=None): X = data.iloc[:, 0:-6] # TODO: get rid of magic number # remove excluded variables for v in EXCLUDE_VARS: if v in X.columns: print('dropped {} column...'.format(v)) X = X.drop([v], axis=1) # convert categorical variables if convert_categorical: X = pd.concat([X, pd.get_dummies(X['ethnicity'])], axis=1) X = pd.concat([X, pd.get_dummies(X['gender'])], axis=1) X = X.drop(['ethnicity', 'gender'], axis=1) X = X.drop(['Other', 'Female'], axis=1) # to avoid colinearity ## Extract outcomes y = None names = { 'time': 'censor_or_{}_days'.format(outcome), 'event': '{}_indicator'.format(outcome), } y = data[[names['time'], names['event']]] ## Filter for appropriate samples prev_ct = len(y) pos_events = y.iloc[:, 0] > 0 # event times > 0 X = X.loc[pos_events] y = y.loc[pos_events] data_idxs = list( [i for (i, inc) in zip(data_idxs, pos_events.tolist()) if inc]) print('filtered out {} events with times < 0'.format(prev_ct - len(y))) if keep_cols is None: X = X.loc[:, (X != 0).any(axis=0)] # drop columns w/ all zero else: for vr in keep_cols: if not set([vr]).issubset(X.columns): X[vr] = 0.0 # impute with zero by default X = X[keep_cols] # check for nulls and impute x_null = np.sum(pd.isnull(X)) y_null = np.sum(pd.isnull(y)) if (x_null.sum() > 0) or (y_null.sum() > 0): print('Will impute...') print('NULL (X, y):', x_null, y_null) if imputer is None: print('Fitting MissForest...') imputer = MissForest(random_state=seed) X_data = imputer.fit_transform(X) X = pd.DataFrame(data=X_data, columns=X.columns) print('Fitted.') else: X_data = imputer.transform(X) X = pd.DataFrame(data=X_data, columns=X.columns) # scale numerical values if scaler is None: scaler = StandardScaler() X[NUMERICAL_VARS] = scaler.fit_transform(X[NUMERICAL_VARS]) else: X[NUMERICAL_VARS] = scaler.transform(X[NUMERICAL_VARS]) if verbose: print('X.shape: {}, y.shape: {}'.format(X.shape, y.shape)) print('Columns: {}'.format(X.columns)) print('---------------- X ----------------\n{}'.format(X.describe())) print('---------------- y ----------------\n{}'.format(y.describe())) return X, y, scaler, imputer, data_idxs
def panel_data(train, years_ahead=1): """ It uses a random forest trained on the observed values of a data matrix (selected series codes except those in submit_rows_index) to predict the missing values. after that, use panel data model for prediction Returns: y_pred: prediction values of target """ train_melt = pd.melt(train.iloc[:, 0:38], id_vars=['Country Name', 'Series Code'], value_vars=train.columns[0:36], var_name='year', value_name='value') train_melt['year'] = train_melt['year'].str[:4].astype(int) panel = train_melt.groupby(['Country Name', 'year', 'Series Code'])['value'].mean().unstack() # only use code with at least one observed value across 36 years in each country for the imputation data matrix left_feature = panel.iloc[:, 9:].isna().groupby('Country Name').sum().max( axis=0) <= 18 pred = panel.iloc[:, 9:].iloc[:, left_feature.values] # construct matrix of features across countries df = [] ct_list = list(set(pred.index.get_level_values(0))) ct_list = sorted(ct_list) for i in ct_list: df.append(pred.loc[i]) predictors = pd.concat(df, axis=1) # random forest imputation imputer = MissForest() predictors_imputed = imputer.fit_transform(predictors) panel.reset_index(inplace=True) panel.columns = ['Country Name', 'year'] + [ 'y' + str(i) for i in range(1, 10) ] + ['x' + str(i) for i in range(1, 1297)] nfeature = int(predictors.shape[1] / 214) split = list(range(nfeature, predictors_imputed.shape[1], nfeature)) _ = np.split(predictors_imputed, split, 1) predictors_new = pd.DataFrame(np.vstack(_)) predictors_new['year'] = panel.year predictors_new['Country Name'] = panel['Country Name'] predictors_new.columns = [ 'x' + str(i) for i in range(1, pred.shape[1] + 1) ] + ['year', 'Country Name'] # combine the updated feature matrix and responses feature = predictors_new.isna().sum() <= 0 # change to 1 panel_left = predictors_new.iloc[:, feature.values] panel_comb = pd.merge(panel.iloc[:, 0:11], panel_left.shift(years_ahead)) # Split prediction and target panel_train = panel_comb.loc[panel_comb.year < 2007] panel_train = panel_train.set_index(['Country Name', 'year']) panel_test = panel_comb.loc[panel_comb.year == 2007] panel_test = panel_test.set_index(['Country Name', 'year']) # panel data model with warnings.catch_warnings(): warnings.filterwarnings("ignore") Ypred = pd.DataFrame() for i in range(1, 10): formula = 'y' + str(i) + '~1+' + '+'.join( panel_train.columns[11:].values) + '+EntityEffects' mod = PanelOLS.from_formula(formula, panel_train) res = mod.fit(cov_type='clustered', cluster_entity=True) Ypred['y' + str(i)] = res.predict(data=panel_test).predictions # Eval Yval = panel_test.iloc[:, :9] rmse = np.sqrt(np.nanmean(np.power(Ypred - Yval, 2))) print(rmse) return Ypred
def _random_forest(self,df): imputer = MissForest(random_state=10) imputed_values = pd.DataFrame(imputer.fit_transform(df)) imputed_values.columns = df.columns return imputed_values
my_test_data1 = test.loc[:, ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']] test_data = title_extract(my_test_data1) test_data = dummy_encode(test_data, 2, 6, 0, 7) # Feature scaling (Age) from sklearn.preprocessing import StandardScaler sc = StandardScaler() CV_data[['Age']] = sc.fit_transform(CV_data[['Age']]) test_data[['Age']] = sc.fit_transform(test_data[['Age']]) from missingpy import MissForest # Make an instance and perform the imputation imputer = MissForest(random_state=0) my_imp = imputer.fit(train_data.drop(['Survived', 'Weight'], axis=1)) CV_data_missforest = imputer.transform(CV_data.drop('Survived', axis=1)) CV_data_missforest = pd.DataFrame(CV_data_missforest, columns=CV_data.columns[1:]) CV_data_missforest = pd.concat([CV_data.Survived, CV_data_missforest], axis=1) test_data_missforest = imputer.transform(test_data) test_data_missforest = pd.DataFrame(test_data_missforest, columns=test_data.columns) ## Now that the individuals in the training set have their new weights, and the missing values in the cross-validation and test set have been imputed ## using the MissForest imputation method, we will now fit the logistic model in R since Python doesn't allow for fitting a weighted model train_data.to_excel(r'train_data.xlsx', index = False) CV_data_missforest.to_excel(r'CV_data.xlsx', index = False) test_data_missforest.to_excel(r'test_data.xlsx', index = False)