def deploy(file_name): file_name = file_name + '.csv' df = pd.read_csv(file_name) df = df.tail(30000) df = df.replace(to_replace=-9999, value=np.nan) # # i=0 # while (i<30): # i=i+1 # df['pressure'].fillna(method='backfill', inplace=True) # df['gph'].fillna(method='backfill', inplace=True) # # # # df= df[['pressure','temp','gph']] # print(df.head(10)) # df.replace(np.nan,0) # df1 = pd.read_excel('/Users/jashrathod/Desktop/') df_new = pd.DataFrame() df_new['wdir_new'] = df['wdir'] df_new['gph'] = df['gph'] df_new.reset_index(inplace=True) print(df_new.head()) #df_new = df.replace(-9999, np.nan) imputer = MissForest() df_new = imputer.fit_transform(df_new) #print(df_new.head()) df_new = pd.DataFrame(df_new) df_new.rename(columns={0: 'a', 1: 'b', 2: 'c'}) print(df_new.columns) print(df_new.head()) df = df.join(df_new) df_new.to_excel("1filmiss.xls")
def reconstruct(dataset, mask): print('Reconstructing using MissForest...') # train_data = dataset.orig_ds['train_X'] # mask = dataset.miss_masks[config_idx]['train_X'] (datasetLen, dim) = np.shape(dataset) train_data = dataset.copy() incomplete_dataset = np.zeros((datasetLen, dim)) # IterativeImputer requires corrupted entries to be identified as NaN # Using the mask to replace in the input dataset all zero entries for NaN for i in range(datasetLen): frame = train_data.loc[i, :] ms = mask.loc[i, :] ms.values[ms.values == 0] = np.nan incomplete_dataset[i] = frame.values * ms.values incomplete_dataset = pd.DataFrame(incomplete_dataset) imputer = MissForest(max_iter=5, verbose=0) reconstructed_dataset = imputer.fit_transform(incomplete_dataset) print(np.shape(reconstructed_dataset)) print(reconstructed_dataset) return pd.DataFrame(reconstructed_dataset)
def Impute_Data_RF(X_train, y_train, X_test, y_test, vals_mask, cols): XY_incomplete_train = np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1) XY_incomplete_test = np.concatenate((X_test, y_test.reshape(-1, 1)), axis=1) imputer = MissForest(random_state=1, n_jobs=-1) XY_completed_train = imputer.fit_transform(XY_incomplete_train) #min_vals_2=np.nanmin(XY_completed_train,axis=0) #max_vals_2=np.nanmax(XY_completed_train,axis=0) XY_completed_test = imputer.transform(XY_incomplete_test) X_train_imp = (XY_completed_train[:, 0:data.shape[1]]) y_train_imp = np.array(XY_completed_train[:, data.shape[1]] >= 5, dtype="int16") X_test_imp = (XY_completed_test[:, 0:data.shape[1]]) y_test_imp = np.array(XY_completed_test[:, data.shape[1]] >= 5, dtype="int16") for j in range(0, X_train_imp.shape[1]): if var.iloc[j]['type'] == 'cat': X_train_imp[:, j] = np.clip(np.round(X_train_imp[:, j]), min_vals[j], max_vals[j]) X_test_imp[:, j] = np.clip(np.round(X_test_imp[:, j]), min_vals[j], max_vals[j]) else: X_train_imp[:, j] = np.round(X_train_imp[:, j], decimals=1) X_test_imp[:, j] = np.round(X_test_imp[:, j], decimals=1) #min_vals_imp=np.nanmin(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0) #max_vals_imp=np.nanmax(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0) return (X_train_imp, y_train_imp, X_test_imp, y_test_imp)
def test_statstics_fit_transform(): # Test statistics_ when data in fit() and transform() are different X = np.array([ [1, 0, 0, 1], [2, 1, 2, 2], [3, 2, 3, 2], [np.nan, 4, 5, 5], [6, 7, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) statistics_mean = np.nanmean(X, axis=0) Y = np.array([ [0, 0, 0, 0], [2, 2, 2, 1], [3, 2, 3, 2], [np.nan, 4, 5, 5], [6, 7, 6, 7], [9, 9, 8, 8], [16, 15, 18, 19], ]) imputer = MissForest() imputer.fit(X).transform(Y) assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
def test_missforest_imputation_shape(): # Verify the shapes of the imputed matrix n_rows = 10 n_cols = 2 X = gen_array(n_rows, n_cols) imputer = MissForest() X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (n_rows, n_cols))
def rf_imputing(data): #code me ! # Make an instance and perform the imputation imputer = MissForest(verbose=True) X = data.drop('VALUE_PER_UNIT', axis=1) X_imputed = imputer.fit_transform(X) # X_imputed['VALUE_PER_UNIT'] = data['VALUE_PER_UNIT'] return X_imputed
def test_missforest_categorical_multiple(): # Test with two missing values for multiple iterations df = np.array([ [0, 0, np.nan, 1], [0, 1, 1, 2], [0, 2, 1, 2], [np.nan, 4, 1, 5], [1, 7, 0, 7], [1, 8, 0, 8], [1, 15, 0, 19], [1, 18, 0, 17], ]) cat_vars = [0, 2] statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0] n_rows, n_cols = df.shape # Fit missforest and transform imputer = MissForest(random_state=1337) df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars) # Get iterations used by missforest above max_iter = imputer.iter_count_ # Get NaN mask nan_mask = np.isnan(df) nan_rows, nan_cols = np.where(nan_mask) # Make initial guess for missing values df_imp2 = df.copy() df_imp2[nan_rows, nan_cols] = np.take(statistics_mode, nan_cols) # Loop for max_iter count over the columns with NaNs for _ in range(max_iter): for c in nan_cols: # Identify all other columns (i.e. predictors) not_c = np.setdiff1d(np.arange(n_cols), c) # Identify rows with NaN and those without in 'c' y = df_imp2[:, c] X = df_imp2[:, not_c] good_rows = np.where(~nan_mask[:, c])[0] bad_rows = np.where(nan_mask[:, c])[0] # Fit model and predict rf = RandomForestClassifier(n_estimators=100, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) # Fill in values df_imp2[bad_rows, c] = pred_val assert_array_equal(df_imp1, df_imp2) assert_array_equal( imputer.statistics_.get('col_modes')[0], statistics_mode[cat_vars])
def main(p_miss=0.5, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42): np.random.seed(rand_seed) n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed) imputer = MissForest(decreasing=True, random_state=rand_seed, verbose=True) x_filled = imputer.fit_transform(xmiss) mse = mse_own(x_filled, data_x, mask) print("MSE for MissForest: ", mse) return x_filled, mse
def mf_impute(inp, subject=None, cols=None, categorical_variables=None): data = copy.deepcopy(inp) # Prepare input # if cols is none, perform for all columns (except first column) if cols is None: cols = data.columns[1:] # If subject is null, perform for all subjects if subject is None: inp = data[cols] else: # Create a dataframe with all selected subjects inp = pandas.DataFrame() for s in subject: inp = inp.append(get_subject(data, s, data.columns[0]).loc[:, cols]) if len(inp.columns) < 2: raise Exception("Multiple variables must be given as input") # Encode string columns # Note: only categorical variables are encoded if not categorical_variables is None: labels = {} for col in categorical_variables: if inp[col].dtype == np.dtype(object): encoded, mapping, label = label_encode(inp[col]) # Convert string column to encoded result inp[col] = encoded labels[col] = label else: labels = {} # Prepare MissForest Imputer imputer = MissForest() cat_vars = None if not categorical_variables is None: cat_vars = [] for categorical_variable in categorical_variables: cat_vars.append(list(inp.columns).index(categorical_variable)) # Fit and Transform the input res = imputer.fit_transform(inp.values, cat_vars=cat_vars) res = pandas.DataFrame(res, index=inp.index, columns=inp.columns) # Convert encoded columns back to strings for col in labels.keys(): res[col] = labels[col].inverse_transform(res[col].astype(int)) data.loc[res.index, res.columns] = res return data
def impute_times(final, times_open, times_closed, columns, imputation_method="mean"): """ Impute open work items times with different methods :param final: Complete preprocessed dataframe :param times_open: Dataframe of work items that are not closed :param times_closed: Dataframe of work items that are closed :param columns: Columns to impute :param imputation_method: Choose between 'mean', 'KNN', 'forest' :return: Dataframe of open work items with imputed values """ if imputation_method == "mean": for col in columns: mean = times_closed[col].mean() mask = (times_open[col] == 0) times_open[col].mask(mask, mean, inplace=True) if imputation_method in ["KNN", "forest"]: if imputation_method == "KNN": imputer = KNNImputer(missing_values=0, col_max_missing=0.9) if imputation_method == "forest": imputer = MissForest(missing_values=0) for col in columns: try: val = imputer.fit_transform(pd.DataFrame(final[col]))[:, 0] other = pd.DataFrame(index=final.index, data=val, columns=[col]) mask = (times_open[col] == 0) times_open.loc[mask, col] = other except ValueError: imputer = KNNImputer(missing_values=0, col_max_missing=0.99) return times_open
class MissForestImputer(object): def __init__(self): self.imputer = MissForest(verbose=0) def encode_cat(self, X_c): data = X_c.copy() nonulls = data.dropna().values impute_reshape = nonulls.reshape(-1, 1) encoder = OrdinalEncoder() impute_ordinal = encoder.fit_transform(impute_reshape) data.loc[data.notnull()] = np.squeeze(impute_ordinal) return data, encoder def decode_cat(self, X_c, encoder): data = X_c.copy() nonulls = data.dropna().values.reshape(-1, 1) n_cat = len(encoder.categories_[0]) nonulls = np.round(nonulls).clip(0, n_cat - 1) nonulls = encoder.inverse_transform(nonulls) data.loc[data.notnull()] = np.squeeze(nonulls) return data def fit_transform(self, X): num_X = X.select_dtypes(include='number') cat_X = X.select_dtypes(exclude='number') # encode the categorical columns to numeric columns if cat_X.shape[1] > 0: cat_encoders = {} cat_X_enc = [] for c in cat_X.columns: X_c_enc, encoder = self.encode_cat(cat_X[c]) cat_X_enc.append(X_c_enc) cat_encoders[c] = encoder cat_X_enc = pd.concat(cat_X_enc, axis=1) X_enc = pd.concat([num_X, cat_X_enc], axis=1) cat_columns = cat_X.columns cat_indices = [ i for i, c in enumerate(X_enc.columns) if c in cat_columns ] else: X_enc = X cat_indices = None X_imp = self.imputer.fit_transform(X_enc.values.astype(float), cat_vars=cat_indices) X_imp = pd.DataFrame(X_imp, columns=X_enc.columns) if cat_X.shape[1] > 0: num_X_imp = X_imp[num_X.columns] cat_X_imp = X_imp[cat_X.columns] cat_X_dec = [] for c in cat_X.columns: X_c_dec = self.decode_cat(cat_X_imp[c], cat_encoders[c]) cat_X_dec.append(X_c_dec) cat_X_dec = pd.concat(cat_X_dec, axis=1) X_imp = pd.concat([num_X_imp, cat_X_dec], axis=1) X_imp = X_imp[X.columns] return X_imp
def impute_values(df: pd.DataFrame, method: str = 'mean', **kwargs): """ Impute missing values in DataFrame (np.nan or None). ------------------------ Args: * df: pd.DataFrame of (samples x features) * method: string for what method of imputation to use ** 'mean': mean imputation ** 'knn': K-NN imputation (see missingpy.KNNImputer) ** 'rf': random forest imputation (see missingpy.MissForest) Returns: * pd.DataFrame: imputed values (samples x features) """ assert method in ('mean','knn','rf'), '{} not yet implemented.'.format(method) if method=='mean': return df.fillna(df.mean(0)) elif method=='knn': X = df.values imputer = KNNImputer(**kwargs) X_impute = imputer.fit_transform(X) return pd.DataFrame(X_impute, index=df.index, columns=df.columns) elif method=='rf': X = df.values imputer = MissForest(**kwargs) X_impute = imputer.fit_transform(X) return pd.DataFrame(X_impute, index=df.index, columns=df.columns)
def super_fillna(pre_tr_x, pre_te_x, target_col, how="mean"): tr_x = pre_tr_x.copy() te_x = pre_te_x.copy() if how == "mean": fill_value = tr_x[target_col].mean() tr_x.fillna(fill_value, inplace=True) te_x.fillna(fill_value, inplace=True) elif how == "median": fill_value = tr_x[target_col].median() tr_x.fillna(fill_value, inplace=True) te_x.fillna(fill_value, inplace=True) elif how == "rf": imputer = MissForest() tr_x[target_col] = imputer.fit_transform(tr_x[target_col]) te_x[target_col] = imputer.transform(te_x[target_col]) return tr_x, te_x
def test_default_with_invalid_input(): # Test imputation with default values and invalid input # Test with all rows missing in a column X = np.array([ [np.nan, 0, 0, 1], [np.nan, 1, 2, np.nan], [np.nan, 2, 3, np.nan], [np.nan, 4, 5, 5], ]) imputer = MissForest(random_state=1337) msg = "One or more columns have all rows missing." assert_raise_message(ValueError, msg, imputer.fit, X) # Test with inf present X = np.array([ [np.inf, 1, 1, 2, np.nan], [2, 1, 2, 2, 3], [3, 2, 3, 3, 8], [np.nan, 6, 0, 5, 13], [np.nan, 7, 0, 7, 8], [6, 6, 2, 5, 7], ]) msg = "+/- inf values are not supported." assert_raise_message(ValueError, msg, MissForest().fit, X) # Test with inf present in matrix passed in transform() X = np.array([ [np.inf, 1, 1, 2, np.nan], [2, 1, 2, 2, 3], [3, 2, 3, 3, 8], [np.nan, 6, 0, 5, 13], [np.nan, 7, 0, 7, 8], [6, 6, 2, 5, 7], ]) X_fit = np.array([ [0, 1, 1, 2, np.nan], [2, 1, 2, 2, 3], [3, 2, 3, 3, 8], [np.nan, 6, 0, 5, 13], [np.nan, 7, 0, 7, 8], [6, 6, 2, 5, 7], ]) msg = "+/- inf values are not supported." assert_raise_message(ValueError, msg, MissForest().fit(X_fit).transform, X)
def imputer(self, _steps, _answers, train_dataset, _X_train, _y_train, test_dataset, _X_test, _y_test, _headers): self.steps = _steps self.answers = _answers self.X_train = _X_train self.y_train = _y_train self.X_test = _X_test self.y_test = _y_test self.headers = _headers self.train_pipe_steps = [] for i, s in enumerate(self.steps): if (s == 'imputer'): if (self.answers[i][s] == 'Miss Forest'): imputer = MissForest() if (self.answers[i][s] == 'KNN Miss Values'): imputer = KNNImputer(n_neighbors=2) imputer.fit(self.X_train, self.y_train) self.X_train = imputer.transform(self.X_train) self.X_test = imputer.transform(self.X_test) self.new_train_dataset = pd.DataFrame(self.X_train, columns=self.headers[:-1]) self.new_train_dataset[self.headers[-1]] = self.y_train self.new_test_dataset = pd.DataFrame(self.X_test, columns=self.headers[:-1]) self.new_test_dataset[self.headers[-1]] = self.y_test return self.new_train_dataset, self.new_test_dataset
def missforest_imputer(pd_data, random_state=None): """ Impute missing values using the MissForest imputer. Inputs: pd_data: (DataFrame) Data containing missing values. random_state: (int, optional) Seed of the pseudo random number generator to use. Returns: pd_imputed: (DataFrame) Data with missing values imputed. """ imputer = MissForest(random_state=random_state) pd_imputed = pd.DataFrame(imputer.fit_transform(pd_data), index=pd_data.index, columns=pd_data.columns) return pd_imputed
def test_missforest_numerical_single(): # Test imputation with default parameter values # Test with a single missing value df = np.array([ [1, 0, 0, 1], [2, 1, 2, 2], [3, 2, 3, 2], [np.nan, 4, 5, 5], [6, 7, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) statistics_mean = np.nanmean(df, axis=0) y = df[:, 0] X = df[:, 1:] good_rows = np.where(~np.isnan(y))[0] bad_rows = np.where(np.isnan(y))[0] rf = RandomForestRegressor(n_estimators=10, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) df_imputed = np.array([ [1, 0, 0, 1], [2, 1, 2, 2], [3, 2, 3, 2], [pred_val, 4, 5, 5], [6, 7, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) imputer = MissForest(n_estimators=10, random_state=1337) assert_array_equal(imputer.fit_transform(df), df_imputed) assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
def test_missforest_categorical_single(): # Test imputation with default parameter values # Test with a single missing value df = np.array([ [0, 0, 0, 1], [0, 1, 2, 2], [0, 2, 3, 2], [np.nan, 4, 5, 5], [1, 7, 6, 7], [1, 8, 8, 8], [1, 15, 18, 19], ]) y = df[:, 0] X = df[:, 1:] good_rows = np.where(~np.isnan(y))[0] bad_rows = np.where(np.isnan(y))[0] rf = RandomForestClassifier(n_estimators=10, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) df_imputed = np.array([ [0, 0, 0, 1], [0, 1, 2, 2], [0, 2, 3, 2], [pred_val, 4, 5, 5], [1, 7, 6, 7], [1, 8, 8, 8], [1, 15, 18, 19], ]) imputer = MissForest(n_estimators=10, random_state=1337) assert_array_equal(imputer.fit_transform(df, cat_vars=0), df_imputed) assert_array_equal(imputer.fit_transform(df, cat_vars=[0]), df_imputed)
def define_imputer(self,impute_type): '''Initialize the imputer to be used for every iteration. Input: impute_type: string, {'simple': SimpleImputer, 'iterative': IterativeImputer and 'forest': RandomForest imputer} Output: Imputer: imputer object to be used in the pipeline ''' if impute_type=='simple': self.imputer = SimpleImputer(missing_values=np.nan, strategy='median', add_indicator=self.model_args['add_missing_indicator']) elif impute_type=='iterative': self.imputer = IterativeImputer(missing_values=np.nan, initial_strategy='median', add_indicator=self.model_args['add_missing_indicator']) elif impute_type=='forest': self.imputer = MissForest(random_state=self.random_state,n_jobs=-2)
def test_missforest_zero_part2(): # Test with an imputable matrix and compare with missing_values="NaN" X_zero = gen_array(min_val=1, missing_values=0) X_nan = gen_array(min_val=1, missing_values=np.nan) statistics_mean = np.nanmean(X_nan, axis=0) imputer_zero = MissForest(missing_values=0, random_state=1337) imputer_nan = MissForest(missing_values=np.nan, random_state=1337) assert_array_equal(imputer_zero.fit_transform(X_zero), imputer_nan.fit_transform(X_nan)) assert_array_equal(imputer_zero.statistics_.get("col_means"), statistics_mean)
def test_missforest_zero(): # Test imputation when missing_values == 0 missing_values = 0 imputer = MissForest(missing_values=missing_values, random_state=0) # Test with missing_values=0 when NaN present X = gen_array(min_val=0) msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype assert_raise_message(ValueError, msg, imputer.fit, X) # Test with all zeroes in a column X = np.array([ [1, 0, 0, 0, 5], [2, 1, 0, 2, 3], [3, 2, 0, 0, 0], [4, 6, 0, 5, 13], ]) msg = "One or more columns have all rows missing." assert_raise_message(ValueError, msg, imputer.fit, X)
def fit(self, dataset): """Train standard imputation model. Args: - dataset: incomplete dataset """ if dataset.static_feature is not None: # MICE if self.imputation_model_name == 'mice': self.imputation_model = IterativeImputer() # MissForest elif self.imputation_model_name == 'missforest': self.imputation_model = MissForest() # KNN elif self.imputation_model_name == 'knn': self.imputation_model = KNNImputer() self.imputation_model.fit(dataset.static_feature) return
def Missforest_Imputation(self, train_index, test_index, final): miss_info = self.miss_info obj_col = deepcopy(miss_info["obj_col"]) cat_var = [ idx for idx, i in enumerate(miss_info["original_column"]) if i in obj_col ] if final: if obj_col == []: self.numMI = MissForest(max_depth=5).fit_transform( X=self.full_miss_data.values) sample = self.numMI else: MI = MissForest(verbose=0, n_jobs=-1, max_depth=5).fit_transform( X=self.full_miss_data.values, cat_vars=cat_var) MI_pd = pd.DataFrame(MI, columns=miss_info["original_column"]) self.MI_pd = MI_pd sample = self.MI_pd else: if obj_col == []: MISS = MissForest(max_depth=5).\ fit(X = self.full_miss_data.iloc[train_index,:].values) self.numMI = MISS.transform( X=self.full_miss_data.iloc[test_index, :].values) sample = self.numMI else: MIss = MissForest(verbose = 0, n_jobs = -1 , max_depth=5).\ fit(X = self.full_miss_data.iloc[train_index,:].values , cat_vars= cat_var) MI = MIss.transform( self.full_miss_data.iloc[test_index, :].values) MI_pd = pd.DataFrame(MI, columns=miss_info["original_column"]) self.numMI = MI_pd[self.notobj].values sample = MI_pd.values return sample
def fit(self, dataset): """Train standard imputation model. Args: - dataset: incomplete dataset """ if dataset.static_feature is not None: # MICE if self.imputation_model_name == "mice": # TODO: Resolve the below: raise NotImplementedError( "IterativeImputer not implemented due to versioning issues with fancyimpute" ) # self.imputation_model = IterativeImputer() # MissForest elif self.imputation_model_name == "missforest": self.imputation_model = MissForest() # KNN elif self.imputation_model_name == "knn": self.imputation_model = KNNImputer() self.imputation_model.fit(dataset.static_feature) return
SelectedImage = showImagesRandomImages( 3) #select and image randomly from MNSIT dataset missingPercentage = 0.2 # missing rate percentage missingImage = generateMissingFig( SelectedImage, missingPercentage) #inserting missing values to the original image imputer = KNNImputer(n_neighbors=2, weights="uniform") imputed_by_KNN = imputer.fit_transform(missingImage) KNNImputed_RMSE = mean_squared_error(SelectedImage, imputed_by_KNN) #plt.imshow(imputed_by_KNN, cmap='gray', vmin=0, vmax=1) #plt.show() imputer = MissForest() MissForest_imputed = imputer.fit_transform(missingImage) MissForest_RMSE = mean_squared_error(SelectedImage, MissForest_imputed) #plt.imshow(MissForest_imputed, cmap='gray', vmin=0, vmax=1) #plt.show() imputer = IterativeImputer() MICE_imputed = imputer.fit_transform(missingImage) MICE_RMSE = mean_squared_error(SelectedImage, MICE_imputed) #plt.imshow(MICE_imputed, cmap='gray', vmin=0, vmax=1) #plt.show() ppca = PPCA() ppca.fit(data=SelectedImage, d=100, verbose=True) PPCA_imputed = ppca.transform(missingImage) PPCA_RMSE = mean_squared_error(SelectedImage, PPCA_imputed)
msno.matrix(df) #histotams and density plots dataset['horseLevel'].plot.hist(bins=10, alpha=0.5) dataset['sireLevel'].plot.hist(bins=10, alpha=0.5) dataset['damLevel'].plot.hist(bins=10, alpha=0.5) dataset['sireOfdamLevel'].plot.hist(bins=10, alpha=0.5) sns.distplot(dataset['horseLevel'], hist=False, kde=True, bins=int(180/5), color = 'darkblue', hist_kws={'edgecolor':'black'}, kde_kws={'linewidth': 4}) #random forrest imputation imputer = MissForest() imputedData = imputer.fit_transform(df) imputedData = pd.DataFrame(imputedData, columns = df.columns) #create train/test df msk = np.random.rand(len(imputedData)) < 0.8 train = imputedData[msk] test = imputedData[~msk] #OLS train['const'] = 1 reg1 = sm.OLS(endog=train['horseLevel'], exog=train[['damLevel', 'sireLevel', 'sireOfdamLevel']], missing='drop') results1 = reg1.fit()
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author : qichun tang # @Contact : [email protected] from copy import deepcopy import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder df = pd.read_csv("train_classification.csv") df_ce = deepcopy(df) for name in ["Name", "Sex", "Ticket", "Fare", "Cabin", "Embarked"]: col = df_ce[name] col[~col.isna()] = LabelEncoder().fit_transform(col[~col.isna()]) from missingpy import MissForest imputer = MissForest() imputer.fit_transform(df_ce.values.astype("float"))
def _random_forest(self,df): imputer = MissForest(random_state=10) imputed_values = pd.DataFrame(imputer.fit_transform(df)) imputed_values.columns = df.columns return imputed_values
miss_sum.index.names = ['Name'] miss_sum['Name'] = miss_sum.index #plot the missing value count sns.set(style="whitegrid", color_codes=True) sns.barplot(x='Name', y='count', data=miss_sum) plt.xticks(rotation=90) plt.show() #change Period variable train_data['Period'] = train_data['Period'].str.slice_replace(4, 14, '') test_data['Period'] = test_data['Period'].str.slice_replace(4, 14, '') #Impute missing values from missingpy import MissForest imputer = MissForest() train_data_imputed = imputer.fit_transform(train_data) train_data_imputed = pd.DataFrame( data=train_data_imputed[0:, 0:], index=[i for i in range(train_data_imputed.shape[0])], columns=train_data_columns) train_data_imputed.columns #train_data_imputed.reset_index(drop=True).reset_index(drop=True) type(train_data_imputed) train_data_imputed.head(10) # write csv train_data_imputed.to_excel('train_data_imputed.xlsx', index=False)
def test_missforest_mixed_multiple(): # Test with mixed data type df = np.array([ [np.nan, 0, 0, 1], [0, 1, 2, 2], [0, 2, 3, 2], [1, 4, 5, 5], [1, 7, 6, 7], [1, 8, 8, 8], [1, 15, 18, np.nan], ]) n_rows, n_cols = df.shape cat_vars = [0] num_vars = np.setdiff1d(range(n_cols), cat_vars) statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0] statistics_mean = np.nanmean(df, axis=0) # Fit missforest and transform imputer = MissForest(random_state=1337) df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars) # Get iterations used by missforest above max_iter = imputer.iter_count_ # Get NaN mask nan_mask = np.isnan(df) nan_rows, nan_cols = np.where(nan_mask) # Make initial guess for missing values df_imp2 = df.copy() df_imp2[0, 0] = statistics_mode[0] df_imp2[6, 3] = statistics_mean[3] # Loop for max_iter count over the columns with NaNs for _ in range(max_iter): for c in nan_cols: # Identify all other columns (i.e. predictors) not_c = np.setdiff1d(np.arange(n_cols), c) # Identify rows with NaN and those without in 'c' y = df_imp2[:, c] X = df_imp2[:, not_c] good_rows = np.where(~nan_mask[:, c])[0] bad_rows = np.where(nan_mask[:, c])[0] # Fit model and predict if c in cat_vars: rf = RandomForestClassifier(n_estimators=100, random_state=1337) else: rf = RandomForestRegressor(n_estimators=100, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) # Fill in values df_imp2[bad_rows, c] = pred_val assert_array_equal(df_imp1, df_imp2) assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean[num_vars]) assert_array_equal( imputer.statistics_.get('col_modes')[0], statistics_mode[cat_vars])