def test_iterative_imputer_truncated_normal_posterior(): # test that the values that are imputed using `sample_posterior=True` # with boundaries (`min_value` and `max_value` are not None) are drawn # from a distribution that looks gaussian via the Kolmogorov Smirnov test. # note that starting from the wrong random seed will make this test fail # because random sampling doesn't occur at all when the imputation # is outside of the (min_value, max_value) range pytest.importorskip("scipy", minversion="0.17.0") rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) X[0][0] = np.nan imputer = IterativeImputer(min_value=0, max_value=0.5, sample_posterior=True, random_state=rng) imputer.fit_transform(X) # generate multiple imputations for the single missing value imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)]) assert all(imputations >= 0) assert all(imputations <= 0.5) mu, sigma = imputations.mean(), imputations.std() ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm') if sigma == 0: sigma += 1e-12 ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm') # we want to fail to reject null hypothesis # null hypothesis: distributions are the same assert ks_statistic < 0.2 or p_value > 0.1, \ "The posterior does appear to be normal"
def test_iterative_imputer_early_stopping(): rng = np.random.RandomState(0) n = 50 d = 5 A = rng.rand(n, 1) B = rng.rand(1, d) X = np.dot(A, B) nan_mask = rng.rand(n, d) < 0.5 X_missing = X.copy() X_missing[nan_mask] = np.nan imputer = IterativeImputer(max_iter=100, tol=1e-3, sample_posterior=False, verbose=1, random_state=rng) X_filled_100 = imputer.fit_transform(X_missing) assert len(imputer.imputation_sequence_) == d * imputer.n_iter_ imputer = IterativeImputer(max_iter=imputer.n_iter_, sample_posterior=False, verbose=1, random_state=rng) X_filled_early = imputer.fit_transform(X_missing) assert_allclose(X_filled_100, X_filled_early, atol=1e-7) imputer = IterativeImputer(max_iter=100, tol=0, sample_posterior=False, verbose=1, random_state=rng) imputer.fit(X_missing) assert imputer.n_iter_ == imputer.max_iter
def test_iterative_imputer_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 max_iter = 2 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 # this column should not be discarded by IterativeImputer imputer = IterativeImputer(missing_values=0, max_iter=max_iter, n_nearest_features=5, sample_posterior=False, min_value=0, max_value=1, verbose=1, imputation_order=imputation_order, random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] assert (len(ordered_idx) // imputer.n_iter_ == imputer.n_features_with_missing_) if imputation_order == 'roman': assert np.all(ordered_idx[:d-1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d-1] ordered_idx_round_2 = ordered_idx[d-1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == max_iter * (d - 1)
def test_iterative_imputer_all_missing(): n = 100 d = 3 X = np.zeros((n, d)) imputer = IterativeImputer(missing_values=0, max_iter=1) X_imputed = imputer.fit_transform(X) assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
def test_iterative_imputer_no_missing(): rng = np.random.RandomState(0) X = rng.rand(100, 100) X[:, 0] = np.nan m1 = IterativeImputer(max_iter=10, random_state=rng) m2 = IterativeImputer(max_iter=10, random_state=rng) pred1 = m1.fit(X).transform(X) pred2 = m2.fit_transform(X) # should exclude the first column entirely assert_allclose(X[:, 1:], pred1) # fit and fit_transform should both be identical assert_allclose(pred1, pred2)
def test_iterative_imputer_estimators(estimator): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, estimator=estimator, random_state=rng) imputer.fit_transform(X) # check that types are correct for estimators hashes = [] for triplet in imputer.imputation_sequence_: expected_type = (type(estimator) if estimator is not None else type(BayesianRidge())) assert isinstance(triplet.estimator, expected_type) hashes.append(id(triplet.estimator)) # check that each estimator is unique assert len(set(hashes)) == len(hashes)
def test_iterative_imputer_rank_one(): rng = np.random.RandomState(0) d = 100 A = rng.rand(d, 1) B = rng.rand(1, d) X = np.dot(A, B) nan_mask = rng.rand(d, d) < 0.5 X_missing = X.copy() X_missing[nan_mask] = np.nan imputer = IterativeImputer(max_iter=5, verbose=1, random_state=rng) X_filled = imputer.fit_transform(X_missing) assert_allclose(X_filled, X, atol=0.01)
def test_imputation_shape(): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent', "constant"]: imputer = SimpleImputer(strategy=strategy) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert X_imputed.shape == (10, 2) X_imputed = imputer.fit_transform(X) assert X_imputed.shape == (10, 2) iterative_imputer = IterativeImputer(initial_strategy=strategy) X_imputed = iterative_imputer.fit_transform(X) assert X_imputed.shape == (10, 2)
def test_iterative_imputer_clip(): rng = np.random.RandomState(0) n = 100 d = 10 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, min_value=0.1, max_value=0.2, random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def test_iterative_imputer_clip(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, min_value=0.1, max_value=0.2, random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def exp_mi(xmiss, w, y, regularize, m=10, nuisance=False): res_tau_dr = [] res_tau_ols = [] res_tau_ols_ps = [] res_tau_resid = [] res_ps = np.empty([len(w), 1]) res_y0 = np.empty([len(y), 1]) res_y1 = np.empty([len(y), 1]) for i in range(m): imp = IterativeImputer(sample_posterior=True, random_state=i) x_imp_mice = imp.fit_transform(xmiss) if nuisance: tau_tmp, nu_tmp = compute_estimates(x_imp_mice, w, y, regularize, nuisance) res_ps = np.concatenate( (res_ps, nu_tmp['ps_hat'].reshape([len(w), 1])), axis=1) res_y0 = np.concatenate( (res_y0, nu_tmp['y0_hat'].reshape([len(y), 1])), axis=1) res_y1 = np.concatenate( (res_y1, nu_tmp['y1_hat'].reshape([len(y), 1])), axis=1) else: tau_tmp = compute_estimates(x_imp_mice, w, y, regularize) res_tau_dr.append(tau_tmp['tau_dr']) res_tau_ols.append(tau_tmp['tau_ols']) res_tau_ols_ps.append(tau_tmp['tau_ols_ps']) res_tau_resid.append(tau_tmp['tau_resid']) if nuisance: return { 'tau_dr': np.mean(res_tau_dr), 'tau_ols': np.mean(res_tau_ols), 'tau_ols_ps': np.mean(res_tau_ols_ps), 'tau_resid': np.mean(res_tau_resid), }, { 'ps_hat': np.mean(res_ps[:, 1:], axis=1), 'y0_hat': np.mean(res_y0[:, 1:], axis=1), 'y1_hat': np.mean(res_y1[:, 1:], axis=1), } return { 'tau_dr': np.mean(res_tau_dr), 'tau_ols': np.mean(res_tau_ols), 'tau_ols_ps': np.mean(res_tau_ols_ps), 'tau_resid': np.mean(res_tau_resid), }
def RF_imputation(df, fast=True): '''Returns the dataframe where missing values are imputed using Random Forest Imputation (sklearn) ExtraTreesRegressor is used for increased speed. Parameters: ----------- df: pd.DataFrame fast: boolean, if set to True, ExtraTreesRegressor is used in preference of RandomForestRegressor Returns: -------- df_result: pd.DataFrame where the missing values are imputed using Random Forest (MissForest) ''' df_new = df.copy() df_new = make_missing_np_nan(df_new) missing, unique = imputation_heuristic_column(df, 0.99) df_new = delete_cols(df_new, missing) df_new = delete_cols(df_new, unique) #categorical and datetime columns cannot be imputed, so are removed from the imputation dataframe cat_cols, date_cols, num_cols = type_cols(df_new) df_new = df_new[num_cols] columns = df_new.columns if fast: imputer = IterativeImputer(random_state=0, estimator=ExtraTreesRegressor( n_estimators=10, random_state=0)) else: imputer = IterativeImputer(random_state=0, estimator=RandomForestRegressor( n_estimators=10, random_state=0)) imputed = imputer.fit_transform(df_new) df_imputed = pd.DataFrame(imputed, columns=columns) #categorical and datetime columns are added back not_imputed_cols = cat_cols + date_cols df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1) return df_result
def embedding(file_path): kbs_mini = pd.read_csv(file_path, encoding='utf-8') del kbs_mini['end_date'] del kbs_mini['pd'] del kbs_mini['writer'] del kbs_mini['actor1'] del kbs_mini['actor2'] del kbs_mini['actor3'] del kbs_mini['actor4'] del kbs_mini['actor5'] del kbs_mini['avg_rate'] del kbs_mini['rate_25'] del kbs_mini['prev'] start_timestamp = pd.to_datetime(kbs_mini['start_date'], format='%Y-%m-%d').astype(int) / 10**11 kbs_mini['start_date'] = start_timestamp time_to_datetime = pd.to_datetime(kbs_mini['time'], format='%H:%M:%S') kbs_mini['time'] = time_to_datetime.dt.hour + (time_to_datetime.dt.minute / 60) day_one_enc = pd.get_dummies(kbs_mini, columns=['day']) kbs_mini = day_one_enc # # print(kbs_mini['kbs']) kbs_mini = pd.get_dummies(kbs_mini, columns=['kbs']) del kbs_mini['title'] # column_names = kbs_mini.columns.values.tolist() imp_mean = IterativeImputer(missing_values=np.nan, skip_complete=True, random_state=0) imputed_prev_25 = imp_mean.fit_transform(kbs_mini.to_numpy())[:, 4] kbs_mini['prev_25_imputed'] = imputed_prev_25 # kbs_mini = pd.DataFrame(imp_mean.fit_transform(kbs_mini.to_numpy()), columns=column_names) del kbs_mini['prev_25'] # print(kbs_mini.loc[:,['prev_25','prev_25_imputed']]) return kbs_mini # print(kbs_mini) # embedding('../kbs_mini.csv')
def get_results_single_imputation(X_train, X_test, y_train, y_test): # Apply imputation imputer = IterativeImputer(n_iter=100, sample_posterior=True, random_state=0) X_train_imputed = imputer.fit_transform(X_train) X_test_imputed = imputer.transform(X_test) # Standardize data scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train_imputed) X_test_scaled = scaler.transform(X_test_imputed) # Perform estimation and prediction estimator = LinearRegression() estimator.fit(X_train_scaled, y_train) y_predict = estimator.predict(X_test_scaled) mse_single = mse(y_test, y_predict) return mse_single
def missingValues(data, opt='check', **kwargs): n, m = data.shape # Check for missing values data = data.replace([np.inf, -np.inf], np.nan) if opt == 'check': missing = data.isna().sum().sum() print("Missing values: ", round(missing * 100 / (n * m), 2), "% (", missing, "/", n * m, ")") else: # Choose method if opt == 'mean': out = data.fillna(data.mean()) elif opt == 'median': out = data.fillna(data.median()) else: # Iterative imputers from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer if opt == 'bayesian': from sklearn.linear_model import BayesianRidge estim = BayesianRidge(n_iter=100, **kwargs) elif opt == 'extra': from sklearn.ensemble import ExtraTreesRegressor estim = ExtraTreesRegressor(n_estimators=50, max_features=0.5, min_impurity_decrease=1e-3, min_samples_split=5, min_samples_leaf=2, n_jobs=-1, **kwargs) elif opt == 'knn': from sklearn.neighbors import KNeighborsRegressor estim = KNeighborsRegressor(n_jobs=-1, **kwargs) imp = IterativeImputer(estimator=estim, max_iter=5, n_nearest_features=100, verbose=2, random_state=0) out = pd.DataFrame(imp.fit_transform(data), columns=data.columns, index=data.index) print("Missing values imputed using", opt, "method!") return out
def test_iterative_imputer_clip_truncnorm(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 imputer = IterativeImputer(missing_values=0, max_iter=2, n_nearest_features=5, sample_posterior=True, min_value=0.1, max_value=0.2, verbose=1, imputation_order='random', random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def iterative_imputer(pd_data, random_state=None): """ Impute missing values using the multivariate imputer that estimates each feature from all the others. Inputs: pd_data: (DataFrame) Data containing missing values. random_state: (int, optional) Seed of the pseudo random number generator to use. Returns: pd_imputed: (DataFrame) Data with missing values imputed. """ imputer = IterativeImputer(random_state=random_state) pd_imputed = pd.DataFrame(imputer.fit_transform(pd_data), index=pd_data.index, columns=pd_data.columns) return pd_imputed
def impute_data(data, weekly=False): dat = data.copy() if weekly: dat = dat.groupby(['fips', pd.Grouper(key='date', freq='W')]).aggregate('mean') dat = dat.loc[:, ~( dat.columns.str.startswith('smoothed_mean') | dat.columns.str.startswith('mean_') | dat.columns. isin(['Unnamed: 0', 'n', 'pct_avoid_contact_all_or_most_time']))] keep_columns = dat[pandas_select.StartsWith('pct_') | pandas_select.StartsWith('Total households!!') | pandas_select.StartsWith('RELATIONSHIP!!') | pandas_select.StartsWith('SCHOOL ENROLLMENT')].columns.to_list() + \ dat.columns[dat.columns.get_loc("Civilian_labor_force_2018"):dat.columns.get_loc("Median_Household_Income_2018")+1].to_list() +\ dat.columns[dat.columns.get_loc("Total_Male"):dat.columns.get_loc("Total households")+1].to_list() +\ ["prop_cum_deaths","prop_cum_cases"] dat_selected = dat[keep_columns] min_values = np.concatenate([ np.repeat(0, 88), np.repeat(-np.inf, dat_selected.shape[1] - 90), np.repeat(0, 2) ]) max_values = np.concatenate([ np.repeat(100, 88), np.repeat(np.inf, dat_selected.shape[1] - 90), np.repeat(100, 2) ]) imp = IterativeImputer(max_iter=10, random_state=100620, sample_posterior=True, min_value=min_values, max_value=max_values) imputed_features = imp.fit_transform(dat_selected) imputed_df = dat_selected.copy() imputed_df.loc[:, keep_columns] = imputed_features return imputed_df
def imputing_nan(df): """Perform multiple imputations Args: df (DataFrame): Source DataFrame Returns: dict_impute: Dictionary with multiple imputation techniques performed over DataFrame """ # Store DataFrame structure df_columns = df.columns df_index = df.index # Identify string data columns str_cols = df.select_dtypes(include=[np.object]).columns rest_cols = [i for i in df_columns if i not in str_cols] # Soft-Impute df_soft = SoftImpute().fit_transform(df[rest_cols]) # Restore DataFrame structure df_soft = pd.DataFrame(df_soft, columns=rest_cols, index=df_index) df_soft = pd.concat([df[str_cols], df_soft], axis=1) # Building Regressor for IterativeImputer rgr = KNeighborsRegressor(n_neighbors=5) # Create Imputer imp = IterativeImputer(estimator=rgr, random_state=1234) # Apply imputation through KNN to DataFrame df_knn = imp.fit_transform(df[rest_cols]) # Restore DataFrame structure df_knn = pd.DataFrame(df_knn, columns=rest_cols, index=df_index) df_knn = pd.concat([df[str_cols], df_knn], axis=1) dict_impute = {'soft': df_soft, 'knn': df_knn} return dict_impute
def exp_mi(xmiss, w, y, regularize, m=10): res_tau_dr = [] res_tau_ols = [] res_tau_ols_ps = [] res_tau_resid = [] for i in range(m): imp = IterativeImputer(sample_posterior = True, random_state = i) x_imp_mice = imp.fit_transform(xmiss) tau_tmp = compute_estimates(x_imp_mice, w, y, regularize) res_tau_dr.append(tau_tmp['tau_dr']) res_tau_ols.append(tau_tmp['tau_ols']) res_tau_ols_ps.append(tau_tmp['tau_ols_ps']) res_tau_resid.append(tau_tmp['tau_resid']) return { 'tau_dr': np.mean(res_tau_dr), 'tau_ols': np.mean(res_tau_ols), 'tau_ols_ps': np.mean(res_tau_ols_ps), 'tau_resid': np.mean(res_tau_resid), }
def impute_BayesRegression(dataframe, df_missing, rnd_numbers_row, rnd_numbers_column, error_i, m): imputed_value_temp = pd.DataFrame() imputed_value_list = [] for i in range(m): imp_BR = IterativeImputer(tol=0.01, max_iter=10, sample_posterior=True, estimator=BayesianRidge(normalize=True, alpha_1=0, lambda_1=0.005)) df_imputed = pd.DataFrame(imp_BR.fit_transform(df_missing)) for k, row in enumerate(rnd_numbers_row): if k in error_i: pass else: imputed_value_list.append( df_imputed.iloc[row, rnd_numbers_column[k]]) imputed_value_temp[i] = imputed_value_list imputed_value_list = [] df_imputed.columns = [df_missing.columns.tolist()] return df_imputed, imputed_value_temp
def test_iterative_imputer_zero_iters(): rng = np.random.RandomState(0) n = 100 d = 10 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() missing_flag = X == 0 X[missing_flag] = np.nan imputer = IterativeImputer(max_iter=0) X_imputed = imputer.fit_transform(X) # with max_iter=0, only initial imputation is performed assert_allclose(X_imputed, imputer.initial_imputer_.transform(X)) # repeat but force n_iter_ to 0 imputer = IterativeImputer(max_iter=5).fit(X) # transformed should not be equal to initial imputation assert not np.all(imputer.transform(X) == imputer.initial_imputer_.transform(X)) imputer.n_iter_ = 0 # now they should be equal as only initial imputation is done assert_allclose(imputer.transform(X), imputer.initial_imputer_.transform(X))
def __completef(fitinfo, compmethod=None): """Completes missing values in f using IterativeImpute from scikit-learn.""" f = fitinfo['f'] if compmethod == 'KNN': estimator = KNeighborsRegressor() elif compmethod == 'BayesianRidge': estimator = BayesianRidge() elif compmethod == 'RandomForest': estimator = RandomForestRegressor() else: raise ValueError('Specify completion method.') transformer = IterativeImputer(estimator=estimator) fhat = transformer.fit_transform(f) if not np.isfinite(fhat).all(): raise ValueError( 'Completion method (sklearn.impute.IterativeImputer) failed.') fitinfo['f'] = fhat fitinfo['completionmethod'] = 'IterativeImputer:{:s}'.format(compmethod) return
def process_repeated_measures(df): """ Function to process repeated measures data for each participant --------------------------------------------------------------- input: pd.DataFrame containing repeated measures for a single participant returns: Reshaped, imputed and scaled data for participant Notes: ----- Some participants have missing data in their repeated measures. We need to decide what to do with this. Options are (1) fill with zeros; (2) mean impute; (3) multivariate impute). I'm using (3) for now, using the experimental sklearn.impute.IterativeImputer. See[1]. [1]: https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html """ # Select columns df = df.drop('subjectid', axis=1) df = df.transpose() df.columns = ['value'] df['variable'] = df.index df['week'] = df['variable'].str.extract(r'(\d+)$') df['measure'] = df['variable'].str.replace('\d+$', '') df.drop(['variable'], axis=1, inplace=True) # Reshape from LONG to WIDE df = df.pivot(index='week', values='value', columns='measure') # If missing all repeated measures, replace with zeros to allow loop to # continue if df.isnull().all().all(): df = df.fillna(0) # Impute missing values mvi = IterativeImputer(sample_posterior=True) dfi = mvi.fit_transform(df) # Scale sc = StandardScaler() scaled = pd.DataFrame(sc.fit_transform(dfi)) return (scaled)
def test_iterative_imputer_catch_warning(): # check that we catch a RuntimeWarning due to a division by zero when a # feature is constant in the dataset X, y = load_boston(return_X_y=True) n_samples, n_features = X.shape # simulate that a feature only contain one category during fit X[:, 3] = 1 # add some missing values rng = np.random.RandomState(0) missing_rate = 0.15 for feat in range(n_features): sample_idx = rng.choice(np.arange(n_samples), size=int(n_samples * missing_rate), replace=False) X[sample_idx, feat] = np.nan imputer = IterativeImputer(n_nearest_features=5, sample_posterior=True) with pytest.warns(None) as record: X_fill = imputer.fit_transform(X, y) assert not record.list assert not np.any(np.isnan(X_fill))
def impute_xform(df): imp = IterativeImputer(missing_values=np.nan, random_state=5, max_iter=20, add_indicator=True) imputed_arr = imp.fit_transform(df) nans = df.isna().sum() nan_labels = nans[nans > 0].index nan_labels = [col + '_nan' for col in nan_labels] encoded = list(df.columns) encoded.extend(nan_labels) features_imputed = pd.DataFrame(imputed_arr, columns=encoded) skewed = [ 'ScreenPorch', 'PoolArea', 'LotFrontage', '3SsnPorch', 'LowQualFinSF' ] features_log_xformed = pd.DataFrame(data=features_imputed) features_log_xformed[skewed] = features_imputed[skewed].apply( lambda x: np.log(x + 1)) return features_log_xformed
def clean_df(): print('Loading tract table') df = pd.read_csv(os.path.join(PRODUCT_GEO_PATH, 'tract_table.csv')) df['GEOID'] = df['GEOID'].astype(str).apply(lambda x: x.zfill(11)) df = df.set_index('GEOID') geoid_to_city = df.city.to_dict() df = df[[c for c in df.columns if '-M-' not in c]] print('Reformatting tract table') year_dfs = [] for year in ACS_TIME_COVERAGE: year_df = df[[c for c in df.columns if c.split('-')[0] == str(year)]] year_df = year_df.rename( columns={c: '-'.join(c.split('-')[1:]) for c in year_df.columns}) year_df['year'] = year year_df = year_df.set_index('year', append=True) year_dfs.append(year_df) gydf = pd.concat(year_dfs) gydf['city'] = gydf.apply(lambda x: geoid_to_city[x.name[0]], axis=1) gydf = gydf.set_index('city', append=True) gydf = gydf.drop(columns=drop_columns) print('Imputing missing values') imp_mean = IterativeImputer(random_state=0, min_value=0, skip_complete=True, imputation_order='random') X = imp_mean.fit_transform(gydf.values) cdf = copy.deepcopy(gydf) cdf.iloc[:, :] = X return cdf, geoid_to_city
def fit(self, n=0, seed=None): """ Parameters ---------- n : int Number of block bootstrap replicates """ if seed is not None: # add a large constitent, so incrementing seed +1 will work. seed = seed + 1234567: imp = IterativeImputer(**self._imputer_kwargs, random_state=seed) col = 1 rows = self.data.shape[0] temp = imp(self.data) self.result = temp[:,col] if n != 0: self.boot_result = np.zeros([rows, n]) for i in range(n): random_state = seed + n imp = IterativeImputer(**self._imputer_kwargs, random_state=random_state) # get block bootstrap sample boot_sample = bbs_replicate(seed=random_state) # create temp dataset including sample imputed_sample = imp.fit_transform(boot_sample) # impute the result self.boot_result[:,n] = imputed_sample[:,self.target_col]
def test_iterative_imputer_zero_iters(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() missing_flag = X == 0 X[missing_flag] = np.nan imputer = IterativeImputer(max_iter=0) X_imputed = imputer.fit_transform(X) # with max_iter=0, only initial imputation is performed assert_allclose(X_imputed, imputer.initial_imputer_.transform(X)) # repeat but force n_iter_ to 0 imputer = IterativeImputer(max_iter=5).fit(X) # transformed should not be equal to initial imputation assert not np.all(imputer.transform(X) == imputer.initial_imputer_.transform(X)) imputer.n_iter_ = 0 # now they should be equal as only initial imputation is done assert_allclose(imputer.transform(X), imputer.initial_imputer_.transform(X))
if __name__ == "__main__": train_dir = '/opt/ml/processing/input/train' test_dir = '/opt/ml/processing/input/test' seed = 0 train_df = pd.read_csv(os.path.join(train_dir, 'train.csv'), index_col='ID') test_df = pd.read_csv(os.path.join(test_dir, 'test.csv'), index_col='ID') print('Scaling Data') std_scale = preprocessing.StandardScaler().fit(train_df.iloc[:, 1:]) train_df_scaled = std_scale.transform(train_df.iloc[:, 1:]) test_df_scaled = std_scale.transform(test_df) print('Training Data Imputation Model') imputer = IterativeImputer(random_state=seed, missing_values=0) train_imputed = imputer.fit_transform(train_df_scaled) # Transforming test data test_imputed = imputer.transform(test_df_scaled) train_imputed_output_path = os.path.join('/opt/ml/processing/train', 'train_imputed.csv') test_imputed_output_path = os.path.join('/opt/ml/processing/test', 'test_imputed.csv') pd.concat([ train_df['target'], pd.DataFrame( train_imputed, columns=train_df.columns, index=train_df.index) ], axis=1).to_csv(train_imputed_output_path,
def training_data_pipeline(): """ This function fetches and transforms the data needed for training the models. """ # Fetch data print(f'[{datetime.now(TZ_MKD).strftime("%Y-%m-%d %H:%M:%S")}]\tFetching historical data for training') fetch_historical_data(date_start=TZ_MKD.localize(datetime(2011, 1, 1, 0, 0, 0)), date_end=datetime.now(TZ_MKD), pipeline_type=PIPELINE_TRAINING) # Transform data for station, pollutants in SENSORS.items(): print(f'[{datetime.now(TZ_MKD).strftime("%Y-%m-%d %H:%M:%S")}]\tProcessing training data for {station}') df = pd.read_csv(f'./data/training/first-order/{station}', index_col=0) # add feature indicating missingness for each pollutant for p in pollutants: df[f'{p}_missing'] = df[f'{p}'].isna().astype('int32') # log-transform pollutants for p in pollutants: df[p] = np.log(df[p] + 1) # train-val split train_size = int(df.shape[0] * 0.85) df_train = df.iloc[:train_size] df_valid = df.iloc[train_size:] # fit and save scalers features_to_normalize = ['cloud_cover', 'precip', 'uv_index', 'visibility'] features_to_standardize = pollutants + ['temperature', 'humidity', 'dew_point', 'pressure', 'wind_speed'] scalers = {} for f in features_to_normalize: scaler = MinMaxScaler() scaler.fit(df_train[f].values.reshape(-1,1)) scalers[f] = scaler for f in features_to_standardize: scaler = StandardScaler() scaler.fit(df_train[f].values.reshape(-1,1)) scalers[f] = scaler if not os.path.exists(f'./pickles/scalers/{station}'): os.makedirs(f'./pickles/scalers/{station}') for feature, scaler in scalers.items(): with open(f'./pickles/scalers/{station}/{feature}', 'wb') as f: pickle.dump(scaler, f) df_train_scaled = scale_data(df_train, station) df_valid_scaled = scale_data(df_valid, station) train_values = df_train_scaled.values.copy() valid_values = df_valid_scaled.copy() # impute missing values imputer = IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=12, random_state=0), random_state=0, skip_complete=True, max_iter=5) imputed_train_values = imputer.fit_transform(train_values) imputed_valid_values = imputer.transform(valid_values) if not os.path.exists(f'./pickles/imputers'): os.makedirs(f'./pickles/imputers') with open(f'./pickles/imputers/{station}', 'wb') as f: pickle.dump(imputer, f) df_train_imputed = pd.DataFrame(data=imputed_train_values, index=df_train_scaled.index, columns=df_train_scaled.columns) df_valid_imputed = pd.DataFrame(data=imputed_valid_values, index=df_valid_scaled.index, columns=df_valid_scaled.columns) if not os.path.exists(f'./data/training/second-order/{station}'): os.makedirs(f'./data/training/second-order/{station}') df_train_imputed.to_csv(f'./data/training/second-order/{station}/train', index=True) df_valid_imputed.to_csv(f'./data/training/second-order/{station}/valid', index=True) # build seq2seq (third-order) datasets train_encoder_input_data, train_decoder_input_data, train_decoder_target_data = \ build_seq2seq_datasets(df_train_imputed, pollutants) valid_encoder_input_data, valid_decoder_input_data, valid_decoder_target_data = \ build_seq2seq_datasets(df_valid_imputed, pollutants) if not os.path.exists(f'./data/training/third-order/{station}'): os.makedirs(f'./data/training/third-order/{station}') np.save(f'./data/training/third-order/{station}/train_encoder_input_data.npy', train_encoder_input_data) np.save(f'./data/training/third-order/{station}/train_decoder_input_data.npy', train_decoder_input_data) np.save(f'./data/training/third-order/{station}/train_decoder_target_data.npy', train_decoder_target_data) np.save(f'./data/training/third-order/{station}/valid_encoder_input_data.npy', valid_encoder_input_data) np.save(f'./data/training/third-order/{station}/valid_decoder_input_data.npy', valid_decoder_input_data) np.save(f'./data/training/third-order/{station}/valid_decoder_target_data.npy', valid_decoder_target_data)
def test_iterative_imputer_error_param(max_iter, tol, error_type, warning): X = np.zeros((100, 2)) imputer = IterativeImputer(max_iter=max_iter, tol=tol) with pytest.raises(error_type, match=warning): imputer.fit_transform(X)
def data_preprocessing(dat: pd.DataFrame, art='C', y=None, logger=None, remove=True): """ Encoding + remove columns with more than 1/2 na if remove==True + remove columns with all na + imputation if art == 'C', will do LabelEncoding first for the target column ================ Parameter: ================ dat - type of DataFrame art - type of string either C for classifcation of R for regression. indicates the type of problem y - type of string the name of the target column; if None, set the last column of the data set as target considering only one column for label logger - type of Logger remove - type of boolean whether remove the columns with na value more than half length or not ================= Output ================= dat - type of Dataframe the dataframe after preprocessing cols - type of list of string the name of the numerical columns """ if logger == None: logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) logger.info('Start data preprocessing') # replace original indeices with default ones dat = dat.reset_index(drop=True) if art == 'C': logger.info('Start to label target feature y for classification task') dat.iloc[:, -1] = LabelEncoder().fit_transform(dat.iloc[:, -1]) logger.info('End with label encoding the target feature') if remove: # remove columns with more than 1/2 na dat = dat.loc[:, dat.isna().sum() / len(dat) < .5] logger.info( 'Following features are removed from the dataframe because half of their value are NA: %s' % (dat.columns[dat.isna().sum() / len(dat) > .5].to_list())) # Encoding oe = OneHotEncoder(drop='first') # get categorical columns if y: dat_y = dat[[y]] cols = dat.columns.to_list() cols.remove(y) dat_x = dat[cols] else: dat_y = dat[[dat.columns[-1]]] dat_x = dat[dat.columns[:-1]] dat_categ = dat_x.select_dtypes(include=['object']) # get kterm of categ features for i in dat_categ.columns: # save output to dat tmp = dat_x[i].value_counts() dat_x[i + '_kterm'] = dat_x[i].map(lambda x: tmp[x] if x in tmp.index else 0) # float columns including the k term cols dat_numeric = dat_x.select_dtypes( include=['float32', 'float64', 'int32', 'int64']) # onehot encoding and label encoding dat_categ_onehot = dat_categ.iloc[:, dat_categ.apply(lambda x: len(x.unique()) ).values < 8] dat_categ_label = dat_categ.iloc[:, dat_categ.apply(lambda x: len(x.unique()) ).values >= 8] flag_onehot = False flag_label = False # oe if dat_categ_onehot.shape[1] > 0: logger.info( 'Start to do onehot to the following categoric features: %s' % (str(dat_categ_onehot.columns.to_list()))) dat_onehot = pd.DataFrame( oe.fit_transform(dat_categ_onehot.astype(str)).toarray(), columns=oe.get_feature_names(dat_categ_onehot.columns)) logger.info('End with onehot') flag_onehot = True else: dat_onehot = None # le if dat_categ_label.shape[1] > 0: logger.info( 'Start to do label encoding to the following categoric features: %s' % (str(dat_categ_label.columns.to_list()))) dat_categ_label = dat_categ_label.fillna('NULL') dat_label = pd.DataFrame(columns=dat_categ_label.columns) for i in dat_categ_label.columns: dat_label[i] = LabelEncoder().fit_transform( dat_categ_label[i].astype(str)) flag_label = True logger.info('End with label encoding') else: dat_label = None # scaling # combine dat_new = pd.DataFrame() if flag_onehot and flag_label: dat_new = pd.concat([dat_numeric, dat_onehot, dat_label], axis=1) elif flag_onehot: dat_new = pd.concat([dat_numeric, dat_onehot], axis=1) elif flag_label: dat_new = pd.concat([dat_numeric, dat_label], axis=1) else: dat_new = dat_numeric dat_new = pd.concat([dat_new, dat_y], axis=1) # imputation dat_new = dat_new.dropna(axis=1, how='all') if dat_new.isna().sum().sum() > 0: logger.info( 'Nan value exist, start to fill na with iterative imputer: ' + str(dat_new.isna().sum().sum())) # include na value, impute with iterative Imputer or simple imputer columns = dat_new.columns imp = IterativeImputer(max_iter=10, random_state=0) # imp = SimpleImputer(missing_values=np.nan, strategy='mean') dat_new = imp.fit_transform(dat_new) dat_new = pd.DataFrame(dat_new, columns=columns) dat_numeric = dat_new.iloc[:, :-1].select_dtypes( include=['float32', 'float64', 'int32', 'int64']) logger.info('End with filling nan') return dat_new, dat_numeric.columns
print('Iteration', i + 1) # ### Split Data X_train, X_test, y_train, y_test = train_test_split( df.values, labels.values.ravel(), train_size=train_size, shuffle=True, stratify=labels.values.ravel()) # ### Impute Data if data_impute: imp = IterativeImputer(max_iter=25, random_state=1337) X_train = imp.fit_transform(X_train) X_test = imp.transform(X_test) # ### Augment Data if smote_ratio > 0: smote = SMOTE(sampling_strategy='all', random_state=1337, k_neighbors=5, n_jobs=1) X_train, y_train = smote.fit_resample(X_train, y_train) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test)
smooth, hpass)]['topology']) vect_all.append(np.concatenate(vects, axis=1)) del vects X_top = np.swapaxes(np.hstack(vect_all), 0, 1) Y = np.array(id_list) try: df_summary.at[i, 'grid'] = (atlas, est, clust, _k, smooth, hpass) bad_ixs = [i[1] for i in np.argwhere(np.isnan(X_top))] for m in set(bad_ixs): if (X_top.shape[0] - bad_ixs.count(m)) / X_top.shape[0] < 0.50: X_top = np.delete(X_top, m, axis=1) imp = IterativeImputer(max_iter=50, random_state=42) X_top = imp.fit_transform(X_top) scaler = StandardScaler() X_top = scaler.fit_transform(X_top) discr_stat_val, rdf = discr_stat(X_top, Y) df_summary.at[i, 'discriminability'] = discr_stat_val print(discr_stat_val) #print(rdf) del discr_stat_val i += 1 except: i += 1 continue elif modality == 'dwi': gen_hyperparams = ['est', 'clust', '_k'] for col in cols: build_hp_dict(col,
def por_facies_imputer(dataframe): """ Imputes missing porosity and facie labels using KNN Args: df ([DataFrame]): The dataframe should includes the following columns: ['X', 'Y', 'depth', 'por', 'rho','facies'] Returns: df ([DataFrame]) """ df_original = dataframe.copy(deep=False) df = df_original.loc[:, ['X', 'Y', 'depth', 'por', 'rho', 'facies']] categorical = ['facies'] numerical = ['X', 'Y', 'depth', 'por', 'rho'] df['Imputed'] = (df.isnull().sum(axis=1)) > 0 df[categorical] = df[categorical].apply(lambda series: pd.Series( LabelEncoder().fit_transform(series[series.notnull()]), index=series[series.notnull()].index)) # Instatiate imputers imp_num = IterativeImputer(estimator=RandomForestRegressor(), initial_strategy='mean', max_iter=20, random_state=0) imp_cat = IterativeImputer(estimator=RandomForestClassifier(), initial_strategy='most_frequent', max_iter=20, random_state=0) # Fit df[numerical] = imp_num.fit_transform(df[numerical]) df[categorical] = imp_cat.fit_transform(df[categorical]) #Perform corrections to facies information with density and porosity values df['facies'] = np.where((df.por < 0.1) & (df.rho > 2.40), 1, df.facies) df['facies'] = np.where((df.por < 0.08) & (df.rho < 2.25), 2, df.facies) df['facies'] = np.where( (df.por < 0.13) & (df.por > 0.08) & (df.rho < 2.40), 3, df.facies) #Update por, rho and facies with the predicted values for missing data df_original["por"] = df["por"] df_original["rho"] = df["rho"] df_original["facies"] = df["facies"] facies_map = {0: 'SS', 1: 'SS-Sh', 2: 'Sh', 3: 'Sh-SS'} df_original["facies"] = df_original["facies"].map(facies_map) print('---------------------------------') print('Porosity initial missing values = ' + str(dataframe['por'].isna().sum())) print('Porosity final missing values = ' + str(df_original['por'].isna().sum())) print('Facies initial missing values = ' + str(dataframe['facies'].isna().sum())) print('Facies final missing values = ' + str(df_original['facies'].isna().sum())) print('---------------------------------') return df_original
################################################################################ # The remaining missings will be imputed via Iterative Imputer: # Models each feature with missing values as a function of other features, and # uses that estimate for imputation # If we don't put the categorical values back later, maybe we can use some encoding? X_train=train.drop(columns=Categorical,axis=1) X_train.drop(columns='TARGET',axis=1,inplace=True) X_test=test.drop(columns=Categorical,axis=1) # Impute from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer filler=IterativeImputer() X_train_filled = filler.fit_transform(X_train) X_test_filled = filler.transform(X_test) X_train_filled = pd.DataFrame(X_train_filled, columns=list(X_train)) X_test_filled = pd.DataFrame(X_test_filled, columns=list(X_test)) train=pd.concat([train[Categorical],X_train_filled,train['TARGET']],axis=1) test=pd.concat([test[Categorical],X_test_filled],axis=1) # Final check: miss(train,1) miss(test,1) # # If we need to standardize data: # from sklearn import preprocessing # X_scaled = preprocessing.StandardScaler().fit_transform(X)
loans_imp_meanDF = pd.DataFrame(loans_imp_mean, columns=numeric_cols.columns) print("\n\nDataframe info after imputation of numeric columns with mean") # Check the DataFrame's info print(loans_imp_meanDF.info()) ##Impute with IterativeImputer #https://scikit-learn.org/stable/modules/impute.html#iterative-imputer #at each step, a feature column is designated as output y and the other feature #columns are treated as inputs X. A regressor is fit on (X, y) for known y. #Then, the regressor is used to predict the missing values of y. This is done #for each feature in an iterative fashion, and then is repeated for max_iter imputation rounds. # Iteratively impute imp_iter = IterativeImputer(max_iter=5, sample_posterior=True, random_state=123) loans_imp_iter = imp_iter.fit_transform(numeric_cols) # Convert returned array to DataFrame loans_imp_iterDF = pd.DataFrame(loans_imp_iter, columns=numeric_cols.columns) # Check the DataFrame's info print("\n\nDataframe info after iterative imputation of numeric columns") print(loans_imp_iterDF.info()) ########## Replace outliers - Winsorization # Print: before dropping print("\n\nDetect and replace outliers") df = df_filled print(df) numeric_cols = df.select_dtypes(include=[np.number]) print(numeric_cols.mean()) print(numeric_cols.median()) print(numeric_cols.max())
# %% medicamentos.columns dum_reg = pd.get_dummies(medicamentos.REGIONAL_EPS_DESC) dum_medicamento = dum_sign(medicamentos.NOMBRE_MEDICAMENTO, 0.01) dum_diag = dum_sign(medicamentos.DIAGNOSTICO_EPS_DESC, 0.01) aver = pd.concat([ dum_reg, dum_medicamento, dum_diag, medicamentos.FECHA_EMISION.apply(lambda x: x.timestamp()), medicamentos.NUMERO_CANTIDAD_PRESTACIONES ], axis=1) medicamentos_imputed = IterativeImputer(random_state=141854) sal = medicamentos_imputed.fit_transform(aver) # %% medicamentos["NumeroCantidadPrestacionesImputado"] = sal[:, -1] # %% medicamentos["NumeroCantidadPrestacionesImputadoInd"] = pd.isna( medicamentos.NUMERO_CANTIDAD_PRESTACIONES).astype(int) # %% medicamentos[medicamentos.NumeroCantidadPrestacionesImputadoInd == 1] # %% ## El objetivo de estos script es crear los casos de observación de adherencia. ## La metodologia consiste en ver si existe otra entrega de medicación cercana al dia final ## para las entregas de medicacion (un delta de 5 dias es posible)
def load_both_data(project, metric): understand_path = 'data/understand_files_all/' + project + '_understand.csv' understand_df = pd.read_csv(understand_path) understand_df = understand_df.dropna(axis=1, how='all') cols_list = understand_df.columns.values.tolist() for item in ['Kind', 'Name', 'commit_hash', 'Bugs']: if item in cols_list: cols_list.remove(item) cols_list.insert(0, item) understand_df = understand_df[cols_list] cols = understand_df.columns.tolist() understand_df = understand_df.drop_duplicates(cols[4:len(cols)]) understand_df['Name'] = understand_df.Name.str.rsplit('.', 1).str[1] commit_guru_file_level_path = 'data/commit_guru_file/' + project + '.csv' commit_guru_file_level_df = pd.read_csv(commit_guru_file_level_path) commit_guru_file_level_df[ 'commit_hash'] = commit_guru_file_level_df.commit_hash.str.strip('"') commit_guru_file_level_df = commit_guru_file_level_df[ commit_guru_file_level_df['file_name'].str.contains('.java')] commit_guru_file_level_df[ 'Name'] = commit_guru_file_level_df.file_name.str.rsplit( '/', 1).str[1].str.split('.').str[0].str.replace('/', '.') commit_guru_file_level_df = commit_guru_file_level_df.drop('file_name', axis=1) df = understand_df.merge(commit_guru_file_level_df, how='left', on=['commit_hash', 'Name']) cols = df.columns.tolist() cols.remove('Bugs') cols.append('Bugs') df = df[cols] for item in ['Kind', 'Name', 'commit_hash']: if item in cols: df = df.drop(labels=[item], axis=1) # df.dropna(inplace=True) df = df.drop_duplicates() df.reset_index(drop=True, inplace=True) y = df.Bugs X = df.drop('Bugs', axis=1) cols = X.columns scaler = MinMaxScaler() X = scaler.fit_transform(X) X = pd.DataFrame(X, columns=cols) imp_mean = IterativeImputer(random_state=0) X = imp_mean.fit_transform(X) X = pd.DataFrame(X, columns=cols) if metric == 'process': X = X[[ 'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev', 'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev', 'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm', 'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd', 'file_sctr' ]] elif metric == 'product': X = X.drop([ 'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev', 'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev', 'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm', 'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd', 'file_sctr' ], axis=1) else: X = X return X, y