def test_iterative_imputer_verbose(): rng = np.random.RandomState(0) n = 100 d = 3 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1) imputer.fit(X) imputer.transform(X) imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2) imputer.fit(X) imputer.transform(X)
def test_iterative_imputer_truncated_normal_posterior(): # test that the values that are imputed using `sample_posterior=True` # with boundaries (`min_value` and `max_value` are not None) are drawn # from a distribution that looks gaussian via the Kolmogorov Smirnov test. # note that starting from the wrong random seed will make this test fail # because random sampling doesn't occur at all when the imputation # is outside of the (min_value, max_value) range pytest.importorskip("scipy", minversion="0.17.0") rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) X[0][0] = np.nan imputer = IterativeImputer(min_value=0, max_value=0.5, sample_posterior=True, random_state=rng) imputer.fit_transform(X) # generate multiple imputations for the single missing value imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)]) assert all(imputations >= 0) assert all(imputations <= 0.5) mu, sigma = imputations.mean(), imputations.std() ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm') if sigma == 0: sigma += 1e-12 ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm') # we want to fail to reject null hypothesis # null hypothesis: distributions are the same assert ks_statistic < 0.2 or p_value > 0.1, \ "The posterior does appear to be normal"
def test_iterative_imputer_additive_matrix(): rng = np.random.RandomState(0) n = 100 d = 10 A = rng.randn(n, d) B = rng.randn(n, d) X_filled = np.zeros(A.shape) for i in range(d): for j in range(d): X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2 # a quarter is randomly missing nan_mask = rng.rand(n, d) < 0.25 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = IterativeImputer(max_iter=10, verbose=1, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01)
def MultiIterTrees(dataset): from sklearn.impute import IterativeImputer Dim = dataset['d'] trainX = dataset['train_x'] testX = dataset['test_x'] trainM = dataset['train_m'] testM = dataset['test_m'] # Train_No = dataset['train_no'] # Test_No = dataset['test_no'] test_X = testX.copy() train_X = trainX.copy() train_X[trainM == 0] = np.nan test_X[testM == 0] = np.nan # Bayesian imputation etr_estimator = ExtraTreesRegressor(n_estimators=10, random_state=0) etr_imp = IterativeImputer(random_state=0, estimator=etr_estimator) etr_imp.fit(train_X) imputed_X = etr_imp.transform(test_X) print('>>>ExtraTreesRegressor IterativeImputer result: \n') print(imputed_X) _all_rmse = compute_rmse(testX, imputed_X, testM) print('>>>all_rmse', _all_rmse) return _all_rmse
def test_iterative_imputer_additive_matrix(): rng = np.random.RandomState(0) n = 100 d = 10 A = rng.randn(n, d) B = rng.randn(n, d) X_filled = np.zeros(A.shape) for i in range(d): for j in range(d): X_filled[:, (i + j) % d] += (A[:, i] + B[:, j]) / 2 # a quarter is randomly missing nan_mask = rng.rand(n, d) < 0.25 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = IterativeImputer(max_iter=10, verbose=1, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01)
def MultiIterBayesian(dataset): Dim = dataset['d'] trainX = dataset['train_x'] testX = dataset['test_x'] trainM = dataset['train_m'] testM = dataset['test_m'] # Train_No = dataset['train_no'] # Test_No = dataset['test_no'] test_X = testX.copy() train_X = trainX.copy() train_X[trainM == 0] = np.nan test_X[testM == 0] = np.nan # Bayesian imputation br_estimator = BayesianRidge() by_imp = IterativeImputer(random_state=0, estimator=br_estimator) by_imp.fit(train_X) imputed_X = by_imp.transform(test_X) print('>>>BayesianRidge IterativeImputer result: \n') print(imputed_X) _all_rmse = compute_rmse(testX, imputed_X, testM) print('>>>all_rmse', _all_rmse) return _all_rmse
def test_iterative_imputer_transform_stochasticity(): pytest.importorskip("scipy", minversion="0.17.0") rng1 = np.random.RandomState(0) rng2 = np.random.RandomState(1) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng1).toarray() # when sample_posterior=True, two transforms shouldn't be equal imputer = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=True, random_state=rng1) imputer.fit(X) X_fitted_1 = imputer.transform(X) X_fitted_2 = imputer.transform(X) # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2)) # when sample_posterior=False, and n_nearest_features=None # and imputation_order is not random # the two transforms should be identical even if rng are different imputer1 = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng1) imputer2 = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng2) imputer1.fit(X) imputer2.fit(X) X_fitted_1a = imputer1.transform(X) X_fitted_1b = imputer1.transform(X) X_fitted_2 = imputer2.transform(X) assert_allclose(X_fitted_1a, X_fitted_1b) assert_allclose(X_fitted_1a, X_fitted_2)
def main(): configs = json.load( open('MachineLearning/Models/LSTM/Configuration.json', 'r')) if not os.path.exists(configs['model']['save_dir']): os.makedirs(configs['model']['save_dir']) time_series = pd.read_csv(clustered_timeseries_path + "TimeSeriesAggregatedClusteredDeltaTwoDays.csv") print(time_series.shape) # configs['data']['train_test_split'], #the split #configs['data']['columns_dynamic'] # the columns #Impute and Scale Data dynamic_features = configs['data']['dynamic_columns'] grouping = configs['data']['grouping'] imp = IterativeImputer(max_iter=10, random_state=0) imp.fit(time_series[dynamic_features]) time_series[dynamic_features] = imp.transform( time_series[dynamic_features]) time_series = scale(time_series, dynamic_features) X = time_series[dynamic_features] groups = np.array(time_series[grouping]) for outcome in configs['data']['classification_outcome']: y = time_series[outcome] y = y.astype(int) model = Model(configs['model']['name'] + outcome) print(grouping) print(len(set(time_series[grouping]))) model.build_model(configs) i = 0 for ffold_ind, (training_ind, testing_ind) in enumerate( stratified_group_k_fold(X, y, groups, k=10)): # CROSS-VALIDATION training_groups, testing_groups = groups[training_ind], groups[ testing_ind] this_y_train, this_y_val = y[training_ind], y[testing_ind] this_X_train, this_X_val = X.iloc[training_ind], X.iloc[ testing_ind] assert len(set(training_groups) & set(testing_groups)) == 0 print(" X SHAPE: ", this_X_train.shape) print(" Y shape: ", this_y_train.shape) input_timesteps = 24 input_dim = 2 if i == 0: #(NumberOfExamples, TimeSteps, FeaturesPerStep). model.train((this_X_train.values).reshape(-1, 24, 35), (this_y_train.values).reshape(-1, 24, 1)) i = i + 1
def problem2_3_3(data): data[3].loc[data[3] == 0] = np.nan imp = IterativeImputer(missing_values=np.nan) imp.fit(data) newdata = np.round(imp.transform(data)) area = newdata[:, 2].tolist() print("Use Multivariate:", problem2_3_1(area)) return "as shown in the plots"
def iterative_inputer_integer(self, df): df_copy = df.copy() imp = IterativeImputer(max_iter=10, random_state=0) imp.fit(df_copy) df_new = pd.DataFrame(np.round(imp.transform(df_copy)), columns=df_copy.columns) df_new = df_new.astype('int32') return df_new
def impute_integrated_dataset(integrated): imputer = IterativeImputer(random_state=0) data = integrated.select_dtypes(exclude="object") imputer = imputer.fit(data) t_data = imputer.transform(data) integrated[data.columns] = t_data integrated = integrated.drop(columns=["Name", "Location"]) integrated["Year"] = integrated["Year"].astype(int) return integrated
def main(): df = get_raw_data() data_dict = pd.read_csv("data/WiDS Datathon 2020 Dictionary.csv") identifier_features = data_dict[data_dict["Category"] == "identifier"][ "Variable Name"].tolist() + ["icu_id"] type__features = [ "hospital_admit_source", "icu_admit_source", "icu_stay_type", "icu_type", ] redundant_features = ['readmission_status', 'apache_2_bodysystem'] features_to_drop = identifier_features + type__features + redundant_features # keep features that have less than 70% of nulls cut_off_percentage = 0.3 n_of_nulls = int(cut_off_percentage * df.shape[0]) df = df.dropna(axis=1, thresh=n_of_nulls) numeric_features = data_dict[ data_dict["Data Type"] == "numeric"]["Variable Name"].tolist() + [ "bmi", "apache_2_diagnosis", "apache_3j_diagnosis" ] skewed_numeric_features = df.columns[df.columns.isin(numeric_features)] numeric_df = df[skewed_numeric_features] imp = IterativeImputer(max_iter=3, verbose=0) imp.fit(numeric_df) imputed_df = imp.transform(numeric_df) imputed_df = pd.DataFrame(imputed_df, columns=numeric_df.columns) categorical_features = data_dict[ data_dict["Data Type"] != "numeric"]["Variable Name"].tolist() # remove ['bmi','apache_2_diagnosis','apache_3j_diagnosis'] non_categorical features categorical_features = [ feature for feature in categorical_features if feature not in ["bmi", "apache_2_diagnosis", "apache_3j_diagnosis"] ] skewed_categorical_features = df.columns[df.columns.isin( categorical_features)] categorical_df = df[skewed_categorical_features] # fill the null with the most occurred values # df.series.mode() returns a series. so [0] exact value of the series for feature in skewed_categorical_features: categorical_df[feature].fillna(categorical_df[feature].mode()[0], inplace=True) complete_df = pd.concat([imputed_df, categorical_df], axis=1) return complete_df
def split_data(X, y): in_test = X["year"] >= START_TEST_YEAR y_train = y[~in_test] y_test = y[in_test] # There are a whole bunch of missing label data for some countries such that we # can't even forward- or backfill them. Making up your own labels is bad, but # we need some form of data and don't have time to find sources for what's missing imputer = IterativeImputer(max_iter=100) imputer.fit(y_train) return ( X[~in_test], X[in_test], imputer.transform(y_train), imputer.transform(y_test), )
def imput_data_with_sklearn_imputer(df_daily): df_daily_interp = df_daily.copy() df_daily_interp["MES"] = df_daily_interp.index.month imputer = IterativeImputer(estimator=BayesianRidge(), random_state=1) imputer.fit(df_daily_interp.values) imputted_vals = imputer.transform(df_daily_interp.values) df_daily_interp.loc[:, :] = imputted_vals return df_daily_interp
def experiment_LinearRegression(df, df_full, score): start_time = time.time() imp = IterativeImputer(estimator=LinearRegression(), random_state=0, max_iter=10) imp.fit(df) df_filled = pd.DataFrame(imp.transform(df)) score.loc['Linear Regression', 'r2_score'] = r2_score(df_full, df_filled) score.loc['Linear Regression', 'time'] = time.time() - start_time
def fill_chunk(fit_df, transform_df): estimator = RandomForestRegressor(n_estimators=10, n_jobs=8) imp = IterativeImputer(estimator=estimator, max_iter=5, random_state=0) imp.fit(fit_df) transformed = imp.transform(transform_df) imputed_df = pd.DataFrame(data=transformed, index=transform_df.index, columns=transform_df.columns) return imputed_df
def use_imputation(df_list, train_x_columns): imputer = IterativeImputer(random_state=0, max_iter=30, verbose=2) imputer.fit(df_list[0][train_x_columns]) for i in range(len(df_list)): df_list[i][train_x_columns] = imputer.transform( df_list[i][train_x_columns]) return df_list
def smooth_pert_corr(): res = pd.read_csv('/work/GLEAM/perturbation_correction_v2/result.csv', index_col=0) gpis_valid = get_valid_gpis(latmin=24., latmax=51., lonmin=-128., lonmax=-64.) ind_valid = np.unravel_index(gpis_valid, (720, 1440)) imp = IterativeImputer(max_iter=10, random_state=0) ind = np.unravel_index(res.index.values, (720, 1440)) for tag in ['a1', 'b1', 'c1','a2', 'b2', 'c2']: img = np.full((720, 1440), np.nan) img[ind] = res[tag] # find all non-zero values idx = np.where(~np.isnan(img)) vmin, vmax = np.percentile(img[idx], [2.5, 97.5]) img[img < vmin] = vmin img[img > vmax] = vmax # calculate fitting parameters imp.set_params(min_value=vmin, max_value=vmax) imp.fit(img) # Define an anchor pixel to infer fitted image dimensions tmp_img = img.copy() tmp_img[idx[0][100], idx[1][100]] = 1000000 # transform image with and without anchor pixel tmp_img_fitted = imp.transform(tmp_img) img_fitted = imp.transform(img) # # Get indices of fitted image idx_anchor = np.where(tmp_img_fitted == 1000000)[1][0] start = idx[1][100] - idx_anchor end = start + img_fitted.shape[1] # write output img[:, start:end] = img_fitted img = gaussian_filter(img, sigma=0.6, truncate=1) res.loc[:, tag + '_s'] = img[ind_valid] res.to_csv('/work/GLEAM/perturbation_correction_v2/result_smoothed.csv', float_format='%.8f')
def fit(self, X, y=None): """Perform co-clustering. Parameters ---------- X : numpy array or scipy sparse matrix, shape=(n_samples, n_features) Matrix to be analyzed """ random_state = check_random_state(self.random_state) check_array(X, accept_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=self.n_row_clusters, ensure_min_features=self.n_col_clusters, warn_on_dtype=False, estimator=None) global indices indices=np.argwhere(np.isnan(X)) if(len(indices)): imp = IterativeImputer(missing_values=np.nan, sample_posterior=False, max_iter=10, tol=0.001, n_nearest_features=4, initial_strategy='most_frequent') imp.fit(X) X=imp.transform(X) check_positive(X) X = X.astype(float) criterion = self.criterion criterions = self.criterions row_labels_ = self.row_labels_ column_labels_ = self.column_labels_ delta_kl_ = self.delta_kl_ seeds = random_state.randint(np.iinfo(np.int32).max, size=self.n_init) for seed in seeds: self._fit_single(X, seed, y) if np.isnan(self.criterion): raise ValueError("matrix may contain negative or " "unexpected NaN values") # remember attributes corresponding to the best criterion if (self.criterion > criterion): criterion = self.criterion criterions = self.criterions row_labels_ = self.row_labels_ column_labels_ = self.column_labels_ delta_kl_ = self.delta_kl_ # update attributes self.criterion = criterion self.criterions = criterions self.row_labels_ = row_labels_ self.column_labels_ = column_labels_ self.delta_kl_ = delta_kl_ return self
def train_model_iterative_fill(filename): pd.options.mode.chained_assignment = None df = pd.read_csv(filename, encoding = 'utf-16', sep = '\t') groups = list(set(df[PAGAL_KA_SUGRUPUOTI_SPEJIMUS].astype(int))) estimators = [ExtraTreesRegressor(), BayesianRidge(), KNeighborsRegressor(), DecisionTreeRegressor(), RandomForestRegressor()] # geriausiai veikia decision trees regressor # pasirenkamas algoritmas estimator = estimators[0] # ar reikia atmesti mažas reikšmes new_filename = filename atmesti_mazas_reiksmes = True if atmesti_mazas_reiksmes: df = atmesti_mazas_tui(df) new_filename = filename.split('.') new_filename = new_filename[0] + '_be_mazu_tui.' + new_filename[1] df.to_csv(new_filename, sep = '\t', encoding = 'utf-16', index = False) for group in groups: print('Pildomas %s rodiklis' % group) maindf = pd.read_csv(new_filename, encoding = 'utf-16', sep = '\t') # atsirenkamos eilutės tik su tam tikra PAGAL_KA_SUGRUPUOTI_SPEJIMUS reikšme df = maindf.loc[maindf[PAGAL_KA_SUGRUPUOTI_SPEJIMUS] == group] X = shuffle(df) # numetamos reikšmės, kurių neina konvertuoti į skaičius for name in ATMESTI: X = X.drop(name, axis = 1) # atsikratome tuščių stulpelių (neįmanoma teisingai nuspėti kai nėra jokio pavyzdžio) for col in X: if X[col].isnull().all(): X = X.drop(col, axis = 1) # jei yra bent viena eilutė, kurią būtų galima užpildyti if len(X) > 0: index = list(X.index) columns = list(X.columns.values) # sukuriamas ir ištreniruojamas algoritmas imp = IterativeImputer(estimator = estimator, missing_values = np.nan) imp.fit(X) # užpildomos tuščios X reikšmės X = imp.transform(X) # čia X grąžinamas np.array pavidalu, todėl reikia jį atversti atgal į pandas.DataFrame X = pd.DataFrame(data = X, index = index, columns = columns) maindf.update(X) new_filename = new_filename.split('.')[0] + '_updated.' + new_filename.split('.')[1] # išsaugomi spėjimai maindf.to_csv(new_filename, sep = '\t', encoding = 'utf-16', index = False) # sutvarko failą tidy_up_file(new_filename) return 0
def imputeAll(df, write=''): ''' impute all of the columns in the DF apart from URN ''' import numpy as np from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer URNcol = df['URN'][:] originalCols = list(df.columns) originalCols.remove('URN') print('len(originalCols) after removing URN', len(originalCols)) dfToFit = df.drop(['URN'], axis=1) print('dfToFit.shape', dfToFit.shape) imp = IterativeImputer(max_iter=10, random_state=0) imp.fit(dfToFit) print('imp.transform(dfToFit).shape', imp.transform(dfToFit).shape) fixed_df = pd.DataFrame(imp.transform(dfToFit), columns=originalCols) fixed_df['URN'] = URNcol if len(write) > 0: fixed_df.to_csv(write) return fixed_df
def internal(self, col_list): col_list1 = col_list.get('internal') data = self.data[self.data.columns.intersection(col_list1)] imp_mean = IterativeImputer(random_state=0) imp_mean.fit(data) data_iterative = pd.DataFrame(imp_mean.transform(data), columns=data.columns, index=data.index) data_iterative.to_csv('internal.csv', index=False) return data_iterative
def imputation(df): ## La imputacion se hace para reemplazar los nans por un valor estimado from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer imputer = IterativeImputer(max_iter=10, random_state=0) imputer.fit(df) df_num_imp = imputer.transform(df) df = pd.DataFrame(df_num_imp, columns=df.columns) return df
def impute_columns(life_expectancy_df): imputed_values = iterative_impute( life_expectancy_df, ["Population", "Measles", "Thinness 1-19 Years", "Under-five Deaths"], "Population") life_expectancy_df['Population'] = imputed_values[:, 0] imputed_values = iterative_impute( life_expectancy_df, ["Hepatitis B", "Diphtheria", "Polio", "Life Expectancy"], "Hepatitis B") life_expectancy_df['Hepatitis B'] = imputed_values[:, 0] imputed_values = iterative_impute(life_expectancy_df, [ "GDP", "Percentage Expenditure", "Life Expectancy", "Income Composition Of Resources", "Schooling", "Alcohol", "BMI" ], "GDP") life_expectancy_df['GDP'] = imputed_values[:, 0] imputed_values = iterative_impute( life_expectancy_df, ["Total Expenditure", "Alcohol", "Schooling", "BMI"], "Total Expenditure") life_expectancy_df['Total Expenditure'] = imputed_values[:, 0] imputed_values = iterative_impute(life_expectancy_df, [ "Alcohol", "Schooling", "Income Composition Of Resources", "Life Expectancy", "GDP", "Percentage Expenditure", "BMI", "Total Expenditure" ], "Alcohol") life_expectancy_df['Alcohol'] = imputed_values[:, 0] imputed_values = iterative_impute(life_expectancy_df, [ "Income Composition Of Resources", "Life Expectancy", "BMI", "GDP", "Alcohol", "Diphtheria", "Percentage Expenditure", "Polio" ], "Income Composition Of Resources") imputed_values = iterative_impute(life_expectancy_df, [ "Schooling", "Alcohol", "Income Composition Of Resources", "Life Expectancy", "GDP", "Percentage Expenditure", "BMI", "Total Expenditure" ], "Schooling") life_expectancy_df['Schooling'] = imputed_values[:, 0] imputed_values = iterative_impute(life_expectancy_df, [ "Income Composition Of Resources", "Life Expectancy", "BMI", "GDP", "Alcohol", "Diphtheria", "Percentage Expenditure", "Polio" ], "Income Composition Of Resources") life_expectancy_df['Income Composition Of Resources'] = imputed_values[:, 0] imputer = IterativeImputer(random_state=0) columns = [ 'Thinness 1-19 Years', 'BMI', 'Polio', 'Diphtheria', 'Life Expectancy', 'Adult Mortality' ] data = life_expectancy_df[columns] imputer = imputer.fit(data) imputed_values = imputer.transform(data) life_expectancy_df[columns] = imputed_values return life_expectancy_df
def iterativemethod(self): import numpy as np from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer imp_mean = IterativeImputer(random_state=0) for featurem in self.missing_columns: param = list(set(self.data.columns) - set(featurem)) imp_mean.fit(np.array(self.data[param]).reshape(-1, 1)) self.data[featurem] = imp_mean.transform( np.array(self.data[featurem]).reshape(-1, 1)) return self.data
def MICE(df): columns = df.columns imp = IterativeImputer(max_iter=100, missing_values=0, random_state=random.randint(0, 1000), sample_posterior=True, verbose=True) imp.fit(df) res = imp.transform(df) df = pd.DataFrame(res, columns=columns) return df
def main(): time_series_clustered_demographics = pd.read_csv(clustered_timeseries_path +"TimeSeriesAggregatedClustered.csv") time_series_clustered_demographics_not_old = pd.read_csv(clustered_timeseries_path+"TimeSeriesAggregatedClusteredNotOld.csv") time_series_clustered_baseline = pd.read_csv(clustered_timeseries_path+"TimeSeriesAggregatedClusteredBaseline.csv") time_series_clustered_twodays = pd.read_csv(clustered_timeseries_path+"TimeSeriesAggregatedClusteredDeltaTwoDays.csv") time_series = pd.read_csv(data_path+"TimeSeriesAggregated.csv") dynamic_features = ['Hour','ALT', 'Albumin', 'Anticoagulant clinic INR', 'Bicarbonate', 'Biochemistry (Glucose)', 'Blood Lactate', 'C-Reactive-Protein', 'CSF Glucose', 'Creatinine', 'Creatinine Clearance.', 'D-Dimer', 'DiasBP', 'Estimated-GFR', 'Fasting Glucose.', 'Ferritin', 'FiO2', 'Fluid Albumin.', 'Fluid Glucose.', 'GCSEye', 'GCSMotor', 'GCSVerbal', 'HBA1c-DCCT', 'HBA1c-IFCC', 'Hb', 'HbA1c', 'HeartRate', 'INR', 'Lactate', 'Lactate (CSF)', 'Lactate (plasma)', 'Lactate-Dehydrogenase', 'Lymphocytes', 'Lymphocytes (LYMP)', 'NEWS2', 'NT-pro-BNP', 'Neutrophils', 'OxygenDelivery', 'OxygenLitres', 'OxygenSaturation', 'PCO2', 'PCV', 'PH', 'PLT', 'PO2', 'PO2/FIO2', 'PainScore', 'Protein/Creatinine Ratio', 'Random Glucose:', 'Random Urine pH', 'Random-Urine-Creatinine', 'RespirationRate', 'Reticulocyte HB Content', 'SupplementalOxygen', 'SysBP', 'Temperature', 'Troponin-I', 'Troponin-T', 'U-albumin/creat. ratio', 'Urea', 'Urine Albumin conc.', 'Urine Glucose', 'Urine Urea', 'Venous Bicarbonate', 'Venous PCO2', 'Venous PO2', 'Venous pH', 'WBC', 'WBC count (CSF)', 'WBC count (Fluid)', 'cHCO3'] rfm=RandomForestClassifier(n_estimators=100, max_depth=4) lrm=LogisticRegression(solver='lbfgs') #ExperimentI(time_series_clustered_demographics) #ExperimentII(time_series_clustered_demographics_not_old) #ExperimentIII(time_series_clustered_baseline) #ExperimentIV(time_series_clustered_twodays) dynamic_features = ['Hour', 'ALT', 'Albumin', 'Blood Lactate', 'C-Reactive-Protein', 'Creatinine', 'D-Dimer', 'DiasBP', 'Estimated-GFR', 'Ferritin', 'FiO2', 'GCSMotor', 'GCSVerbal', 'Hb', 'HeartRate', 'INR', 'Lymphocytes', 'NEWS2', 'Neutrophils', 'OxygenLitres', 'OxygenSaturation', 'PCO2', 'PCV', 'PH', 'PLT', 'PO2', 'PO2/FIO2', 'PainScore', 'SupplementalOxygen', 'SysBP', 'Temperature', 'Troponin-T', 'Urea', 'WBC', 'cHCO3'] imp = IterativeImputer(max_iter=10, random_state=0) imp.fit(time_series[dynamic_features]) time_series[dynamic_features] = imp.transform(time_series[dynamic_features]) time_series = scale(time_series, dynamic_features) ExperimentV(time_series)
def impute_model_approach(self, dataset, col): imp_mean = IterativeImputer(random_state=0, verbose=1, max_iter=100) for i in dataset.columns: if i == col: continue dataset = self.impute_interpolate(dataset, i) print(dataset.shape) imp_mean.fit(dataset) print(dataset.columns) X = imp_mean.transform(dataset) X = pd.DataFrame(X, index=dataset.index, columns=dataset.columns) return X
def TrainBDT(featureNames, trainingData, classificationArray): clf = ensemble.HistGradientBoostingClassifier() trainingData = trainingData[featureNames] # Remove all irrelevant columns if cfg.balanceClasses: imp = IterativeImputer() imp.fit(trainingData) trainingData = imp.transform(trainingData) sm = smt(sampling_strategy=1) trainingData, classificationArray = sm.fit_sample( trainingData, classificationArray) clfView = clf.fit(trainingData, classificationArray) return clfView
def impute_data(self, selected_attributes): """ X: which features to use to interpolate missing values y: which features to replace missing values """ imp = IterativeImputer(max_iter=100, random_state=0) X = self.altered_dataframe[self.feature_list] y = self.altered_dataframe[selected_attributes] imp.fit(X, y) self.altered_dataframe = pd.DataFrame(data=imp.transform( self.dataframe), columns=self.feature_list)
def imputeCVData(class_label,instance_label,categorical_variables,data_train,data_test,random_state,header): # Begin by imputing categorical variables with simple 'mode' imputation mode_dict = {} for c in data_train.columns: if c in categorical_variables: train_mode = data_train[c].mode().iloc[0] data_train[c].fillna(train_mode, inplace=True) mode_dict[c] = train_mode for c in data_test.columns: if c in categorical_variables: data_test[c].fillna(mode_dict[c], inplace=True) # Now impute remaining ordinal variables if instance_label == None or instance_label == 'None': x_train = data_train.drop([class_label], axis=1).values x_test = data_test.drop([class_label], axis=1).values else: x_train = data_train.drop([class_label, instance_label], axis=1).values x_test = data_test.drop([class_label, instance_label], axis=1).values inst_train = data_train[instance_label].values # pull out instance labels in case they include text inst_test = data_test[instance_label].values y_train = data_train[class_label].values y_test = data_test[class_label].values # Impute features (x) imputer = IterativeImputer(random_state=random_state,max_iter=30).fit(x_train) x_new_train = imputer.transform(x_train) x_new_test = imputer.transform(x_test) # Recombine x and y if instance_label == None or instance_label == 'None': data_train = pd.concat([pd.DataFrame(y_train, columns=[class_label]), pd.DataFrame(x_new_train, columns=header)],axis=1, sort=False) data_test = pd.concat([pd.DataFrame(y_test, columns=[class_label]), pd.DataFrame(x_new_test, columns=header)], axis=1, sort=False) else: data_train = pd.concat([pd.DataFrame(y_train, columns=[class_label]), pd.DataFrame(inst_train, columns=[instance_label]),pd.DataFrame(x_new_train, columns=header)], axis=1, sort=False) data_test = pd.concat([pd.DataFrame(y_test, columns=[class_label]), pd.DataFrame(inst_test, columns=[instance_label]), pd.DataFrame(x_new_test, columns=header)], axis=1, sort=False) return data_train,data_test,imputer,mode_dict
def iter_inputer(est, X_train, X_test, est_label, y_train=None, y_test=None, max_iter=10): """ iterative imputer with est e.g. est=RandomForestClassifier(n_estimators=n_estimators, n_jobs=1, max_depth=4) """ imp = IterativeImputer(estimator=est, max_iter=max_iter) if not y_train is None: X_train['Y'] = y_train X_test['Y'] = y_test imp.fit(X_train) IDENT = '%s_Miter%d' % (est_label, max_iter) joblib.dump(imp, 'iter_imp_%s.joblib' % IDENT) X_train_it_imp = pd.DataFrame(imp.transform(X_train), columns=X_train.columns, index=X_train.index) X_test_it_imp = pd.DataFrame(imp.transform(X_test), columns=X_test.columns, index=X_test.index) if not y_train is None: X_train_it_imp = X_train_it_imp.drop(columns=['Y']) X_test_it_imp = X_test_it_imp.drop(columns=['Y']) X_train_it_imp.to_pickle('X_train_ii_%s.pkl' % IDENT) X_test_it_imp.to_pickle('X_test_ii_%s.pkl' % IDENT) return (imp, X_train_it_imp, X_test_it_imp)
def test_iterative_imputer_zero_iters(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() missing_flag = X == 0 X[missing_flag] = np.nan imputer = IterativeImputer(max_iter=0) X_imputed = imputer.fit_transform(X) # with max_iter=0, only initial imputation is performed assert_allclose(X_imputed, imputer.initial_imputer_.transform(X)) # repeat but force n_iter_ to 0 imputer = IterativeImputer(max_iter=5).fit(X) # transformed should not be equal to initial imputation assert not np.all(imputer.transform(X) == imputer.initial_imputer_.transform(X)) imputer.n_iter_ = 0 # now they should be equal as only initial imputation is done assert_allclose(imputer.transform(X), imputer.initial_imputer_.transform(X))
def test_iterative_imputer_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 X_train = rng.randint(low=0, high=3, size=(n, d)) X_test = rng.randint(low=0, high=3, size=(n, d)) X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column imputer = IterativeImputer(missing_values=0, max_iter=1, initial_strategy=strategy, random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) # if there were no missing values at time of fit, then imputer will # only use the initial imputer for that feature at transform assert_allclose(imputer.transform(X_test)[:, 0], initial_imputer.transform(X_test)[:, 0])
def test_iterative_imputer_transform_recovery(rank): rng = np.random.RandomState(0) n = 100 d = 100 A = rng.rand(n, rank) B = rng.rand(rank, d) X_filled = np.dot(A, B) nan_mask = rng.rand(n, d) < 0.5 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data in half n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = IterativeImputer(max_iter=10, verbose=1, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, atol=0.1)