lastcolumn_np = df.iloc[:,-1:].to_numpy() lastcolumn_op=np.ravel(lastcolumn_np) # In[105]: df_copy.drop(['Target Variable (Discrete)','Feature 16','Feature 17'], axis=1, inplace=True) # In[106]: imp_mean = IterativeImputer(random_state=0) imp_mean.fit(df_copy) IterativeImputer(random_state=0) data = imp_mean.transform(df_copy) # In[108]: corr_features = set() correlation_matrix = pd.DataFrame(data).corr() for i in range(len(correlation_matrix.columns)): for j in range(i): if abs(correlation_matrix.iloc[i, j]) > 0.8: colname = correlation_matrix.columns[i] corr_features.add(colname)
#Split data in training data and validation data x_trainCV = x_train.values[train_index] x_testCV = x_train.values[test_index] y_trainCV = y_train.values[train_index] y_testCV = y_train.values[test_index] #Replacement of NaN's #Default Simple imp = SimpleImputer( missing_values=np.nan, strategy=replace_nan_strategy) if (replace_nan_method == 'Iterative'): imp = IterativeImputer( n_nearest_features=replace_nan_strategy ) x_trainCV = imp.fit_transform( x_trainCV, y_trainCV) x_testCV = imp.transform(x_testCV) #Outlier detection print('before: ', x_trainCV.shape) iso = IsolationForest( contamination=contamination).fit( x_trainCV, y_trainCV) clfTrain = iso.predict(x_trainCV)
# ============================================================================= # #Normalisation # ============================================================================= from sklearn import preprocessing temp_features = temp_features.iloc[:, :].values #returns a numpy array min_max_scaler = preprocessing.MinMaxScaler() temp_features = min_max_scaler.fit_transform(temp_features) temp_features = pd.DataFrame(temp_features, columns=feature_list) # ============================================================================= # #Imputation # ============================================================================= from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer imp = IterativeImputer(random_state=0, max_iter = 50, imputation_order='random') imp.fit(temp_features) features_imp = imp.transform(temp_features) imp = None import gc gc.collect() features_imp = pd.DataFrame(features_imp,columns=feature_list) features = features_imp.copy() features = features.join(pd.DataFrame(temp_features_label, columns=(['Longterm_TransplantOutcome']))) features = features.join(pd.DataFrame(temp_features_tenure, columns=(['tenure']))) features = features.join(pd.DataFrame(temp_features_transplantationIDs, columns=(['TransplantationID']))) features = features.join(pd.DataFrame(temp_features_patientIDs, columns=(['PatientID']))) features.to_csv(r'T:\\tbase\\tbase_data_imputed.csv') ###################################
def load_data_release_level(project, metric): understand_path = 'data/understand_files_all/' + project + '_understand.csv' understand_df = pd.read_csv(understand_path) understand_df = understand_df.dropna(axis=1, how='all') cols_list = understand_df.columns.values.tolist() for item in ['Kind', 'Name', 'commit_hash', 'Bugs']: if item in cols_list: cols_list.remove(item) cols_list.insert(0, item) understand_df = understand_df[cols_list] cols = understand_df.columns.tolist() understand_df = understand_df.drop_duplicates(cols[4:len(cols)]) understand_df['Name'] = understand_df.Name.str.rsplit('.', 1).str[1] commit_guru_file_level_path = 'data/commit_guru_file/' + project + '.csv' commit_guru_file_level_df = pd.read_csv(commit_guru_file_level_path) commit_guru_file_level_df[ 'commit_hash'] = commit_guru_file_level_df.commit_hash.str.strip('"') commit_guru_file_level_df = commit_guru_file_level_df[ commit_guru_file_level_df['file_name'].str.contains('.java')] commit_guru_file_level_df[ 'Name'] = commit_guru_file_level_df.file_name.str.rsplit( '/', 1).str[1].str.split('.').str[0].str.replace('/', '.') commit_guru_file_level_df = commit_guru_file_level_df.drop('file_name', axis=1) release_df = pd.read_pickle('data/release/' + project + '_release.pkl') release_df = release_df.sort_values('created_at', ascending=False) release_df = release_df.reset_index(drop=True) release_df['created_at'] = pd.to_datetime(release_df.created_at) release_df['created_at'] = release_df.created_at.dt.date commit_guru_path = 'data/commit_guru/' + project + '.csv' commit_guru_df = pd.read_csv(commit_guru_path) cols = understand_df.columns.tolist() commit_guru_df['created_at'] = pd.to_datetime( commit_guru_df.author_date_unix_timestamp, unit='s') commit_guru_df['created_at'] = commit_guru_df.created_at.dt.date commit_guru_df = commit_guru_df[['commit_hash', 'created_at']] df = understand_df.merge(commit_guru_file_level_df, how='left', on=['commit_hash', 'Name']) df = df.merge(commit_guru_df, how='left', on=['commit_hash']) cols = df.columns.tolist() cols.remove('Bugs') cols.append('Bugs') df = df[cols] file_names = df.Name commit_hash = df.commit_hash for item in ['Kind', 'Name', 'commit_hash']: if item in cols: df = df.drop(labels=[item], axis=1) df = df.drop_duplicates() df.reset_index(drop=True, inplace=True) created_at = df.created_at df = df.drop('created_at', axis=1) y = df.Bugs X = df.drop('Bugs', axis=1) cols = X.columns scaler = MinMaxScaler() X = scaler.fit_transform(X) X = pd.DataFrame(X, columns=cols) imp_mean = IterativeImputer(random_state=0) X = imp_mean.fit_transform(X) X = pd.DataFrame(X, columns=cols) X['created_at'] = created_at if metric == 'process': X = X[[ 'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev', 'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev', 'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm', 'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd', 'file_sctr', 'created_at' ]] elif metric == 'product': X = X.drop([ 'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev', 'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev', 'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm', 'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd', 'file_sctr' ], axis=1) else: X = X df = X df['Name'] = file_names df['Bugs'] = y accepted_commit_dates = [] all_data = pd.DataFrame() for i in range(release_df.shape[0] - 1): sub_df = df[df['created_at'] <= release_df.loc[i, 'created_at']] sub_df = sub_df[sub_df['created_at'] > release_df.loc[i + 1, 'created_at']] sub_df.sort_values(by=['created_at'], inplace=True, ascending=False) sub_df.drop_duplicates(['Name'], inplace=True) all_data = pd.concat([all_data, sub_df], axis=0) all_data = all_data.drop('created_at', axis=1) return all_data
for n_estimators, max_iter in [(e, i) for e in [10, 100] for i in [10, 100]]: x_train = x_train0 y_train = y_train0 # 1. Missing Values est = ExtraTreesRegressor(n_estimators=n_estimators, random_state=42, max_features='sqrt', n_jobs=10, verbose=0) imputer = IterativeImputer(estimator=est, max_iter=max_iter, tol=0.001, n_nearest_features=100, initial_strategy='median', imputation_order='ascending', verbose=2, random_state=0) x_train_filled = imputer.fit_transform(x_train) x_train = pd.DataFrame(x_train_filled) # 2. Outliers detection clf = IsolationForest(n_estimators=150, max_samples=1000, contamination=0.02, max_features=1.0, bootstrap=False, n_jobs=10, behaviour='old',
for i in range(num_iter): print('Iteration', i + 1) # ### Split Data X_train, X_test, y_train, y_test = train_test_split( df.values, labels.values.ravel(), train_size=train_size, shuffle=True, stratify=labels.values.ravel()) # ### Impute Data if data_impute: imp = IterativeImputer(max_iter=25, random_state=1337) X_train = imp.fit_transform(X_train) X_test = imp.transform(X_test) # ### Augment Data if smote_ratio > 0: smote = SMOTE(sampling_strategy='all', random_state=1337, k_neighbors=5, n_jobs=1) X_train, y_train = smote.fit_resample(X_train, y_train) scaler = StandardScaler() X_train = scaler.fit_transform(X_train)
headers = list(train_features) rare = [] common = [] tot = train_features.shape[0] + test_features.shape[ 0] # total entries per column for head in headers: # get number of missing values in this column missing = train_features[head].isna().sum() + test_features[head].isna( ).sum() if (missing / tot >= 0.9): rare.append(head) else: common.append(head) # impute and compute features for each patient imp = IterativeImputer() features = ["pid", "Age"] + rare common.remove("pid") common.remove("Age") for c in common: features.append(c + "_mean") features.append(c + "_min") features.append(c + "_max") features.append(c + "_median") X_feat = pd.DataFrame(index=all_pids, data={"pid": all_pids}, columns=features) skip = False for pid in all_pids:
def run(argv=None): """Emulate a HP search and monitor fit time.""" args = parser.parse_args(argv) imputers = { 'Mean': SimpleImputer(strategy='mean'), 'Mean+mask': SimpleImputer(strategy='mean', add_indicator=True), 'Med': SimpleImputer(strategy='median'), 'Med+mask': SimpleImputer(strategy='median', add_indicator=True), 'Iterative': IterativeImputer(max_iter=args.max_iter), 'Iterative+mask': IterativeImputer(add_indicator=True, max_iter=args.max_iter), 'IterativeR': IterativeImputer(estimator=RidgeCV(), max_iter=args.max_iter), 'IterativeR+mask': IterativeImputer(estimator=RidgeCV(), add_indicator=True, max_iter=args.max_iter), 'KNN': KNNImputer(), 'KNN+mask': KNNImputer(add_indicator=True), } task_name = args.task_name est = args.est imp = imputers.get(args.imp, None) if task_name is None or est is None: logger.info('No argv given.') task_name = 'TB/shock_hemo' est = 'HGBC' task = tasks[task_name] logger.info(f'Argv given. Task {task.meta.tag}. est {est}.') t0 = time() logger.info('Getting X.') X = task.X logger.info('Getting y.') y = task.y logger.info(f'X shape before splits: {X.shape}') # Simulate the outer CV (the one of KFold) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2) # Simulate the inner CV (the one of RandomSearchCV) X_train2, X_test2, y_train2, _ = train_test_split(X_train, y_train, test_size=0.2) # Now X has the same shape as in real experiment logger.info(f'X shape: {X_train2.shape}') t_X_ready = time() if imp is not None: logger.info(f'Fitting imputer {args.imp}') imp.fit(X_train2, y_train2) t_fit_imp = time() logger.info('Imputer fitted.') logger.info('Transforming X_train') imp.transform(X_train2) t_tra1_imp = time() logger.info('X_train transformed') logger.info('Transforming X_test') imp.transform(X_test2) t_tra2_imp = time() logger.info('X_test transformed') t_fits = [time()] for learning_rate in param_space['learning_rate']: for max_depth in param_space['max_depth']: if est == 'HGBC': estimator = HistGradientBoostingClassifier( learning_rate=learning_rate, max_depth=max_depth ) elif est == 'HGBR': estimator = HistGradientBoostingRegressor( loss='least_absolute_deviation', learning_rate=learning_rate, max_depth=max_depth ) else: raise ValueError(f'Unknown estimator {est}') logger.info(f'Params: LR {learning_rate} MD {max_depth}') logger.info('Fitting estimator.') estimator.fit(X_train2, y_train2) t_fits.append(time()) logger.info('Estimator fitted.') t_fits = np.diff(t_fits) data = { 'task_tag': [task.meta.tag], 'imp': [args.imp], 'imp_params': [repr({'max_iter': args.max_iter})], 'X_shape': [repr(X.shape)], 'X_train_shape': [repr(X_train2.shape)], 'X_test_shape': [repr(X_test2.shape)], 'time_X_ready': [t_X_ready-t0], 'time_fit_imp': np.around([0 if imp is None else t_fit_imp-t_X_ready], 2), 'time_tra1_imp': np.around([0 if imp is None else t_tra1_imp-t_X_ready], 2), 'time_tra2_imp': np.around([0 if imp is None else t_tra2_imp-t_tra1_imp], 2), 'time_fits': [repr(np.around(t_fits.tolist(), 2))], 'time_fits_mean': [np.around(t_fits.mean(), 2)] } new_df = pd.DataFrame(data) df = None filepath = 'results/fit_time.csv' if os.path.exists(filepath): df = pd.read_csv(filepath, index_col=0) if df is not None: new_df = pd.concat([df, new_df]) new_df.to_csv(filepath)
def basic_preprocess(train_complete, test_complete, out_column, drop_columns=None, forced_categorical = None, forced_numeric = None, columns_to_normalize = None, use_labeler = None, manual_processing = None, seed=42, perc=10): complete_features = pd.concat([train_complete, test_complete], sort=False).reset_index(drop=True) train = train_complete.copy() test = test_complete.copy() normalize_output = columns_to_normalize and out_column in columns_to_normalize if normalize_output: columns_to_normalize.remove(out_column) if use_labeler: if not columns_to_normalize: columns_to_normalize = [] for column in use_labeler: if column in columns_to_normalize: columns_to_normalize.remove(column) convert_dict = {} if forced_categorical: for column in forced_categorical: convert_dict[column] = 'str' if forced_numeric: for column in forced_numeric: convert_dict[column] = 'float64' train = train.astype(convert_dict) test = test.astype(convert_dict) if drop_columns: train.drop(drop_columns, axis=1, inplace=True) test.drop(drop_columns, axis=1, inplace=True) train_data = np.array(train[out_column]) if normalize_output: normalize, denormalize = transform_distribution(train_data) else: normalize = lambda x: x denormalize = lambda x: x y = np.array(normalize(train_data)) train_features = train.drop([out_column], axis=1) features = pd.concat([train_features, test], sort=False).reset_index(drop=True) impute_with_mode(features) numerics = list(features.select_dtypes(include=[np.number]).columns.values) if len(numerics) >= 2: imp = IterativeImputer(max_iter=10, sample_posterior=False, random_state=seed) imp.fit(features[numerics]) features[numerics] = imp.transform(features[numerics]) elif numerics: impute_with_median(features) if use_labeler: labeler = LabelEncoder() for column in use_labeler: features[column] = labeler.fit_transform(features[column]) final_features = pd.get_dummies(features).reset_index(drop=True) if columns_to_normalize: normalize_columns(final_features, columns_to_normalize) if manual_processing: final_features = manual_processing(final_features, complete_features) X = final_features.iloc[:len(y), :] X_sub = final_features.iloc[len(X):, :] #print('selecting relevant features') #X, X_sub = select_features(X, y, X_sub, final_features.columns, perc=perc) return X, y, X_sub, denormalize
def impute(self): self.data = IterativeImputer().fit_transform(self.data) return self.data
def main(): index = load_dataset('all_merged', return_index=True) for _sym, data in index.items(): features, target = get_symbol_features(index, _sym) features_p = features[data['features']['ohlcv']].pct_change().replace( [np.inf, -np.inf], np.nan) features_p.columns = [c + '_p1' for c in features_p.columns] features_1 = features_p.shift(1) features_1.columns = [c + '_lag1' for c in features_1.columns] features_2 = features_p.shift(2) features_2.columns = [c + '_lag2' for c in features_2.columns] features_mean = features_p.rolling(3).mean() features_mean.columns = [c + '_mean_3' for c in features_mean.columns] ta = features[data['features']['ta'] + data['features']['ta_7d'] + data['features']['ta_30d']] features = pd.concat([ features['close'], ta, features_p, features_1, features_2, features_mean ], axis=1)[30:] target = target[30:] # Split data in train and blind test set with 70:30 ratio, # most ML models don't take sequentiality into account, but our pipeline # uses a SimpleImputer with mean strategy, so it's best not to shuffle the data. X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, shuffle=False, test_size=0.3) logger.info("Start Feature Selection") imp = SimpleImputer() values = imp.fit_transform(X_train) #sel = SelectKBest(score_func=f_classif, k=min(10, X_train.shape[1])) feature_count = int(0.3 * X_train.shape[1]) sel = RFECV(estimator=RandomForestClassifier(), cv=5, verbose=0, n_jobs=4, min_features_to_select=feature_count, scoring='neg_mean_squared_error') sel.fit(values, y_train) logger.info("End Feature Selection") bestfeatures = [ c for c, f in zip(features.columns, sel.get_support()) if f ] if not 'close' in bestfeatures: bestfeatures += ['close'] print("Using features:\n{}".format(bestfeatures, len(bestfeatures))) train_features = pd.DataFrame(X_train, columns=features.columns) test_features = pd.DataFrame(X_test, columns=features.columns) X_train = train_features[bestfeatures].values X_test = test_features[bestfeatures].values # Summarize distribution print("Training set: # Features {}, # Samples {}".format( X_train.shape[1], X_train.shape[0])) plot_class_distribution("Training set", _sym, y_train) print("Test set: # Features {}, # Samples {}".format( X_test.shape[1], X_test.shape[0])) plot_class_distribution("Test set", _sym, y_test) if not np.isfinite(X_train).all(): logger.warning("Training x is not finite!") if not np.isfinite(y_train).all(): logger.warning("Training y is not finite!") if not np.isfinite(X_test).all(): logger.warning("Test x is not finite!") if not np.isfinite(y_test).all(): logger.warning("Test y is not finite!") # Build pipeline to be used as estimator in grid search # so that each subset of the data is transformed independently # to avoid contamination between folds. pipeline = Pipeline([ ( 'i', IterativeImputer() ), # Replace nan's with the median value between previous and next observation ('s', MinMaxScaler(feature_range=(-1, 1))), ('c', MLPClassifier()), ]) # Perform hyperparameter tuning of the ensemble with 5-fold cross validation logger.info("Start Grid search") CV_rfc = GridSearchCV(estimator=pipeline, param_grid=PARAM_GRID, cv=5, n_jobs=4, scoring='neg_mean_squared_error', verbose=1) CV_rfc.fit(X_train, y_train) logger.info("End Grid search") # Take the fitted ensemble with tuned hyperparameters clf = CV_rfc.best_estimator_ # Test ensemble's performance on training and test sets logger.info("Classification report on train set") predictions1 = clf.predict(X_train) train_report = classification_report(y_train, predictions1, output_dict=True) print(classification_report(y_train, predictions1)) logger.info("Classification report on test set") predictions2 = clf.predict(X_test) test_report = classification_report(y_test, predictions2, output_dict=True) print(classification_report(y_test, predictions2)) stats = { 'score': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'test_score': accuracy_score(y_test, predictions2), 'test_mse': mean_squared_error(y_test, predictions2), 'train_report': train_report, 'test_report': test_report, } print(CV_rfc.best_params_) num_samples = min(y_train.shape[0], y_test.shape[0], 30) print("Gains calculated on {} samples only!".format(num_samples)) print( "Train Accuracy: {}\nTrain MSE: {}\nGains on train preds: 100 -> {}" .format( accuracy_score(y_train, predictions1), mean_squared_error(y_train, predictions1), test_gains(train_features['close'][0:num_samples], predictions1[0:num_samples], initial_balance=100, position_size=0.1))) print( "Test Accuracy: {}\nTest MSE: {}\nGains on test preds: 100 -> {}". format( accuracy_score(y_test, predictions2), mean_squared_error(y_test, predictions2), test_gains(test_features['close'][0:num_samples], predictions2[0:num_samples], initial_balance=100, position_size=0.1))) print("--- end ---")
def fill_nan(self): self.data_expand = IterativeImputer().fit_transform(self.data_expand) return self
def impute_tui(): path = 'tui_data_d1/' filenames = os.listdir(path) filenames = [path + name for name in filenames if name.startswith('X')] main_df = pd.DataFrame() for n, filename in enumerate(filenames): df = pd.read_csv(filename, sep='\t', low_memory=False, encoding='utf-16') main_df = main_df.append(df, ignore_index=True) if n % 100 == 0: print(n) main_df = main_df.drop(['VLST_KODAS2', 'kodas'], axis=1) index = main_df.columns.values print(index) print(main_df) estimators = [ ExtraTreesRegressor(), BayesianRidge(), KNeighborsRegressor(), DecisionTreeRegressor(), RandomForestRegressor() ] f = open('performance.txt', 'w') for estimator in estimators: imp = IterativeImputer(estimator=estimator, missing_values=np.nan) imp.fit(main_df) df = pd.read_csv(os.path.join(path, 'test.csv'), sep='\t', low_memory=False, encoding='utf-16') df = df.drop(['VLST_KODAS2', 'kodas'], axis=1) df = imp.transform(df) df = pd.DataFrame(df) df.columns = index df.to_csv(os.path.join(path, 'test_imp.csv'), sep='\t', encoding='utf-16') df1 = pd.read_csv(os.path.join(path, 'X00411.csv'), sep='\t', low_memory=False, encoding='utf-16') df1 = df1.drop(['VLST_KODAS2', 'kodas'], axis=1) df2 = pd.read_csv(os.path.join(path, 'test_imp.csv'), sep='\t', low_memory=False, encoding='utf-16') score, total = 0, 0 for i in range(1, 400): val1 = df1.iloc[i] val2 = df2.iloc[i] val1 = int(val1['D1']) val2 = int(val2['D1']) if relatively_equal(val1, val2): print(val1, val2) score += 1 total += 1 print(str(estimator).split('(')[0]) print('score: ', score / total) f.write(str(estimator).split('(')[0]) f.write('\ntotal: %i / %i\n' % (score, total)) f.write('score: %.1f%%\n\n' % (100 * score / total))
def run(argv=None): if argv is None or len(argv) < 2: logger.info('No argv given.') task = tasks['TB/shock_hemo'] imp = 'iterative' else: task = tasks[argv[1]] imp = argv[2] logger.info(f'Argv given. Task {task.meta.tag}. Imp {imp}.') logger.info('Getting X.') X = task.X logger.info('Getting y.') y = task.y logger.info(f'X shape before splits: {X.shape}') # Simulate the outer CV (the one of KFold) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2) # Simulate the inner CV (the one of RandomSearchCV) X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train, y_train, test_size=0.2) # Now X has the same shape as in real experiment logger.info(f'X shape: {X_train2.shape}') if imp == 'iterative': imp = IterativeImputer() elif imp == 'knn': imp = KNNImputer() t0 = time() logger.info('Fitting imputer.') imp.fit(X_train2) t1 = time() logger.info('Imputer fitted.') logger.info('Transforming X_train.') imp.transform(X_train2) t2 = time() logger.info('X_train transformed.') logger.info('Transforming X_test.') imp.transform(X_test2) t3 = time() logger.info('X_test transformed.') data = { 'task_tag': [task.meta.tag], 'imp': [imp.__class__.__name__], 'X_shape': [repr(X.shape)], 'X_train_shape': [repr(X_train2.shape)], 'X_test_shape': [repr(X_test2.shape)], 'fit_time': [t1 - t0], 'transform_time_train': [t2 - t1], 'transform_time_test': [t3 - t2] } new_df = pd.DataFrame(data) df = None filepath = 'results/impute_time.csv' if os.path.exists(filepath): df = pd.read_csv(filepath, index_col=0) if df is not None: new_df = pd.concat([df, new_df]) new_df.to_csv(filepath) print(new_df)
def evaluate_exp4(main_folder, mv_config, r_seed=0, num_file=500): train_with_nan, train_full, w0_with_nan_list, w0_list, w1_with_nan_list, w1_list = dh.load_realdata( r_seed, mv_config, num_file, main_folder) imp = IterativeImputer(max_iter=10, random_state=0) train_impu = imp.fit_transform(train_with_nan) alpha = 0.05 result_MWW = np.zeros([2, 2]) result_QTree = np.zeros([2, 2]) result_kchi2 = np.zeros([2, 2]) result_Gau = np.zeros([2, 2]) result_Tri = np.zeros([2, 2]) result_ME = np.zeros([2, 2]) result_MMD = np.zeros([2, 2]) Qtree_Htest_impu = None Qtree_Htest_full = None kchi2_impu = None kchi2_full = None mfkchi2_miss_gau = None mfkchi2_full_gau = None mfkchi2_miss_tri = None mfkchi2_full_tri = None me_ml = None me_mg = None me_fl = None me_fg = None mmd_impu = None mmd_full = None for i in range(num_file): imp = IterativeImputer(max_iter=10, random_state=0) w0_miss = w0_with_nan_list[i][0] w0_impu = imp.fit_transform(w0_miss) w0_full = w0_list[i] imp = IterativeImputer(max_iter=10, random_state=0) w1_miss = w1_with_nan_list[i][0] w1_impu = imp.fit_transform(w1_miss) w1_full = w1_list[i] alg_r_seed = 1 print('MWW') np.random.seed(alg_r_seed) # ============================================================================================================== # w0_result = perform_mww_test(train_impu, w0_impu, train_full, w0_full, alpha) result_MWW[0] = result_MWW[0] + w0_result w1_result = perform_mww_test(train_impu, w1_impu, train_full, w1_full, alpha) result_MWW[1] = result_MWW[1] + w1_result print('Qtree') np.random.seed(alg_r_seed) # ============================================================================================================== # w0_result, Qtree_Htest_impu, Qtree_Htest_full = perform_QTree_test( train_impu, w0_impu, train_full, w0_full, alpha, Qtree_Htest_impu, Qtree_Htest_full) result_QTree[0] = result_QTree[0] + w0_result w1_result, Qtree_Htest_impu, Qtree_Htest_full = perform_QTree_test( train_impu, w1_impu, train_full, w1_full, alpha, Qtree_Htest_impu, Qtree_Htest_full) result_QTree[1] = result_QTree[1] + w1_result # print('kchi2') np.random.seed(alg_r_seed) # ============================================================================================================== # w0_result, kchi2_impu, kchi2_full = perform_kmean_chi2_test( train_impu, w0_impu, train_full, w0_full, alpha, kchi2_impu, kchi2_full) result_kchi2[0] = result_kchi2[0] + w0_result w1_result, kchi2_impu, kchi2_full = perform_kmean_chi2_test( train_impu, w1_impu, train_full, w1_full, alpha, kchi2_impu, kchi2_full) result_kchi2[1] = result_kchi2[1] + w1_result print('Gau') np.random.seed(alg_r_seed) # ============================================================================================================== # w0_result, mfkchi2_miss_gau, mfkchi2_full_gau = perform_mfkmean_chi2_test( train_with_nan, w0_miss, train_full, w0_full, alpha, mfkchi2_miss_gau, mfkchi2_full_gau, apply_fuzzy='Gaussion', top_k=2) result_Gau[0] = result_Gau[0] + w0_result w1_result, mfkchi2_miss_gau, mfkchi2_full_gau = perform_mfkmean_chi2_test( train_with_nan, w1_miss, train_full, w1_full, alpha, mfkchi2_miss_gau, mfkchi2_full_gau, apply_fuzzy='Gaussion', top_k=2) result_Gau[1] = result_Gau[1] + w1_result print('Tri') np.random.seed(alg_r_seed) # ============================================================================================================== # w0_result, mfkchi2_miss_tri, mfkchi2_full_tri = perform_mfkmean_chi2_test( train_with_nan, w0_miss, train_full, w0_full, alpha, mfkchi2_miss_tri, mfkchi2_full_tri, apply_fuzzy='Triangle', top_k=2) result_Tri[0] = result_Tri[0] + w0_result w1_result, mfkchi2_miss_tri, mfkchi2_full_tri = perform_mfkmean_chi2_test( train_with_nan, w1_miss, train_full, w1_full, alpha, mfkchi2_miss_tri, mfkchi2_full_tri, apply_fuzzy='Triangle', top_k=2) result_Tri[1] = result_Tri[1] + w1_result print('ME') np.random.seed(alg_r_seed) # ============================================================================================================== # w0_result, me_ml, me_mg, me_fl, me_fg = perform_me_test( train_impu, w0_impu, train_full, w0_full, alpha, me_ml, me_mg, me_fl, me_fg) result_ME[0] = result_ME[0] + w0_result w1_result, me_ml, me_mg, me_fl, me_fg = perform_me_test( train_impu, w1_impu, train_full, w1_full, alpha, me_ml, me_mg, me_fl, me_fg) result_ME[1] = result_ME[1] + w1_result # print('MMD') np.random.seed(alg_r_seed) # ============================================================================================================== # w0_result, mmd_impu, mmd_full = perform_mmd_test( train_impu, w0_impu, train_full, w0_full, alpha, mmd_impu, mmd_full) result_MMD[0] = result_MMD[0] + w0_result w1_result, mmd_impu, mmd_full = perform_mmd_test( train_impu, w1_impu, train_full, w1_full, alpha, mmd_impu, mmd_full) result_MMD[1] = result_MMD[1] + w1_result # return result_MWW, result_QTree, result_kchi2, result_Gau, result_Tri, result_ME, result_MMD
def extract_feats_transform(X, Y=None): if Y is None: #==================================# # use impute distances as features # #==================================# imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0) zero_imput = imp.fit_transform(X) zero_imput = euclidean_distances(zero_imput, zero_imput) zero_imput = zero_imput.flatten().reshape(-1, 1) imp = SimpleImputer(missing_values=np.nan, strategy='mean') mean_imput = imp.fit_transform(X) mean_imput = euclidean_distances(mean_imput, mean_imput) mean_imput = mean_imput.flatten().reshape(-1, 1) imp = SimpleImputer(missing_values=np.nan, strategy='median') medi_imput = imp.fit_transform(X) medi_imput = euclidean_distances(medi_imput, medi_imput) medi_imput = medi_imput.flatten().reshape(-1, 1) imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent') mfre_imput = imp.fit_transform(X) mfre_imput = euclidean_distances(mfre_imput, mfre_imput) mfre_imput = mfre_imput.flatten().reshape(-1, 1) imp = IterativeImputer(max_iter=10, random_state=0) iter_imput = imp.fit_transform(X) iter_imput = euclidean_distances(iter_imput, iter_imput) iter_imput = iter_imput.flatten().reshape(-1, 1) #=============================# # missing value masked vector # #=============================# X_masked_hasNan_masked_vector = np.isnan(X) * 1 pd_X_Nan = pd.DataFrame(X_masked_hasNan_masked_vector) pd_X_Nan['key'] = 0 all_merge = pd.merge(pd_X_Nan, pd_X_Nan, on='key', how='outer') all_merge = all_merge.drop(columns=['key']) all_merge = all_merge.values train_X = np.hstack([ all_merge, zero_imput, mean_imput, medi_imput, mfre_imput, iter_imput ]) else: #==================================# # use impute distances as features # #==================================# imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0) zero_imput_X = imp.fit_transform(X) zero_imput_Y = imp.fit_transform(Y) zero_imput = euclidean_distances(zero_imput_X, zero_imput_Y) zero_imput = zero_imput.flatten().reshape(-1, 1) imp = SimpleImputer(missing_values=np.nan, strategy='mean') mean_imput_X = imp.fit_transform(X) mean_imput_Y = imp.fit_transform(Y) mean_imput = euclidean_distances(mean_imput_X, mean_imput_Y) mean_imput = mean_imput.flatten().reshape(-1, 1) imp = SimpleImputer(missing_values=np.nan, strategy='median') medi_imput_X = imp.fit_transform(X) medi_imput_Y = imp.fit_transform(Y) medi_imput = euclidean_distances(medi_imput_X, medi_imput_Y) medi_imput = medi_imput.flatten().reshape(-1, 1) imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent') mfre_imput_X = imp.fit_transform(X) mfre_imput_Y = imp.fit_transform(Y) mfre_imput = euclidean_distances(mfre_imput_X, mfre_imput_Y) mfre_imput = mfre_imput.flatten().reshape(-1, 1) imp = IterativeImputer(max_iter=10, random_state=0) iter_imput_X = imp.fit_transform(X) iter_imput_Y = imp.fit_transform(Y) iter_imput = euclidean_distances(iter_imput_X, iter_imput_Y) iter_imput = iter_imput.flatten().reshape(-1, 1) #=============================# # missing value masked vector # #=============================# X_masked_hasNan_masked_vector = np.isnan(X) * 1 Y_masked_hasNan_masked_vector = np.isnan(Y) * 1 pd_X_Nan = pd.DataFrame(X_masked_hasNan_masked_vector) pd_Y_Nan = pd.DataFrame(Y_masked_hasNan_masked_vector) pd_X_Nan['key'] = 0 pd_Y_Nan['key'] = 0 all_merge = pd.merge(pd_X_Nan, pd_Y_Nan, on='key', how='outer') all_merge = all_merge.drop(columns=['key']) all_merge = all_merge.values train_X = np.hstack([ all_merge, zero_imput, mean_imput, medi_imput, mfre_imput, iter_imput ]) return train_X
def test_iterative_imputer_error_param(max_iter, tol, error_type, warning): X = np.zeros((100, 2)) imputer = IterativeImputer(max_iter=max_iter, tol=tol) with pytest.raises(error_type, match=warning): imputer.fit_transform(X)
date = pd.Timestamp.now().strftime(format='%Y-%m-%d_%H-%M_') predictions.to_csv( f'C:/Users/fredh/Documents/Data Driven platform competition - Pump it up/predictions/{date}submission.csv', index=True, header=True) from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import OneHotEncoder from sklearn.decomposition import TruncatedSVD from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer categorical_feat = X_train.select_dtypes(include='object').columns.to_list() num_feat = X_train.select_dtypes(include='number').columns.to_list() num_pipe_7 = Pipeline([('imputer', IterativeImputer(max_iter=10, random_state=0)), ('scaler', StandardScaler())]) cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore'))]) ct_7 = ColumnTransformer(remainder='drop', transformers=[('numerical', num_pipe_7, num_feat), ('categorical', cat_pipe, categorical_feat)]) from xgboost import XGBClassifier """ space = [ Real(0.6, 0.7, name="colsample_bylevel"), Real(0.6, 0.7, name="colsample_bytree"), Real(0.01, 1, name="gamma"), Real(0.0001, 1, name="learning_rate"), Real(0.1, 10, name="max_delta_step"),
def imputer(df, dfv, dfk, target_col, imputer_dict): result = {} for i in imputer_dict: if imputer_dict[i]['Indicator'] == 'deleterows': if df[i].isna().sum() > 0: df = df[df[i].isfinite()] dfv = dfv[dfv[i].isfinite()] dfk = dfk[dfk[i].isfinite()] if imputer_dict[i]['Indicator'] == True: if df[i].isna().sum() > 0: df[i + '_null_ind'] = np.where(df[i].isna(), 1, 0) dfv[i + '_null_ind'] = np.where(dfv[i].isna(), 1, 0) dfk[i + '_null_ind'] = np.where(dfk[i].isna(), 1, 0) if imputer_dict[i]['mvi'] in ['mean', 'median', 'most_frequent']: imp = SimpleImputer(missing_values=np.nan, strategy=imputer_dict[i]['mvi'], verbose=True, add_indicator=False, fill_value=None) imp.fit(df[[i]]) result[i] = imp df.loc[:, i] = result[i].transform(df[[i]]) dfv.loc[:, i] = result[i].transform(dfv[[i]]) dfk.loc[:, i] = result[i].transform(dfk[[i]]) if imputer_dict[i]['mvi'] == 'far_val': result[i] = df[i].max() * 100 df[i] = np.where(df[i].isna(), result[i], df[i]) dfv[i] = np.where(dfv[i].isna(), result[i], dfv[i]) dfk[i] = np.where(dfk[i].isna(), result[i], dfk[i]) ##### interativeimputer (if none of the above then this) ###### imp = IterativeImputer( max_iter=3, estimator=ExtraTreesRegressor( ) #### hyperparameter, alternatively beysian, knn etc. , n_nearest_features= 5 ##### Change value for maximum columns considered to predict missing value ) dfvc = dfv.copy() dfv[target_col] = np.nan dfkc = dfk.copy() dfk[target_col] = np.nan dfcolumns = df.columns imp.fit(df) df = pd.DataFrame(imp.transform(df)) df.columns = dfcolumns dfv = pd.DataFrame(imp.transform(dfv)) dfv.columns = dfcolumns dfk = pd.DataFrame(imp.transform(dfk)) dfk.columns = dfcolumns dfv[target_col] = np.array(dfvc[target_col]) dfk[target_col] = np.nan for i in imputer_dict: if imputer_dict[i]['mvi'] == 'iterativeimputer': result[i] = imp print("Completed imputer - ", datetime.datetime.now()) return df, dfv, dfk, result
data.Name = data.Name.map(lambda n: n.split(',')[1].split('.')[0].strip()) data.Cabin = data.Cabin.map(lambda a: a[0]) # one_hot for i in ['Cabin', 'Embarked', 'Name']: data = make_one_hot(data, i) data.Sex.replace({'female': 0, 'male': 1}, inplace=True) # numeric feature eng data['Family_size'] = data.Parch + data.SibSp + 1 # drop data.drop(['Cabin_*', 'SibSp', 'Parch', 'Ticket'], axis=1, inplace=True) return data data = feature_eng(data) data = IterativeImputer().fit_transform(data) x_train, x_test = data[:len(train)], data[len(train):] x_train = scale(x_train) m1 = MLPClassifier(max_iter=1000, hidden_layer_sizes=len(x_train[0]) * 2) cv = cross_val_score(m1, x_train, y_train, cv=5) print(cv.mean(), ' +/-', cv.std() * 2) x_tr, x_ts, y_tr, y_ts = split(x_train, y_train, shuffle=True, test_size=0.2) m1.fit(x_tr, y_tr) print(m1.score(x_ts, y_ts)) # =================================== from keras.models import Sequential from keras.layers import Dense, Dropout from keras.optimizers import SGD, Adam
from DataPrepocessing import converting_discrete_attributes_to_continuous as prepo raw_data = pd.read_csv('flag_data/flag.data') ## 1 areadata = raw_data['4'] areadata = np.array(areadata).reshape(-1, 1) mean = SimpleImputer(missing_values=0, strategy='mean').fit_transform(areadata) median = SimpleImputer(missing_values=0, strategy='median').fit_transform(areadata) most_freq = SimpleImputer(missing_values=0, strategy='most_frequent').fit_transform(areadata) constant = SimpleImputer(missing_values=0, strategy='constant').fit_transform(areadata) plt.hist(areadata, 100) plt.show() plt.subplot(221) plt.hist(mean, 100) plt.subplot(222) plt.hist(median, 100) plt.subplot(223) plt.hist(most_freq, 100) plt.subplot(224) plt.hist(constant, 100) plt.show() ## 2 imp = IterativeImputer(missing_values=np.nan) plt.hist(imp.fit_transform(prepo.dataout)) plt.show()
subject_dict[ID][ses][(atlas, est, clust, _k, smooth, hpass)]['topology']) vect_all.append(np.concatenate(vects, axis=1)) del vects X_top = np.swapaxes(np.hstack(vect_all), 0, 1) Y = np.array(id_list) try: df_summary.at[i, 'grid'] = (atlas, est, clust, _k, smooth, hpass) bad_ixs = [i[1] for i in np.argwhere(np.isnan(X_top))] for m in set(bad_ixs): if (X_top.shape[0] - bad_ixs.count(m)) / X_top.shape[0] < 0.50: X_top = np.delete(X_top, m, axis=1) imp = IterativeImputer(max_iter=50, random_state=42) X_top = imp.fit_transform(X_top) scaler = StandardScaler() X_top = scaler.fit_transform(X_top) discr_stat_val, rdf = discr_stat(X_top, Y) df_summary.at[i, 'discriminability'] = discr_stat_val print(discr_stat_val) #print(rdf) del discr_stat_val i += 1 except: i += 1 continue elif modality == 'dwi': gen_hyperparams = ['est', 'clust', '_k'] for col in cols:
def load_both_data(project, metric): understand_path = 'data/understand_files_all/' + project + '_understand.csv' understand_df = pd.read_csv(understand_path) understand_df = understand_df.dropna(axis=1, how='all') cols_list = understand_df.columns.values.tolist() for item in ['Kind', 'Name', 'commit_hash', 'Bugs']: if item in cols_list: cols_list.remove(item) cols_list.insert(0, item) understand_df = understand_df[cols_list] cols = understand_df.columns.tolist() understand_df = understand_df.drop_duplicates(cols[4:len(cols)]) understand_df['Name'] = understand_df.Name.str.rsplit('.', 1).str[1] commit_guru_file_level_path = 'data/commit_guru_file/' + project + '.csv' commit_guru_file_level_df = pd.read_csv(commit_guru_file_level_path) commit_guru_file_level_df[ 'commit_hash'] = commit_guru_file_level_df.commit_hash.str.strip('"') commit_guru_file_level_df = commit_guru_file_level_df[ commit_guru_file_level_df['file_name'].str.contains('.java')] commit_guru_file_level_df[ 'Name'] = commit_guru_file_level_df.file_name.str.rsplit( '/', 1).str[1].str.split('.').str[0].str.replace('/', '.') commit_guru_file_level_df = commit_guru_file_level_df.drop('file_name', axis=1) df = understand_df.merge(commit_guru_file_level_df, how='left', on=['commit_hash', 'Name']) cols = df.columns.tolist() cols.remove('Bugs') cols.append('Bugs') df = df[cols] file_names = df.Name for item in ['Kind', 'Name', 'commit_hash']: if item in cols: df = df.drop(labels=[item], axis=1) # df.dropna(inplace=True) df = df.drop_duplicates() df.reset_index(drop=True, inplace=True) y = df.Bugs X = df.drop('Bugs', axis=1) cols = X.columns scaler = MinMaxScaler() X = scaler.fit_transform(X) X = pd.DataFrame(X, columns=cols) imp_mean = IterativeImputer(random_state=0) X = imp_mean.fit_transform(X) X = pd.DataFrame(X, columns=cols) if metric == 'process': X = X[[ 'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev', 'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev', 'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm', 'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd', 'file_sctr' ]] elif metric == 'product': X = X.drop([ 'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev', 'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev', 'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm', 'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd', 'file_sctr' ], axis=1) else: X = X X['Name'] = file_names X['Bugs'] = y return X
), make_pipeline(Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3)), KNeighborsRegressor(n_neighbors=15), ] score_iterative_imputer = pd.DataFrame() # iterative imputer is sensible to the tolerance and # dependent on the estimator used internally. # we tuned the tolerance to keep this example run with limited computational # resources while not changing the results too much compared to keeping the # stricter default value for the tolerance parameter. tolerances = (1e-3, 1e-1, 1e-1, 1e-2) for impute_estimator, tol in zip(estimators, tolerances): estimator = make_pipeline( IterativeImputer(random_state=0, estimator=impute_estimator, max_iter=25, tol=tol), br_estimator, ) score_iterative_imputer[ impute_estimator.__class__.__name__] = cross_val_score( estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS) scores = pd.concat( [score_full_data, score_simple_imputer, score_iterative_imputer], keys=["Original", "SimpleImputer", "IterativeImputer"], axis=1,
# minimum instances proportion to allow feature minimum_valued_instances_proportion = 0.80 X = pd.read_excel (r'C:\Temp\learning\clean.xlsx') y = pd.read_excel (r'C:\Temp\learning\happiness.xlsx') no_target_value_instances = y.notna().iloc[ : , 1 ] X = X[ no_target_value_instances ] y = y[ no_target_value_instances ] X.dropna(thresh=len(X) * minimum_valued_instances_proportion, axis=1, inplace=True) imp = IterativeImputer(max_iter=2, random_state=123) start = time.time() imp.fit(X.drop(X.columns[[0]], axis=1)) end = time.time() print(end - start) X_values = imp.transform(X.drop(X.columns[[0]], axis=1)) X.iloc[:,1:] = X_values X.to_excel(r'C:\Temp\learning\cleaner.xlsx', index=False) y.to_excel(r'C:\Temp\learning\targeter.xlsx', index=False)
def impute_all(self, df, regressor=None, **regr_kwargs): im = IterativeImputer(estimator=regressor) dffin = im.fit_transform(df) return dffin
import pytest import numpy as np from scipy import sparse from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.utils._testing import assert_array_equal from sklearn.experimental import enable_iterative_imputer # noqa from sklearn.impute import IterativeImputer from sklearn.impute import KNNImputer from sklearn.impute import SimpleImputer IMPUTERS = [IterativeImputer(tol=0.1), KNNImputer(), SimpleImputer()] SPARSE_IMPUTERS = [SimpleImputer()] # ConvergenceWarning will be raised by the IterativeImputer @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") @pytest.mark.parametrize("imputer", IMPUTERS) def test_imputation_missing_value_in_test_array(imputer): # [Non Regression Test for issue #13968] Missing value in test set should # not throw an error and return a finite dataset train = [[1], [2]] test = [[3], [np.nan]] imputer.set_params(add_indicator=True) imputer.fit(train).transform(test)
random.seed(r) print('\n############### Evaluate Best Model ###############') # ## Read in Data train_test = pd.read_json('data/train-test.json') train_test_labels = train_test[['label']] train_test = train_test.drop('label', axis='columns') hold = pd.read_json('data/holdout.json') hold_labels = hold[['label']] hold = hold.drop('label', axis='columns') # ### Impute Data imp = IterativeImputer(max_iter=100, random_state=r) X_train_test = imp.fit_transform(train_test.values) y_train_test = train_test_labels.values.ravel() X_hold = imp.transform(hold.values) y_hold = hold_labels.values.ravel() # ### Augment Data #if smote_ratio > 0: # smote = SMOTE( # sampling_strategy='all', # random_state=1337, # k_neighbors=5, # n_jobs=1 # )
# creating surrogates for missing data for col in df: if df[col].isna().sum() != 0: df[col + '_surrogate'] = df[col].isna().astype(int) #check new columns created df.head() #Impute missing values num_nulls = pd.DataFrame({"Number of Nulls": df.isnull().sum()}) impute_cols = list(num_nulls[num_nulls["Number of Nulls"] != 0].index) from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer imp = IterativeImputer(missing_values=np.nan, max_iter=10, verbose=0, random_state=100) df[impute_cols] = imp.fit_transform(df[impute_cols]) df df.describe() #Check for missing values df.isnull().sum() df.info() df.describe() ### ### HANDLE SKEWED DATA ### #Get float cols
def data_preprocessing(dat: pd.DataFrame, art='C', y=None, logger=None, remove=True): """ Encoding + remove columns with more than 1/2 na if remove==True + remove columns with all na + imputation if art == 'C', will do LabelEncoding first for the target column ================ Parameter: ================ dat - type of DataFrame art - type of string either C for classifcation of R for regression. indicates the type of problem y - type of string the name of the target column; if None, set the last column of the data set as target considering only one column for label logger - type of Logger remove - type of boolean whether remove the columns with na value more than half length or not ================= Output ================= dat - type of Dataframe the dataframe after preprocessing cols - type of list of string the name of the numerical columns """ if logger == None: logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) logger.info('Start data preprocessing') # replace original indeices with default ones dat = dat.reset_index(drop=True) if art == 'C': logger.info('Start to label target feature y for classification task') dat.iloc[:, -1] = LabelEncoder().fit_transform(dat.iloc[:, -1]) logger.info('End with label encoding the target feature') if remove: # remove columns with more than 1/2 na dat = dat.loc[:, dat.isna().sum() / len(dat) < .5] logger.info( 'Following features are removed from the dataframe because half of their value are NA: %s' % (dat.columns[dat.isna().sum() / len(dat) > .5].to_list())) # Encoding oe = OneHotEncoder(drop='first') # get categorical columns if y: dat_y = dat[[y]] cols = dat.columns.to_list() cols.remove(y) dat_x = dat[cols] else: dat_y = dat[[dat.columns[-1]]] dat_x = dat[dat.columns[:-1]] dat_categ = dat_x.select_dtypes(include=['object']) # get kterm of categ features for i in dat_categ.columns: # save output to dat tmp = dat_x[i].value_counts() dat_x[i + '_kterm'] = dat_x[i].map(lambda x: tmp[x] if x in tmp.index else 0) # float columns including the k term cols dat_numeric = dat_x.select_dtypes( include=['float32', 'float64', 'int32', 'int64']) # onehot encoding and label encoding dat_categ_onehot = dat_categ.iloc[:, dat_categ.apply(lambda x: len(x.unique()) ).values < 8] dat_categ_label = dat_categ.iloc[:, dat_categ.apply(lambda x: len(x.unique()) ).values >= 8] flag_onehot = False flag_label = False # oe if dat_categ_onehot.shape[1] > 0: logger.info( 'Start to do onehot to the following categoric features: %s' % (str(dat_categ_onehot.columns.to_list()))) dat_onehot = pd.DataFrame( oe.fit_transform(dat_categ_onehot.astype(str)).toarray(), columns=oe.get_feature_names(dat_categ_onehot.columns)) logger.info('End with onehot') flag_onehot = True else: dat_onehot = None # le if dat_categ_label.shape[1] > 0: logger.info( 'Start to do label encoding to the following categoric features: %s' % (str(dat_categ_label.columns.to_list()))) dat_categ_label = dat_categ_label.fillna('NULL') dat_label = pd.DataFrame(columns=dat_categ_label.columns) for i in dat_categ_label.columns: dat_label[i] = LabelEncoder().fit_transform( dat_categ_label[i].astype(str)) flag_label = True logger.info('End with label encoding') else: dat_label = None # scaling # combine dat_new = pd.DataFrame() if flag_onehot and flag_label: dat_new = pd.concat([dat_numeric, dat_onehot, dat_label], axis=1) elif flag_onehot: dat_new = pd.concat([dat_numeric, dat_onehot], axis=1) elif flag_label: dat_new = pd.concat([dat_numeric, dat_label], axis=1) else: dat_new = dat_numeric dat_new = pd.concat([dat_new, dat_y], axis=1) # imputation dat_new = dat_new.dropna(axis=1, how='all') if dat_new.isna().sum().sum() > 0: logger.info( 'Nan value exist, start to fill na with iterative imputer: ' + str(dat_new.isna().sum().sum())) # include na value, impute with iterative Imputer or simple imputer columns = dat_new.columns imp = IterativeImputer(max_iter=10, random_state=0) # imp = SimpleImputer(missing_values=np.nan, strategy='mean') dat_new = imp.fit_transform(dat_new) dat_new = pd.DataFrame(dat_new, columns=columns) dat_numeric = dat_new.iloc[:, :-1].select_dtypes( include=['float32', 'float64', 'int32', 'int64']) logger.info('End with filling nan') return dat_new, dat_numeric.columns