def fit(self, tr_features=None, tr_labels=None): """Fit and train datasets with regression hyperparameters GridSearch and CV across multiple estimators. Parameters ---------- features : df, default = None Train features columns. ( NOTE: In the Pipeline Cluster Traversal Experiments, the features columns should be from the same pipeline dataset). labels : df ,default = None Train label column. ( NOTE: In the Pipeline Cluster Traversal Experiments, the label column should be from the same pipeline dataset). Returns ------- cv_num : int # of fold for cross-validation. DICT_EST : dictionary key is the name of estimators, value is the ralated trained model. NOTE - Trained model auto save function only avalable when in_pipeline = "False". NOTE - Log records will generate and save to ./logs folder automatedly. """ if (self.input_from_file): tr_labels = tr_labels.values.ravel() reg = reg_cv(cv_val=self.cv_num, random_state=self.random_state) # estimators = ['lr','knn','tree','svm','mlp','rf','gb','ada','xgb','hgboost','huber','rgcv','cvlasso','sgd'] estimators = self.set_estimators if (not self.in_pipeline): pkl_folder = os.path.join(os.getcwd(), 'pkl') if not os.path.exists(pkl_folder): os.makedirs(pkl_folder) loop_num = 1 total_loop = len(estimators) for est in estimators: start_time = time() logger.info(Test_comment) logger.info(f"Current Running:" + est + " estimator") try: cv_est = getattr(reg, est)() cv_est.fit(tr_features, tr_labels) if (not self.in_pipeline): model_name = os.path.join(pkl_folder, f'{est}_reg_model.pkl') joblib.dump(cv_est.best_estimator_, model_name) time_est = round( ((time() - start_time) / 60) * (total_loop - loop_num), 4) update_progress( loop_num / total_loop, clear_flag=False, process_name="Model Selection w/ Cross-validation", time_est=time_est) print( f"\n *optimalflow* autoCV Module ===> {est} model CrossValidation with {self.cv_num} folds:" ) print_results(cv_est, self.in_pipeline) self.DICT_EST[est] = cv_est logger.info( f"This estimator executed {round((time()-start_time)/60,4)} minutes" ) loop_num += 1 except: print(est + " estimator is not availible.") if (not self.in_pipeline): time_est = round( ((time() - start_time) / 60) * (total_loop - loop_num), 4) update_progress( loop_num / total_loop, clear_flag=True, process_name="Model Selection w/ Cross-validation", time_est=time_est) logger.info( f"This estimator executed {round((time()-start_time)/60,4)} minutes" ) loop_num += 1 pass return (self.cv_num, self.DICT_EST)
def fit(self, tr_features, tr_labels): """Fits and transforms a dataframe with built-in algorithms, to select top features. Parameters ---------- tr_features : df, default = None Train features columns. (NOTE: In the Pipeline Cluster Traversal Experiments, the features columns should be from the same pipeline dataset). tr_labels : array/df, default = None Train label column, when input_from_file = True, must be pandas datframe. (NOTE: In the Pipeline Cluster Traversal Experiments, the label column should be from the same pipeline dataset). Returns ------- fs_num : int # of top features has been select out. fs_results : array Selected & ranked top feature names. NOTE - Log records will generate and save to ./logs folder automatedly. """ if (self.input_from_file): tr_labels = tr_labels.values.ravel() reg = reg_fs(fs_num=self.fs_num, random_state=self.random_state, cv=self.cv) #selectors = ['kbest_f','rfe_svm','rfe_tree','rfe_rf','rfecv_svm','rfecv_tree','rfecv_rf'] selectors = self.set_selectors loop_num = 1 total_loop = len(selectors) selected_features = [] for selector in selectors: start_time = time() if (not self.in_pipeline): logger.info(Test_comment) logger.info(f"Current Running:" + selector + " selector") try: reg_selector = getattr(reg, selector)() reg_sel_result = reg_selector.fit(tr_features, tr_labels) fs_feature = rank_fs_result(reg_sel_result, tr_features.head(1)) selected_features.extend(fs_feature) if (not self.in_pipeline): update_progress(loop_num / total_loop, process_name="Feature Selection Iteration") print( f'\n *optimalflow* autoFS Module ===> Selector {selector} gets outputs: {fs_feature}' ) logger.info( f"This selector executed {round((time()-start_time)/60,4)} minutes" ) loop_num += 1 except: if (not self.in_pipeline): print(selector + " selector is not availible.") update_progress(loop_num / total_loop, process_name="Feature Selection Iteration") logger.info( f"This selector executed {round((time()-start_time)/60,4)} minutes" ) loop_num += 1 pass counts = Counter(selected_features) fs_results = sorted(selected_features, key=lambda x: -counts[x]) fs_results = unique(fs_results)[:self.fs_num] if (not self.in_pipeline): print( f"The optimalflow autoFS identify the top {self.fs_num} important features for regression are: {fs_results}." ) logger.info( f"The optimalflow autoFS identify the top {self.fs_num} important features for regression are: {fs_results}." ) return (self.fs_num, fs_results)
def fit(self, data): """Fits and transforms a chain of Optimal Flow modules. Parameters ---------- input_data : pandas dataframe, shape = [n_samples, n_features] NOTE: The input_data should be the datasets after basic data cleaning & well feature deduction, the more features involve will result in more columns permutation outputs. Returns ------- DICT_PREP_INFO : dictionary Each key is the # of preprocessed dataset("Dataset_xxx" format, i.e. "Dataset_10"), each value stores an info string about what transforms applied. i.e. DICT_PREPROCESSING['Dataset_0'] stores value "winsor_0-Scaler_None-- Encoded Features:['diagnosis', 'Size_3', 'area_mean']", which means applied 1st mode of winsorization, none scaler applied, and the encoded columns names(shown the enconding approachs in the names) DICT_FEATURE_SELECTION_INFO : dictionary Each key is the # of preprocessed dataset, each value stores the name of features selected after the autoFS module. DICT_MODELS_EVALUATION : dictionary Each key is the # of preprocessed dataset, each value stores the model evaluation results with its validate dataset. DICT_DATA : dictionary Each key is the # of preprocessed dataset, and first level sub-key is the type of splitted sets(including 'DICT_Train','DICT_TEST',and'DICT_Validate'). The second level sub-key is "X" for features and "y" for label, each value stores the datasets related to the keys(Pandas Dataframe format) i.e. DICT_DATA['Dataset_0']['DICT_TEST']["X"] is the train features of Dataset_0's test dataset models_summary : Pandas Dataframe Model selection results ranking table among all composits of preprocessed datasets, selected features and all posible models with optimal parameters. NOTE - Log records will generate and save to ./logs folder automatedly. """ dyna = self.step1 DICT_PREP_DF, DICT_PREP_INFO = dyna.fit(input_data=data) print(f"Total combinations: {len(DICT_PREP_DF.keys())}") logger.info(f"Total combinations: {len(DICT_PREP_DF.keys())}") # Tracking the metrics values DICT_MODELS_EVALUATION = {} # Feature Selction tracking DICT_FEATURE_SELECTION_INFO = {} DICT_DATA = {} loop_num = 1 total_loop = len(DICT_PREP_DF.keys()) for number, key in enumerate(DICT_PREP_DF.keys()): combination_df = DICT_PREP_DF[key] start_time = time() logger.info(Test_comment) dataset_num = key.split("Dataset_", 1)[1] logger.info( f"Current Running Preprocessed Dataset No. {dataset_num}:") features = combination_df.drop(dyna.label_col, axis=1) labels = combination_df[dyna.label_col] logger.info("[Features Before autoFS]: ") logger.info(list(features.columns)) custom_val_size, custom_size, custom_random_state = self.step2 X_train, y_train, X_val, y_val, X_test, y_test = data_splitting_tool( feature_cols=features, label_col=labels, val_size=custom_val_size, test_size=custom_size, random_state=custom_random_state) tr_features = X_train tr_labels = y_train autoFS_module = self.step3 fs_num, fs_results = autoFS_module.fit(tr_features, tr_labels) DICT_FEATURE_SELECTION_INFO["Dataset_" + str(dataset_num)] = fs_results logger.info(f"[Results Report]:") logger.info( f">>> autoFS summary - This dataset has the top {fs_num} important features: {fs_results}." ) tr_features = tr_features[list(fs_results)] tr_labels = tr_labels val_features = X_val[list(fs_results)] val_labels = y_val ts_features = X_test[list(fs_results)] ts_labels = y_test DICT_PER_DATA = { "DICT_Train": {}, "DICT_Validate": {}, "DICT_TEST": {} } DICT_PER_DATA["DICT_Train"]["X"] = tr_features DICT_PER_DATA["DICT_Train"]["y"] = tr_labels DICT_PER_DATA["DICT_Validate"]["X"] = val_features DICT_PER_DATA["DICT_Validate"]["y"] = val_labels DICT_PER_DATA["DICT_TEST"]["X"] = ts_features DICT_PER_DATA["DICT_TEST"]["y"] = ts_labels DICT_DATA["Dataset_" + str(dataset_num)] = DICT_PER_DATA autoCV_module = self.step4 cv_num, DICT_EST = autoCV_module.fit(tr_features, tr_labels) for est in DICT_EST.keys(): results = DICT_EST[est] logger.info( f">>> autoCV summary - {est} model CrossValidation with {cv_num} folds:" ) logger.info(' - Best Paramaters: {}\n'.format( results.best_params_)) logger.info(' - Best CV Score: {}\n'.format( results.best_score_)) evaluate_module = self.step5 if (evaluate_module.model_type == "cls"): metrics_df = pd.DataFrame(columns=[ 'Model_Name', 'Accuracy', 'Precision', 'Recall', 'Latency', 'Best_Parameters' ]) if (evaluate_module.model_type == "reg"): metrics_df = pd.DataFrame(columns=[ 'Model_Name', 'R2', 'MAE', 'MSE', 'RMSE', 'Latency', 'Best_Parameters' ]) for est in DICT_EST.keys(): optimal_scores = evaluate_module.fit( name=est, model=DICT_EST[est].best_estimator_, features=val_features, labels=val_labels) optimal_scores.append( str([i for i in DICT_EST[est].best_params_.items()])) if (evaluate_module.model_type == "cls"): metrics_df = metrics_df.append( pd.DataFrame([optimal_scores], columns=[ 'Model_Name', 'Accuracy', 'Precision', 'Recall', 'Latency', 'Best_Parameters' ])) logger.info( '>>> {} Modle Validation Results -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}s' .format(optimal_scores[0], optimal_scores[1], optimal_scores[2], optimal_scores[3], optimal_scores[4])) if (evaluate_module.model_type == "reg"): metrics_df = metrics_df.append( pd.DataFrame([optimal_scores], columns=[ 'Model_Name', 'R2', 'MAE', 'MSE', 'RMSE', 'Latency', 'Best_Parameters' ])) logger.info( '>>> {} Model Validation Results -- R^2 Score: {} / Mean Absolute Error: {} / Mean Squared Error: {} / Root Mean Squared Error: {} / Latency: {}s' .format(optimal_scores[0], optimal_scores[1], optimal_scores[2], optimal_scores[3], optimal_scores[4], optimal_scores[5])) DICT_MODELS_EVALUATION["Dataset_" + str(dataset_num)] = metrics_df logger.info( f"Total executed {round((time()-start_time)/60,4)} minutes") time_est = round( ((time() - start_time) / 60) * (total_loop - loop_num), 4) update_progress(loop_num / total_loop, clear_flag=True, process_name="autoFS & autoCV Iteration", time_est=time_est) loop_num += 1 dict_flow = DICT_MODELS_EVALUATION for key in dict_flow.keys(): dict_flow[key]['Dataset'] = key if (evaluate_module.model_type == "cls"): models_summary = pd.concat( [dict_flow[i] for i in dict_flow.keys()], ignore_index=True).sort_values( by=['Accuracy', 'Precision', 'Recall', 'Latency'], ascending=[False, False, False, True]) models_summary = models_summary[[ "Dataset", "Model_Name", "Best_Parameters", 'Accuracy', 'Precision', 'Recall', 'Latency' ]] if (evaluate_module.model_type == "reg"): models_summary = pd.concat( [dict_flow[i] for i in dict_flow.keys()], ignore_index=True).sort_values( by=['R2', 'MAE', 'MSE', 'RMSE', 'Latency'], ascending=[False, True, True, True, True]) models_summary = models_summary[[ "Dataset", "Model_Name", "Best_Parameters", 'R2', 'MAE', 'MSE', 'RMSE', 'Latency' ]] logger.info(Start_log) print(f"The top 5 Models with Best Performance Metrics:") print(models_summary.head(5)) logger.info(f"The top 5 Models with Best Performance Metrics:") logger.info(models_summary.head(5)) return (DICT_PREP_INFO, DICT_FEATURE_SELECTION_INFO, DICT_MODELS_EVALUATION, DICT_DATA, models_summary)
def fit(self, input_data = None): """Fits and transforms a pandas dataframe to non-missing values, outlier excluded, categories encoded and scaled datasets by all algorithms permutation. Parameters ---------- input_data : pandas dataframe, shape = [n_samples, n_features] NOTE: The input_data should be the datasets after basic data cleaning & well feature deduction, the more features involve will result in more columns permutation outputs. Returns ------- DICT_PREP_DF : dictionary Each key is the # of output preprocessed dataset, each value stores the dataset DICT_PREP_INFO : dictionary Dictionary for reference. Each key is the # of the output preprocessed dataset, each value stores the column names of the dataset NOTE - Log records will generate and save to ./logs folder automatedly. """ if (self.export_output_files): df_folder = os.path.join(os.getcwd(),'dfs') if not os.path.exists(df_folder): os.makedirs(df_folder) for l in os.listdir(df_folder): os.remove(os.path.join(df_folder,l)) DICT_DFS={} for i in range(len(self.parameters.get("winsorizer"))): pp = PPtools(label_col = self.label_col, data = input_data, model_type = self.model_type) pp.split_category_cols() initial_num_cols = pp.num_df.columns pp.impute_tool() pp.winsorize_tool(lower_ban = self.parameters.get("winsorizer")[i][0],upper_ban = self.parameters.get("winsorizer")[i][1]) winsorized_df_cols_list = list(pp.num_df.columns) encoded_cols_list = {} for col in pp.cat_df.columns: encoded_cols_list[col] = [] if(pp.cat_df[col].nunique() < self.parameters.get("encode_band")[0]): for en_type in self.parameters.get("low_encode"): encoded_col = pp.encode_tool(en_type = en_type ,category_col = col) encoded_cols_list[col].append(list(encoded_col.columns)) pp.num_df = pd.concat([pp.num_df,encoded_col],axis = 1) if(pp.cat_df[col].nunique() >= self.parameters.get("encode_band")[0]): for en_type in self.parameters.get("high_encode"): encoded_col = pp.encode_tool(en_type = en_type ,category_col = col) encoded_cols_list[col].append(list(encoded_col.columns)) pp.num_df = pd.concat([pp.num_df,encoded_col],axis = 1) args_list = [] for key in encoded_cols_list.keys(): args_list.append(encoded_cols_list[key]) iters_combined = itertools.product(*args_list) loop_num = 1 total_loop = len(list(iters_combined)) for number, combination in enumerate(itertools.product(*args_list)): start_time = time() combined_cols_list = [] combined_cols_list.append(winsorized_df_cols_list) for ele in list(combination): combined_cols_list.append(ele) combined_cols_list = [item for sublist in combined_cols_list for item in sublist] encoded_df = pp.num_df[pp.num_df.columns.intersection(combined_cols_list)] encoded_df = pp.remove_zero_col_tool(encoded_df) category_sparsity_score = pp.sparsity_tool(encoded_df[encoded_df.columns.difference(list(initial_num_cols))]) if (category_sparsity_score > self.parameters["sparsity"][0]) and ((len(encoded_df.columns)+1)<=self.parameters["cols"][0]): logger.info(Test_comment) logger.info(f"Current Running Dataset No. {number} :") if (self.export_output_files): temp_dfs = os.path.join(df_folder, f"winsor_{i}_{number}.csv") encoded_df.to_csv(temp_dfs, index = False) for sca in self.parameters["scaler"]: DICT_DFS[f"winsor_{i}-Scaler_{sca}-Dataset_{number}"] = pd.concat([pp.data[self.label_col], pp.scale_tool(df = encoded_df,sc_type = sca)],axis = 1) logger.info(f">>> winsorized_Strategy is {i}") logger.info(f">>> Scaler stragety is {sca}") logger.info(f">>> Encoding strategy: {list(combination)}") logger.info(f">>> Total columns with label column is: {len(list(encoded_df.columns))+1}") logger.info(f">>> Encoded Category Columns' Sparsity Score: {str(category_sparsity_score)}") time_est = round(((time()-start_time)/60)*(total_loop - loop_num),4) update_progress(loop_num/total_loop, clear_flag = True, process_name = "Data Preprocessing Ensemble Iteration", time_est = time_est) loop_num += 1 DICT_PREP_INFO = {} DICT_PREP_DF = {} for number, key in enumerate(DICT_DFS.keys()): DICT_PREP_INFO["Dataset_"+str(number)] = key.split("Dataset_",1)[0] + "- Encoded Features:" + str(list(DICT_DFS[key].columns)) for number, key in enumerate(DICT_DFS.keys()): DICT_PREP_DF["Dataset_"+str(number)] = DICT_DFS[key] return(DICT_PREP_DF,DICT_PREP_INFO)