Exemplo n.º 1
0
    def fit(self, tr_features=None, tr_labels=None):
        """Fit and train datasets with regression hyperparameters GridSearch and CV across multiple estimators.
        
        Parameters
        ----------

        features : df, default = None
            Train features columns. ( NOTE: In the Pipeline Cluster Traversal Experiments, the features columns should be from the same pipeline dataset).
        labels : df ,default = None
            Train label column. 
            ( NOTE: In the Pipeline Cluster Traversal Experiments, the label column should be from the same pipeline dataset).        
        Returns
        -------
            cv_num : int
                # of fold for cross-validation.
            DICT_EST : dictionary
                key is the name of estimators, value is the ralated trained model.

            NOTE - Trained model auto save function only avalable when in_pipeline = "False".
            NOTE - Log records will generate and save to ./logs folder automatedly.
        """
        if (self.input_from_file):
            tr_labels = tr_labels.values.ravel()
        reg = reg_cv(cv_val=self.cv_num, random_state=self.random_state)
        # estimators = ['lr','knn','tree','svm','mlp','rf','gb','ada','xgb','hgboost','huber','rgcv','cvlasso','sgd']
        estimators = self.set_estimators
        if (not self.in_pipeline):
            pkl_folder = os.path.join(os.getcwd(), 'pkl')
            if not os.path.exists(pkl_folder):
                os.makedirs(pkl_folder)
        loop_num = 1
        total_loop = len(estimators)

        for est in estimators:
            start_time = time()
            logger.info(Test_comment)
            logger.info(f"Current Running:" + est + " estimator")
            try:
                cv_est = getattr(reg, est)()
                cv_est.fit(tr_features, tr_labels)
                if (not self.in_pipeline):
                    model_name = os.path.join(pkl_folder,
                                              f'{est}_reg_model.pkl')
                    joblib.dump(cv_est.best_estimator_, model_name)
                    time_est = round(
                        ((time() - start_time) / 60) * (total_loop - loop_num),
                        4)
                    update_progress(
                        loop_num / total_loop,
                        clear_flag=False,
                        process_name="Model Selection w/ Cross-validation",
                        time_est=time_est)

                print(
                    f"\n    *optimalflow* autoCV Module ===> {est} model CrossValidation with {self.cv_num} folds:"
                )
                print_results(cv_est, self.in_pipeline)
                self.DICT_EST[est] = cv_est

                logger.info(
                    f"This estimator executed {round((time()-start_time)/60,4)} minutes"
                )
                loop_num += 1
            except:
                print(est + " estimator is not availible.")
                if (not self.in_pipeline):
                    time_est = round(
                        ((time() - start_time) / 60) * (total_loop - loop_num),
                        4)
                    update_progress(
                        loop_num / total_loop,
                        clear_flag=True,
                        process_name="Model Selection w/ Cross-validation",
                        time_est=time_est)
                logger.info(
                    f"This estimator executed {round((time()-start_time)/60,4)} minutes"
                )
                loop_num += 1
                pass
        return (self.cv_num, self.DICT_EST)
Exemplo n.º 2
0
    def fit(self, tr_features, tr_labels):
        """Fits and transforms a dataframe with built-in algorithms, to select top features.
        
        Parameters
        ----------
        tr_features : df, default = None
            Train features columns. 
            (NOTE: In the Pipeline Cluster Traversal Experiments, the features columns should be from the same pipeline dataset).
        tr_labels : array/df, default = None
            Train label column, when input_from_file = True, must be pandas datframe.
            (NOTE: In the Pipeline Cluster Traversal Experiments, the label column should be from the same pipeline dataset).
             
        Returns
        -------
        fs_num : int
            # of top features has been select out.
        fs_results : array
            Selected & ranked top feature names.
        
        NOTE - Log records will generate and save to ./logs folder automatedly.
        """
        if (self.input_from_file):
            tr_labels = tr_labels.values.ravel()

        reg = reg_fs(fs_num=self.fs_num,
                     random_state=self.random_state,
                     cv=self.cv)
        #selectors = ['kbest_f','rfe_svm','rfe_tree','rfe_rf','rfecv_svm','rfecv_tree','rfecv_rf']
        selectors = self.set_selectors
        loop_num = 1
        total_loop = len(selectors)
        selected_features = []
        for selector in selectors:
            start_time = time()
            if (not self.in_pipeline):
                logger.info(Test_comment)
                logger.info(f"Current Running:" + selector + " selector")
            try:
                reg_selector = getattr(reg, selector)()
                reg_sel_result = reg_selector.fit(tr_features, tr_labels)
                fs_feature = rank_fs_result(reg_sel_result,
                                            tr_features.head(1))

                selected_features.extend(fs_feature)
                if (not self.in_pipeline):
                    update_progress(loop_num / total_loop,
                                    process_name="Feature Selection Iteration")
                    print(
                        f'\n      *optimalflow* autoFS Module ===> Selector {selector} gets outputs: {fs_feature}'
                    )
                    logger.info(
                        f"This selector executed {round((time()-start_time)/60,4)} minutes"
                    )
                loop_num += 1

            except:
                if (not self.in_pipeline):
                    print(selector + " selector is not availible.")
                    update_progress(loop_num / total_loop,
                                    process_name="Feature Selection Iteration")
                    logger.info(
                        f"This selector executed {round((time()-start_time)/60,4)} minutes"
                    )
                loop_num += 1
                pass

        counts = Counter(selected_features)
        fs_results = sorted(selected_features, key=lambda x: -counts[x])
        fs_results = unique(fs_results)[:self.fs_num]
        if (not self.in_pipeline):
            print(
                f"The optimalflow autoFS identify the top {self.fs_num} important features for regression are: {fs_results}."
            )
            logger.info(
                f"The optimalflow autoFS identify the top {self.fs_num} important features for regression are: {fs_results}."
            )
        return (self.fs_num, fs_results)
Exemplo n.º 3
0
    def fit(self, data):
        """Fits and transforms a chain of Optimal Flow modules.
        
        Parameters
        ----------
        input_data : pandas dataframe, shape = [n_samples, n_features]
            
            NOTE: 
            The input_data should be the datasets after basic data cleaning & well feature deduction, the more features involve will result in more columns permutation outputs. 
        
        Returns
        -------
        DICT_PREP_INFO : dictionary
            Each key is the # of preprocessed dataset("Dataset_xxx" format, i.e. "Dataset_10"), each value stores an info string about what transforms applied.
            i.e. DICT_PREPROCESSING['Dataset_0'] stores value "winsor_0-Scaler_None-- Encoded Features:['diagnosis', 'Size_3', 'area_mean']", which means applied 1st mode of winsorization, none scaler applied, and the encoded columns names(shown the enconding approachs in the names)
        DICT_FEATURE_SELECTION_INFO : dictionary
            Each key is the # of preprocessed dataset, each value stores the name of features selected after the autoFS module.
        DICT_MODELS_EVALUATION : dictionary
            Each key is the # of preprocessed dataset, each value stores the model evaluation results with its validate dataset.
        DICT_DATA : dictionary
            Each key is the # of preprocessed dataset, and first level sub-key is the type of splitted sets(including 'DICT_Train','DICT_TEST',and'DICT_Validate').
            The second level sub-key is "X" for features and "y" for label, each value stores the datasets related to the keys(Pandas Dataframe format)
            i.e. DICT_DATA['Dataset_0']['DICT_TEST']["X"] is the train features of Dataset_0's test dataset
        models_summary : Pandas Dataframe
            Model selection results ranking table among all composits of preprocessed datasets, selected features and all posible models with optimal parameters. 
        
        NOTE - Log records will generate and save to ./logs folder automatedly.
        """
        dyna = self.step1
        DICT_PREP_DF, DICT_PREP_INFO = dyna.fit(input_data=data)
        print(f"Total combinations: {len(DICT_PREP_DF.keys())}")
        logger.info(f"Total combinations: {len(DICT_PREP_DF.keys())}")
        # Tracking the metrics values
        DICT_MODELS_EVALUATION = {}
        # Feature Selction tracking
        DICT_FEATURE_SELECTION_INFO = {}

        DICT_DATA = {}

        loop_num = 1
        total_loop = len(DICT_PREP_DF.keys())
        for number, key in enumerate(DICT_PREP_DF.keys()):
            combination_df = DICT_PREP_DF[key]
            start_time = time()
            logger.info(Test_comment)
            dataset_num = key.split("Dataset_", 1)[1]
            logger.info(
                f"Current Running Preprocessed Dataset No. {dataset_num}:")
            features = combination_df.drop(dyna.label_col, axis=1)
            labels = combination_df[dyna.label_col]

            logger.info("[Features Before autoFS]: ")
            logger.info(list(features.columns))

            custom_val_size, custom_size, custom_random_state = self.step2
            X_train, y_train, X_val, y_val, X_test, y_test = data_splitting_tool(
                feature_cols=features,
                label_col=labels,
                val_size=custom_val_size,
                test_size=custom_size,
                random_state=custom_random_state)
            tr_features = X_train
            tr_labels = y_train
            autoFS_module = self.step3
            fs_num, fs_results = autoFS_module.fit(tr_features, tr_labels)
            DICT_FEATURE_SELECTION_INFO["Dataset_" +
                                        str(dataset_num)] = fs_results
            logger.info(f"[Results Report]:")
            logger.info(
                f">>> autoFS summary - This dataset has the top {fs_num} important features: {fs_results}."
            )

            tr_features = tr_features[list(fs_results)]
            tr_labels = tr_labels
            val_features = X_val[list(fs_results)]
            val_labels = y_val
            ts_features = X_test[list(fs_results)]
            ts_labels = y_test

            DICT_PER_DATA = {
                "DICT_Train": {},
                "DICT_Validate": {},
                "DICT_TEST": {}
            }
            DICT_PER_DATA["DICT_Train"]["X"] = tr_features
            DICT_PER_DATA["DICT_Train"]["y"] = tr_labels
            DICT_PER_DATA["DICT_Validate"]["X"] = val_features
            DICT_PER_DATA["DICT_Validate"]["y"] = val_labels
            DICT_PER_DATA["DICT_TEST"]["X"] = ts_features
            DICT_PER_DATA["DICT_TEST"]["y"] = ts_labels

            DICT_DATA["Dataset_" + str(dataset_num)] = DICT_PER_DATA

            autoCV_module = self.step4
            cv_num, DICT_EST = autoCV_module.fit(tr_features, tr_labels)
            for est in DICT_EST.keys():
                results = DICT_EST[est]
                logger.info(
                    f">>> autoCV summary - {est} model CrossValidation with {cv_num} folds:"
                )
                logger.info('     - Best Paramaters: {}\n'.format(
                    results.best_params_))
                logger.info('     - Best CV Score: {}\n'.format(
                    results.best_score_))

            evaluate_module = self.step5
            if (evaluate_module.model_type == "cls"):
                metrics_df = pd.DataFrame(columns=[
                    'Model_Name', 'Accuracy', 'Precision', 'Recall', 'Latency',
                    'Best_Parameters'
                ])
            if (evaluate_module.model_type == "reg"):
                metrics_df = pd.DataFrame(columns=[
                    'Model_Name', 'R2', 'MAE', 'MSE', 'RMSE', 'Latency',
                    'Best_Parameters'
                ])

            for est in DICT_EST.keys():
                optimal_scores = evaluate_module.fit(
                    name=est,
                    model=DICT_EST[est].best_estimator_,
                    features=val_features,
                    labels=val_labels)
                optimal_scores.append(
                    str([i for i in DICT_EST[est].best_params_.items()]))

                if (evaluate_module.model_type == "cls"):
                    metrics_df = metrics_df.append(
                        pd.DataFrame([optimal_scores],
                                     columns=[
                                         'Model_Name', 'Accuracy', 'Precision',
                                         'Recall', 'Latency', 'Best_Parameters'
                                     ]))
                    logger.info(
                        '>>> {} Modle Validation Results -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}s'
                        .format(optimal_scores[0], optimal_scores[1],
                                optimal_scores[2], optimal_scores[3],
                                optimal_scores[4]))

                if (evaluate_module.model_type == "reg"):
                    metrics_df = metrics_df.append(
                        pd.DataFrame([optimal_scores],
                                     columns=[
                                         'Model_Name', 'R2', 'MAE', 'MSE',
                                         'RMSE', 'Latency', 'Best_Parameters'
                                     ]))
                    logger.info(
                        '>>> {} Model Validation Results -- R^2 Score: {} / Mean Absolute Error: {} / Mean Squared Error: {} / Root Mean Squared Error: {} / Latency: {}s'
                        .format(optimal_scores[0], optimal_scores[1],
                                optimal_scores[2], optimal_scores[3],
                                optimal_scores[4], optimal_scores[5]))

            DICT_MODELS_EVALUATION["Dataset_" + str(dataset_num)] = metrics_df

            logger.info(
                f"Total executed {round((time()-start_time)/60,4)} minutes")
            time_est = round(
                ((time() - start_time) / 60) * (total_loop - loop_num), 4)
            update_progress(loop_num / total_loop,
                            clear_flag=True,
                            process_name="autoFS & autoCV Iteration",
                            time_est=time_est)
            loop_num += 1

        dict_flow = DICT_MODELS_EVALUATION
        for key in dict_flow.keys():
            dict_flow[key]['Dataset'] = key
        if (evaluate_module.model_type == "cls"):
            models_summary = pd.concat(
                [dict_flow[i] for i in dict_flow.keys()],
                ignore_index=True).sort_values(
                    by=['Accuracy', 'Precision', 'Recall', 'Latency'],
                    ascending=[False, False, False, True])
            models_summary = models_summary[[
                "Dataset", "Model_Name", "Best_Parameters", 'Accuracy',
                'Precision', 'Recall', 'Latency'
            ]]
        if (evaluate_module.model_type == "reg"):
            models_summary = pd.concat(
                [dict_flow[i] for i in dict_flow.keys()],
                ignore_index=True).sort_values(
                    by=['R2', 'MAE', 'MSE', 'RMSE', 'Latency'],
                    ascending=[False, True, True, True, True])
            models_summary = models_summary[[
                "Dataset", "Model_Name", "Best_Parameters", 'R2', 'MAE', 'MSE',
                'RMSE', 'Latency'
            ]]
        logger.info(Start_log)
        print(f"The top 5 Models with Best Performance Metrics:")
        print(models_summary.head(5))
        logger.info(f"The top 5 Models with Best Performance Metrics:")
        logger.info(models_summary.head(5))

        return (DICT_PREP_INFO, DICT_FEATURE_SELECTION_INFO,
                DICT_MODELS_EVALUATION, DICT_DATA, models_summary)
Exemplo n.º 4
0
    def fit(self, input_data = None):
        """Fits and transforms a pandas dataframe to non-missing values, outlier excluded, categories encoded and scaled datasets by all algorithms permutation.
        
        Parameters
        ----------
        input_data : pandas dataframe, shape = [n_samples, n_features]
            
            NOTE: 
            The input_data should be the datasets after basic data cleaning & well feature deduction, the more features involve will result in more columns permutation outputs. 
        
        Returns
        -------
        DICT_PREP_DF : dictionary
            Each key is the # of output preprocessed dataset, each value stores the dataset
        DICT_PREP_INFO : dictionary
            Dictionary for reference. Each key is the # of the output preprocessed dataset, each value stores the column names of the dataset
        
        NOTE - Log records will generate and save to ./logs folder automatedly.
        """
                        
        if (self.export_output_files):
            df_folder = os.path.join(os.getcwd(),'dfs')
            if not os.path.exists(df_folder):
                os.makedirs(df_folder)
            for l in os.listdir(df_folder):
                os.remove(os.path.join(df_folder,l))
        DICT_DFS={}
        for i in range(len(self.parameters.get("winsorizer"))):
            pp = PPtools(label_col = self.label_col, data = input_data, model_type = self.model_type)
            pp.split_category_cols()
            initial_num_cols = pp.num_df.columns
            pp.impute_tool()
            pp.winsorize_tool(lower_ban = self.parameters.get("winsorizer")[i][0],upper_ban = self.parameters.get("winsorizer")[i][1])
            winsorized_df_cols_list = list(pp.num_df.columns)
            encoded_cols_list = {}
            for col in pp.cat_df.columns:
                encoded_cols_list[col] = []
                if(pp.cat_df[col].nunique() < self.parameters.get("encode_band")[0]):
                    for en_type in self.parameters.get("low_encode"):
                        encoded_col = pp.encode_tool(en_type = en_type ,category_col = col)
                        encoded_cols_list[col].append(list(encoded_col.columns))
                        pp.num_df = pd.concat([pp.num_df,encoded_col],axis = 1)                       

                if(pp.cat_df[col].nunique() >= self.parameters.get("encode_band")[0]):
                    for en_type in self.parameters.get("high_encode"):
                        encoded_col = pp.encode_tool(en_type = en_type ,category_col = col)
                        encoded_cols_list[col].append(list(encoded_col.columns))                       
                        pp.num_df = pd.concat([pp.num_df,encoded_col],axis = 1)

            args_list = []
            for key in encoded_cols_list.keys():
                args_list.append(encoded_cols_list[key])
            iters_combined = itertools.product(*args_list)
            loop_num = 1
            total_loop = len(list(iters_combined))                            
            for number, combination in enumerate(itertools.product(*args_list)):
                start_time = time()
                combined_cols_list = []
                combined_cols_list.append(winsorized_df_cols_list)
                
                for ele in list(combination):
                    combined_cols_list.append(ele)
                
                combined_cols_list = [item for sublist in combined_cols_list for item in sublist]
                encoded_df = pp.num_df[pp.num_df.columns.intersection(combined_cols_list)]
                encoded_df = pp.remove_zero_col_tool(encoded_df)
                category_sparsity_score = pp.sparsity_tool(encoded_df[encoded_df.columns.difference(list(initial_num_cols))])
                if (category_sparsity_score > self.parameters["sparsity"][0]) and ((len(encoded_df.columns)+1)<=self.parameters["cols"][0]):
                    logger.info(Test_comment)
                    logger.info(f"Current Running Dataset No. {number} :")
                    if (self.export_output_files):
                        temp_dfs = os.path.join(df_folder, f"winsor_{i}_{number}.csv")
                        encoded_df.to_csv(temp_dfs, index = False)
                    

                    for sca in self.parameters["scaler"]:
                        DICT_DFS[f"winsor_{i}-Scaler_{sca}-Dataset_{number}"] = pd.concat([pp.data[self.label_col], pp.scale_tool(df = encoded_df,sc_type = sca)],axis = 1)
                        logger.info(f">>> winsorized_Strategy is {i}")
                        logger.info(f">>> Scaler stragety is {sca}")
                        logger.info(f">>> Encoding strategy: {list(combination)}")
                        logger.info(f">>> Total columns with label column is: {len(list(encoded_df.columns))+1}")
                        logger.info(f">>> Encoded Category Columns' Sparsity Score: {str(category_sparsity_score)}")
                time_est = round(((time()-start_time)/60)*(total_loop - loop_num),4)
                update_progress(loop_num/total_loop, clear_flag = True, process_name = "Data Preprocessing Ensemble Iteration", time_est = time_est)
                loop_num += 1
        DICT_PREP_INFO = {}
        DICT_PREP_DF = {}
        for number, key in enumerate(DICT_DFS.keys()):
            DICT_PREP_INFO["Dataset_"+str(number)] = key.split("Dataset_",1)[0] + "- Encoded Features:" + str(list(DICT_DFS[key].columns))
        for number, key in enumerate(DICT_DFS.keys()):
            DICT_PREP_DF["Dataset_"+str(number)] = DICT_DFS[key]
        
        return(DICT_PREP_DF,DICT_PREP_INFO)