Пример #1
0
    def set_run_identificaton(self, script_name: str) -> bool:

        #get only filename
        script_name, _ = Util.get_name_and_extension_from_file(script_name)

        tag = self.datetime_init.strftime("%y-%m-%d-%H-%M-%S")
        self.app_folder = self.param.output_path + "/" + self.param.app_name + "/"
        self.run_folder = self.param.output_path + "/" + self.param.app_name + "/" + script_name + "_" + tag + "/"
        self.prefix_name = self.param.app_name + "_" + tag + "_"
        return True
Пример #2
0
    def init_script(self,
                    script_name: str = "",
                    warnings_level: str = "") -> bool:

        if self.param.environment_id == 'console_localhost':

            # Measure time consuption
            self.datetime_init = datetime.datetime.now()
            self.time_init = time.time()
            self.script_name = script_name

            log = Util.get_logging_level(level=self.param.logging_level)
            logging.basicConfig(
                level=log,
                format='%(asctime)s: %(levelname)s - %(message)s',
                datefmt='%y-%m-%d %H:%M')

            # Supress warnings
            warnings.filterwarnings(warnings_level)

            logging.info(
                "======================================================================"
            )
            logging.info('Starting ' + self.script_name + ' at ' +
                         str(self.datetime_init.strftime("%Y-%m-%d %H:%M")))

            # setup working files
            self.set_run_identificaton(script_name=self.script_name)

            # create app folder
            if not os.path.exists(self.app_folder):
                os.makedirs(self.app_folder)

            # create run folder
            if not os.path.exists(self.run_folder):
                os.makedirs(self.run_folder)

            # setting tracking server
            if self.param.tracking:
                self.tracking = MLFlowManagement()
                tag = self.datetime_init.strftime("%y-%m-%d-%H-%M-%S")
                self.tracking.setup_mlflow_tracking(
                    URI=self.MLFLOW_URI,
                    experiment_name=self.param.app_name,
                    run_name=tag)

        else:
            logging.error('Environment id not valid')

        return True
Пример #3
0
    def run(self):

        # =========================================================================================
        # Environment variables
        load_dotenv(find_dotenv())
        PYTHON_WARNINGS = os.getenv("PYTHON_WARNINGS")

        # ===========================================================================================
        # Script Setup

        # Loading json file
        data_config = Util.load_parameters_from_file(
            path_file=self.parameters_file)

        # Validate parameters and load environment class
        env_param = EnvironmentParameters(
            **data_config.get("environment_parameters"))
        env = Environment(param=env_param)

        # Validade parameters and load raw data settings
        data_param = Data2ViewParameters(
            **data_config.get("view_data_parameters"))
        ds = DataProcessing(param=data_param)

        # ===========================================================================================
        # Setup environment
        env.init_script(script_name=os.path.basename(__file__),
                        warnings_level=PYTHON_WARNINGS)

        # ===========================================================================================
        # Loading data
        logging.info(
            "======================================================================"
        )
        logging.info('Loading Raw Data:')
        data = ds.load_data()

        logging.info(
            "======================================================================"
        )
        logging.info('Update visualization of data:')
        dv = DataVisualization(title=env.param.app_name, data_param=data_param)
        dv.update_page(data=data)

        # ===========================================================================================
        # Script Performance
        env.close_script()
Пример #4
0
    def run(self):

        # =========================================================================================
        # Environment variables
        load_dotenv(find_dotenv())
        PYTHON_WARNINGS = os.getenv("PYTHON_WARNINGS")

        # ===========================================================================================
        # Script Setup

        # Loading json file
        data_config = Util.load_parameters_from_file(path_file=self.parameters_file)

        # Validate parameters and load environment class
        env_param = EnvironmentParameters(**data_config.get("environment_parameters"))
        env = Environment(param=env_param)

        # Validade parameters and load data processing class
        data_param = Static2ValueParameters(**data_config.get("static2value_parameters"))
        ds = DataProcessing(param=data_param)

        # ===========================================================================================
        # Setup environment
        env.init_script(script_name=os.path.basename(__file__), warnings_level=PYTHON_WARNINGS)

        # ===========================================================================================
        # Loading data
        logging.info("======================================================================")
        logging.info("Loading Training and Test Data:")
        data_train_input, data_train_target = ds.load_dataset(subset='train')

        logging.info("======================================================================")
        logging.info("Fit and Transform Training Data:")
        (
            data_train_input,
            data_train_target,
            input_var_dict,
            target_var_dict,
            numerical_input_encoder_list,
            categorical_input_encoder_int_list,
            categorical_input_encoder_hot_list,
            categorical_input_encoder_bin_list,
            categorical_input_int_to_cat_dict_list,
            categorical_input_cat_to_int_dict_list,
            txt_int_to_word_dict_list_input,
            txt_word_to_int_dict_list_input,
            numerical_output_encoder_list,
            categorical_output_encoder_int_list,
            categorical_output_encoder_hot_list,
            categorical_output_encoder_bin_list,
            int_to_cat_dict_list_output_list,
            cat_to_int_dict_list_output_list,
        ) = ds.fit_transform_train_data(
            data_train_input=data_train_input,
            data_train_target=data_train_target
        )

        logging.info("======================================================================")
        logging.info("Building Model:")

        model = self.model_selection(data_config=data_config, data_param=data_param, environment=env)

        model.fit(
            data_input=data_train_input,
            data_target=data_train_target,
            input_var_dict=input_var_dict,
            target_var_dict=target_var_dict,
            target_cat_dict=cat_to_int_dict_list_output_list
        )

        logging.info("======================================================================")
        logging.info("Building predictions:")

        data_train_predict = model.eval_predict(
            data_input=data_train_input,
            input_var_dict=input_var_dict,
            int_to_cat_dict_target=None)

        logging.info("======================================================================")
        logging.info("Training Results")

        model_eval_train = RegressionModelEvaluation(
            Y_target=data_train_target[data_param.output_target],
            Y_predict=data_train_predict[['predict']],
            Y_reliability=data_train_predict[['reliability']],
            subset_label="eval_train_",
            regression_type=data_param.regression_type,
            train_history=model.history
        )

        # checking metrics
        model_eval_train.execute()
        # ===========================================================================================
        # Saving files
        logging.info("======================================================================")
        logging.info("Saving Training Results:")

        # prediction report
        prediction_report = model_eval_train.get_prediction_report()
        Util.save_dataframe(data=prediction_report, folder_path=env.run_folder,
                            prefix=env.prefix_name + "pred_train_report")

        # ===========================================================================================
        # ploting results
        if env_param.view_plots or env_param.save_plots:
            logging.info("======================================================================")
            logging.info("Plotting training result graphs")

            model_eval_train.plot_training_results(
                view=env_param.view_plots,
                save=env_param.save_plots,
                path=env.run_folder,
                prefix=env.prefix_name + "train_",
            )

        # ===========================================================================================
        # Evaluating test dataset
        # ===========================================================================================
        # Loading data
        logging.info("======================================================================")
        logging.info("Loading Test Data:")

        # exclude data_train for memory optimization
        del (data_train_input)
        del (data_train_target)
        del (data_train_predict)

        # loading test data
        data_test_input, data_test_target = ds.load_dataset(subset='test')

        logging.info("======================================================================")
        logging.info("Transform Test Data:")
        (
            data_test_input,
            data_test_target,
        ) = ds.transform_test_data(
            data_test_input=data_test_input,
            data_test_target=data_test_target,
            input_var_dict=input_var_dict,
            target_var_dict=input_var_dict,
            numerical_input_encoder_list=numerical_input_encoder_list,
            categorical_input_encoder_int_list=categorical_input_encoder_int_list,
            categorical_input_encoder_hot_list=categorical_input_encoder_hot_list,
            categorical_input_encoder_bin_list=categorical_input_encoder_bin_list,
            categorical_int_to_cat_dict_list_input=categorical_input_int_to_cat_dict_list,
            categorical_cat_to_int_dict_list_input=categorical_input_cat_to_int_dict_list,
            txt_int_to_word_dict_list_input=txt_int_to_word_dict_list_input,
            txt_word_to_int_dict_list_input=txt_word_to_int_dict_list_input,
            numerical_output_encoder_list=numerical_output_encoder_list,
            categorical_output_encoder_int_list=categorical_output_encoder_int_list,
            categorical_output_encoder_hot_list=categorical_output_encoder_hot_list,
            categorical_output_encoder_bin_list=categorical_output_encoder_bin_list,
            int_to_cat_dict_list_output_list=int_to_cat_dict_list_output_list,
            cat_to_int_dict_list_output_list=cat_to_int_dict_list_output_list,
        )

        logging.info("======================================================================")
        logging.info("Test Results")
        data_test_predict = model.eval_predict(data_input=data_test_input,
                                               input_var_dict=input_var_dict,
                                               int_to_cat_dict_target=None)

        model_eval_test = RegressionModelEvaluation(
            Y_target=data_test_target[data_param.output_target],
            Y_predict=data_test_predict[['predict']],
            Y_reliability=data_test_predict[['reliability']],
            subset_label="eval_test_",
            regression_type=data_param.regression_type,
            train_history=model.history
        )

        # checking metrics
        model_eval_test.execute()

        # ===========================================================================================
        # Saving files
        logging.info("======================================================================")
        logging.info("Saving Testing Results:")

        # prediction report
        prediction_report = model_eval_test.get_prediction_report()
        Util.save_dataframe(data=prediction_report, folder_path=env.run_folder,
                            prefix=env.prefix_name + "pred_test_report")

        # ===========================================================================================
        # ploting results
        if env_param.view_plots or env_param.save_plots:
            logging.info("======================================================================")
            logging.info("Plotting test result graphs")

            model_eval_test.plot_test_results(
                view=env_param.view_plots,
                save=env_param.save_plots,
                path=env.run_folder,
                prefix=env.prefix_name + "test_",
            )

        # ===========================================================================================
        # Register tracking info
        if env.param.tracking:
            env.publish_results(history=ds.history)
            env.publish_results(history=model.history)
            env.publish_results(history=model_eval_train.history)
            env.publish_results(history=model_eval_test.history)
            env.tracking.log_artifacts_folder(local_dir=env.run_folder)

        # ===========================================================================================
        # Script Performance
        env.close_script()
Пример #5
0
    def run(self):

        # =========================================================================================
        # Environment variables
        load_dotenv(find_dotenv())
        PYTHON_WARNINGS = os.getenv("PYTHON_WARNINGS")

        # ===========================================================================================
        # Script Setup

        # Loading json file
        data_config = Util.load_parameters_from_file(path_file=self.parameters_file)

        # Validate parameters and load environment class
        env_param = EnvironmentParameters(**data_config.get("environment_parameters"))
        env = Environment(param=env_param)

        # Validade parameters and load data processing class
        data_param = Static2ClusterParameters(**data_config.get("static2cluster_parameters"))
        ds = DataProcessing(param=data_param)

        # ===========================================================================================
        # Setup environment
        env.init_script(script_name=os.path.basename(__file__), warnings_level=PYTHON_WARNINGS)

        # ===========================================================================================
        # Loading data
        logging.info("======================================================================")
        logging.info("Loading Training and Test Data:")
        data_train_input, _ = ds.load_dataset()
        data_test_input, _ = ds.load_test_data()

        logging.info("======================================================================")
        logging.info("Preprocessing Training Data:")
        (
            data_train_input,
            _,
            data_test_input,
            _,
            variables_input,
            _,
            _,
            _,
        ) = ds.prepare_train_test_data(
            data_train_input=data_train_input,
            data_test_input=data_test_input
        )

        logging.info("======================================================================")
        logging.info("Building Model:")

        model = self.model_selection(data_config=data_config, data_param=data_param)

        model.fit(
            data_input=data_train_input[variables_input]
        )

        logging.info("======================================================================")
        logging.info("Building predictions:")

        # TODO

        logging.info("======================================================================")
        logging.info("Training Results")

        # TODO

        logging.info("======================================================================")
        logging.info("Test Results")

        # TODO

        # ===========================================================================================
        # Saving model
        logging.info("======================================================================")
        logging.info("Saving Results:")

        # ===========================================================================================
        # Register tracking info
        if env.param.tracking:
            env.publish_results(history=ds.history)
            env.tracking.log_artifacts_folder(local_dir=env.run_folder)

        # ===========================================================================================
        # Script Performance
        env.close_script()
Пример #6
0
    def run(self):

        # =========================================================================================
        # Environment variables
        load_dotenv(find_dotenv())
        PYTHON_WARNINGS = os.getenv("PYTHON_WARNINGS")

        # ===========================================================================================
        # Script Setup

        # Loading json file
        data_config = Util.load_parameters_from_file(path_file=self.parameters_file)

        # Validate parameters and load environment class
        env_param = EnvironmentParameters(**data_config.get("environment_parameters"))
        env = Environment(param=env_param)

        # Validade parameters and load data processing class
        data_param = Txt2VecParameters(**data_config.get("txt2vec_parameters"))
        ds = DataProcessing(param=data_param)

        # ===========================================================================================
        # Setup environment
        env.init_script(script_name=os.path.basename(__file__), warnings_level=PYTHON_WARNINGS)

        # ===========================================================================================
        # Loading data
        logging.info("======================================================================")
        logging.info("Loading Training and Test Data:")
        data_train_input, data_train_target = ds.load_dataset()

        logging.info("======================================================================")
        logging.info("Preprocessing Training Data:")
        (
            data_train_input,
            variables_input
        ) = ds.prepare_corpus_data(data=data_train_input)

        logging.info("======================================================================")
        logging.info("Building Model:")

        # select model technology
        model = self.model_selection(data_config=data_config, data_param=data_param, environment=env)

        # build model
        model.fit(
            dataframe= data_train_input,
            corpus_col=variables_input[0]
        )

        logging.info("======================================================================")
        logging.info("Training Results")

        # todo a embedding evaluation
        # model_eval_train = ClassificationModelEvaluation(
        #     Y_target=data_train_target[data_param.output_target],
        #     Y_predict=data_train_predict[['predict']],
        #     subset_label="Train",
        #     classification_type=data_param.classification_type,
        #     Y_int_to_cat_labels=int_to_cat_dict_list_target,
        #     Y_cat_to_int_labels=cat_to_int_dict_list_target,
        #     history=None,
        # )

        #model_eval_train.print_evaluation_scores()
        #env.tracking.publish_c_eval(model_eval=model_eval_train, mode="train")

        if env_param.view_plots or env_param.save_plots:
            logging.info("======================================================================")
            logging.info("Plotting training result graphs")

            if env_param.save_plots:
                logging.info("Plots will save in " + env.run_folder)

            if env_param.view_plots:
                logging.info("Plots will view in window popup")

            # model_eval_train.plot_evaluation_scores(
            #     view=env_param.view_plots,
            #     save=env_param.save_plots,
            #     path=env.run_folder,
            #     prefix=env.prefix_name + "train_",
            # )

        # ===========================================================================================
        # Saving model
        logging.info("======================================================================")
        logging.info("Saving Results:")

        model.save_model()

        # ===========================================================================================
        # Register tracking info
        if env.param.tracking:
            env.publish_results(history=ds.history)
            env.tracking.log_artifacts_folder(local_dir=env.run_folder)

        # ===========================================================================================
        # Script Performance
        env.close_script()
Пример #7
0
    def run(self):

        # =========================================================================================
        # Environment variables
        load_dotenv(find_dotenv())
        PYTHON_WARNINGS = os.getenv("PYTHON_WARNINGS")

        # ===========================================================================================
        # Script Setup

        # Loading json file
        data_config = Util.load_parameters_from_file(path_file=self.parameters_file)

        # Validate parameters and load environment class
        env_param = EnvironmentParameters(**data_config.get("environment_parameters"))
        env = Environment(param=env_param)

        # Validade parameters and load data processing class
        data_param = Dataprep2DatasetParameters(**data_config.get("dataset_parameters"))
        ds = DataProcessing(param=data_param)

        # ===========================================================================================
        # Setup environment
        env.init_script(script_name=os.path.basename(__file__), warnings_level=PYTHON_WARNINGS)

        # ===========================================================================================
        # Loading data
        logging.info("======================================================================")
        logging.info("Loading Raw Data:")
        data = ds.load_data()

        logging.info("======================================================================")
        logging.info("Split train and test data subsets:")
        data_train, data_test = ds.build_dataset(data=data)

        # ===========================================================================================
        # Analysis of data subsets

        logging.info("======================================================================")
        logging.info("Descritive Analysis - Training Data:")
        ds.descriptive_analysis(
            data=data_train,
            view_plots=env.param.view_plots,
            save_plots=env.param.save_plots,
            save_analysis=False,
            folder_path=env.run_folder,
            prefix=env.prefix_name,
        )

        logging.info("======================================================================")
        logging.info("Descritive Analysis - Test Data:")
        ds.descriptive_analysis(
            data=data_test,
            view_plots=env.param.view_plots,
            save_plots=env.param.save_plots,
            save_analysis=False,
            folder_path=env.run_folder,
            prefix=env.prefix_name,
        )

        # ===========================================================================================
        # Saving dataset
        logging.info("======================================================================")
        logging.info("Saving Datasets:")

        ds.save_datasets(
            data_train=data_train,
            data_test=data_test,
            folder_path=env.run_folder,
            prefix=env.prefix_name,
        )

        # ===========================================================================================
        # Register tracking info
        if env.param.tracking:
            env.publish_results(history=ds.history)

        # ===========================================================================================
        # Script Performance
        env.close_script()
Пример #8
0
    def run(self):

        # =========================================================================================
        # Environment variables
        load_dotenv(find_dotenv())
        PYTHON_WARNINGS = os.getenv("PYTHON_WARNINGS")

        # ===========================================================================================
        # Script Setup

        # Loading json file
        data_config = Util.load_parameters_from_file(
            path_file=self.parameters_file)

        # Validate parameters and load environment class
        env_param = EnvironmentParameters(
            **data_config.get("environment_parameters"))
        env = Environment(param=env_param)

        # Validade parameters and load data processing class
        data_param = Data2DataprepParameters(
            **data_config.get("prep_data_parameters"))
        ds = DataProcessing(param=data_param)

        # ===========================================================================================
        # Setup environment
        env.init_script(script_name=os.path.basename(__file__),
                        warnings_level=PYTHON_WARNINGS)

        # ===========================================================================================
        # Loading data
        logging.info(
            "======================================================================"
        )
        logging.info('Loading Raw Data:')
        data = ds.load_data()

        logging.info(
            "======================================================================"
        )
        logging.info('Preprocessing Raw Data:')
        data = ds.prep_rawdata(data=data)

        # ===========================================================================================
        # Analysis of dataprep

        logging.info(
            "======================================================================"
        )
        logging.info('Descritive Analysis:')
        ds.descriptive_analysis(data=data,
                                view_plots=env.param.view_plots,
                                save_plots=env.param.save_plots,
                                save_analysis=True,
                                folder_path=env.run_folder,
                                prefix=env.prefix_name)

        # ===========================================================================================
        # Saving data
        logging.info(
            "======================================================================"
        )
        logging.info('Saving preprocessed data:')
        ds.save_dataframe(data=data,
                          folder_path=env.run_folder,
                          prefix=env.prefix_name)

        # ===========================================================================================
        # Register tracking info
        if env.param.tracking:
            env.publish_results(history=ds.history)

        # ===========================================================================================
        # Script Performance
        env.close_script()