示例#1
0
def print_classification_report(y_true, y_predict, features, label):
    """
    Print of Classification Report

    Args:
        y_true (np.array): actual
        y_predict (np.array): predicted true
        label (string): a label to indicate training or evaluation
        features (list): feature label

    Returns:

    """
    classification_dict = classification_report(y_true,
                                                y_predict,
                                                output_dict=True,
                                                target_names=features)
    for k, v in classification_dict.items():
        if isinstance(v, dict):
            metric_string = ''
            for metrics, result in v.items():
                metric_string += f'{metrics} {result:.2f} '
            _logger.info(f'{label} - {k}: {metric_string}')
        else:
            _logger.info(f'{label} - {k}: {v:.2f}')
示例#2
0
文件: db.py 项目: infinex/projects
 def __enter__(self):
     _logger.info("Creating DB connection...")
     self.connection = pg.connect(host=self.db_config.get("host"),
                                  port=int(self.db_config.get("port")),
                                  dbname=self.db_config.get("dbname"),
                                  user=self.db_config.get("user"))
     _logger.info("Connection created!")
     return self.connection
示例#3
0
文件: main.py 项目: infinex/projects
    def load_or_create_train_holdout_set(self, df):
        """
        create an fixed holdout set for model evaluation

        Args:
            df: pd.DataFrame

        Returns:
            pd.DataFrame: te_df
            pd.DataFrame: tt_df
        """
        _logger.info('Loading of holdout set')
        holdout_complaint_id = self.load_or_get_holdout_id(df)
        tr_df = df[~df[COMPLAINT_ID].isin(holdout_complaint_id.tolist())]
        te_df = df[df[COMPLAINT_ID].isin(holdout_complaint_id.tolist())]

        return tr_df, te_df
示例#4
0
文件: main.py 项目: infinex/projects
    def evaluate_xgb(self, bst, x, y, label_class, description):
        """
        Model Evaluation Step

        Args:
            description (string): Training or Evaluation
            label_class: label for the class
            bst (xgb.Booster): xgb booster
            x (np.array): train_x
            y (np.array): train_y

        """
        _logger.info('Start Evaluation')
        deval = xgb.DMatrix(x, label=y)
        prediction = bst.predict(deval)
        print_classification_report(y, prediction, label_class, description)

        return bst
示例#5
0
    def prepare_features_for_predicting(self, df):
        """
        prepare numeric features
        Args:
            df: input pd.DataFrame

        Returns:
            pd.DataFrame

        """
        _logger.info('Preparing Model Features for predicting')
        try:
            df = self.generate_numeric_features(df)
        except Exception as e:
            _logger.error(f"Unable to prepare model features for predicting",
                          e)
            raise
        return df
示例#6
0
    def prepare_features_for_training(self, df, drop_invalid_rows=True):
        """
        Prepare numeric nlp features, add label features and add drop flags

        Args:
            df (pd.DataFrame):  input DataFrame
            drop_invalid_rows (bool): flags to indicate whether to remove invalid rows

        Returns:
            df (pd.DataFrame):  output DataFrame
        """

        _logger.info('Preparing Model Features for training')
        try:
            df = self.generate_numeric_features(df)
            df = self.generate_label_features(df)
            df = self.generate_drop_valid_features(
                df, drop_invalid_rows=drop_invalid_rows)

        except Exception as e:
            _logger.error(f"Unable to prepare model features for training", e)
            raise
        return df
示例#7
0
文件: main.py 项目: infinex/projects
    def train(self):
        """
        the start of the training function
        """
        # Load Data
        try:
            etl = ETL(env.DB_FILE, env.SCHEMA_FILE)
            complaints_users = etl.load_query(SQL_QUERY_STRING)

            # Preprocess
            df = self.df_preprocessor.prepare_features_for_training(complaints_users, drop_invalid_rows=True)
            train_df, holdout_df = self.load_or_create_train_holdout_set(df)

            # fit and transform Label
            _logger.info('Preparing Label Fitting and Transformation')
            train_y = self.label_pipeline.fit_transform(train_df[LABEL])
            holdout_y = self.label_pipeline.transform(holdout_df[LABEL])
            label_class = self.label_pipeline.classes_
            num_class = len(label_class)

            # # Fit and Transform Text Features
            _logger.info('Preparing Text Features Fitting and Transformation')
            train_x = self.input_pipeline.fit_transform(train_df[BASE_TRAINING_FEATURES + [COMPLAINT_TEXT]])
            holdout_x = self.input_pipeline.transform(holdout_df[BASE_TRAINING_FEATURES + [COMPLAINT_TEXT]])

            # Saving transformer
            transformer_utils.save_transformer(self.label_pipeline, self.label_transformer_save_path)
            transformer_utils.save_transformer(self.input_pipeline, self.feature_transformer_model_save_path)

            # Start Training
            bst = self.train_xgb(train_x, train_y, holdout_x, holdout_y, num_class)
            self.evaluate_xgb(bst, train_x, train_y, label_class, 'Training')
            self.evaluate_xgb(bst, holdout_x, holdout_y, label_class, 'Evaluation')
            save_xgb(bst, self.xgb_model_save_path)

            _logger.info('Training of xgb model completed')
        except Exception as e:
            _logger.error('Unknown exception occurs during training', e)
            raise
示例#8
0
文件: main.py 项目: infinex/projects
    def train_xgb(self, train_x, train_y, holdout_x, holdout_y, num_class):
        """
        Training of xgboost model

        Args:
            train_x: input data x
            train_y: input data y
            holdout_x: holdout data x
            holdout_y: holdout data y
            num_class: number of input class

        Returns:
            xgb.Booster: bst
        """
        _logger.info('Start Training')
        _logger.info(f'Training Size {train_x.shape[0]}')
        _logger.info(f'Holdout Size {holdout_x.shape[0]}')
        dtrain = xgb.DMatrix(train_x, label=train_y)
        dholdout = xgb.DMatrix(holdout_x, label=holdout_y)
        xgb_params = self.get_xgb_parameters(num_class)
        bst = xgb.train(xgb_params, dtrain, self.num_rounds, [(dtrain, 'train'), (dholdout, 'eval')])

        return bst
示例#9
0
文件: db.py 项目: infinex/projects
 def __exit__(self, type, value, traceback):
     _logger.info("Closing the DB connection!")
     self.connection.close()