def print_classification_report(y_true, y_predict, features, label): """ Print of Classification Report Args: y_true (np.array): actual y_predict (np.array): predicted true label (string): a label to indicate training or evaluation features (list): feature label Returns: """ classification_dict = classification_report(y_true, y_predict, output_dict=True, target_names=features) for k, v in classification_dict.items(): if isinstance(v, dict): metric_string = '' for metrics, result in v.items(): metric_string += f'{metrics} {result:.2f} ' _logger.info(f'{label} - {k}: {metric_string}') else: _logger.info(f'{label} - {k}: {v:.2f}')
def __enter__(self): _logger.info("Creating DB connection...") self.connection = pg.connect(host=self.db_config.get("host"), port=int(self.db_config.get("port")), dbname=self.db_config.get("dbname"), user=self.db_config.get("user")) _logger.info("Connection created!") return self.connection
def load_or_create_train_holdout_set(self, df): """ create an fixed holdout set for model evaluation Args: df: pd.DataFrame Returns: pd.DataFrame: te_df pd.DataFrame: tt_df """ _logger.info('Loading of holdout set') holdout_complaint_id = self.load_or_get_holdout_id(df) tr_df = df[~df[COMPLAINT_ID].isin(holdout_complaint_id.tolist())] te_df = df[df[COMPLAINT_ID].isin(holdout_complaint_id.tolist())] return tr_df, te_df
def evaluate_xgb(self, bst, x, y, label_class, description): """ Model Evaluation Step Args: description (string): Training or Evaluation label_class: label for the class bst (xgb.Booster): xgb booster x (np.array): train_x y (np.array): train_y """ _logger.info('Start Evaluation') deval = xgb.DMatrix(x, label=y) prediction = bst.predict(deval) print_classification_report(y, prediction, label_class, description) return bst
def prepare_features_for_predicting(self, df): """ prepare numeric features Args: df: input pd.DataFrame Returns: pd.DataFrame """ _logger.info('Preparing Model Features for predicting') try: df = self.generate_numeric_features(df) except Exception as e: _logger.error(f"Unable to prepare model features for predicting", e) raise return df
def prepare_features_for_training(self, df, drop_invalid_rows=True): """ Prepare numeric nlp features, add label features and add drop flags Args: df (pd.DataFrame): input DataFrame drop_invalid_rows (bool): flags to indicate whether to remove invalid rows Returns: df (pd.DataFrame): output DataFrame """ _logger.info('Preparing Model Features for training') try: df = self.generate_numeric_features(df) df = self.generate_label_features(df) df = self.generate_drop_valid_features( df, drop_invalid_rows=drop_invalid_rows) except Exception as e: _logger.error(f"Unable to prepare model features for training", e) raise return df
def train(self): """ the start of the training function """ # Load Data try: etl = ETL(env.DB_FILE, env.SCHEMA_FILE) complaints_users = etl.load_query(SQL_QUERY_STRING) # Preprocess df = self.df_preprocessor.prepare_features_for_training(complaints_users, drop_invalid_rows=True) train_df, holdout_df = self.load_or_create_train_holdout_set(df) # fit and transform Label _logger.info('Preparing Label Fitting and Transformation') train_y = self.label_pipeline.fit_transform(train_df[LABEL]) holdout_y = self.label_pipeline.transform(holdout_df[LABEL]) label_class = self.label_pipeline.classes_ num_class = len(label_class) # # Fit and Transform Text Features _logger.info('Preparing Text Features Fitting and Transformation') train_x = self.input_pipeline.fit_transform(train_df[BASE_TRAINING_FEATURES + [COMPLAINT_TEXT]]) holdout_x = self.input_pipeline.transform(holdout_df[BASE_TRAINING_FEATURES + [COMPLAINT_TEXT]]) # Saving transformer transformer_utils.save_transformer(self.label_pipeline, self.label_transformer_save_path) transformer_utils.save_transformer(self.input_pipeline, self.feature_transformer_model_save_path) # Start Training bst = self.train_xgb(train_x, train_y, holdout_x, holdout_y, num_class) self.evaluate_xgb(bst, train_x, train_y, label_class, 'Training') self.evaluate_xgb(bst, holdout_x, holdout_y, label_class, 'Evaluation') save_xgb(bst, self.xgb_model_save_path) _logger.info('Training of xgb model completed') except Exception as e: _logger.error('Unknown exception occurs during training', e) raise
def train_xgb(self, train_x, train_y, holdout_x, holdout_y, num_class): """ Training of xgboost model Args: train_x: input data x train_y: input data y holdout_x: holdout data x holdout_y: holdout data y num_class: number of input class Returns: xgb.Booster: bst """ _logger.info('Start Training') _logger.info(f'Training Size {train_x.shape[0]}') _logger.info(f'Holdout Size {holdout_x.shape[0]}') dtrain = xgb.DMatrix(train_x, label=train_y) dholdout = xgb.DMatrix(holdout_x, label=holdout_y) xgb_params = self.get_xgb_parameters(num_class) bst = xgb.train(xgb_params, dtrain, self.num_rounds, [(dtrain, 'train'), (dholdout, 'eval')]) return bst
def __exit__(self, type, value, traceback): _logger.info("Closing the DB connection!") self.connection.close()