def run_cv(model, X, y, folds=3, cv_type=StratifiedKFold, success_metric=roc_auc_score) -> Tuple: """ Run the specified cross validation on the given model using the given X, y. Returns a tuple where: - the first item is the mean CV score - the second item is the std of the CV scores """ try: cv = cv_type(n_splits=folds, shuffle=True) scores = [] for train_idx, test_idx in cv.split(X, y): X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] model.fit(X_train, y_train) y_pred = model.predict_proba(X_test) score = success_metric(y_test, y_pred[:, 1]) scores.append(score) score_mean, score_std = np.mean(scores), np.std(scores) LOAN_LOGGER.info('CV on model completed') return score_mean, score_std except: message = 'CV on model NOT completed' log_and_stop(LOAN_LOGGER, message)
def _run_boosting_hyperopt(selected_model, X_train, y_train, X_test, y_test, max_evals): """ Run Hyperopt for the LGBM or XGB models. The models are trained and tested on the given X and y. The score metric is ROC_AUC. tpe.suggest has been modified, so that only the first 3 tries are random, instead of the default 20. The function returns a tuple where: - the first item is a dictionary returned by the fmin function - the second item is the trials variable used in hyperopt """ def objective(space): model_params = { 'colsample_bytree': space['colsample_bytree'], 'learning_rate': space['learning_rate'], 'max_depth': int(space['max_depth']), 'min_child_weight': int(space['min_child_weight']), 'n_estimators': int(space['n_estimators']), 'reg_alpha': space['reg_alpha'], 'reg_lambda': space['reg_lambda'], 'subsample': space['subsample'], 'num_leaves': 20, 'random_state': 2020, 'importance_type': 'gain', 'n_jobs': -1 } model = selected_model(**model_params) model.fit(X_train, y_train) y_pred = model.predict_proba(X_test) score = -roc_auc_score(y_test, y_pred[:, 1]) return {'loss': score, 'status': STATUS_OK} try: space = { 'max_depth': hp.quniform('ho_max_depth', 5, 20, 1), 'colsample_bytree': hp.uniform('ho_colsample_bytree', 0.8, 1.), 'learning_rate': hp.uniform('ho_learning_rate', 0.05, 0.2), 'subsample': hp.uniform('ho_subsample', 0.7, 1.), 'min_child_weight': hp.quniform('ho_min_child_weight', 1, 10, 1), 'reg_alpha': hp.loguniform('ho_reg_alpha', 0., 1.), 'reg_lambda': hp.uniform('ho_reg_lambda', 0.7, 1.), 'n_estimators': hp.quniform('ho_n_estimators', 50, 500, 5) } trials = Trials() best_params = fmin(fn=objective, space=space, algo=partial(tpe.suggest, n_startup_jobs=3), max_evals=max_evals, trials=trials) LOAN_LOGGER.info('Boosting Hyperopt finished successfully') return best_params, trials except: message = 'Boosting Hyperopt NOT finished successfully' log_and_stop(LOAN_LOGGER, message)
def drop_rows_with_nans(dataset: pd.DataFrame) -> pd.DataFrame: """ Drop rows which contain NaN in one of the 3 columns: `annual_inc`, `earliest_cr_line`, 'pub_rec_bankruptcies'. Function returns a new dataset with removed NaN rows. """ try: df = dataset.copy() df = df.dropna(subset=['annual_inc', 'earliest_cr_line']) LOAN_LOGGER.info('Rows containing NaNs removed') return df except: message = 'Rows containing NaNs NOT removed' log_and_stop(LOAN_LOGGER, message)
def create_log_features(dataset: pd.DataFrame) -> pd.DataFrame: """ Calculate the logarithms for skewed the column annual_inc. Function returns a dataset with new features. """ try: df = dataset.copy() df['annual_inc_log'] = np.log1p(df['annual_inc'].values) LOAN_LOGGER.info('Logarithm features created') return df except: message = 'Logarithm features NOT created' log_and_stop(LOAN_LOGGER, message)
def extract_number_from_text(dataset: pd.DataFrame) -> pd.DataFrame: """ Extract the number of months from the term column and the digit from the sub_grade column. Function returns a new dataset with new features. """ try: df = dataset.copy() df['term_month'] = df['term'].map(lambda x: int(x.strip()[:2])) df['sub_grade_digit'] = df['sub_grade'].map(lambda x: int(x[1])) LOAN_LOGGER.info('Number values extracted from text columns') return df except: message = 'Number values NOT extracted from text columns' log_and_stop(LOAN_LOGGER, message)
def factorize_categorical_features(dataset: pd.DataFrame, factorized_dict: Dict) -> pd.DataFrame: """ Changing all categorical variables into numerical variables based on the given factorized dictionary. Function returns a new dataset where factorized columns end with `_cat`. """ try: df = dataset.copy() for cat_feat in factorized_dict.keys(): df['{}_cat'.format(cat_feat)] = df[cat_feat].map( lambda x: factorized_dict[cat_feat].get_loc(x)) LOAN_LOGGER.info('Categorical features factorized') return df except: message = 'Categorical features NOT factorized' log_and_stop(LOAN_LOGGER, message)
def create_factorizing_dict(dataset: pd.DataFrame, cat_feats: List[str]) -> Dict: """ Create a dictionary which contains all label-number relations for the given categorical variables. Function returns the created dictionary. """ try: df = dataset.copy() factorized_dict = {} for cat_feat in cat_feats: factorized_dict[cat_feat] = pd.factorize(df[cat_feat])[1] LOAN_LOGGER.info('Factorized dictionary created') return factorized_dict except: message = 'Factorized dictionary NOT created' log_and_stop(LOAN_LOGGER, message)
def optimize_dtypes(dataset: pd.DataFrame, dtype_cols) -> pd.DataFrame: """ Optimize the dtype for the given set of columns in order to use less memory. Additionally garbage collection is run. Function returns a new dataset with new dtypes. """ try: df = dataset.copy() for key in dtype_cols.keys(): df.loc[:, key] = df[key].astype(dtype_cols[key]) gc.collect() LOAN_LOGGER.info('dtypes optimized for given columns') return df except: message = 'dtypes NOT optimized for given columns' log_and_stop(LOAN_LOGGER, message)
def _run_tree_hyperopt(selected_model, X_train, y_train, X_test, y_test, max_evals): """ Run Hyperopt for the DecisionTree model. The model is trained and tested on the given X and y. The score metric is ROC_AUC. tpe.suggest has been modified, so that only the first 3 tries are random, instead of the default 20. The function returns a tuple where: - the first item is a dictionary returned by the fmin function - the second item is the trials variable used in hyperopt """ def objective(space): model_params = { 'max_depth': int(space['max_depth']), 'min_samples_split': int(space['min_samples_split']), 'min_samples_leaf': int(space['min_samples_leaf']), 'random_state': 2020 } model = selected_model(**model_params) model.fit(X_train, y_train) y_pred = model.predict_proba(X_test) score = -roc_auc_score(y_test, y_pred[:, 1]) return {'loss': score, 'status': STATUS_OK} try: space = { 'max_depth': hp.quniform('ho_max_depth', 5, 20, 1), 'min_samples_split': hp.quniform('ho_min_samples_split', 2, 10, 1), 'min_samples_leaf': hp.quniform('ho_min_samples_leaf', 1, 10, 1), } trials = Trials() best_params = fmin(fn=objective, space=space, algo=partial(tpe.suggest, n_startup_jobs=3), max_evals=max_evals, trials=trials) LOAN_LOGGER.info('Tree Hyperopt finished successfully') return best_params, trials except: message = 'Tree Hyperopt NOT finished successfully' log_and_stop(LOAN_LOGGER, message)
def create_date_features(dataset: pd.DataFrame) -> pd.DataFrame: """ Create features based on datetime columns: issue_d, earliest_cr_line. Function returns a new dataset with new features. """ try: df = dataset.copy() df['issue_d_month'] = df['issue_d'].dt.month df['issue_d_year'] = df['issue_d'].dt.year df['earliest_cr_line_month'] = df['earliest_cr_line'].dt.month df['earliest_cr_line_year'] = df['earliest_cr_line'].dt.year df['days_between_earliest_cr_and_issue'] = ( df['issue_d'] - df['earliest_cr_line']).dt.days LOAN_LOGGER.info('Date features created') return df except: message = 'Date features NOT created' log_and_stop(LOAN_LOGGER, message)
def create_target_variable(dataset: pd.DataFrame) -> pd.DataFrame: """ Create the column bad_loan which is True if loan_status has any of the following values: `Charged Off`, `Late (31-120 days)`, 'Late (16-30 days)', `Does not meet the credit policy. Status:Charged Off`, `Default`. Function returns a new dataset with the target variable. """ try: df = dataset.copy() bad_status = [ 'Charged Off', 'Late (31-120 days)', 'Late (16-30 days)', 'Does not meet the credit policy. Status:Charged Off', 'Default' ] df['bad_loan'] = df['loan_status'].isin(bad_status) LOAN_LOGGER.info('Target Variable created') return df except: message = 'Target Variable NOT created' log_and_stop(LOAN_LOGGER, message)
def prepare_train_test_sets(features, target) -> Tuple: """ Change the DataFrame and Series into numpy arrays and divide the data into a training and test set. Test set will consist 20% of observations. Function returns X and y divided into training and test sets. """ try: X = np.array(features, dtype=np.float) y = np.array(target, dtype=np.float) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020, stratify=y) LOAN_LOGGER.info('Train and test sets prepared') return X_train, X_test, y_train, y_test except: message = 'Train and test sets NOT prepared' log_and_stop(LOAN_LOGGER, message)
def train_and_test_model(model, X_train, y_train, X_test, y_test, success_metric=roc_auc_score): """ Train the given model on the given training set and then test it on the given test set. Returns a tuple where: - the first item is the score achieved on the test set - the second item are the predicted probabilities """ try: model.fit(X_train, y_train) y_pred = model.predict_proba(X_test) score = success_metric(y_test, y_pred[:, 1]) LOAN_LOGGER.info('Model trained on train set and tested on test set') return score, y_pred except: message = 'Model NOT trained on train set and tested on test set' log_and_stop(LOAN_LOGGER, message)
def read_csv_file(file: str, usecols: List[str]) -> pd.DataFrame: """ Read the specified file consisting loan data with specified usecols. Columns issue_d and earliest_cr_line are parsed as datetime. Numerical columns have explicitly specified dtypes for memory optimization. Function returns a read file. """ try: file_path = Path(__file__).parents[0].absolute() / 'data' / file read_file = pd.read_csv(file_path, usecols=usecols, parse_dates=['issue_d', 'earliest_cr_line'], dtype={ 'int_rate': np.float16, 'installment': np.float32, 'annual_inc': np.float32 }) LOAN_LOGGER.info('File read') return read_file except: message = 'File NOT read' log_and_stop(LOAN_LOGGER, message)
def get_model_params(selected_model, X_train, y_train, X_test, y_test, hyperopt=False, max_evals=10): """ Retrieve parameters for the selected model, either by using the hyperopt algorithm or loading a given set of parameters. Hyperopt accepts only the XGB, LGBM or DecisionTree model. The given set of parameters was found using hyperopt. The function returns a tuple where: - the first item is a dictionary of model parameters ready to be used in a model - the second item is either the trials variable if hyperopt was done, or the string `No hyperopt done` otherwise """ try: global AVAILABLE_MODELS assert (str(selected_model) in AVAILABLE_MODELS.keys()), 'Allowed models are {}'.format( AVAILABLE_MODELS.keys()) if selected_model in ('LGBM', 'XGB'): model_type = 'Boosting' else: model_type = 'DecisionTree' LOAN_LOGGER.info('Correct model chosen for parameter retrieval') except: message = 'Correct model NOT chosen for parameter retrieval' log_and_stop(LOAN_LOGGER, message) else: if hyperopt: if model_type == 'Boosting': best_params, trials = _run_boosting_hyperopt( AVAILABLE_MODELS[selected_model], X_train, y_train, X_test, y_test, max_evals) params_dict = { 'colsample_bytree': best_params['ho_colsample_bytree'], 'learning_rate': best_params['ho_learning_rate'], 'max_depth': int(best_params['ho_max_depth']), 'min_child_weight': int(best_params['ho_min_child_weight']), 'n_estimators': int(best_params['ho_n_estimators']), 'reg_alpha': best_params['ho_reg_alpha'], 'reg_lambda': best_params['ho_reg_lambda'], 'subsample': best_params['ho_subsample'], 'num_leaves': 20, 'random_state': 2020, 'importance_type': 'gain', 'n_jobs': -1 } else: best_params, trials = _run_tree_hyperopt( AVAILABLE_MODELS[selected_model], X_train, y_train, X_test, y_test, max_evals) params_dict = { 'max_depth': int(best_params['ho_max_depth']), 'min_samples_leaf': int(best_params['ho_min_samples_leaf']), 'min_samples_split': int(best_params['ho_min_samples_split']), 'random_state': 2020 } return params_dict, trials else: params_dict = { 'Boosting': { 'colsample_bytree': 0.8899759555042142, 'learning_rate': 0.09532621848124778, 'max_depth': 11, 'min_child_weight': 4, 'n_estimators': 215, 'reg_alpha': 2.016992556501955, 'reg_lambda': 0.7643883757438669, 'subsample': 0.7651869713043127, 'num_leaves': 20, 'random_state': 2020, 'importance_type': 'gain', 'n_jobs': -1 }, 'DecisionTree': { 'max_depth': 11, 'min_samples_leaf': 8, 'min_samples_split': 10, 'random_state': 2020 }, } return params_dict[model_type], 'No hyperopt done'