def main(): # ----------------------------------------------------------------------------------- # Adjustable Parameters parser = argparse.ArgumentParser() parser.add_argument('--train', action='store_true', help='training or scoring') parser.add_argument('--inputfile', type=str, help='input data file name') parser.add_argument('--outputfile', type=str, help='output prediction file name') args = parser.parse_args() # directory for the input data and output prediction: DATA_DIR = 'data' OUTPUT_DIR = 'output' # columns used: CAT_COLS = [ 'Auction', 'Transmission', 'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName', 'IsOnlineSale' ] NUM_COLS = [ 'VehicleAge', 'VehOdo', 'VehBCost', 'WarrantyCost', 'MMRCurrentAuctionAveragePrice', 'MMRAcquisitionAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRAcquisitionRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'MMRAcquisitonRetailCleanPrice' ] DATE_COLS = ['PurchDate'] LABEL_COL = 'IsBadBuy' IDS_COL = 'RefId' # current time for computing recency feature NOW = '2010-12-31' # modeling step: # model checkpoint for future scoring MODEL_DIR = 'model' CHECKPOINT_PREPROCESS = os.path.join(MODEL_DIR, 'preprocess.pkl') CHECKPOINT_XGB = os.path.join(MODEL_DIR, 'xgb.pkl') # parameter that only relevant for training stage and not scoring if args.train: # number of cross validation and hyperparameter settings to try CV = 10 N_ITER = 5 MODEL_RANDOM_STATE = 4321 # train/validation stratified split VAL_SIZE = 0.1 TEST_SIZE = 0.1 SPLIT_RANDOM_STATE = 1234 # ----------------------------------------------------------------------------------- logger.info('preprocessing') input_path = os.path.join(DATA_DIR, args.inputfile) if args.train: data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL, LABEL_COL) ids = data[IDS_COL].values label = data[LABEL_COL].values data = data.drop([IDS_COL, LABEL_COL], axis=1) # train/test split twice to achieve train/validaion/test three way split df_train, df_test, y_train, y_test, ids_train, ids_test = train_test_split( data, label, ids, test_size=TEST_SIZE, random_state=SPLIT_RANDOM_STATE, stratify=label) df_train, df_val, y_train, y_val, ids_train, ids_val = train_test_split( df_train, y_train, ids_train, test_size=VAL_SIZE, random_state=SPLIT_RANDOM_STATE, stratify=y_train) # obtain finalized columns num_cols_cleaned = list( SortedSet(df_train.columns) - SortedSet(CAT_COLS)) preprocess = Preprocesser(num_cols=num_cols_cleaned, cat_cols=CAT_COLS) X_train = preprocess.fit_transform(df_train) X_val = preprocess.transform(df_val) X_test = preprocess.transform(df_test) logger.info('modeling') eval_set = [(X_train, y_train), (X_val, y_val)] xgb_tuned = build_xgb(N_ITER, CV, MODEL_RANDOM_STATE, eval_set) xgb_tuned.fit(X_train, y_train) if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) dump(preprocess, CHECKPOINT_PREPROCESS) dump(xgb_tuned, CHECKPOINT_XGB) # model evaluation metric reporting y_pred = [] xgb_best = xgb_tuned.best_estimator_ zipped = zip(('train', 'validation', 'test'), (X_train, X_val, X_test), (y_train, y_val, y_test)) for name, X, y in zipped: xgb_pred = xgb_best.predict_proba( X, ntree_limit=xgb_best.best_ntree_limit)[:, 1] score = round(roc_auc_score(y, xgb_pred), 3) logger.info('{} AUC: {}'.format(name, score)) y_pred.append(xgb_pred) ids = np.hstack((ids_train, ids_val, ids_test)) y_pred = np.hstack(y_pred) else: data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL) ids = data[IDS_COL].values data = data.drop(IDS_COL, axis=1) logger.info('scoring') preprocess = load(CHECKPOINT_PREPROCESS) xgb_tuned = load(CHECKPOINT_XGB) X = preprocess.transform(data) xgb_best = xgb_tuned.best_estimator_ y_pred = xgb_best.predict_proba( X, ntree_limit=xgb_best.best_ntree_limit)[:, 1] if not os.path.isdir(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) output_path = os.path.join(OUTPUT_DIR, args.outputfile) write_output(ids, IDS_COL, y_pred, LABEL_COL, output_path)
def main(): # ----------------------------------------------------------------------------------- # Adjustable Parameters parser = argparse.ArgumentParser() parser.add_argument( '--train', action = 'store_true', help = 'training or scoring') parser.add_argument( '--inputfile', type = str, help = 'input data file name') parser.add_argument( '--outputfile', type = str, help = 'output prediction file name') args = parser.parse_args() # directory for the input data and output prediction: DATA_DIR = 'data' OUTPUT_DIR = 'output' # columns used: CAT_COLS = ['Auction', 'Transmission', 'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName', 'IsOnlineSale'] NUM_COLS = ['VehicleAge', 'VehOdo', 'VehBCost', 'WarrantyCost', 'MMRCurrentAuctionAveragePrice', 'MMRAcquisitionAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRAcquisitionRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'MMRAcquisitonRetailCleanPrice'] DATE_COLS = ['PurchDate'] LABEL_COL = 'IsBadBuy' IDS_COL = 'RefId' # current time for computing recency feature NOW = '2010-12-31' # modeling step: # model checkpoint for future scoring MODEL_DIR = 'model' CHECKPOINT_XGB = 'xgb.pkl' CHECKPOINT_PREPROCESS = 'preprocess.pkl' # parameter that only relevant for training stage and not scoring if args.train: # number of cross validation and hyperparameter settings to try CV = 10 N_ITER = 5 MODEL_RANDOM_STATE = 4321 # train/validation stratified split VAL_SIZE = 0.1 TEST_SIZE = 0.1 SPLIT_RANDOM_STATE = 1234 # ----------------------------------------------------------------------------------- logger.info('preprocessing') checkpoint_preprocess = os.path.join(MODEL_DIR, CHECKPOINT_PREPROCESS) checkpoint_xgb = os.path.join(MODEL_DIR, CHECKPOINT_XGB) input_path = os.path.join(DATA_DIR, args.inputfile) if args.train: data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL, LABEL_COL) ids = data[IDS_COL].values label = data[LABEL_COL].values data = data.drop([IDS_COL, LABEL_COL], axis = 1) # train/test split twice to achieve train/validation/test three way split df_train, df_test, y_train, y_test, ids_train, ids_test = train_test_split( data, label, ids, test_size = TEST_SIZE, random_state = SPLIT_RANDOM_STATE, stratify = label) df_train, df_val, y_train, y_val, ids_train, ids_val = train_test_split( df_train, y_train, ids_train, test_size = VAL_SIZE, random_state = SPLIT_RANDOM_STATE, stratify = y_train) # obtain finalized columns num_cols_cleaned = list(SortedSet(df_train.columns) - SortedSet(CAT_COLS)) preprocess = Preprocesser(num_cols = num_cols_cleaned, cat_cols = CAT_COLS) X_train = preprocess.fit_transform(df_train) X_val = preprocess.transform(df_val) X_test = preprocess.transform(df_test) logger.info('modeling') eval_set = [(X_train, y_train), (X_val, y_val)] xgb_tuned = build_xgb(N_ITER, CV, MODEL_RANDOM_STATE, eval_set) xgb_tuned.fit(X_train, y_train) if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) dump(preprocess, checkpoint_preprocess) dump(xgb_tuned, checkpoint_xgb) # model evaluation metric reporting y_pred = [] xgb_best = xgb_tuned.best_estimator_ zipped = zip( ('train', 'validation', 'test'), (X_train, X_val, X_test), (y_train, y_val, y_test)) for name, X, y in zipped: xgb_pred = xgb_best.predict_proba( X, ntree_limit = xgb_best.best_ntree_limit)[:, 1] score = round(roc_auc_score(y, xgb_pred), 3) logger.info('{} AUC: {}'.format(name, score)) y_pred.append(xgb_pred) ids = np.hstack((ids_train, ids_val, ids_test)) y_pred = np.hstack(y_pred) else: data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL) ids = data[IDS_COL].values data = data.drop(IDS_COL, axis = 1) logger.info('scoring') preprocess = load(checkpoint_preprocess) xgb_tuned = load(checkpoint_xgb) X = preprocess.transform(data) xgb_best = xgb_tuned.best_estimator_ y_pred = xgb_best.predict_proba( X, ntree_limit = xgb_best.best_ntree_limit)[:, 1] if not os.path.isdir(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) output_path = os.path.join(OUTPUT_DIR, args.outputfile) write_output(ids, IDS_COL, y_pred, LABEL_COL, output_path)