def process_results(results_df, features_importance_df, params): print('Reading WA data ...') # Join with baseline model file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/wa.h5'.format(DATA_DIR)) wa_df = pd.read_hdf(file_location, 'wa_forecast_df') wa_df = downcast_datatypes(wa_df) results_df = downcast_datatypes(results_df) results_df = pd.merge(results_df, wa_df, how='left', on=['date', 'product_id']) subprocess.call(['rm', '-f', file_location]) del wa_df print('Reading product data ...') file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/product.h5'.format(DATA_DIR)) product_df = pd.read_hdf(file_location, 'product_df') results_df = pd.merge(results_df, product_df[['product_id', 'product_type_id']], how='left', on='product_id') subprocess.call(['rm', '-f', file_location]) del product_df # Create a DF that only covers the test periods # We may not have WA for all dates within test, so drop those that don't # Daily updated forecast is transformed to a 7-day ahead forecast starting from each first day in each testing fold test_df = results_df[results_df['is_test'] == True] # Using test_df.iloc[::7, ] to select every 7th row (including 0) inside a group may offer a faster solution def transform_to_weekly_wa(test_df): """ Select all rows in the test set that are not divisible by 7 (these are the first days of the weeks) Set all but the first days of the week to 0, then fill all NaN's with a forwardfill This propogates the first forecasted value of the week to all consecutive days in that week The end result is a fair 7-day ahead forecast for each week which is updated every 7 days """ test_df['wa'][test_df.reset_index(drop=True).index % 7 != 0] = np.NaN test_df.fillna(method='ffill', inplace=True) return test_df print('Transforming WA forecast to weekly predictions ...') # Ensure to not propogate values across different folds for a product by also grouping by fold! test_df = test_df.groupby(['product_id', 'fold']).apply(transform_to_weekly_wa) test_df = test_df.dropna() test_df = downcast_datatypes(test_df) features_names = features_importance_df['feature'].unique( ) # get features for log (unique because n folds in df) write_metrics(test_df, 'lgbm_log.txt', features_names, params) # Plotting print('Saving feature importance plots to GCS ...') overall_huber_lgbm = mean_huber(test_df['actual'], test_df['lgbm']) plot_importances(features_importance_df, overall_huber_lgbm, type='split') plot_importances(features_importance_df, overall_huber_lgbm, type='gain') del features_importance_df
def train_model_per_fold(): for fold in list(range(0, NUM_FOLDS-1)): print('Training model for fold {}'.format(fold)) print('Reading features slices') file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/train_x_complete_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) train_x = pd.read_hdf(file_location, 'train_x') subprocess.call(['rm', '-f', file_location]) file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/test_x_complete_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) test_x = pd.read_hdf(file_location, 'test_x') subprocess.call(['rm', '-f', file_location]) file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/train_y_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) train_y = pd.read_hdf(file_location, 'train_y') subprocess.call(['rm', '-f', file_location]) file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/test_y_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) test_y = pd.read_hdf(file_location, 'test_y') subprocess.call(['rm', '-f', file_location]) # Specify numeric and categorical features features_names = [f for f in train_x.columns if f not in ['date', 'actual', 'on_stock']] # Create lgb dataframes and train model print('Building lgb datsets') lgb_train = lgb.Dataset(train_x[features_names], categorical_feature=CAT_FEATURES, label=train_y, free_raw_data=False) lgb_test = lgb.Dataset(test_x[features_names], categorical_feature=CAT_FEATURES, label=test_y, free_raw_data=False) del train_x, test_x, train_y, test_y print('Training model') booster = lgb.train( PARAMS, lgb_train, num_boost_round=3000, valid_sets=[lgb_train, lgb_test], categorical_feature=CAT_FEATURES, early_stopping_rounds=100, verbose_eval=100 ) # Save booster object to disk print('Writing model to GCS') with open(tempfile.NamedTemporaryFile().name, 'w') as tf: booster.save_model('{}.txt'.format(tf.name)) upload_file_to_gcs(PROJECT, BUCKET, '{}.txt'.format(tf.name), '{}/booster_{}_{}.txt'.format(MODEL_DIR, fold, RUNTAG)) subprocess.call(['rm', '-f', tf.name]) subprocess.call(['rm', '-f', '{}.txt'.format(tf.name)]) del booster, lgb_train, lgb_test gc.collect()
def create_fold_aware_features(): for fold in list(range(0, NUM_FOLDS - 1)): print('Generating fold aware features for fold {}'.format(fold)) print('Reading feature matrix') file_location = download_file_from_gcs( PROJECT, BUCKET, '{}/train_x_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) train_x = pd.read_hdf(file_location, 'train_x') subprocess.call(['rm', '-f', file_location]) file_location = download_file_from_gcs( PROJECT, BUCKET, '{}/test_x_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) test_x = pd.read_hdf(file_location, 'test_x') subprocess.call(['rm', '-f', file_location]) print('Creating fold aware features') train_x, test_x = add_fold_aware_features(train_x, test_x) print('Writing slice to GCS') with open(tempfile.NamedTemporaryFile().name, 'w') as tf: train_x.to_hdf('{}.h5'.format(tf.name), 'train_x', index=False) upload_file_to_gcs( PROJECT, BUCKET, '{}.h5'.format(tf.name), '{}/train_x_complete_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) # train_x.to_csv('{}.csv'.format(tf.name), index=False) # upload_file_to_gcs(PROJECT, BUCKET, '{}.csv'.format(tf.name), '{}/train_x_complete_{}_{}.csv'.format(FEATURES_DIR, fold, RUNTAG)) subprocess.call(['rm', '-f', tf.name]) subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)]) # subprocess.call(['rm', '-f', '{}.csv'.format(tf.name)]) del train_x with open(tempfile.NamedTemporaryFile().name, 'w') as tf: test_x.to_hdf('{}.h5'.format(tf.name), 'test_x', index=False) upload_file_to_gcs( PROJECT, BUCKET, '{}.h5'.format(tf.name), '{}/test_x_complete_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) # test_x.to_csv('{}.csv'.format(tf.name), index=False) # upload_file_to_gcs(PROJECT, BUCKET, '{}.csv'.format(tf.name), '{}/test_x_complete_{}_{}.csv'.format(FEATURES_DIR, fold, RUNTAG)) subprocess.call(['rm', '-f', tf.name]) subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)]) # subprocess.call(['rm', '-f', '{}.csv'.format(tf.name)]) del test_x gc.collect()
def add_product_features(data_df, LOGGER): LOGGER.info('Generating product based features') file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/product.h5'.format(DATA_DIR)) product_df = pd.read_hdf(file_location, 'product_df') data_df = data_df.merge(product_df, on='product_id', how='inner') int16_features = ['actual', 'team_id', 'subproduct_type_id'] int32_features = [ 'product_id', 'product_type_id', 'brand_id', 'manufacturer_id', 'product_group_id' ] data_df[int16_features] = data_df[int16_features].apply( lambda col: col.astype('int16')) data_df[int32_features] = data_df[int32_features].apply( lambda col: col.astype('int32')) del product_df gc.collect() return data_df
def add_wa_feature(data_df, LOGGER): LOGGER.info('Loading WA forecast from GCS') file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/wa.h5'.format(DATA_DIR)) wa_df = pd.read_hdf(file_location, 'wa_forecast_df') wa_df = downcast_datatypes(wa_df) wa_df['date'] = pd.to_datetime(wa_df['date']) LOGGER.info('Adding WA forecast as feature') data_df = data_df.merge(wa_df, how='left', on=['product_id', 'date']) lag_wa_df = data_df[data_df.on_stock][[ 'product_id', 'date', 'wa' ]] # Create dataframe to lag wa feature data_df = data_df.drop( 'wa', axis=1) # Drop wa feature again as it contains 1 day ahead information LOGGER.info('Sorting data dataframe before doing lags') lag_wa_df = lag_wa_df.sort_values(by='date', ascending=True).reset_index(drop=True) for lag in [7]: LOGGER.info('Generating for lag {} ...'.format(lag)) column_name = 'wa_lag_{}'.format(lag) LOGGER.info('Creating lag feature') lag_wa_df[column_name] = lag_wa_df.groupby('product_id')['wa'].shift( lag).fillna(0).astype('int16') lag_wa_df = downcast_datatypes(lag_wa_df) lag_wa_df = lag_wa_df.drop( 'wa', axis=1) # Drop the non-lagged wa feature before merging LOGGER.info('Merging data dataframe and wa lag') data_df = pd.merge(data_df, lag_wa_df, how='left', on=['product_id', 'date']) del wa_df, lag_wa_df gc.collect() return data_df
def predict_and_save_results_per_fold(): for fold in list(range(0, NUM_FOLDS-1)): print('Predicting results for fold {}'.format(fold)) print('Reading features slices') file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/train_x_complete_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) train_x = pd.read_hdf(file_location, 'train_x') subprocess.call(['rm', '-f', file_location]) file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/test_x_complete_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) test_x = pd.read_hdf(file_location, 'test_x') subprocess.call(['rm', '-f', file_location]) file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/train_y_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) train_y = pd.read_hdf(file_location, 'train_y') subprocess.call(['rm', '-f', file_location]) file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/test_y_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) test_y = pd.read_hdf(file_location, 'test_y') subprocess.call(['rm', '-f', file_location]) # Specify numeric and categorical features features_names = [f for f in train_x.columns if f not in ['date', 'actual', 'on_stock']] # Create lgb dataframes and train model print('Building lgb datsets') lgb_train = lgb.Dataset(train_x[features_names], categorical_feature=CAT_FEATURES, label=train_y, free_raw_data=False) lgb_test = lgb.Dataset(test_x[features_names], categorical_feature=CAT_FEATURES, label=test_y, free_raw_data=False) print('Loading lgb trained model') file_location = '{}/booster_{}_{}.txt'.format(MODEL_DIR, fold, RUNTAG) file_location = download_file_from_gcs(PROJECT, BUCKET, file_location) booster = lgb.Booster(model_file=file_location) subprocess.call(['rm', '-f', file_location]) print('Predicting train data') train_preds = booster.predict(lgb_train.data, num_iteration=booster.best_iteration) print('Predicting test data') test_preds = booster.predict(lgb_test.data, num_iteration=booster.best_iteration) print('Writing results for fold {} at {}'.format(fold, datetime.now().strftime("%Y-%m-%d_%H:%M:%S"))) fold_result_df = pd.DataFrame() fold_result_df['product_id'] = train_x['product_id'].append(test_x['product_id']) fold_result_df['date'] = train_x['date'].append(test_x['date']) fold_result_df['on_stock'] = train_x['on_stock'].append(test_x['on_stock']) fold_result_df['fold'] = np.repeat(fold, len(train_x.index) + len(test_x.index)) fold_result_df['actual'] = train_x['actual'].append(test_x['actual']) fold_result_df['lgbm'] = np.concatenate([train_preds, test_preds]) fold_result_df['is_test'] = np.concatenate([np.repeat(False, len(train_x.index)), np.repeat(True, len(test_x.index))]) fold_result_df = fold_result_df.sort_values(by=['product_id', 'date']) with open(tempfile.NamedTemporaryFile().name, 'w') as tf: fold_result_df.to_hdf('{}.h5'.format(tf.name), 'fold_result_df', index=False) upload_file_to_gcs(PROJECT, BUCKET, '{}.h5'.format(tf.name), '{}/results_{}_{}.h5'.format(RESULTS_DIR, fold, RUNTAG)) fold_result_df.to_csv('{}.csv'.format(tf.name), index=False) upload_file_to_gcs(PROJECT, BUCKET, '{}.csv'.format(tf.name), '{}/results_{}_{}.csv'.format(RESULTS_DIR, fold, RUNTAG)) subprocess.call(['rm', '-f', tf.name]) subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)]) subprocess.call(['rm', '-f', '{}.csv'.format(tf.name)]) # Compute feature importances gain = booster.feature_importance(importance_type='gain') split = booster.feature_importance(importance_type='split') fold_importance_df = pd.DataFrame() fold_importance_df['fold'] = np.repeat(fold, len(features_names)) fold_importance_df['feature'] = features_names fold_importance_df['gain'] = gain fold_importance_df['split'] = split with open(tempfile.NamedTemporaryFile().name, 'w') as tf: fold_importance_df.to_hdf('{}.h5'.format(tf.name), 'fold_importance_df', index=False) upload_file_to_gcs(PROJECT, BUCKET, '{}.h5'.format(tf.name), '{}/importance_{}_{}.h5'.format(RESULTS_DIR, fold, RUNTAG)) fold_importance_df.to_csv('{}.csv'.format(tf.name), index=False) upload_file_to_gcs(PROJECT, BUCKET, '{}.csv'.format(tf.name), '{}/importance_{}_{}.csv'.format(RESULTS_DIR, fold, RUNTAG)) subprocess.call(['rm', '-f', tf.name]) subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)]) subprocess.call(['rm', '-f', '{}.csv'.format(tf.name)]) # Only compute Shapley values if specified if COMPUTE_SHAP: print('Started computing fold {} SHAP values at {}'.format(fold, datetime.now().strftime("%Y-%m-%d_%H:%M:%S"))) explainer_fold = shap.TreeExplainer(booster) shap_values = explainer_fold.shap_values(train_x[features_names]) shap_df = pd.DataFrame() shap_df['fold'] = np.repeat(fold, len(shap_values)) shap_df['date'] = train_x['date'].append(test_x['date']) for col_num, features_name in enumerate(features_names): shap_df[features_name] = shap_values[:, col_num] with open(tempfile.NamedTemporaryFile().name, 'w') as tf: shap_df.to_hdf('{}.h5'.format(tf.name), 'shap_df', index=False) upload_file_to_gcs(PROJECT, BUCKET, '{}.h5'.format(tf.name), '{}/shap_{}_{}.h5'.format(RESULTS_DIR, fold, RUNTAG)) shap_df.to_csv('{}.csv'.format(tf.name), index=False) upload_file_to_gcs(PROJECT, BUCKET, '{}.csv'.format(tf.name), '{}/shap_{}_{}.csv'.format(RESULTS_DIR, fold, RUNTAG)) subprocess.call(['rm', '-f', tf.name]) subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)]) subprocess.call(['rm', '-f', '{}.csv'.format(tf.name)]) del shap_df del train_x, test_x, train_y, test_y del booster, lgb_train, lgb_test, train_preds, test_preds del fold_result_df, fold_importance_df gc.collect()
def create_folds_and_slice_features(): print('Loading full feature matrix') file_location = download_file_from_gcs( PROJECT, BUCKET, '{}/features_{}.h5'.format(DATA_DIR, RUNTAG)) features_df = pd.read_hdf(file_location, 'features_df') target = features_df['actual'].copy() subprocess.call(['rm', '-f', file_location]) min_train_size = int(0.75 * features_df.shape[0]) # 2+ years training min_test_size = int(0.03 * features_df.shape[0]) # 2+ months of testing step_size = int(0.03 * features_df.shape[0]) # 2+ month step size between folds timefolds = timefold.timefold(method='step', min_train_size=min_train_size, min_test_size=min_test_size, step_size=step_size) folds = {} for fold, (train_idx, test_idx) in enumerate(timefolds.split(features_df)): print('Generating fold {}'.format(fold)) folds[str(fold)] = (list(map(int, train_idx)), list(map(int, test_idx))) for fold, (train_idx, test_idx) in folds.items(): print('Generating feature slice for fold {}'.format(fold)) print('Train idx: {} to {}, test idx: {} to {}'.format( train_idx[0], train_idx[-1], test_idx[0], test_idx[-1])) train_x, train_y = features_df.iloc[train_idx], target.iloc[train_idx] test_x, test_y = features_df.iloc[test_idx], target.iloc[test_idx] print('Writing slices to GCS') with open(tempfile.NamedTemporaryFile().name, 'w') as tf: train_x.to_hdf('{}.h5'.format(tf.name), 'train_x', index=False) upload_file_to_gcs( PROJECT, BUCKET, '{}.h5'.format(tf.name), '{}/train_x_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) subprocess.call(['rm', '-f', tf.name]) subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)]) del train_x with open(tempfile.NamedTemporaryFile().name, 'w') as tf: train_y.to_hdf('{}.h5'.format(tf.name), 'train_y', index=False) upload_file_to_gcs( PROJECT, BUCKET, '{}.h5'.format(tf.name), '{}/train_y_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) subprocess.call(['rm', '-f', tf.name]) subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)]) del train_y with open(tempfile.NamedTemporaryFile().name, 'w') as tf: test_x.to_hdf('{}.h5'.format(tf.name), 'test_x', index=False) upload_file_to_gcs( PROJECT, BUCKET, '{}.h5'.format(tf.name), '{}/test_x_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) subprocess.call(['rm', '-f', tf.name]) subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)]) del test_x with open(tempfile.NamedTemporaryFile().name, 'w') as tf: test_y.to_hdf('{}.h5'.format(tf.name), 'test_y', index=False) upload_file_to_gcs( PROJECT, BUCKET, '{}.h5'.format(tf.name), '{}/test_y_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG)) subprocess.call(['rm', '-f', tf.name]) subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)]) del test_y gc.collect()
log4jLogger = sc._jvm.org.apache.log4j LOGGER = log4jLogger.LogManager.getLogger(__name__) LOGGER.info('LightGBM calculation started') _, kwargs_string = sys.argv runner_kwargs = eval(kwargs_string) project = runner_kwargs['project'] LOGGER.info('Starting process to generate features') try: LOGGER.info( 'Attempting to download existing features file in GCS for current runtag: {}' .format(RUNTAG)) file_location = download_file_from_gcs( PROJECT, BUCKET, '{}/features.h5'.format(DATA_DIR)) features_df = pd.read_hdf(file_location, 'features_df') subprocess.call(['rm', '-f', file_location]) LOGGER.info('Downcasting datatypes for entire feature matrix') features_df = downcast_datatypes(features_df) except IOError: LOGGER.info( 'Failed to find existing features file for current runtag {}, ' 'so features will be generated from scratch'.format(RUNTAG)) LOGGER.info('Downloading data (actuals) hdf file') file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/actual.h5'.format(DATA_DIR)) LOGGER.info('Reading data (actuals) in pandas dataframe') data_df = pd.read_hdf(file_location, 'data_df') LOGGER.info('Starting feature generation') features_df = process_features(data_df, LOGGER)