def process_results(results_df, features_importance_df, params):
    print('Reading WA data ...')
    # Join with baseline model
    file_location = download_file_from_gcs(PROJECT, BUCKET,
                                           '{}/wa.h5'.format(DATA_DIR))
    wa_df = pd.read_hdf(file_location, 'wa_forecast_df')
    wa_df = downcast_datatypes(wa_df)
    results_df = downcast_datatypes(results_df)
    results_df = pd.merge(results_df,
                          wa_df,
                          how='left',
                          on=['date', 'product_id'])
    subprocess.call(['rm', '-f', file_location])
    del wa_df

    print('Reading product data ...')
    file_location = download_file_from_gcs(PROJECT, BUCKET,
                                           '{}/product.h5'.format(DATA_DIR))
    product_df = pd.read_hdf(file_location, 'product_df')
    results_df = pd.merge(results_df,
                          product_df[['product_id', 'product_type_id']],
                          how='left',
                          on='product_id')
    subprocess.call(['rm', '-f', file_location])
    del product_df

    # Create a DF that only covers the test periods
    # We may not have WA for all dates within test, so drop those that don't
    # Daily updated forecast is transformed to a 7-day ahead forecast starting from each first day in each testing fold
    test_df = results_df[results_df['is_test'] == True]

    # Using test_df.iloc[::7, ] to select every 7th row (including 0) inside a group may offer a faster solution
    def transform_to_weekly_wa(test_df):
        """
        Select all rows in the test set that are not divisible by 7 (these are the first days of the weeks)
        Set all but the first days of the week to 0, then fill all NaN's with a forwardfill
        This propogates the first forecasted value of the week to all consecutive days in that week
        The end result is a fair 7-day ahead forecast for each week which is updated every 7 days
        """
        test_df['wa'][test_df.reset_index(drop=True).index % 7 != 0] = np.NaN
        test_df.fillna(method='ffill', inplace=True)
        return test_df

    print('Transforming WA forecast to weekly predictions ...')
    # Ensure to not propogate values across different folds for a product by also grouping by fold!
    test_df = test_df.groupby(['product_id',
                               'fold']).apply(transform_to_weekly_wa)
    test_df = test_df.dropna()
    test_df = downcast_datatypes(test_df)

    features_names = features_importance_df['feature'].unique(
    )  # get features for log (unique because n folds in df)
    write_metrics(test_df, 'lgbm_log.txt', features_names, params)

    # Plotting
    print('Saving feature importance plots to GCS ...')
    overall_huber_lgbm = mean_huber(test_df['actual'], test_df['lgbm'])
    plot_importances(features_importance_df, overall_huber_lgbm, type='split')
    plot_importances(features_importance_df, overall_huber_lgbm, type='gain')
    del features_importance_df
def train_model_per_fold():
    for fold in list(range(0, NUM_FOLDS-1)):
        print('Training model for fold {}'.format(fold))

        print('Reading features slices')
        file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/train_x_complete_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG))
        train_x = pd.read_hdf(file_location, 'train_x')
        subprocess.call(['rm', '-f', file_location])

        file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/test_x_complete_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG))
        test_x = pd.read_hdf(file_location, 'test_x')
        subprocess.call(['rm', '-f', file_location])

        file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/train_y_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG))
        train_y = pd.read_hdf(file_location, 'train_y')
        subprocess.call(['rm', '-f', file_location])

        file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/test_y_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG))
        test_y = pd.read_hdf(file_location, 'test_y')
        subprocess.call(['rm', '-f', file_location])

        # Specify numeric and categorical features
        features_names = [f for f in train_x.columns if f not in ['date', 'actual', 'on_stock']]

        # Create lgb dataframes and train model
        print('Building lgb datsets')
        lgb_train = lgb.Dataset(train_x[features_names], categorical_feature=CAT_FEATURES, label=train_y, free_raw_data=False)
        lgb_test = lgb.Dataset(test_x[features_names], categorical_feature=CAT_FEATURES, label=test_y, free_raw_data=False)
        del train_x, test_x, train_y, test_y

        print('Training model')
        booster = lgb.train(
            PARAMS,
            lgb_train,
            num_boost_round=3000,
            valid_sets=[lgb_train, lgb_test],
            categorical_feature=CAT_FEATURES,
            early_stopping_rounds=100,
            verbose_eval=100
        )

        # Save booster object to disk
        print('Writing model to GCS')
        with open(tempfile.NamedTemporaryFile().name, 'w') as tf:
            booster.save_model('{}.txt'.format(tf.name))
            upload_file_to_gcs(PROJECT, BUCKET, '{}.txt'.format(tf.name), '{}/booster_{}_{}.txt'.format(MODEL_DIR, fold, RUNTAG))
        subprocess.call(['rm', '-f', tf.name])
        subprocess.call(['rm', '-f', '{}.txt'.format(tf.name)])
        del booster, lgb_train, lgb_test
        gc.collect()
예제 #3
0
def create_fold_aware_features():
    for fold in list(range(0, NUM_FOLDS - 1)):
        print('Generating fold aware features for fold {}'.format(fold))

        print('Reading feature matrix')
        file_location = download_file_from_gcs(
            PROJECT, BUCKET,
            '{}/train_x_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG))
        train_x = pd.read_hdf(file_location, 'train_x')
        subprocess.call(['rm', '-f', file_location])

        file_location = download_file_from_gcs(
            PROJECT, BUCKET,
            '{}/test_x_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG))
        test_x = pd.read_hdf(file_location, 'test_x')
        subprocess.call(['rm', '-f', file_location])

        print('Creating fold aware features')
        train_x, test_x = add_fold_aware_features(train_x, test_x)

        print('Writing slice to GCS')
        with open(tempfile.NamedTemporaryFile().name, 'w') as tf:
            train_x.to_hdf('{}.h5'.format(tf.name), 'train_x', index=False)
            upload_file_to_gcs(
                PROJECT, BUCKET, '{}.h5'.format(tf.name),
                '{}/train_x_complete_{}_{}.h5'.format(FEATURES_DIR, fold,
                                                      RUNTAG))
            # train_x.to_csv('{}.csv'.format(tf.name), index=False)
            # upload_file_to_gcs(PROJECT, BUCKET, '{}.csv'.format(tf.name), '{}/train_x_complete_{}_{}.csv'.format(FEATURES_DIR, fold, RUNTAG))
        subprocess.call(['rm', '-f', tf.name])
        subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)])
        # subprocess.call(['rm', '-f', '{}.csv'.format(tf.name)])
        del train_x

        with open(tempfile.NamedTemporaryFile().name, 'w') as tf:
            test_x.to_hdf('{}.h5'.format(tf.name), 'test_x', index=False)
            upload_file_to_gcs(
                PROJECT, BUCKET, '{}.h5'.format(tf.name),
                '{}/test_x_complete_{}_{}.h5'.format(FEATURES_DIR, fold,
                                                     RUNTAG))
            # test_x.to_csv('{}.csv'.format(tf.name), index=False)
            # upload_file_to_gcs(PROJECT, BUCKET, '{}.csv'.format(tf.name), '{}/test_x_complete_{}_{}.csv'.format(FEATURES_DIR, fold, RUNTAG))
        subprocess.call(['rm', '-f', tf.name])
        subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)])
        # subprocess.call(['rm', '-f', '{}.csv'.format(tf.name)])
        del test_x
        gc.collect()
def add_product_features(data_df, LOGGER):
    LOGGER.info('Generating product based features')
    file_location = download_file_from_gcs(PROJECT, BUCKET,
                                           '{}/product.h5'.format(DATA_DIR))
    product_df = pd.read_hdf(file_location, 'product_df')
    data_df = data_df.merge(product_df, on='product_id', how='inner')
    int16_features = ['actual', 'team_id', 'subproduct_type_id']
    int32_features = [
        'product_id', 'product_type_id', 'brand_id', 'manufacturer_id',
        'product_group_id'
    ]
    data_df[int16_features] = data_df[int16_features].apply(
        lambda col: col.astype('int16'))
    data_df[int32_features] = data_df[int32_features].apply(
        lambda col: col.astype('int32'))

    del product_df
    gc.collect()

    return data_df
def add_wa_feature(data_df, LOGGER):
    LOGGER.info('Loading WA forecast from GCS')
    file_location = download_file_from_gcs(PROJECT, BUCKET,
                                           '{}/wa.h5'.format(DATA_DIR))
    wa_df = pd.read_hdf(file_location, 'wa_forecast_df')
    wa_df = downcast_datatypes(wa_df)
    wa_df['date'] = pd.to_datetime(wa_df['date'])
    LOGGER.info('Adding WA forecast as feature')
    data_df = data_df.merge(wa_df, how='left', on=['product_id', 'date'])

    lag_wa_df = data_df[data_df.on_stock][[
        'product_id', 'date', 'wa'
    ]]  # Create dataframe to lag wa feature
    data_df = data_df.drop(
        'wa',
        axis=1)  # Drop wa feature again as it contains 1 day ahead information
    LOGGER.info('Sorting data dataframe before doing lags')
    lag_wa_df = lag_wa_df.sort_values(by='date',
                                      ascending=True).reset_index(drop=True)

    for lag in [7]:
        LOGGER.info('Generating for lag {} ...'.format(lag))
        column_name = 'wa_lag_{}'.format(lag)
        LOGGER.info('Creating lag feature')
        lag_wa_df[column_name] = lag_wa_df.groupby('product_id')['wa'].shift(
            lag).fillna(0).astype('int16')
        lag_wa_df = downcast_datatypes(lag_wa_df)

    lag_wa_df = lag_wa_df.drop(
        'wa', axis=1)  # Drop the non-lagged wa feature before merging
    LOGGER.info('Merging data dataframe and wa lag')
    data_df = pd.merge(data_df,
                       lag_wa_df,
                       how='left',
                       on=['product_id', 'date'])

    del wa_df, lag_wa_df
    gc.collect()

    return data_df
def predict_and_save_results_per_fold():
    for fold in list(range(0, NUM_FOLDS-1)):
        print('Predicting results for fold {}'.format(fold))

        print('Reading features slices')
        file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/train_x_complete_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG))
        train_x = pd.read_hdf(file_location, 'train_x')
        subprocess.call(['rm', '-f', file_location])

        file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/test_x_complete_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG))
        test_x = pd.read_hdf(file_location, 'test_x')
        subprocess.call(['rm', '-f', file_location])

        file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/train_y_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG))
        train_y = pd.read_hdf(file_location, 'train_y')
        subprocess.call(['rm', '-f', file_location])

        file_location = download_file_from_gcs(PROJECT, BUCKET, '{}/test_y_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG))
        test_y = pd.read_hdf(file_location, 'test_y')
        subprocess.call(['rm', '-f', file_location])

        # Specify numeric and categorical features
        features_names = [f for f in train_x.columns if f not in ['date', 'actual', 'on_stock']]

        # Create lgb dataframes and train model
        print('Building lgb datsets')
        lgb_train = lgb.Dataset(train_x[features_names], categorical_feature=CAT_FEATURES, label=train_y, free_raw_data=False)
        lgb_test = lgb.Dataset(test_x[features_names], categorical_feature=CAT_FEATURES, label=test_y, free_raw_data=False)

        print('Loading lgb trained model')
        file_location = '{}/booster_{}_{}.txt'.format(MODEL_DIR, fold, RUNTAG)
        file_location = download_file_from_gcs(PROJECT, BUCKET, file_location)
        booster = lgb.Booster(model_file=file_location)
        subprocess.call(['rm', '-f', file_location])

        print('Predicting train data')
        train_preds = booster.predict(lgb_train.data, num_iteration=booster.best_iteration)

        print('Predicting test data')
        test_preds = booster.predict(lgb_test.data, num_iteration=booster.best_iteration)

        print('Writing results for fold {} at {}'.format(fold, datetime.now().strftime("%Y-%m-%d_%H:%M:%S")))
        fold_result_df = pd.DataFrame()
        fold_result_df['product_id'] = train_x['product_id'].append(test_x['product_id'])
        fold_result_df['date'] = train_x['date'].append(test_x['date'])
        fold_result_df['on_stock'] = train_x['on_stock'].append(test_x['on_stock'])
        fold_result_df['fold'] = np.repeat(fold, len(train_x.index) + len(test_x.index))
        fold_result_df['actual'] = train_x['actual'].append(test_x['actual'])
        fold_result_df['lgbm'] = np.concatenate([train_preds, test_preds])
        fold_result_df['is_test'] = np.concatenate([np.repeat(False, len(train_x.index)), np.repeat(True, len(test_x.index))])
        fold_result_df = fold_result_df.sort_values(by=['product_id', 'date'])

        with open(tempfile.NamedTemporaryFile().name, 'w') as tf:
            fold_result_df.to_hdf('{}.h5'.format(tf.name), 'fold_result_df', index=False)
            upload_file_to_gcs(PROJECT, BUCKET, '{}.h5'.format(tf.name), '{}/results_{}_{}.h5'.format(RESULTS_DIR, fold, RUNTAG))
            fold_result_df.to_csv('{}.csv'.format(tf.name), index=False)
            upload_file_to_gcs(PROJECT, BUCKET, '{}.csv'.format(tf.name), '{}/results_{}_{}.csv'.format(RESULTS_DIR, fold, RUNTAG))
        subprocess.call(['rm', '-f', tf.name])
        subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)])
        subprocess.call(['rm', '-f', '{}.csv'.format(tf.name)])

        # Compute feature importances
        gain = booster.feature_importance(importance_type='gain')
        split = booster.feature_importance(importance_type='split')
        fold_importance_df = pd.DataFrame()
        fold_importance_df['fold'] = np.repeat(fold, len(features_names))
        fold_importance_df['feature'] = features_names
        fold_importance_df['gain'] = gain
        fold_importance_df['split'] = split

        with open(tempfile.NamedTemporaryFile().name, 'w') as tf:
            fold_importance_df.to_hdf('{}.h5'.format(tf.name), 'fold_importance_df', index=False)
            upload_file_to_gcs(PROJECT, BUCKET, '{}.h5'.format(tf.name), '{}/importance_{}_{}.h5'.format(RESULTS_DIR, fold, RUNTAG))
            fold_importance_df.to_csv('{}.csv'.format(tf.name), index=False)
            upload_file_to_gcs(PROJECT, BUCKET, '{}.csv'.format(tf.name), '{}/importance_{}_{}.csv'.format(RESULTS_DIR, fold, RUNTAG))
        subprocess.call(['rm', '-f', tf.name])
        subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)])
        subprocess.call(['rm', '-f', '{}.csv'.format(tf.name)])

        # Only compute Shapley values if specified
        if COMPUTE_SHAP:
            print('Started computing fold {} SHAP values at {}'.format(fold, datetime.now().strftime("%Y-%m-%d_%H:%M:%S")))
            explainer_fold = shap.TreeExplainer(booster)
            shap_values = explainer_fold.shap_values(train_x[features_names])

            shap_df = pd.DataFrame()
            shap_df['fold'] = np.repeat(fold, len(shap_values))
            shap_df['date'] = train_x['date'].append(test_x['date'])
            for col_num, features_name in enumerate(features_names):
                shap_df[features_name] = shap_values[:, col_num]

            with open(tempfile.NamedTemporaryFile().name, 'w') as tf:
                shap_df.to_hdf('{}.h5'.format(tf.name), 'shap_df', index=False)
                upload_file_to_gcs(PROJECT, BUCKET, '{}.h5'.format(tf.name), '{}/shap_{}_{}.h5'.format(RESULTS_DIR, fold, RUNTAG))
                shap_df.to_csv('{}.csv'.format(tf.name), index=False)
                upload_file_to_gcs(PROJECT, BUCKET, '{}.csv'.format(tf.name), '{}/shap_{}_{}.csv'.format(RESULTS_DIR, fold, RUNTAG))
            subprocess.call(['rm', '-f', tf.name])
            subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)])
            subprocess.call(['rm', '-f', '{}.csv'.format(tf.name)])
            del shap_df

        del train_x, test_x, train_y, test_y
        del booster, lgb_train, lgb_test, train_preds, test_preds
        del fold_result_df, fold_importance_df
        gc.collect()
def create_folds_and_slice_features():
    print('Loading full feature matrix')
    file_location = download_file_from_gcs(
        PROJECT, BUCKET, '{}/features_{}.h5'.format(DATA_DIR, RUNTAG))
    features_df = pd.read_hdf(file_location, 'features_df')
    target = features_df['actual'].copy()
    subprocess.call(['rm', '-f', file_location])

    min_train_size = int(0.75 * features_df.shape[0])  # 2+ years training
    min_test_size = int(0.03 * features_df.shape[0])  # 2+ months of testing
    step_size = int(0.03 *
                    features_df.shape[0])  # 2+ month step size between folds
    timefolds = timefold.timefold(method='step',
                                  min_train_size=min_train_size,
                                  min_test_size=min_test_size,
                                  step_size=step_size)

    folds = {}
    for fold, (train_idx, test_idx) in enumerate(timefolds.split(features_df)):
        print('Generating fold {}'.format(fold))
        folds[str(fold)] = (list(map(int, train_idx)), list(map(int,
                                                                test_idx)))

    for fold, (train_idx, test_idx) in folds.items():
        print('Generating feature slice for fold {}'.format(fold))
        print('Train idx: {} to {}, test idx: {} to {}'.format(
            train_idx[0], train_idx[-1], test_idx[0], test_idx[-1]))
        train_x, train_y = features_df.iloc[train_idx], target.iloc[train_idx]
        test_x, test_y = features_df.iloc[test_idx], target.iloc[test_idx]

        print('Writing slices to GCS')
        with open(tempfile.NamedTemporaryFile().name, 'w') as tf:
            train_x.to_hdf('{}.h5'.format(tf.name), 'train_x', index=False)
            upload_file_to_gcs(
                PROJECT, BUCKET, '{}.h5'.format(tf.name),
                '{}/train_x_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG))
        subprocess.call(['rm', '-f', tf.name])
        subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)])
        del train_x

        with open(tempfile.NamedTemporaryFile().name, 'w') as tf:
            train_y.to_hdf('{}.h5'.format(tf.name), 'train_y', index=False)
            upload_file_to_gcs(
                PROJECT, BUCKET, '{}.h5'.format(tf.name),
                '{}/train_y_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG))
        subprocess.call(['rm', '-f', tf.name])
        subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)])
        del train_y

        with open(tempfile.NamedTemporaryFile().name, 'w') as tf:
            test_x.to_hdf('{}.h5'.format(tf.name), 'test_x', index=False)
            upload_file_to_gcs(
                PROJECT, BUCKET, '{}.h5'.format(tf.name),
                '{}/test_x_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG))
        subprocess.call(['rm', '-f', tf.name])
        subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)])
        del test_x

        with open(tempfile.NamedTemporaryFile().name, 'w') as tf:
            test_y.to_hdf('{}.h5'.format(tf.name), 'test_y', index=False)
            upload_file_to_gcs(
                PROJECT, BUCKET, '{}.h5'.format(tf.name),
                '{}/test_y_{}_{}.h5'.format(FEATURES_DIR, fold, RUNTAG))
        subprocess.call(['rm', '-f', tf.name])
        subprocess.call(['rm', '-f', '{}.h5'.format(tf.name)])
        del test_y
        gc.collect()
    log4jLogger = sc._jvm.org.apache.log4j
    LOGGER = log4jLogger.LogManager.getLogger(__name__)

    LOGGER.info('LightGBM calculation started')

    _, kwargs_string = sys.argv
    runner_kwargs = eval(kwargs_string)

    project = runner_kwargs['project']

    LOGGER.info('Starting process to generate features')
    try:
        LOGGER.info(
            'Attempting to download existing features file in GCS for current runtag: {}'
            .format(RUNTAG))
        file_location = download_file_from_gcs(
            PROJECT, BUCKET, '{}/features.h5'.format(DATA_DIR))
        features_df = pd.read_hdf(file_location, 'features_df')
        subprocess.call(['rm', '-f', file_location])
        LOGGER.info('Downcasting datatypes for entire feature matrix')
        features_df = downcast_datatypes(features_df)
    except IOError:
        LOGGER.info(
            'Failed to find existing features file for current runtag {}, '
            'so features will be generated from scratch'.format(RUNTAG))
        LOGGER.info('Downloading data (actuals) hdf file')
        file_location = download_file_from_gcs(PROJECT, BUCKET,
                                               '{}/actual.h5'.format(DATA_DIR))
        LOGGER.info('Reading data (actuals) in pandas dataframe')
        data_df = pd.read_hdf(file_location, 'data_df')
        LOGGER.info('Starting feature generation')
        features_df = process_features(data_df, LOGGER)