Exemplo n.º 1
0
def remove_before_changepoint(data: pd.HDFStore, select_idx: pd.Index = None):
    changepoints = {
        837: '2014-03-16',
        700: '2014-01-03',
        681: '2013-06-14',
        986: '2013-05-22',
        885: '2014-05-18',
        589: '2013-05-27',
        105: '2013-05-20',
        663: '2013-10-06',
        764: '2013-04-24',
        364: '2013-05-31',
        969: '2013-03-10',
        803: '2014-01-07',
        91: '2014-01-14'
    }
    # noinspection PyUnusedLocal
    for store, date in changepoints.items():
        idx = data.select_as_coordinates(
            'train', 'Store != store or Date > pd.Timestamp(date)')
        if select_idx is not None:
            select_idx = select_idx.intersection(idx)
        else:
            select_idx = idx
    return select_idx
Exemplo n.º 2
0
def glm_predictions(data: pd.HDFStore,
                    output: pd.HDFStore,
                    model_save_dir=None,
                    predict_train=True,
                    from_saved_model=False):
    # +-
    test_set_stores = data.select_column('test', 'Store').unique()
    ##
    if from_saved_model:
        if from_saved_model is True:
            glm = get_saved_glm_model(model_save_dir)
        else:
            glm = get_saved_glm_model(from_saved_model)

    else:

        ##
        logger.info("Dropping store data before changepoint.")
        select_idx = remove_before_changepoint(data, None)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.info("Dropping stores not in test set. Initial shape")
        idx = data.select_as_coordinates('train', 'Store in test_set_stores')
        select_idx = select_idx.intersection(idx)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.debug("Log transform on sales data")
        idx = data.select_as_coordinates('train', 'Sales > 0')
        select_idx = select_idx.intersection(idx)
        with warnings_to_log('divide by zero'):
            data.put('train_logsales',
                     np.log(data.select('train', 'columns = Sales')),
                     data_columns=True)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        select_idx = remove_outliers_lm(data, select_idx, log_lm_features,
                                        test_set_stores)
        logger.info("Removed outliers, reduced shape {0}".format(
            len(select_idx)))

        ##
        logger.info("Running glm training")
        X = DataFromHDF(data_store=data,
                        key='train',
                        select_idx=select_idx,
                        columns=linear_features)
        y = DataFromHDF(data_store=data,
                        key='train_logsales',
                        select_idx=select_idx,
                        column='Sales')
        glm = GLMPredictions(stores=test_set_stores, steps=15, step_by=3)
        glm.fit(X, y)

        ##
        if model_save_dir:
            glm.save_model(model_save_dir)

    ##
    logger.info("glm predictions on test set")
    X = DataFromHDF(data_store=data, key='test', columns=linear_features)
    glm_output = DataFromHDF(data_store=output,
                             key='test/glm',
                             data_columns=True)
    preds = glm.predict(X)
    glm_output.put(preds)

    ##
    if predict_train:
        logger.info("glm predictions on training set")
        X = DataFromHDF(data_store=data, key='train', columns=linear_features)
        glm_output = DataFromHDF(data_store=output,
                                 key='train/glm',
                                 data_columns=True)
        preds = glm.predict(X)
        glm_output.put(preds)
Exemplo n.º 3
0
def xgb_predictions(data: pd.HDFStore,
                    output: pd.HDFStore,
                    model_save_dir=None,
                    predict_train=True,
                    from_saved_model=False):
    # +-
    ##
    # noinspection PyUnusedLocal
    test_set_stores = data.select_column('test', 'Store').unique()

    if from_saved_model:
        if from_saved_model is True:
            xgb = get_saved_xgb_model(model_save_dir)
        else:
            xgb = get_saved_xgb_model(from_saved_model)

    else:

        logger.info("Dropping store data before changepoint.")
        select_idx = remove_before_changepoint(data, None)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.info("Dropping stores not in test set. Initial shape")
        idx = data.select_as_coordinates('train', 'Store in test_set_stores')
        select_idx = select_idx.intersection(idx)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.debug("Log transform on sales data")
        idx = data.select_as_coordinates('train', 'Sales > 0')
        select_idx = select_idx.intersection(idx)
        with warnings_to_log('divide by zero'):
            data.put('train_logsales',
                     np.log(data.select('train', 'columns = Sales')),
                     data_columns=True)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.info("Running xgboost training")
        X = DataFromHDF(data_store=data,
                        key='train',
                        select_idx=select_idx,
                        columns=xgb_features)
        y = DataFromHDF(data_store=data,
                        key='train_logsales',
                        select_idx=select_idx,
                        column='Sales')
        xgb = XGBPredictions(eval_function=xgb_expm1_rmspe,
                             params=xparams,
                             nrounds=3000)
        xgb.fit(X, y)

        ##
        if model_save_dir:
            xgb.save_model(model_save_dir)

    ##
    logger.info("xgboost predictions on test set")
    X = DataFromHDF(data_store=data, key='test', columns=xgb_features)
    xgb_output = DataFromHDF(data_store=output,
                             key='test/xgb',
                             data_columns=True)
    preds = xgb.predict(X)
    xgb_output.put(preds)

    ##
    if predict_train:
        logger.info("xgboost predictions on training set")
        xgb_output = DataFromHDF(data_store=output,
                                 key='train/xgb',
                                 data_columns=True)
        select_idx = data.select_as_coordinates('train',
                                                'Store in test_set_stores')
        X = DataFromHDF(data_store=data,
                        key='train',
                        select_idx=select_idx,
                        columns=xgb_features)
        predict_in_chunks(xgb, X, xgb_output)