示例#1
0
def main(cutoff_date):
    s_ut.my_print('loading to sup.dim_cx_ticket_forecast the forecast with cutoff date ' + str(cutoff_date))
    t_file = os.path.expanduser('~/Forecasts/par/' + 'table_output_' + str(cutoff_date) + '.par')
    s_ut.my_print('table file: ' + str(t_file))

    if os.path.isfile(t_file):
        df = p_ut.read_df(t_file)
        p_ut.set_week_start(df, tcol='ds')  # week_starting patch

        # week_starting patch
        df_cols_ = df.columns
        if 'ds_week_ending' in df_cols_ and 'ds_week_starting' not in df_cols_:
            df['ds_week_ending'] = pd.to_datetime(df['ds_week_ending'])
            df['ds_week_starting'] = df['ds_week_ending'] - pd.to_timedelta(6, unit='D')

        s_ut.my_print('data file: ' + str(t_file) + ' rows: ' + str(len(df)) + ' to table')
        partition = {'ds': str(cutoff_date)}
        table = 'sup.dim_cx_ticket_forecast'
        ap.hive.push(df, table=table, if_exists='replace', partition=partition,
                     table_props={'abb_retention_days': '-1', 'abb_retention_days_reason': 'fact table. No pii'}
                     )
        return 0
    else:
        s_ut.my_print('ERROR: failed to load: file ' + t_file + ' is missing')
        return -1
示例#2
0
def table_load(dr_, cutoff_date_, m_adj=1.0):
    # load to adjusted data to the table
    # read table files '~/Forecasts/par/' + 'table_output_'
    gcols = [
        'dim_business_unit', 'dim_language', 'dim_tier', 'dim_channel',
        'time_interval'
    ]
    t_list, max_ds_ = list(), None
    for d_ in dr_:
        fname_ = os.path.expanduser('~/Forecasts/par/table_output_' +
                                    str(d_.date())) + '.par'
        s_ut.my_print('rolling date: ' + str(d_.date()) + ' fname: ' +
                      str(fname_))
        if os.path.exists(fname_):
            fx = p_ut.read_df(fname_)
            p_ut.set_week_start(
                fx, tcol='fcst_date_inv_endings')  # week_starting patch

            # week_starting patch
            df_cols__ = fx.columns
            if 'ds_week_ending' in df_cols__ and 'ds_week_starting' not in df_cols__:
                fx['ds_week_ending'] = pd.to_datetime(fx['ds_week_ending'])
                fx['ds_week_starting'] = fx[
                    'ds_week_ending'] - pd.to_timedelta(6, unit='D')

            fv = process_w_df(fx, cutoff_date_, 'fcst_date_inv_ending',
                              gcols + ['run_date_inv_ending'])
            max_ds_ = fv['fcst_date_inv_ending'].max(
            ) if max_ds_ is None else min(max_ds_,
                                          fv['fcst_date_inv_ending'].max())
            t_list.append(fv)
    tdf = pd.concat(t_list, axis=0)
    t_fdf = tdf.groupby(gcols + ['fcst_date_inv_ending']).apply(
        lambda x: x['ticket_count'].mean()).reset_index()
    t_fdf.rename(columns={0: 'ticket_count'}, inplace=True)
    avg_tdf = t_fdf[t_fdf['fcst_date_inv_ending'] <= max_ds_].copy()
    avg_tdf['run_date_inv_ending'] = str(cutoff_date_.date())
    avg_tdf.reset_index(inplace=True)
    avg_tdf.rename(columns={'index': 'fcst_horizon'}, inplace=True)
    avg_tdf['fcst_date_inv_ending'] = avg_tdf[
        'fcst_date_inv_ending'].dt.date.astype(str)
    avg_tdf['ticket_count'] *= m_adj
    print(
        '******* saving data to load to sup.dim_cx_ticket_forecast >>>>>>>>>>>>>>'
    )
    p_ut.save_df(avg_tdf, '~/my_tmp/tab_data_' + str(cutoff_date_.date()))
    print(888888888888888888888888)
    print('---------------- SKIPPING TABLE ---------------------')
    ret = -1
    # ret = t2t.to_table(avg_tdf, str(cutoff_date_.date()), 'sup.dim_cx_ticket_forecast')         # 'josep.dim_ticket_facst_test    # 'sup.dim_cx_ticket_forecast'
    if ret == -1:
        s_ut.my_print('ERROR: table push failed')
    return ret
示例#3
0
def get_fcast_(ts_name, cutoff_date, e_date):
    # get forecast
    froot = '~/my_tmp/fbp/'
    fname = froot + 'lang_fcast_' + ts_name + '_' + str(cutoff_date.date())
    fcast_df = p_ut.read_df(fname)
    if fcast_df is None:
        s_ut.my_print('ERROR: no forecasts for ' + str(ts_name) +
                      ' and cutoff date ' + str(cutoff_date.date()))
        sys.exit()
    elif fcast_df['ds'].max() <= e_date:
        s_ut.my_print('ERROR: no forecasts for ' + str(ts_name) +
                      ' and cutoff date ' + str(cutoff_date.date()) +
                      ' and horizon ' + str(e_date.date()))
        sys.exit()
    else:
        p_ut.set_week_start(fcast_df, tcol='ds')  # week_starting patch
        return fcast_df
示例#4
0
def get_cfg_data(ts_name, cfg_cols, p_col):
    # read all the cfgs and set the cfg_idx
    t_name = 'sup.fct_cx_forecast_config'
    qry = 'select * from ' + t_name + ';'
    q_file = '/tmp/read_cfg_' + ts_name + '.hql'
    with open(q_file, 'w') as f:
        f.write(qry)
    s_ut.my_print('pid: ' + str(os.getpid()) + ' in query file: ' + q_file)
    fout = None
    ret = hql.run_hql((q_file, q_file), fout)
    if ret == -1:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR: Query failed. No configs found')
        sys.exit()
    s_ut.my_print('pid: ' + str(os.getpid()) + ' fcast cfg file: ' + ret)
    cfg_df = p_ut.read_df(ret, sep='\t')
    p_ut.set_week_start(cfg_df, tcol='ds')  # week_starting patch

    if cfg_df is None or len(cfg_df) == 0:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR: no data for query: ' + str(qry))
        sys.exit()
    dcol = {
        x: x.replace(t_name.split('.')[-1] + '.', '')
        for x in cfg_df.columns
    }
    cfg_df.rename(columns=dcol, inplace=True)
    cfg_df['cutoff'] = pd.to_datetime(cfg_df['cutoff'])
    cfg_df = cfg_df[(cfg_df['ts_name'] == ts_name)]
    cfg_df = cfg_df[cfg_df[p_col] > 0.0].copy()
    cfg_df.fillna('None', inplace=True)
    cfg_df['cfg_str'] = cfg_df.apply(
        lambda x: json.dumps(x[cfg_cols].to_dict()), axis=1)
    z = cfg_df['cfg_str'].drop_duplicates()
    zf = pd.DataFrame(z)
    zf.reset_index(inplace=True, drop=True)
    zf.reset_index(inplace=True)
    zf.columns = ['cfg_idx', 'cfg_str']
    df = cfg_df.merge(zf, on=['cfg_str'], how='left')
    df['language'].replace(['Mandarin_Offshore', 'Mandarin_Onshore'],
                           'Mandarin',
                           inplace=True)  # Mandarin need to be fixed later
    df.drop_duplicates(inplace=True)
    p_ut.save_df(df, '~/my_tmp/rk_df_' + ts_name)
    # df = p_ut.read_df('~/my_tmp/rk_df_' + ts_name)
    return df
示例#5
0
def get_actuals_(ts_name, ts_cfg, e_date):
    # get actuals
    froot = ts_cfg['data_path'].split('/')
    a_dir = os.path.expanduser('/'.join(froot[:-1])) + '/'
    fname = froot[-1]
    actuals_df = None
    for f in os.listdir(a_dir):
        if fname in f:
            s_ut.my_print('actuals file: ' + a_dir + str(f.split('.')[0]))
            actuals_df = p_ut.read_df(a_dir + f.split('.')[0])
            if actuals_df is not None and actuals_df['ds'].max() >= e_date:
                break
    if actuals_df is None:
        s_ut.my_print('ERROR: no actuals for ' + ts_name + ' and horizon ' +
                      str(e_date.date()))
        sys.exit()
    else:
        p_ut.set_week_start(actuals_df, tcol='ds')  # week_starting patch
        return actuals_df
示例#6
0
def get_fcast_cfg(ts_name, cutoff_date):
    fdir = '~/my_tmp/cfg_sel/'
    fperf = os.path.expanduser(fdir + 'cfg_sel_' + ts_name + '_' + cutoff_date)
    fidx = os.path.expanduser(fdir + 'cfg_idx_' + ts_name + '_' + cutoff_date)
    df_cfg = p_ut.read_df(fidx)
    p_ut.set_week_start(df_cfg, tcol='ds')  # week_starting patch

    dfp = p_ut.read_df(fperf)
    p_ut.set_week_start(dfp, tcol='ds')  # week_starting patch
    f_list = list()
    for l, f in dfp.groupby('language'):
        tf = f.nsmallest(n=1, columns=['f_err'])
        cfg_list = list(tf.loc[tf.index[0], 'cfg_idx'][0])
        print(l)
        print(cfg_list)
        fi = df_cfg[(df_cfg['language'] == l)
                    & (df_cfg['cfg_idx'].isin(cfg_list))]
        fi.drop('f_err', axis=1, inplace=True)
        fi.drop_duplicates(inplace=True)
        print(fi)
        f_list.append(fi)
    return pd.concat(f_list) if len(f_list) > 0 else None
示例#7
0
def get_fcast(cutoff_date_, froot, months=3):
    # get the fcast issued <months> ago
    f_month = 1 + (cutoff_date_.month - months) % 12  # fcast issue month
    yr = cutoff_date_.year if f_month < cutoff_date_.month else cutoff_date_.year - 1
    dm = pd.to_datetime(str(yr) + '-' + str(f_month) +
                        '-01')  # 1st day of issue month
    wd = dm.weekday()
    fcast_sat = dm - pd.to_timedelta(
        wd + 2, unit='D') if wd < 5 else dm - pd.to_timedelta(wd - 5, unit='D')
    fcast_f = froot + str(fcast_sat.date())
    try:
        fdf = p_ut.read_df(os.path.expanduser(fcast_f))
    except OSError:
        s_ut.my_print('file not found: ' + froot)
        return None
    if fdf is None:
        return None
    else:
        p_ut.set_week_start(cfg_df, tcol='ds')  # week_starting patch

        fdf.rename(columns={'ticket_count': 'forecasted_count'}, inplace=True)
        s_ut.my_print('getting forecast from ' + fcast_f)
        return fdf
示例#8
0
def prepare_regs(r_name, rcfg, cutoff_date, fcast_days, init_date):
    s_ut.my_print('pid: ' + str(os.getpid()) + ' preparing regressor ' +
                  str(r_name))
    in_file, r_col_dict, key_cols = rcfg['data_path'], rcfg['r_col'], rcfg.get(
        'key_cols', None)

    # regressors: set deterministic indicators
    if r_name == 'peak':  # peak season indicator. No clean up, imputation or forecast
        r_col = list(r_col_dict.keys())[0]  # peaks
        df = pd.DataFrame({
            'ds':
            pd.date_range(start=pd.to_datetime(init_date),
                          end=pd.to_datetime(cutoff_date) +
                          pd.to_timedelta(fcast_days, unit='D'),
                          freq='D')
        })
        df[r_col] = df['ds'].apply(
            lambda x: 1 if x.month_name() in ['July', 'August'] else 0)
        regressors.IndicatorRegressor('peak',
                                      'peak',
                                      'ds',
                                      init_date,
                                      cutoff_date, ['July', 'August'],
                                      fcast_days,
                                      dim_cols=None)
        return df

    # other regressors: clean up, imputation and forecast (later)
    r_file = d_proc.get_data_file(rcfg['data_path'], cutoff_date)
    if r_file is None:
        s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: date ' +
                      str(cutoff_date.date()) + ' has no data for regressor ' +
                      r_name)
        return None
    else:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' found data file for regressor ' + r_name +
                      ' and date ' + str(cutoff_date.date()) + ': ' + r_file)

    rdf = p_ut.read_df(r_file)
    p_ut.set_week_start(rdf, tcol='ds')  # week_starting patch
    rdf = rdf[(rdf['ds'] >= pd.to_datetime(init_date))
              & (rdf['ds'] <= pd.to_datetime(cutoff_date))].copy()

    if key_cols is not None:  # get only relevant data
        for c, v in key_cols.items():
            rdf = rdf[rdf[c] == v]

    rdf['ceiling'] = rcfg.get('ceiling', 1)
    rdf['floor'] = rcfg.get('floor', 0)

    if r_name == 'contact-rate':
        if len(rdf) > 0:
            dim_cols = 'language' if 'language' in rdf.columns else None
            regressors.Regressor('contact-rate',
                                 'contact_rate',
                                 'ds',
                                 rdf,
                                 rcfg,
                                 init_date,
                                 cutoff_date,
                                 fcast_days,
                                 dim_cols=dim_cols)
        if 'language' in rdf.columns:
            return rdf[['ds', 'language', 'contact_rate', 'ceiling', 'floor']]
        else:
            return rdf[['ds', 'contact_rate', 'ceiling', 'floor']]
    elif r_name == 'tenure':
        if len(rdf) > 0:
            regressors.Regressor('tenure',
                                 'tenure_days',
                                 'ds',
                                 rdf,
                                 rcfg,
                                 init_date,
                                 cutoff_date,
                                 fcast_days,
                                 dim_cols=['language'])
        return rdf[['ds', 'language', 'tenure_days']]
    elif r_name == 'bookings' or r_name == 'checkins':
        if len(rdf) > 0:
            regressors.Regressor(r_name,
                                 r_name[:-1] + '_count',
                                 'ds',
                                 rdf,
                                 rcfg,
                                 init_date,
                                 cutoff_date,
                                 fcast_days,
                                 dim_cols=['language'])
        return rdf
    else:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' WARNING: unknown regressor: ' + str(r_name))
        return None
示例#9
0
def main(ts_name_, cutoff_date_):
    cfg_cols = ['growth', 'y_mode', 'w_mode', 'r_mode', 'xform', 'h_mode', 'training', 'do_res', 'changepoint_range']
    upr_horizon, lwr_horizon = 112, 84
    lbl = ts_name_ + '_' + cutoff_date_

    if ts_name_ == 'phone-inbound-vol':
        fname = dtp.get_data_file('~/my_tmp/cleaned/phone-vol_cleaned_', cutoff_date_)
        interaction_type = 'inbound'
    else:
        fname = dtp.get_data_file('~/my_tmp/cleaned/phone-aht_cleaned_', cutoff_date_)
        interaction_type = 'inbound' if 'inbound' in ts_name_ else 'outbound'

    # ######################################################################
    # ######################################################################
    # actuals
    s_ut.my_print('pid: ' + str(os.getpid()) + ' actuals file: ' + str(fname))
    q_df = pd.read_parquet(fname)

    # week_starting patch
    df_cols_ = q_df.columns
    if 'ds_week_ending' in df_cols_ and 'ds_week_starting' not in df_cols_:
        q_df['ds_week_ending'] = pd.to_datetime(q_df['ds_week_ending'])
        q_df['ds_week_starting'] = q_df['ds_week_ending'] - pd.to_timedelta(6, unit='D')

    q_df['ds'] = pd.to_datetime(q_df['ds'].values)
    w_col = 'y' if 'vol' in ts_name_ else 'calls'

    # daily actuals (language level)
    if ts_name_ == 'phone-inbound-vol':
        q_df = q_df.groupby(['ds','language']).agg({'offered': np.sum, 'accepted': np.sum, 'abandons': np.sum}).reset_index()
    else:
        q_df = q_df[q_df['interaction_type'] == interaction_type].copy()
        q_df = q_df.groupby(['ds', 'language']).agg({'calls': np.sum, 'agent_mins': np.sum}).reset_index()
    a_ddf, ctype = set_demand(q_df.copy(), 10, ts_name_)
    w_df = a_ddf[a_ddf['ds'] <= cutoff_date_].groupby('language').agg({w_col: np.sum}).reset_index()
    w_df.columns = ['language', 'weight']
    p_ut.save_df(a_ddf, '~/my_tmp/a_daily_df_' + lbl)

    # weekly level: use week starting
    if ts_name_ == 'phone-inbound-vol':
        m_df = q_df.groupby(pd.Grouper(key='ds', freq='W-SUN')).agg({'offered': np.sum, 'accepted': np.sum, 'abandons': np.sum}).reset_index()
    else:
        m_df = q_df.groupby(pd.Grouper(key='ds', freq='W-SUN')).agg({'calls': np.sum, 'agent_mins': np.sum}).reset_index()
    a_wdf_, ctype = set_demand(m_df, 10, ts_name_)

    a_wdf = a_wdf_.copy()
    horizon_date = min(pd.to_datetime(cutoff_date_) + pd.to_timedelta(upr_horizon, unit='D'), a_wdf['ds'].max())
    a_wdf['ds_week_ending'] = a_wdf['ds'] + pd.to_timedelta(6, unit='D')  # switch to week ending so that we do not have incomplete weeks at end
    a_wdf = a_wdf[(a_wdf['ds_week_ending'] <= horizon_date) & (a_wdf['ds_week_ending'] > cutoff_date_)].copy()
    a_wdf.drop('ds', axis=1, inplace=True)
    a_wdf['ts_name'] = ts_name_
    p_ut.save_df(a_wdf, '~/my_tmp/a_weekly_df_' + lbl)
    # ######################################################################
    # ######################################################################

    # ######################################################################
    # ######################################################################
    # DS forecasts: select the top fcast cfgs for each language, score them based on past performance and forecast them
    sdir = '~/my_tmp/cfg_sel/'
    df_best = p_ut.read_df(sdir + 'cfg_best_' + ts_name_ + '_' + cutoff_date_)  # best ensembles by idx
    p_ut.set_week_start(df_best, tcol='ds')  # week_starting patch

    z = df_best[['language', 'cfg_idx']].copy()
    z.set_index('language', inplace=True)
    dx = z.to_dict()['cfg_idx']
    dx = {k: list(v) for k, v in dx.items()}

    df_idx = p_ut.read_df(sdir + 'cfg_idx_' + ts_name_ + '_' + cutoff_date_)   # map cfg_idx to fcast cfg
    p_ut.set_week_start(df_idx, tcol='ds')  # week_starting patch
    df_idx = df_idx[['language', 'cfg_idx'] + cfg_cols].copy()
    df_idx.drop_duplicates(inplace=True)

    # fix None for fcasts
    for c in cfg_cols:
        df_idx[c] = df_idx[c].apply(lambda x: None if x == 'None' else x)
    df_idx['h_mode'] = df_idx['h_mode'].apply(lambda x: True if x == 1 else False)
    df_idx['do_res'] = df_idx['do_res'].apply(lambda x: True if x == 1 else False)
    cfg_df = pd.concat([lf[lf['cfg_idx'].isin(dx[l])] for l, lf in df_idx.groupby('language')], axis=0)

    # run the fcasts for the selected cfg's
    file_out = lfc.main(ts_name_, cutoff_date_, cfg_cols, to_db=False, df_cfg=cfg_df.copy(), is_mp=True)   # , is_fcast=False)
    if file_out is None:
        s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no fcast file returned')
        sys.exit()
    s_ut.my_print('pid: ' + str(os.getpid()) + ' +++++++++++++ completed forecasts:: file: ' + str(file_out) + ' +++++++++++++++ ')
    fdf = pd.read_parquet(file_out)

    # week_starting patch
    df_cols_ = fdf.columns
    if 'ds_week_ending' in df_cols_ and 'ds_week_starting' not in df_cols_:
        fdf['ds_week_ending'] = pd.to_datetime(fdf['ds_week_ending'])
        fdf['ds_week_starting'] = fdf['ds_week_ending'] - pd.to_timedelta(6, unit='D')

    print(fdf.head())

    # make sure fdf and perf_df have the same cfg's
    # fdf_list = list(fdf['cfg_idx'].unique())
    # perf_df = perf_df[perf_df['cfg_idx'].isin(fdf_list)].copy()
    # cfg_df = cfg_df[cfg_df['cfg_idx'].isin(fdf_list)].copy()
    # arr = [int(x * num_cfg_) for x in [1.0, 0.75, 0.5, 0.25, 0.125]]
    # arg_list = [[fdf, a_ddf, w_df, cfg_df, ts_name_, cutoff_date_, horizon_date] for k in arr if k > 1]
    # m_list = s_ut.do_mp(get_models, arg_list, is_mp=True, cpus=len(arr), do_sigkill=True)  # list of dicts
    # m_list = get_models(fdf, a_ddf, w_df, cfg_df, ts_name_, cutoff_date_, horizon_date)
    m_list = get_models(fdf, a_ddf, w_df, ts_name_, cutoff_date_, horizon_date)
    s_ut.my_print('pid: ' + str(os.getpid()) + ' ============== main: get_models complete. appending results ==========')
    if len(m_list) > 0:
        d_out = dict()
        for dv in m_list:
            for k, fname in dv.items():
                if k not in d_out.keys():
                    d_out[k] = list()
                fz = p_ut.read_df(fname)
                p_ut.set_week_start(fz, tcol='ds')  # week_starting patch
                d_out[k].append(fz)
        return {k: pd.concat(d_out[k], axis=0) for k, arr in d_out.items()}
    else:
        return None
示例#10
0
def main(argv):
    print(argv)
    time_scale = 'W'  # reset for daily ticket data
    if len(sys.argv) == 2:
        run_date = sys.argv[
            1]  # at least 3 days after last Saturday with actual data
        with_bu = True
        s_ut.my_print(
            'WARNING: rerun not set in command line. Assuming no rerun')
    elif len(sys.argv) == 3:
        _, run_date, bu = sys.argv  # at least 3 days after last Saturday with actual data
        with_bu = bool(int(bu))
    elif len(sys.argv) == 1:
        with_bu = True
        run_date = str(pd.to_datetime('today').date())
    else:
        print('invalid args: ' + str(sys.argv))
        sys.exit()

    cutoff_date = tm_ut.get_last_sat(run_date)  # set to last saturday

    if time_scale == 'W':
        upr_horizon, lwr_horizon = 75, None
        fcast_days = 7 * upr_horizon  # regardless of time_scale
        inc_start, inc_end = 4, 0
    else:
        upr_horizon, lwr_horizon = 75 * 7, None
        fcast_days = upr_horizon
        inc_start, inc_end = 28, 0

    fcast_date = cutoff_date + pd.to_timedelta(fcast_days, unit='D')

    # get actuals
    act_df = p_ut.read_df('~/my_tmp/tix_act_df_' + str(cutoff_date.date()))
    if act_df is None:
        s_ut.my_print('ERROR: No actuals found')
        sys.exit()
    p_ut.set_week_start(act_df, tcol='ds')  # week_starting patch

    # get lang fcast
    froot = '~/my_tmp/fbp_tix_'
    fname = froot + 'lwbu_fcast_' if with_bu is True else froot + 'lnbu_fcast_'
    fcast_df = p_ut.read_df(fname + str(cutoff_date.date()))
    if fcast_df is None:
        s_ut.my_print('ERROR: No fcast found')
        sys.exit()
    p_ut.set_week_start(fcast_df, tcol='ds')  # week_starting patch

    b_cols = ['agent_sector', 'channel']
    g_cols = ['language']
    if with_bu is False:
        b_cols.append('business_unit')
        fcast_df.drop('business_unit', inplace=True, axis=1)  # all None
    else:
        g_cols.append('business_unit')
    s_ut.my_print(
        '------------------------- start biz level forecast from cutoff date '
        + str(cutoff_date.date()) + ' to forecast date ' +
        str(fcast_date.date()) + ' with business columns: ' + str(b_cols) +
        '  ------------')

    b_fcast = biz_fcast(fcast_df, act_df, g_cols, b_cols, cutoff_date,
                        time_scale, inc_start, inc_end)
    if b_fcast is not None:  # save all the fcats for eahc fcast cfg
        froot = '~/my_tmp/fbp_tix_'
        fname = froot + 'wbu_b_fcast_' if with_bu is True else froot + 'nbu_b_fcast_'
        p_ut.save_df(b_fcast, fname + str(cutoff_date.date()))
    else:
        print('ERROR: no business fcast')
        sys.exit()

    # final fcast (ens_avg)
    ens_df = ens_fcast(b_fcast, act_df, cutoff_date, g_cols, b_cols)
    froot = '~/my_tmp/fbp_tix_'
    fname = froot + 'wbu_e_fcast_' if with_bu is True else froot + 'nbu_e_fcast_'
    p_ut.save_df(ens_df, fname + str(cutoff_date.date()))

    print('++++++++++++++ Error Summary ++++++++++++')
    # check for language error
    fdf = ens_df.groupby(['ds', 'language']).agg({
        'y_pred': np.sum
    }).reset_index()
    months = 3
    m_start = pd.to_datetime(
        str(cutoff_date.year) + '-' + str(cutoff_date.month) +
        '-01') + pd.DateOffset(months=months + 1)
    end_date = tm_ut.last_saturday_month(m_start)  # max date for err check
    collect_date = cutoff_date - pd.DateOffset(months=months)
    start_date = end_date - pd.to_timedelta(
        2, unit='W')  # start date for error check
    a_df, _ = t_ut.get_actuals(
        end_date, collect_date)  # actuals from collect date to end_date
    fa = t_ut.set_act(a_df, ['language'])  # clean TS for each language
    fa = fa[(fa['ds'] > start_date) & (fa['ds'] <= end_date)].copy()
    z = fa.merge(fdf, on=['ds', 'language'], how='left')
    z = z[(z['y_pred'] > 0) & z['ticket_count'] > 0].copy()
    z_lang = z.groupby('language').agg({
        'ticket_count': np.sum,
        'y_pred': np.sum
    }).reset_index()
    z_all = pd.DataFrame({
        'language': ['All'],
        'ticket_count': [z_lang['ticket_count'].sum()],
        'y_pred': [z_lang['y_pred'].sum()]
    })
    z_lang = pd.concat([z_all, z_lang], axis=0)
    z_lang['err'] = np.abs((z_lang['y_pred'] / z_lang['ticket_count']) - 1)
    print(z_lang)

    # t_ut.err_chk(ens_df, cutoff_date, [['language']], ycol='y_pred', months=3)
    print('DONE')
示例#11
0
def prepare_regs(r_name, rcfg, cutoff_date, fcast_days, int_type, init_date):
    s_ut.my_print('pid: ' + str(os.getpid()) + ' preparing regressor ' +
                  str(r_name))
    in_file, r_col_dict, key_cols = rcfg['data_path'], rcfg['r_col'], rcfg.get(
        'key_cols', None)

    # regressors: set deterministic indicators
    if r_name == 'peak':  # peak season indicator. No clean up, imputation or forecast
        r_col = list(r_col_dict.keys())[0]  # peaks
        df = pd.DataFrame({
            'ds':
            pd.date_range(start=init_date,
                          end=cutoff_date +
                          pd.to_timedelta(fcast_days, unit='D'),
                          freq='D')
        })
        df[r_col] = df['ds'].apply(
            lambda x: 1 if x.month_name() in ['July', 'August'] else 0)
        return regressors.IndicatorRegressor('peak', 'peak', 'ds', init_date,
                                             cutoff_date, ['July', 'August'],
                                             fcast_days)
    else:  # other regressors
        r_file = d_proc.get_data_file(rcfg['data_path'], cutoff_date)
        if r_file is None:
            s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: date ' +
                          str(cutoff_date.date()) +
                          ' has no data for regressor ' + r_name)
            return None
        else:
            s_ut.my_print('pid: ' + str(os.getpid()) +
                          ' found data file for regressor ' + r_name +
                          ' and date ' + str(cutoff_date.date()) + ': ' +
                          r_file)
            rdf = p_ut.read_df(r_file)
            p_ut.set_week_start(rdf, tcol='ds')  # week_starting patch
            rdf = rdf[(rdf['ds'] >= pd.to_datetime(init_date))
                      & (rdf['ds'] <= cutoff_date)].copy()
            if rdf['ds'].max() < pd.to_datetime(cutoff_date):
                s_ut.my_print('WARNING: ' + r_name + ' max date (' +
                              str(rdf['ds'].max().date()) +
                              ') is smaller than cutoff date (' +
                              str(cutoff_date.date()) + ')')
            elif len(rdf) > 0:
                if 'interaction_type' in rdf.columns:
                    rdf = rdf[rdf['interaction_type'] == int_type].copy()
                if r_name == 'contact-rate':
                    if rdf['contact_rate'].min(
                    ) == 0.0:  # would mean no inbound tickets
                        zmin = rdf[rdf['contact_rate'] >
                                   0.0]['contact_rate'].min() / 10.0
                        rdf['contact_rate'].replace(0.0, zmin, inplace=True)
                    return regressors.Regressor(
                        'contact-rate', 'contact_rate', 'ds',
                        rdf[['ds', 'language', 'contact_rate']], rcfg,
                        init_date, cutoff_date, fcast_days)
                elif r_name == 'tenure':
                    rdf = rdf.groupby(['ds', 'language']).agg({
                        'tenure_days':
                        np.sum
                    }).reset_index()
                    return regressors.Regressor('tenure', 'tenure_days', 'ds',
                                                rdf, rcfg, init_date,
                                                cutoff_date, fcast_days)
                elif r_name == 'bookings' or r_name == 'checkins':
                    return regressors.Regressor(r_name, r_name[:-1] + '_count',
                                                'ds', rdf, rcfg, init_date,
                                                cutoff_date, fcast_days)
                else:
                    s_ut.my_print('pid: ' + str(os.getpid()) +
                                  ' WARNING: unknown regressor: ' +
                                  str(r_name))
                    return None