Exemplo n.º 1
0
def fcast_idx(f_df):
    fg = pd.DataFrame(f_df['dim_cfg'].copy())
    fg.drop_duplicates(inplace=True)
    fg.reset_index(drop=True, inplace=True)
    fg.reset_index(inplace=True)
    p_ut.save_df(fg, '~/my_tmp/f_idx')
    return fg
Exemplo n.º 2
0
def perf_smry(perf_df, cutoff_date, time_scale, ts_name, upr, lwr):
    # print smry and save
    upr_horizon = cutoff_date + pd.to_timedelta(upr, unit=time_scale)
    lwr_horizon = cutoff_date + pd.to_timedelta(lwr, unit=time_scale)
    if perf_df is not None:
        perf_df.sort_values(by='language', inplace=True)
        perf_df.reset_index(inplace=True, drop=True)
        perf_df['ts_name'] = ts_name
        perf_df['cutoff'] = cutoff_date
        s_ut.my_print(
            '###########################  cutoff: ' + str(cutoff_date.date()) +
            ' ts_name: ' + str(ts_name) + ' performance between ' +
            str(lwr_horizon.date()) + ' (included) and ' +
            str(upr_horizon.date()) +
            ' (included)  ##########################################')
        perf_df.sort_values(by=['language', 'err'], inplace=True)
        print(perf_df.head(10))
        p_ut.save_df(
            perf_df, '~/my_tmp/fbp/lang_perf_' + ts_name + '_' +
            str(cutoff_date.date()))
    else:
        s_ut.my_print(
            'WARNING: no actuals to compute fcast errors for the  between ' +
            str(lwr_horizon.date()) + ' (included) and ' +
            str(upr_horizon.date()) + ' (included) ' + ' for cutoff: ' +
            str(cutoff_date.date()) + ' and ts_name: ' + str(ts_name))
Exemplo n.º 3
0
    def back_fcast(self, f_df):         # undo data transforms
        # >>>>>>>>>>> only yhat's will be correct in the natural scale for non-trivial transforms <<<<<<<<<<<
        if self.verbose:
            s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: only yhat (all levels) back-transforms will be correct')
        levels = ['', '_upper', '_lower']
        for lvl in levels:
            # ###################### should only inverse-xform the terms that exist (eg invert multiplicative_terms only if something is multiplicative)  ##############
            for c in ['yhat', 'trend', 'additive_terms', 'multiplicative_terms']:
                y_var = self.xform_obj.fcast_var(f_df[[c + '_upper', c + '_lower']].copy(), self.prophet_obj.interval_width)   # use the same var for upper and lower
                v = f_df[c + lvl].copy()
                f_df[c + lvl] = self.xform_obj.inverse_transform(v, y_var, lbl=c + lvl)  # only yhat's will be correct in the natural scale for non-trivial transforms

        if f_df['yhat'].isnull().sum() > 0.05 * len(f_df):   # more than pct nulls
            s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: fcast failed: too many NaNs for cfg ' + str(self.f_cfg))
            s_ut.my_print(str(f_df['yhat'].isnull().sum()) + ' nulls on ' + str(len(f_df)) + ' rows')
            print(f_df.head(10))
            print(f_df.isnull().sum())
            p_ut.save_df(f_df, '~/my_tmp/f_df')
            return None
        else:
            f_df['yhat'].interpolate(inplace=True, limit_direction='both')
            f_df['yhat_upper'].interpolate(inplace=True, limit_direction='both')
            f_df['yhat_lower'].interpolate(inplace=True, limit_direction='both')
            if self.verbose:
                s_ut.my_print('pid: ' + str(os.getpid()) + ' back_fcast: forecast OK with cfg ' + str(self.f_cfg))
        return f_df.copy()
Exemplo n.º 4
0
 def interpolate_(self, y, yt, nan_pct=0.2):
     # y: inverse-transformed values (values in natural scale)
     # yt: pre-inverse transform (values in transformed scale)
     if y is None:
         return None
     else:
         yx = np.reshape(y, (1, -1))[0] if self.method is not None else y
     nulls = pd.Series(yx).isnull().sum()
     pct = 100.0 * np.round(nulls / len(yx), 2)
     if nulls > nan_pct * np.ceil(len(yx)):
         su.my_print('WARNING: Too many NaN to interpolate for label ' +
                     str(self.lbl) + ': ' + str(nulls) + ' out of ' +
                     str(len(yx)) + ' (' + str(pct) +
                     '%) data points and lambda ' + str(self.lmbda))
         f = pd.DataFrame({'yt': list(yt), 'yx': list(yx)})
         f['lmbda'] = self.lmbda
         p_ut.save_df(f, '~/my_tmp/interpolDF')
         return None
     elif 0 < nulls <= nan_pct * np.ceil(
             len(yx)):  # interpolate yhat if some NaNs
         su.my_print('WARNING: interpolating for label ' + str(self.lbl) +
                     ': ' + str(nulls) + ' NaNs out of ' + str(len(yx)) +
                     ' data points (' + str(pct) + '%)')
         st = pd.Series(yx)
         sint = st.interpolate(limit_direction='both')
         yhat = sint.values
         ys = np.reshape(yhat, (1, -1))
         return ys[0]
     else:  # all OK
         return y
Exemplo n.º 5
0
def data_check(df, name):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    u_vals = [c for c in df.columns if df[c].nunique() <= 1]
    if df.isnull().sum().sum() > 0 or len(u_vals) > 0:
        p_ut.save_df(df, '~/my_tmp/f_data')
        s_ut.my_print('ERROR: invalid data for ' + str(name))
        sys.exit()
Exemplo n.º 6
0
def get_shift(fl, cu_sun, upr, lwr):
    # extra shift to max correlation
    lg = fl.loc[fl.index[0], 'language']
    fl.drop('language', inplace=True, axis=1)
    zn = fl.copy()
    zn.set_index('ds', inplace=True)
    zs = zn[['y']].shift(364, freq='D')                                 # shift prior year data to cutoff date
    f = zn.merge(zs, left_index=True, right_index=True, how='left')
    f.rename(columns={'y_y': 'y_shifted_', 'y_x': 'y'}, inplace=True)
    ws_adj = 4                                                          # max adj shift
    lx = [_corr(f.copy(), cu_sun, v, upr) for v in range(-ws_adj, ws_adj + 1)]
    lx.sort(key=lambda x: x[0])
    rho_max, ws_opt, le = lx[-1]
    print('language: ' + lg + ' week shift: ' + str(ws_opt) + ' best corr: ' + str(rho_max) + ' len: ' + str(le))

    f['y_shifted'] = f['y_shifted_'].shift(ws_opt)
    f.drop('y_shifted_', axis=1, inplace=True)
    f.dropna(inplace=True)
    fx = lm_mdl(f, cu_sun, upr, lwr)
    if fx is not None:
        fx['opt_shift'] = ws_opt
        fx['corr'] = rho_max
        p_ut.save_df(fx, '~/my_tmp/fshift_' + lg)
        return fx
    else:
        return None
Exemplo n.º 7
0
    def perf_err(self):
        if self.mdl is None:
            return {
                'ens': self.name,
                'err': np.nan,
                'n_features': len(self.features_),
                'df': self.df,
                'alpha': self.alpha,
                'l1_ratio': self.lbda
            }
        else:
            d_mdl = {
                'ens': self.name,
                'n_features': len(self.features_),
                'df': self.df,
                'alpha': self.alpha,
                'l1_ratio': self.lbda
            }

            y_test = self.y_scaler.inverse_transform(
                self.y_test)  # the quantity to be forecasted
            x_test = self.f_test[self.features_].copy()
            p_ut.save_df(x_test, '~/my_tmp/x_test')

            # actuals regression based error (bad performance)
            # ys_pred = self.mdl.predict(self.X_test)
            # y_pred = self.y_scaler.inverse_transform(ys_pred)
            # d_mdl['mdl_err'] = self._perf_err(y_test, y_pred)

            # plain avg based error
            y_avg = x_test.mean(axis=1)
            d_mdl['avg_err'] = self._perf_err(y_test, y_avg.values)

            # IC based
            for ic in ['aic', 'aicc', 'bic', 'hqic']:
                _arr = np.exp((-0.5) * np.array([
                    self.IC(y_test, self.f_test[c].values, self.df, method=ic)
                    for c in self.features_
                ]))
                weights = _arr / np.sum(_arr)
                _avg = np.average(x_test, axis=1, weights=weights)
                d_mdl[ic + '_err'] = self._perf_err(y_test, _avg)

            # Regression on adj_y_shifted
            # variables need to be centered and scaled?
            lm = l_mdl.LinearRegression(normalize=False).fit(
                x_test, self.f_test['adj_y_shifted'].values)
            y_pred = lm.predict(x_test)
            d_mdl['mse_err'] = self._perf_err(y_test, y_pred)

            # adj err: use adj_y_shifted as forecast
            d_mdl['adj_err'] = self._perf_err(
                y_test, self.f_test['adj_y_shifted'].values)
            return d_mdl
Exemplo n.º 8
0
def table_load(dr_, cutoff_date_, m_adj=1.0):
    # load to adjusted data to the table
    # read table files '~/Forecasts/par/' + 'table_output_'
    gcols = [
        'dim_business_unit', 'dim_language', 'dim_tier', 'dim_channel',
        'time_interval'
    ]
    t_list, max_ds_ = list(), None
    for d_ in dr_:
        fname_ = os.path.expanduser('~/Forecasts/par/table_output_' +
                                    str(d_.date())) + '.par'
        s_ut.my_print('rolling date: ' + str(d_.date()) + ' fname: ' +
                      str(fname_))
        if os.path.exists(fname_):
            fx = p_ut.read_df(fname_)
            p_ut.set_week_start(
                fx, tcol='fcst_date_inv_endings')  # week_starting patch

            # week_starting patch
            df_cols__ = fx.columns
            if 'ds_week_ending' in df_cols__ and 'ds_week_starting' not in df_cols__:
                fx['ds_week_ending'] = pd.to_datetime(fx['ds_week_ending'])
                fx['ds_week_starting'] = fx[
                    'ds_week_ending'] - pd.to_timedelta(6, unit='D')

            fv = process_w_df(fx, cutoff_date_, 'fcst_date_inv_ending',
                              gcols + ['run_date_inv_ending'])
            max_ds_ = fv['fcst_date_inv_ending'].max(
            ) if max_ds_ is None else min(max_ds_,
                                          fv['fcst_date_inv_ending'].max())
            t_list.append(fv)
    tdf = pd.concat(t_list, axis=0)
    t_fdf = tdf.groupby(gcols + ['fcst_date_inv_ending']).apply(
        lambda x: x['ticket_count'].mean()).reset_index()
    t_fdf.rename(columns={0: 'ticket_count'}, inplace=True)
    avg_tdf = t_fdf[t_fdf['fcst_date_inv_ending'] <= max_ds_].copy()
    avg_tdf['run_date_inv_ending'] = str(cutoff_date_.date())
    avg_tdf.reset_index(inplace=True)
    avg_tdf.rename(columns={'index': 'fcst_horizon'}, inplace=True)
    avg_tdf['fcst_date_inv_ending'] = avg_tdf[
        'fcst_date_inv_ending'].dt.date.astype(str)
    avg_tdf['ticket_count'] *= m_adj
    print(
        '******* saving data to load to sup.dim_cx_ticket_forecast >>>>>>>>>>>>>>'
    )
    p_ut.save_df(avg_tdf, '~/my_tmp/tab_data_' + str(cutoff_date_.date()))
    print(888888888888888888888888)
    print('---------------- SKIPPING TABLE ---------------------')
    ret = -1
    # ret = t2t.to_table(avg_tdf, str(cutoff_date_.date()), 'sup.dim_cx_ticket_forecast')         # 'josep.dim_ticket_facst_test    # 'sup.dim_cx_ticket_forecast'
    if ret == -1:
        s_ut.my_print('ERROR: table push failed')
    return ret
Exemplo n.º 9
0
def merge_adj(df, adj_df, k_col, ts_cols,
              master_ts):  # keeps coherence at master_ts
    mf = pd.concat([
        set_factor(ds, kval, k_col, adj_df, ts_cols)
        for (ds, kval), f in df.groupby(['ds', k_col])
    ],
                   axis=0)
    p_ut.save_df(mf, '~/my_tmp/mf')
    fx = df.merge(mf, on=['ds', k_col], how='left')
    gx = fx.apply(row_adjust, ts_list=ts_cols, axis=1)
    gx[master_ts + '_tilde'] = gx[[c + '_tilde' for c in ts_cols]].sum(axis=1)
    return gx.copy()
Exemplo n.º 10
0
def cfg_results(ada_cfg, X_train, y_train, X_pred, X_idx, y_res, xf_obj, y_col,
                p_col):
    # best AdaBoost regressor results
    topN = ada_cfg['topN']
    obj = ada_cfg['obj']
    ab_reg = AdaBoostRegressor(n_estimators=int(ada_cfg['adb_estimators']),
                               base_estimator=DecisionTreeRegressor(
                                   max_depth=int(ada_cfg['max_depth']),
                                   min_samples_split=int(
                                       ada_cfg['min_samples_split'])),
                               loss=ada_cfg['loss'],
                               learning_rate=ada_cfg['learning_rate'])

    try:
        ab_reg.fit(X_train, y_train)
    except ValueError as e:
        s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: could not fit: ' +
                      str(e))
        p_ut.save_df(X_train, '~/my_tmp/X_train')
        p_ut.save_df(y_train, '~/my_tmp/y_train')
        return dict()

    y_pred = ab_reg.predict(X_pred)  # predict (transformed) y_col
    lf_out = pd.concat([
        pd.DataFrame(X_idx, columns=['cfg_idx']),
        pd.DataFrame(y_pred, columns=[y_col + '_pred'])
    ],
                       axis=1)  # X_train y_train and X_test have same index
    lf_out.drop_duplicates(inplace=True)

    if y_res is not None:
        lf_out = y_res.merge(lf_out, on=['cfg_idx'], how='left')

    l_cfg = lf_out.nsmallest(
        n=topN, columns=[y_col + '_pred']
    )  # can use transformed values because transforms are monotonic increasing
    d_best = {
        'xform': xf_obj.name,
        'topN': topN,
        'obj': obj,
        'yobj': y_col,
        'yhat': l_cfg[y_col + '_pred'].mean(),
        'cfg_idx': [list(l_cfg['cfg_idx'].values)]
    }

    if y_res is not None:
        d_best['y'] = l_cfg[y_col].mean()

    if p_col in l_cfg.columns:
        d_best[p_col] = l_cfg[p_col].mean()
        d_best[p_col + '_max'] = l_cfg[p_col].max()
        d_best.update(ada_cfg)
    return d_best
Exemplo n.º 11
0
    def prepare_data(self, data_df):  # clean input data and build holiday DF
        # gcols_ = ['ds', 'language']
        if 'language' in data_df.columns:
            data_df['language'].replace(
                ['Mandarin_Onshore', 'Mandarin_Offshore'],
                ['Mandarin', 'Mandarin'],
                inplace=True)
        else:
            data_df['language'] = 'NULL'

        dl_groups = data_df.groupby(['ds'] + self.gcols).agg(
            self.agg_dict).reset_index()
        df_dict, hf_dict = dict(), dict()
        for tpl, gf in dl_groups.groupby(self.gcols):
            if isinstance(tpl, str):
                tpl = (tpl, )
            s_ut.my_print('************* data checks and hols for group: ' +
                          str(tpl))
            mf = None if self.ycol is None else d_proc.data_check(
                gf[['ds', self.ycol]].copy(),
                self.ycol,
                'ds',
                self.cutoff_date,
                self.init_date,
                max_int=self.max_int,
                name=str(tpl),
                unit=self.time_scale)

            if mf is None:
                s_ut.my_print('WARNING: data_check failed for label ' +
                              str(tpl))
                p_ut.save_df(gf, '~/my_tmp/gf_' + str(tpl))
                continue
            else:
                lang = tpl[
                    self.lang_idx] if self.lang_idx is not None else None
                d_df = self.set_demand(mf)
                h_df = self.get_holidays(lang) if lang is not None else None
                hf_dict[lang] = h_df  # dict of hols DF by language (key)
                t_df = self.trim_outliers(d_df,
                                          h_df,
                                          self.outlier_coef, ['y'],
                                          lbl_dict=tpl)  # trim outliers
                if t_df is not None:
                    for ix in range(len(self.gcols)):  # add the grouping cols
                        t_df[self.gcols[ix]] = tpl[ix]
                    if lang not in df_dict.keys():
                        df_dict[lang] = t_df  # dict of trimmed DF by language
                    else:
                        zx = pd.concat([df_dict[lang].copy(), t_df], axis=0)
                        df_dict[lang] = zx
        return df_dict, hf_dict
Exemplo n.º 12
0
def rs_to_excel(df, pcol, ycol):          # all, all - (safety + claims), all - directly, all - (safety + claims + directly)
    def to_spreadsheet(adf, tcol, ycol_):
        # top DF
        adf_tier = pd.pivot_table(adf, values=ycol_, index=['business_unit', 'channel', 'language'], columns=[tcol], aggfunc=sum).reset_index()
        adf_tier_all = pd.pivot_table(adf, values=ycol_, index=['business_unit', 'language'], columns=[tcol], aggfunc=sum).reset_index()
        adf_tier_all['channel'] = 'all'
        output_ = pd.concat([adf_tier_all, adf_tier], sort=True)
        output_['key'] = output_['business_unit'] + '-' + output_['channel'] + '-' + output_['language']

        # spacer DF
        sf = pd.DataFrame(columns=output_.columns, index=range(3))
        sf.loc[0] = [''] * len(output_.columns)
        sf.loc[2] = [''] * len(output_.columns)
        sf.loc[1] = [c for c in output_.columns]
        sf.loc[1, 'key'] = 'FORMULAS'

        # formulas DF
        pdf_ = pd.pivot_table(adf, values=ycol_, index=['channel', 'language'], columns=[tcol], aggfunc=sum).reset_index()
        pdf_['key'] = pdf_['channel'] + '-' + pdf_['language']
        qdf = pd.DataFrame(pdf_.sum(axis=0)).transpose()
        qdf['key'] = 'All'
        fall = pd.concat([output_, sf, qdf, pdf_], axis=0, sort=True)
        fall.reset_index(inplace=True, drop=True)
        cols = fall.columns.values.tolist()
        _ = [cols.remove(c) for c in ['business_unit', 'channel', 'language', 'key']]
        cols.insert(0, 'key')
        return fall[cols]

    df[pcol] = df[pcol].dt.date.astype(str)
    output = dict()
    output['all'] = to_spreadsheet(df, pcol, ycol)
    df_nodirectly = df[df['service_tier'] != 'directly']
    output['all - directly'] = to_spreadsheet(df_nodirectly, pcol, ycol)
    df_ = df[~((df['service_tier'] == 'safety') | (df['service_tier'] == 'claims'))]
    output['all - (safety&claims)'] = to_spreadsheet(df_, pcol, ycol)
    df_ = df[~((df['service_tier'] == 'safety') | (df['service_tier'] == 'claims') | (df['service_tier'] == 'directly'))]
    output['all - (safety&claims&directly)'] = to_spreadsheet(df_, pcol, ycol)
    for tr in df['service_tier'].unique():

        if tr in ['Community Education', 'Resolutions 1']:
            print(11111111111111111)
            print(ycol)
            print(pcol)
            z = df.groupby(['ds', 'service_tier']).sum().reset_index()
            zz = z[(z['ds'] >= '2020-02-01') & (z['ds'] <= '2020-04-10')]
            print(zz[zz['service_tier'] == tr])

        p_ut.save_df(df[df['service_tier'] == tr], '~/my_tmp/sh_df_' + tr)
        output[tr] = to_spreadsheet(df[df['service_tier'] == tr], pcol, ycol)
    return output
Exemplo n.º 13
0
def fcast_errors(argv):
    if len(argv) == 3:
        ts_name, run_date = argv[
            1:]  # at least 3 days after last Saturday with actual data
    else:
        print('invalid args: ' + str(sys.argv))
        sys.exit()

    # data cfg
    data_cfg = FILE_PATH + '/config/data_cfg.json'
    with open(os.path.expanduser(data_cfg), 'r') as fp:
        d_cfg = json.load(fp)

    ts_cfg = d_cfg[ts_name]
    ts_cfg['name'] = ts_name
    ts_cfg['time_scale'] = time_scale
    cutoff_date = tm_ut.get_last_sat(
        run_date
    )  # set to saturday prior run_date or the run_date if a saturday
    e_date = cutoff_date + pd.to_timedelta(
        upr_horizon, unit='D')  # error check end date (included)
    s_date = cutoff_date + pd.to_timedelta(
        lwr_horizon, unit='D')  # error check start date (included)

    # get actuals
    actuals_df = get_actuals_(ts_name, ts_cfg, e_date)
    a_df_dict = dict()
    for w in ts_cfg['outlier_coef']:
        ts_cfg_ = copy.deepcopy(ts_cfg)
        ts_cfg_['outlier_coef'] = w
        a_df_dict[w] = prep_actuals(actuals_df, ts_cfg_, e_date)

    # get fcasts
    f_df = get_fcast_(ts_name, cutoff_date, e_date)

    # get language level errors and save err data
    e_list = list()
    for w, a_df in a_df_dict.items():
        ts_cfg_ = copy.deepcopy(ts_cfg)
        ts_cfg_['outlier_coef'] = w
        e_lang_ = fcast_errors_(a_df, f_df, e_date, s_date, ts_cfg_['ycol'])
        e_list.append(e_lang_)
    e_lang = pd.concat(e_list)
    e_lang['ts_name'] = ts_name
    e_lang['cutoff'] = cutoff_date
    froot = '~/my_tmp/fbp/'
    fname = froot + 'lang_cfgs_'
    p_ut.save_df(e_lang, fname + ts_name + '_' + str(cutoff_date.date()))
    return
Exemplo n.º 14
0
def fcast_regressors(lang, rf_, tcol, ycol, i_dt, f_days, do_fcast):
    # all regressors are forecasted using the default cfg
    s_ut.my_print('pid: ' + str(os.getpid()) +
                  ' starting regressor fcast for language ' + str(lang) +
                  ' cols: ' + str(list(rf_.columns)))
    if do_fcast.get(ycol, True) is True:
        rg = rf_[rf_['ds'] <= i_dt].copy()
        r = None
        if len(rg) > 0:
            if ycol == 'contact_rate':
                if rg[ycol].min() == 0.0:  # would mean no inbound tickets
                    zmin = rg[rg[ycol] > 0.0][ycol].min() / 10.0
                    rg[ycol].replace(0.0, zmin, inplace=True)
                fcfg = {'xform': 'logistic'}
                pdict = dict()
                for c in ['ceiling', 'floor']:
                    if c in rg.columns:
                        pdict[c] = rg.loc[rg.index[0], c]
            else:
                fcfg = None
                pdict = dict()
            rg.rename(columns={tcol: 'ds', ycol: 'y'}, inplace=True)
            f_obj = one_fcast.OneForecast(rg[['ds', 'y']],
                                          None,
                                          pdict,
                                          fcfg,
                                          f_days,
                                          time_scale='D',
                                          verbose=False)
            r = f_obj.forecast()
        if r is not None:
            r.rename(columns={'yhat': ycol}, inplace=True)
            r['language'] = lang
            if any(np.isinf(r[ycol])):
                s_ut.my_print('ERROR in reg forecast for ' + ycol +
                              ' and language ' + str(lang))
                p_ut.save_df(rg, '~/my_tmp/rg_' + lang + '_' + ycol)
                print(r.describe())
                sys.exit()
    else:  # nothing to fcast
        if rf_[tcol].max() < i_dt + pd.to_timedelta(f_days, unit='D'):
            print('WARNING: no forecast set but missing forecast values for ' +
                  str(ycol) + ' Ignoring')
            r = None
        else:
            r = rf_[rf_[tcol] <= i_dt +
                    pd.to_timedelta(f_days, unit='D')].copy()
            r['language'] = lang
    return r
Exemplo n.º 15
0
def to_table(to_db, table, cutoff_date, ts_name, if_exists, df_out):
    cu_dt = str(cutoff_date.date())
    df_out['cutoff'] = cutoff_date
    df_out['ts_name'] = ts_name
    file_out = p_ut.save_df(df_out,
                            '~/my_tmp/fcast_df_' + cu_dt + '_' + ts_name)
    if to_db is True:
        partition = {'cutoff': cu_dt, 'ts_name': ts_name}
        df_out['ds'] = df_out['ds'].dt.date.astype(str)
        df_out.drop(['cutoff', 'ts_name'], axis=1, inplace=True)
        s_ut.my_print('Loading data to ' + table + ' for partition: ' +
                      str(partition))
        try:  # presto does not work with a partition argument
            ap.hive.push(df_out,
                         table=table,
                         if_exists=if_exists,
                         partition=partition,
                         table_props={
                             'abb_retention_days': '-1',
                             'abb_retention_days_reason': 'fact table. No pii'
                         })
        except:
            s_ut.my_print('ERROR: push to ' + table +
                          ' failed for partition: ' + str(partition))
            sys.exit()
    return file_out
Exemplo n.º 16
0
def main(in_df, k_col, ts_list, top_ts, adjust_date):
    if k_col in ts_list:
        s_ut.my_print('ERROR: invalid k_col: ' + str(k_col) +
                      ' cannot one of ' + str(ts_list) +
                      ' has to be a DF column')
        sys.exit()
    adj_df_list = fcast_adj(k_col, adjust_date)
    if len(adj_df_list) > 0:
        fout = in_df.copy()
        for adj_df in adj_df_list:
            fout = merge_adj(fout, adj_df, k_col, ts_list, top_ts)
        p_ut.save_df(fout, '~/my_tmp/fout_' + k_col)
        return fout
    else:
        s_ut.my_print('WARNING: nothing to adjust for ' + k_col +
                      ' <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
        return in_df
Exemplo n.º 17
0
def get_cfg_data(ts_name, cfg_cols, p_col):
    # read all the cfgs and set the cfg_idx
    t_name = 'sup.fct_cx_forecast_config'
    qry = 'select * from ' + t_name + ';'
    q_file = '/tmp/read_cfg_' + ts_name + '.hql'
    with open(q_file, 'w') as f:
        f.write(qry)
    s_ut.my_print('pid: ' + str(os.getpid()) + ' in query file: ' + q_file)
    fout = None
    ret = hql.run_hql((q_file, q_file), fout)
    if ret == -1:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR: Query failed. No configs found')
        sys.exit()
    s_ut.my_print('pid: ' + str(os.getpid()) + ' fcast cfg file: ' + ret)
    cfg_df = p_ut.read_df(ret, sep='\t')
    p_ut.set_week_start(cfg_df, tcol='ds')  # week_starting patch

    if cfg_df is None or len(cfg_df) == 0:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR: no data for query: ' + str(qry))
        sys.exit()
    dcol = {
        x: x.replace(t_name.split('.')[-1] + '.', '')
        for x in cfg_df.columns
    }
    cfg_df.rename(columns=dcol, inplace=True)
    cfg_df['cutoff'] = pd.to_datetime(cfg_df['cutoff'])
    cfg_df = cfg_df[(cfg_df['ts_name'] == ts_name)]
    cfg_df = cfg_df[cfg_df[p_col] > 0.0].copy()
    cfg_df.fillna('None', inplace=True)
    cfg_df['cfg_str'] = cfg_df.apply(
        lambda x: json.dumps(x[cfg_cols].to_dict()), axis=1)
    z = cfg_df['cfg_str'].drop_duplicates()
    zf = pd.DataFrame(z)
    zf.reset_index(inplace=True, drop=True)
    zf.reset_index(inplace=True)
    zf.columns = ['cfg_idx', 'cfg_str']
    df = cfg_df.merge(zf, on=['cfg_str'], how='left')
    df['language'].replace(['Mandarin_Offshore', 'Mandarin_Onshore'],
                           'Mandarin',
                           inplace=True)  # Mandarin need to be fixed later
    df.drop_duplicates(inplace=True)
    p_ut.save_df(df, '~/my_tmp/rk_df_' + ts_name)
    # df = p_ut.read_df('~/my_tmp/rk_df_' + ts_name)
    return df
Exemplo n.º 18
0
def prepare_regressors(data_cfg,
                       _cfg,
                       d_cfg,
                       cutoff_date,
                       fcast_days,
                       init_date='2016-01-01'):
    s_ut.my_print('************* reading regressors ********************')
    reg_cfg = data_cfg.get('regressors', None)
    if reg_cfg is None:
        return None

    arg_list = [[rname, rcfg, cutoff_date, fcast_days, init_date]
                for rname, rcfg in reg_cfg.items()]
    rf_list = s_ut.do_mp(prepare_regs,
                         arg_list,
                         is_mp=True,
                         cpus=None,
                         do_sigkill=True)
    arg_list, rcol_list = fcast_prep(rf_list, reg_cfg, cutoff_date, fcast_days,
                                     pd.to_datetime(init_date))
    r_list = s_ut.do_mp(fcast_regressors,
                        arg_list,
                        is_mp=True,
                        cpus=None,
                        do_sigkill=True)
    r_list = list(filter(lambda x: x is not None,
                         r_list))  # drop all Nones if any
    reg_fdf = merge_regressors(
        r_list, rcol_list)  # merge all regressors in a single DF
    fcast_date = cutoff_date + pd.to_timedelta(fcast_days, unit='D')
    if reg_fdf is not None:
        p_ut.save_df(reg_fdf, '~/my_tmp/reg_df')
        s_ut.my_print('final predicted regressors: fcast date: ' +
                      str(fcast_date.date()) + ' cutoff rate: ' +
                      str(cutoff_date.date()) + ' fcast_days: ' +
                      str(fcast_days) + ' gap: ' + str(
                          max([
                              reg_fdf[reg_fdf['language'] == l]
                              ['ds'].diff().dt.days.max()
                              for l in reg_fdf['language'].unique()
                          ])) + ' nulls: ' +
                      str(sum([reg_fdf[c].isnull().sum() for c in rcol_list])))
    else:
        s_ut.my_print('WARNING: no regressors available')
    return reg_fdf
Exemplo n.º 19
0
    def resi_fcast_(self, f_df, y_vals):
        # forecast the residuals from self by calling OneForecast again
        # prepare: first, inverse transform actuals and forecasted actuals back to the natural scale
        y = self.xform_obj.inverse_transform(y_vals['y'], 0.0, lbl='y')                                           # inverse transformed actuals
        y_hat_x = f_df[(f_df['ds'] >= y_vals['ds'].min()) & (f_df['ds'] <= y_vals['ds'].max())].copy()             # transformed forecasted actuals
        y_var = self.xform_obj.fcast_var(y_hat_x[['yhat_upper', 'yhat_lower']], self.prophet_obj.interval_width)   # use the same var for upper and lower
        yhat = self.xform_obj.inverse_transform(y_hat_x['yhat'], y_var, lbl='yhat')                          # inverse transformed forecasted actuals
        try:
            res_df = pd.DataFrame({'ds': y_vals['ds'].values, 'y': y - yhat})                                          # residuals DF to forecast in natural scale
        except TypeError as msg:
            s_ut.my_print('WARNING: resi_fcast fails with msg: ' + str(msg))
            print(y_hat_x.head())
            print(y_vals.head())
            return None

        if res_df['y'].isnull().sum() > 0.25 * len(res_df):   # more than pct nulls
            s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: resi_fcast failed: too many NaNs in res_df: '
                          + str(res_df['y'].isnull().sum()) + ' nulls on ' + str(len(res_df)) + ' rows')
            print(res_df.head(10))
            print(res_df.isnull().sum())
            p_ut.save_df(res_df, '~/my_tmp/res_df')
            return None
        else:
            res_df['y'].interpolate(inplace=True, limit_direction='both')

            # ##################################################################################################
            # ################################### white noise test ##############################################
            # lags = min(10.0, len(y_vals) / 5.0) if self.f_cfg['y_mode'] is None else int(min(2.0 * 365 + 1.0, len(y_vals) / 5.0))
            # is_wn = st_ut.white_noise_test(res_df['y'].values, p_thres=0.05, lags=int(lags), verbose=False)                  # white noise test
            # if is_wn is False:                                                                                               # residuals are not white noise
            # ##################################################################################################
            # ##################################################################################################

            f_cfg = copy.deepcopy(self.f_cfg)                                                                                  # use current cfg but adjust xform,do_res and regs
            f_cfg['xform'] = 'yeo-johnson'                                                                                     # residuals will be negative!
            f_cfg['do_res'] = False                                                                                            # no infinite recursion!
            f_cfg['r_mode'] = None                                                                                             # regs will probably not help here
            s_ut.my_print('pid: ' + str(os.getpid()) + ' resi_fcast: start recursive call: residual forecast')
            res_obj = OneForecast(res_df.copy(), None, copy.deepcopy(self.prophet_dict), f_cfg, self.horizon, time_scale=self.time_scale, verbose=self.verbose)
            ret = res_obj.forecast()                                                                                       # residuals forecast in natural scale
            if ret is None:
                ret = [None]
            s_ut.my_print('pid: ' + str(os.getpid()) + ' resi_fcast: end recursive call: residual forecast')
            return ret[0]  # n_df
Exemplo n.º 20
0
def lang_perf(lg, f_data, a_perf, this_cutoff, upr, lwr):
    p_ut.save_df(a_perf, '~/my_tmp/a_perf_' + lg)
    p_ut.save_df(f_data, '~/my_tmp/f_data_' + lg)
    d_list = list()
    a_perf.sort_values(by='a_err', inplace=True)
    a_perf.reset_index(inplace=True, drop=True)
    cfg, b_err = a_perf.loc[a_perf.index[0], ]

    # OMP
    # reg_omp = OMP('OMP', f_data, this_cutoff, upr, lwr)
    # _ = gen_output(reg_omp, d_list)

    # lasso
    reg_lasso = LassoRegressor('lasso', f_data, this_cutoff, upr, lwr)
    lasso_mask = gen_output(reg_lasso, d_list, b_err, append=False)

    # ENET
    verbose = False
    reg_enet = EnetOpt('enet',
                       f_data,
                       this_cutoff,
                       upr,
                       lwr,
                       max_evals=100,
                       verbose=verbose)
    enet_mask = gen_output(reg_enet, d_list, b_err, append=False)

    f_mask = enet_mask if np.sum(enet_mask) < len(f_data) else lasso_mask

    # Ridge
    reg_ridge = RidgeOpt('ridge', f_data, this_cutoff, upr, lwr, f_mask,
                         reg_enet.ridge_par)
    _ = gen_output(reg_ridge, d_list, b_err, append=True)

    # 'XGBRegressor', 'AdaBoostRegressor', 'BaggingRegressor', 'GradientBoostingRegressor', 'RandomForestRegressor', 'ExtraTreesRegressor'
    # ensembles with ENET selected features
    # for rname in ['XGBRegressor', 'AdaBoostRegressor']:
    #     for loss_type in ['rel', 'abs']:
    #         reg = EnsOpt(rname, fl, cutoff_date, upr, lwr, mask, loss_type=loss_type, max_evals=200)
    #         d_err = reg.perf_err()
    #         d_err['ens'] += '-' + loss_type
    #         d_list.append(d_err)

    return d_list
Exemplo n.º 21
0
def check_ratios(ts, a_df, f_df, is_dl, col):
    s_ut.my_print('ratio check to ' + str(ts) + ' and DL: ' + str(is_dl))
    b_df = a_df[a_df['channel'] == 'directly'].copy(
    ) if is_dl is True else a_df[a_df['channel'] != 'directly'].copy()
    a_r = get_ratios(b_df, 'ticket_count', [col]) if len(b_df) > 0 else None
    g_df = f_df[f_df['channel'] == 'directly'].copy(
    ) if is_dl is True else f_df[f_df['channel'] != 'directly'].copy()
    f_r = get_ratios(g_df, ts + '_tilde', [col]) if len(g_df) > 0 else None
    p_ut.save_df(a_df, '~/my_tmp/a_df_' + ts + '_' + str(is_dl) + '_' + col)
    p_ut.save_df(f_df, '~/my_tmp/f_df_' + ts + '_' + str(is_dl) + '_' + col)
    if a_r is not None and f_r is not None:
        raf = a_r.merge(f_r, on=[col, 'language'], how='left')
        raf.columns = [col, 'language', 'a_ratio', 'f_ratio']
        raf['diff'] = np.abs(raf['f_ratio'] - raf['a_ratio'])
        p_ut.save_df(raf, '~/my_tmp/raf_' + ts + '_' + str(is_dl) + '_' + col)
        if raf['diff'].min() > 1.0e-3:
            raf['ts_name'] = ts
            raf['is_dl'] = is_dl
            s_ut.my_print('WARNING: ' + col +
                          ' ratios off <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
            print(raf[raf['diff'] > 0])
    elif (a_r is None and f_r is not None) or (a_r is not None
                                               and f_r is None):
        s_ut.my_print('ERROR: error in ratios for ' + ts)
        sys.exit()
    else:  # nothing to check
        pass
Exemplo n.º 22
0
def cfg_perf(fl, upr, lwr, cu_sun):
    fl.reset_index(inplace=True)
    lg, dim_cfg, ws = fl.loc[fl.index[0], ['language', 'dim_cfg', 'opt_shift']]
    fx = f_ok(fl, cu_sun - pd.to_timedelta(upr, unit='W'), cu_sun + pd.to_timedelta(upr, unit='W'), 'yhat')    # check fcast OK at the real cutoff date
    if fx is None:
        s_ut.my_print('WARNING: invalid cfg ' + str(dim_cfg) + ' config for cutoff ' + str(cu_sun.date()) + ' and language ' + str(lg))
        p_ut.save_df(fl, '~/my_tmp/fl_' + lg + '_' + str(dim_cfg))
        return None
    else:
        # actual performance for this cfg
        fa = fx[(fx['ds'] >= cu_sun + pd.to_timedelta(lwr, unit='W')) & (fx['ds'] <= cu_sun + pd.to_timedelta(upr, unit='W'))].copy()
        a_err = st_ut.wmape(fa[['y', 'yhat']])

        # training performance: use last year's data adjusted
        g_train = fx[(fx['ds'] >= cu_sun + pd.to_timedelta(lwr, unit='W')) & (fx['ds'] <= cu_sun + pd.to_timedelta(upr, unit='W'))].copy()
        t_err = st_ut.wmape(g_train[['adj_y_shifted', 'yhat']], y_col='adj_y_shifted') if len(g_train) > 0 else np.nan    # "training" performance for this fcast cfg
        if np.isnan(t_err):
            s_ut.my_print('WARNING: cannot get training performance of config ' + str(dim_cfg) + ' for cutoff ' + str(cu_sun.date()) + ' and language ' + str(lg))
        else:
            s_ut.my_print('OK: get performance of config ' + str(dim_cfg) + ' for cutoff ' + str(cu_sun.date()) + ' and language ' + str(lg))
        fx['train_err'] = t_err
        fx['a_err'] = a_err
        return fx
Exemplo n.º 23
0
def lognorm_qfit(a):
    # _ts_name, _ts_key = a[3]
    _ts_name, _ts_key = a[2]
    qd, avg, std = set_fit(a)  # get quantile dict and data avg
    dict_out = _lognorm_qfit(qd, avg, std) if (
        len(qd) > 0 and min(qd.values()) > 0.0 and avg > 0.0) else None

    if dict_out is None:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' WARNING: lognorm fit failed for ts_name: ' +
                      str(_ts_name) + ' and ts_key: ' + str(_ts_key))
        _ = p_ut.save_df(
            a[0], TempUtils.tmpfile('fit_df_' + _ts_name + '_' + _ts_key))
        return None
    else:  # fill output DF
        dict_out['ts_name'] = _ts_name
        dict_out['ts_key'] = _ts_key
        dict_out['dist_name'] = 'lognorm'
        dict_out['probs'] = [[1.0]]
        dict_out['pars'] = [[dict_out['sh'][0], dict_out['sc'][0]]]
    _ = [dict_out.pop(x, None) for x in ['sc', 'sh', 'prob']]
    return pd.DataFrame(dict_out)
Exemplo n.º 24
0
def basic_perf(ts_name, cutoff_date, upr, lwr, init_date='2016-01-01', time_scale='W'):
    ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)

    # actuals
    actuals_df = dp.ts_actuals(ts_name, ts_cfg, cols)
    actuals_df.rename(columns={ts_cfg['ycol']: 'y'}, inplace=True)
    actuals_df.drop_duplicates(inplace=True)                          # not sure why there are dups. Table problem?

    # forecasts
    f_df = get_lang_fcast(ts_cfg, cutoff_date, eq=True)
    f_df.drop_duplicates(inplace=True)                                # not sure why there are dups. Table problem?
    f_idx = fcast_idx(f_df)
    f_df = f_df.merge(f_idx, on='dim_cfg', how='left')
    f_df.drop('dim_cfg', axis=1, inplace=True)
    f_df.rename(columns={'index': 'dim_cfg'}, inplace=True)
    df = f_df.merge(actuals_df, on=['ds', 'language'], how='left')
    df.drop('cutoff', axis=1, inplace=True)
    df.drop_duplicates(inplace=True)                                  # not sure why there are dups
    p_ut.save_df(df, '~/my_tmp/df_all')

    # perf
    cu_sun = cutoff_date - pd.to_timedelta(6, unit='D')
    sf = df[['ds', 'language', 'y']].drop_duplicates()
    sf.dropna(inplace=True)
    f_shift = sf.groupby('language').apply(get_shift, cu_sun=cu_sun, upr=upr, lwr=lwr).reset_index()  # find best shift
    nf = df.merge(f_shift, on=['ds', 'language', 'y'], how='left')
    nf.set_index(['language', 'dim_cfg'], inplace=True)     # avoids drop of nuisance cols
    nf.dropna(inplace=True)
    nf.drop_duplicates(inplace=True)
    zperf = nf.groupby(['language', 'dim_cfg']).apply(cfg_perf, upr=upr, lwr=lwr, cu_sun=cu_sun).reset_index(drop=True)  # do not groupby df.index
    zperf.dropna(inplace=True)
    p_ut.save_df(zperf, '~/my_tmp/zperf_' + ts_name + '_' + str(cutoff_date.date()))
    nf.reset_index(inplace=True)
    p_ut.save_df(nf, '~/my_tmp/nf')
    fout = nf.merge(zperf, on=list(nf.columns), how='left')
    return actuals_df, fout
Exemplo n.º 25
0
            np.sum
        }).reset_index()
        g_df['cx_yhat'] = g_df['w_ib_aht'] / g_df[
            'IB Offered'] if 'inbound' in ts_name else g_df['w_ob_aht'] / g_df[
                'OB Offered']
    else:
        print('invalid ts_name: ' + str(ts_name))

    # language level errors weekly
    wf = g_df.merge(a_df, on=['ds_week_ending', 'language'], how='inner')
    wf['cx_err'] = 2.0 * np.abs(wf['cx_yhat'] - wf['y']) / (wf['cx_yhat'] +
                                                            wf['y'])
    wf['ts_name'] = ts_name
    wf['cutoff_date'] = cutoff_date
    wf['horizon_date'] = str(horizon_date.date())
    p_ut.save_df(wf,
                 '~/my_tmp/cx_fcast_weekly_err_' + ts_name + '_' + cutoff_date)

    # summary by language
    wf['x'] = wf['cx_err'] * wf[w_col]
    lf = wf.groupby('language').agg({'x': np.sum, w_col: np.sum}).reset_index()
    lf['cx_err'] = lf['x'] / lf[w_col]
    all_err = (lf[w_col] * lf['cx_err']).sum() / lf[w_col].sum()
    adf = pd.DataFrame({'language': ['All'], 'cx_err': [all_err]})
    lf = lf[['language', 'cx_err']].copy()
    fdf = pd.concat([lf, adf], axis=0)
    fdf['ts_name'] = ts_name
    fdf['cutoff_date'] = cutoff_date
    fdf['horizon_date'] = str(horizon_date.date())
    p_ut.save_df(fdf, '~/my_tmp/cx_fcast_err_' + ts_name + '_' + cutoff_date)
    print('horizon date: ' + str(horizon_date))
    print(fdf)
Exemplo n.º 26
0
def main(argv):
    # ###########################
    # parameters
    # ###########################
    time_scale = 'W'  # forecasting time scale reset for daily ticket data
    init_date = pd.to_datetime('2016-01-01')
    upr = 12
    lwr = 8
    evals = 50
    by_lang = False
    # ###########################
    # ###########################
    # ###########################
    print(argv)
    if len(argv[1:]) == 1:
        ts_name = argv[-1]
        cutoff_date = pd.to_datetime('today')
        to_table = False
    elif len(argv[1:]) == 2:
        ts_name, cutoff_date = argv[1:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = False
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
                str(argv))
            sys.exit()
    elif len(argv[1:]) == 3:
        ts_name, cutoff_date, to_table = argv[1:]
        try:
            cutoff_date = pd.to_datetime(cutoff_date)
            to_table = bool(int(to_table))
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
                str(argv))
            sys.exit()
    else:
        s_ut.my_print(
            'ERROR: invalid arguments (ts_name, cutoff_date, to_table): ' +
            str(argv))
        sys.exit()
    ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)

    # actuals
    actuals_df = dp.ts_actuals(ts_name, ts_cfg, cols)
    actuals_df.rename(columns={ts_cfg['ycol']: 'y'}, inplace=True)

    # forecasts
    f_df = fp.get_lang_fcast(ts_cfg, cutoff_date)
    fcast_date = cutoff_date + pd.to_timedelta(upr, unit=time_scale)

    perf_list = list()
    for xens in [
            'XGBRegressor', 'AdaBoostRegressor', 'BaggingRegressor',
            'GradientBoostingRegressor', 'RandomForestRegressor',
            'ExtraTreesRegressor', 'lasso'
    ]:
        fcast_df = ep.make_fcast(ts_name,
                                 f_df,
                                 actuals_df,
                                 cutoff_date,
                                 fcast_date,
                                 xens,
                                 evals,
                                 by_lang, (lwr, upr),
                                 lwr=lwr,
                                 upr=upr)
        perf_df = perf.fcast_perf(fcast_df, actuals_df, cutoff_date, lwr, upr,
                                  time_scale, xens)
        if perf_df is None:
            s_ut.my_print('pid: ' + str(os.getpid()) +
                          ' WARNING: forecast performance detail failed for ' +
                          ts_name + ' ,cutoff date ' +
                          str(cutoff_date.date()) + ' and ensemble: ' +
                          str(xens))
        else:
            perf_df['ts_name'] = ts_name
            perf_list.append(perf_df)
    if len(perf_list) > 0:
        pf = pd.concat(perf_list, axis=0)
        p_ut.save_df(
            pf, '~/my_tmp/perf/fcast_perf_detail_' + ts_name + '_' +
            str(cutoff_date.date()))
        if to_table is True:
            tab_cols = ['language', 'y', 'yhat', 'err', 'lwr', 'upr', 'ens']
            partition = {'cutoff': str(cutoff_date.date()), 'ts_name': ts_name}
            ret = hql.to_tble(pf, tab_cols,
                              'sup.cx_language_forecast_performance_detail',
                              partition)
            if ret != 0:
                s_ut.my_print(
                    'pid: ' + str(os.getpid()) +
                    ' ERROR: forecast performance detail failed for ' +
                    ts_name + ' and cutoff date ' + str(cutoff_date.date()))
                sys.exit()
        print('DONE')
    else:
        s_ut.my_print(
            'pid: ' + str(os.getpid()) +
            ' ERROR: no data for forecast performance detail failed for ' +
            ts_name + ' and cutoff date ' + str(cutoff_date.date()))
        sys.exit()
Exemplo n.º 27
0
def main(argv):
    print(argv)
    if len(argv) == 2:
        p_name = argv[
            1]  # at least 3 days after last Saturday with actual data
        if 'not-' in p_name:
            print('invalid series name: ' + str(argv))
            sys.exit()
    else:
        print('invalid args: ' + str(argv))
        sys.exit()

    # ###########################
    # parameters
    # ###########################
    lwr = 9
    upr = 12
    time_scale = 'W'  # forecasting time scale reset for daily ticket data
    init_date = pd.to_datetime('2016-01-01')
    froot = os.path.expanduser('~/my_tmp/fbp/')
    # max_int = 2 if time_scale == 'W' else (5 if time_scale == 'D' else None)
    # if max_int is None:
    #     s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: unsupported time scale: ' + str(time_scale))
    #     sys.exit()
    # ###########################
    # ###########################

    # set complementary TS
    vname = p_name.split('_')  # eg ts, bu
    if len(vname) == 2:
        b_name = vname[0]
        bu = vname[1]
        n_name = b_name + '_not-' + bu
        ts_list = [p_name, n_name, b_name]
    elif len(vname) == 1:
        ts_list = [p_name]
        b_name, n_name = None, None
    else:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR: invalid ts name: ' + p_name)
        sys.exit()

    # forecasts
    fcast_df = get_fcasts(ts_list, froot)
    p_ut.save_df(fcast_df, '~/my_tmp/fcast_df')
    if fcast_df is None:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR: no fcast data for ' + p_name)
        sys.exit()
    else:
        # actuals
        a_df = get_actuals(ts_list, init_date, time_scale)
        if a_df is not None:
            ts_adf = a_df[a_df['ts_name'] == p_name].copy()
            p_ut.save_df(ts_adf, '~/my_tmp/ts_adf')
            p_list = list()
            for cu, f_cu in fcast_df.groupby('cutoff'):
                fcast_date = f_cu['ds'].max()
                cu -= pd.to_timedelta(
                    (1 + cu.weekday()) % 7,
                    unit='D')  # move cutoff to week starting Sunday

                # performance bounds
                upr_horizon = min(fcast_date,
                                  cu + pd.to_timedelta(upr, unit=time_scale))
                lwr_horizon = cu + pd.to_timedelta(lwr, unit=time_scale)
                f_cu.drop(['cutoff', 'ts_name'], axis=1, inplace=True)
                m_fcast = merge_neg(f_cu, b_name, p_name, n_name, 'yhat')
                m_fcast.drop_duplicates(inplace=True)
                perf_df = fcast_perf(m_fcast, ts_adf, lwr_horizon, upr_horizon)
                perf_df['ts_name'] = p_name
                perf_df['cutoff'] = cu
                p_list.append(perf_df)
            if len(p_list) > 0:
                perf_out = pd.concat(p_list, axis=0)
                fname = froot + 'ens_perf_'
                p_ut.save_df(perf_out, fname + p_name)
            else:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' WARNING: perf_df not available')
        else:
            s_ut.my_print('pid: ' + str(os.getpid()) +
                          ' ERROR: no actuals data for ' + p_name)
            sys.exit()
        print('DONE')
Exemplo n.º 28
0
def main(argv):
    # ###########################
    # parameters
    # ###########################
    time_scale = 'W'  # forecasting time scale reset for daily ticket data
    init_date = pd.to_datetime('2016-01-01')
    froot = '~/my_tmp/fbp/'
    # ###########################
    # ###########################

    print(argv)
    if len(argv) == 2:
        ts_name = argv[-1]
        to_table = False
        run_date = pd.to_datetime('today')
    elif len(argv) == 3:
        ts_name, run_date = argv[-2:]
        try:
            run_date = pd.to_datetime(run_date)
            to_table = False
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, run_date, to_table): ' +
                str(argv))
            sys.exit()
    elif len(argv) == 4:
        ts_name, run_date, to_table = argv[1:]
        try:
            run_date = pd.to_datetime(run_date)
            to_table = bool(int(to_table))
        except ValueError:
            s_ut.my_print(
                'ERROR: invalid arguments (ts_name, run_date, to_table): ' +
                str(argv))
            sys.exit()
    else:
        s_ut.my_print(
            'ERROR: invalid arguments (ts_name, run_date, to_table): ' +
            str(argv))
        sys.exit()

    # data cfg
    cutoff_date = tm_ut.get_last_sat(
        run_date
    )  # set to last saturday before run_date or the run_date if a saturday
    ts_cfg, cols = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)
    FCAST_DICT['outlier_coef'] = ts_cfg.get('outlier_coef', [3.0])

    fcast_days = ts_cfg.get('fcast_days', None)
    if fcast_days is None:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR" fcast_days must be specified in data_cfg')
        sys.exit()
    else:
        fcast_date = cutoff_date + pd.to_timedelta(fcast_days, unit='D')

    if time_scale == 'W':
        fcast_date = fcast_date - pd.to_timedelta(
            1 + fcast_date.weekday(), unit='D')  # set to week starting Sunday
        cu = cutoff_date - pd.to_timedelta(
            1 + cutoff_date.weekday(), unit='D')  # set to week starting Sunday
        fcast_days = (fcast_date - cu).days  # multiple of 7
        upr_horizon = int(fcast_days / 7)  # in time scale units
    elif time_scale == 'D':
        upr_horizon = int(fcast_days)  # in time scale units
    else:
        s_ut.my_print('pid: ' + str(os.getpid()) + ' invalid time scale: ' +
                      str(time_scale))
        sys.exit()

    s_ut.my_print('pid: ' + str(os.getpid()) +
                  ' ------------------------ start language forecast for ' +
                  str(ts_name) + ' from cutoff date ' +
                  str(cutoff_date.date()) + ' (excluded) to forecast date ' +
                  str(fcast_date.date()) +
                  '  (included) -----------------------')

    # get actuals
    actuals_df = dp.ts_actuals(
        ts_name, ts_cfg,
        cols)  # may have data past cutoff for accuracy checking
    if actuals_df['ds'].max() < cutoff_date:
        s_ut.my_print(
            'ERROR: no actuals available for forecast from cutoff date: ' +
            str(cutoff_date.date()))
        sys.exit()
    f_actuals_df = actuals_df[actuals_df['ds'] <= cutoff_date].copy(
    )  # actuals for forecast: only use up to cutoff date

    # adjust FCAST_DICT
    if len(FCAST_DICT['do_res']) == 2:  # True, False
        FCAST_DICT['do_res'] = [
            True
        ]  # MUST overwrite: the False care is always included and otherwise we double count.
    if len(ts_cfg.get('regressors', list())) == 0:
        FCAST_DICT['r_mode'] = [None]
        reg_dict = dict()
    else:
        reg_dict = regs.ens_fcast(
            ts_name, ts_cfg['regressors'], cutoff_date, time_scale, fcast_days,
            init_date,
            f_actuals_df)  # stored by cutoff date on last Sat of the month

    # update init_date
    init_date = max([f_actuals_df['ds'].min()] +
                    [f['ds'].min() for f in reg_dict.values()])
    f_actuals_df = f_actuals_df[f_actuals_df['ds'] >= init_date].copy()
    reg_dict = {
        lx: f[f['ds'] >= init_date].copy()
        for lx, f in reg_dict.items()
    }
    ts_cfg['init_date'] = init_date

    # set the list of fcast cfgs
    tlist = get_f_cfg(FCAST_DICT, cutoff_date, init_date,
                      time_scale)  # list of fcast cfg's
    fix_pars = [
        f_actuals_df, ts_name, reg_dict, fcast_date, cutoff_date, ts_cfg,
        time_scale, upr_horizon
    ]
    arg_list = [
        fix_pars + [tlist[ix]] for ix in range(len(tlist))
    ]  # 2 fcasts are done per input cfg (do_res = true and do_res = false)
    n_fcfg = 2 * len(arg_list)
    s_ut.my_print('pid: ' + str(os.getpid()) + ' ++++++++ there are ' +
                  str(n_fcfg) + ' fcast configs per language **********')

    # ###############################################################################
    # ###############################################################################
    # ###############################################################################
    if is_test:
        df_list_ = s_ut.do_mp(fcast_lang,
                              arg_list,
                              is_mp=False,
                              cpus=None,
                              do_sigkill=True)
    else:
        df_list_ = s_ut.do_mp(fcast_lang,
                              arg_list,
                              is_mp=True,
                              cpus=None,
                              do_sigkill=True)
    # ###############################################################################
    # ###############################################################################
    # ###############################################################################

    # join all the fcasted data into a flat list
    df_list = [f for f in df_list_ if f is not None]
    if len(df_list) > 0:
        ylist, alist = list(), list()
        for fl in df_list:
            if fl is not None:
                fl = set_cfg(fl.copy(), CFG_COLS)
                ylist.append(fl[[
                    'ds', 'language', 'yhat', 'ts_name', 'cutoff', 'dim_cfg',
                    'fcast_date'
                ]].copy())
                alist.append(fl)

        # save basic fcast data
        fcast_df = pd.concat(
            ylist, axis=0)  # now all the list elements have the same columns
        fcast_df.reset_index(inplace=True, drop=True)

        ok_cfg = fcast_df['dim_cfg'].unique()
        s_ut.my_print('pid: ' + str(os.getpid()) + str(len(ok_cfg)) +
                      ' forecasts cfgs available for ' + str(ts_name) +
                      ' from cutoff date ' + str(cutoff_date.date()) +
                      ' (excluded) to forecast date ' +
                      str(fcast_date.date()) +
                      '  (included) -----------------------')
        # fcast_df = fcast_df[fcast_df['dim_cfg'].isin(ok_cfg)].copy()
        fname = froot + 'lang_fcast_'
        p_ut.save_df(fcast_df, fname + ts_name + '_' + str(cutoff_date.date()))
        if to_table is True:
            tab_cols = ['ds', 'language', 'dim_cfg', 'yhat']
            partition = {
                'cutoff': str(cutoff_date.date()),
                'ts_name': ts_cfg['ts_key']
            }
            ret = hql.to_tble(fcast_df, tab_cols, 'sup.cx_language_forecast',
                              partition)
            if ret != 0:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' ERROR: no forecasts loaded to table for ' +
                              str(ts_cfg['ts_key']) + ' and cutoff date ' +
                              str(cutoff_date.date()))
                sys.exit()

        # save all fcast data  (y_upr, y_lwr, ...)
        all_df = pd.concat(
            alist, axis=0)  # now all the list elements have the same columns
        all_cols = list(set([c for c in all_df.columns if c not in CFG_COLS]))
        all_df.reset_index(inplace=True, drop=True)
        all_df = all_df[all_cols].copy()
        all_df = all_df[all_df['dim_cfg'].isin(ok_cfg)].copy()
        fname = froot + 'fcast_all_'
        p_ut.save_df(all_df, fname + ts_name + '_' + str(cutoff_date.date()))
        if to_table is True:
            all_df.drop(['cutoff', 'ts_name'], axis=1, inplace=True)
            mf = pd.melt(all_df,
                         id_vars=['ds', 'language', 'dim_cfg'],
                         var_name='key',
                         value_name='value')
            mf.dropna(subset=['value'], inplace=True)
            mf = mf[mf['value'] != 0.0].copy()
            partition = {
                'cutoff': str(cutoff_date.date()),
                'ts_name': ts_cfg['ts_key']
            }
            ret = hql.to_tble(mf, list(mf.columns),
                              'sup.cx_language_forecast_detail', partition)
            if ret != 0:
                s_ut.my_print('pid: ' + str(os.getpid()) +
                              ' ERROR: no forecasts loaded to table for ' +
                              str(ts_cfg['ts_key']) + ' and cutoff date ' +
                              str(cutoff_date.date()))
                sys.exit()
        print('DONE')
    else:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR: no forecasts available for ' +
                      str(ts_cfg['ts_key']) + ' from cutoff date ' +
                      str(cutoff_date.date()) +
                      ' (excluded) to forecast date ' +
                      str(fcast_date.date()) +
                      '  (included) -----------------------')
Exemplo n.º 29
0
def to_excel_(xl_file, cutoff_date_, curr_adj_obj, prev_adj_obj, target_year, lang_errs, tier_errs, lact_df):
    """
    build excel output and save it to xls file
    :param xl_file: out file
    :param cutoff_date_: current cutoff date
    :param curr_adj_obj: current fcast adjusted obj
    :param prev_adj_obj: prior fcast adjusted obj
    :param lang_errs: lang errs of the fcast 3 months back
    :param tier_errs: tier errs between the fcast 3 months back
    :param lact_df: lang level actuals 16 weeks back
    :return: _
    """
    # prepare excel output: xls output is adjusted. par output is Not
    fx = os.path.expanduser(xl_file)
    s_ut.my_print('xl file: ' + fx)
    xlwriter = pd.ExcelWriter(fx, engine='xlsxwriter')

    # pull out actuals date indicator
    act_df = curr_adj_obj.data[['ds', 'is_actual']].copy()
    act_df['ds'] = act_df['ds'].dt.date.astype(str)
    act_df.set_index('ds', inplace=True)
    p_ut.save_df(act_df, '~/my_tmp/act_df')
    act_dates = list(act_df[act_df['is_actual'] == 1].index)

    curr_fcast = curr_adj_obj.data   # current forecast
    prev_fcast = prev_adj_obj.data   # previous forecast

    # fcast delta
    fcast_delta(xlwriter, curr_fcast, prev_fcast, 'Forecast Delta')
    for bu in ['Homes', 'Experiences', 'China']:
        c_fcast = curr_fcast[curr_fcast['business_unit'] == bu].copy()
        p_fcast = prev_fcast[prev_fcast['business_unit'] == bu].copy()
        fcast_delta(xlwriter, c_fcast, p_fcast, bu + ' Forecast Delta')

    # target year totals by language-tier-channel
    fcast_totals(xlwriter, curr_fcast, target_year, ' Totals')
    for bu in ['Homes', 'Experiences', 'China']:
        c_fcast = curr_fcast[curr_fcast['business_unit'] == bu].copy()
        c_fcast = c_fcast[(c_fcast['ds'] >= '2019-12-28') & (c_fcast['ds'] <= '2021-01-03')].copy()
        fcast_totals(xlwriter, c_fcast, target_year,  ' ' + bu + ' Totals')

    # language errors wrt actuals
    if lang_errs is not None:
        lang_errs.fillna(0, inplace=True)
        # lang_errs.rename(columns={'language': 'language'}, inplace=True)
        lang_errs['actual_count'] = np.round(lang_errs['actual_count'], 0)
        lang_errs['forecasted_count'] = np.round(lang_errs['forecasted_count'], 0)
        lang_errs.to_excel(xlwriter, 'Language Accuracy', index=False)
        workbook = xlwriter.book
        bold = workbook.add_format({'bold': True})
        worksheet = xlwriter.sheets['Language Accuracy']
        worksheet.set_column('A:ZZ', 20)
        worksheet.set_row(0, None, bold)
        format_key = workbook.add_format({'bg_color': '#E06666', 'font_color': '#660000'})
        worksheet.conditional_format(0, 0, 0, len(lang_errs.columns) - 1, {'type': 'cell', 'format': format_key, 'criteria': '>=', 'value': '""'})
    else:
        s_ut.my_print('No data for ' + 'language accuracy')

    # tier errors wrt actuals
    if tier_errs is not None:
        tier_errs.fillna(0, inplace=True)
        # tier_errs.rename(columns={'language': 'language', 'service_tier': 'service tier'}, inplace=True)
        tier_errs['actual_count'] = np.round(tier_errs['actual_count'], 0)
        tier_errs['forecasted_count'] = np.round(tier_errs['forecasted_count'], 0)
        tier_errs.to_excel(xlwriter, 'Tier Accuracy', index=False)
        workbook = xlwriter.book
        bold = workbook.add_format({'bold': True})
        worksheet = xlwriter.sheets['Tier Accuracy']
        worksheet.set_column('A:ZZ', 20)
        worksheet.set_row(0, None, bold)
        format_key = workbook.add_format({'bg_color': '#E06666', 'font_color': '#660000'})
        worksheet.conditional_format(0, 0, 0, len(tier_errs.columns) - 1, {'type': 'cell', 'format': format_key, 'criteria': '>=', 'value': '""'})
    else:
        s_ut.my_print('No data for ' + 'tier accuracy')

    # channel mix (not very practical to transpose dates as there are 4 values per date)
    cdf = curr_adj_obj.data[curr_adj_obj.data['ds'] > cutoff_date_].copy()
    if cdf is not None:
        cdf['ds'] = cdf['ds'].dt.date.astype(str)
        cdf['ticket_count'] = np.round(cdf['ticket_count'].astype(float).values, 0)
        p_ut.save_df(cdf, '~/my_tmp/cdf')
        channels = cdf['channel'].unique()
        pmix_df = pd.pivot_table(cdf, index=['business_unit', 'language', 'service_tier', 'ds'],
                                 columns='channel', values='ticket_count')
        qmix = pmix_df.div(pmix_df.sum(axis=1), axis=0).reset_index()
        qmix.fillna(0, inplace=True)
        for c in channels:
            qmix[c] = qmix[c].apply(lambda x: np.round(100.0 * x, 1) if isinstance(x, (float, np.float64, int, np.int64)) else x)
        dcol = {c: c + '(%)' for c in channels}
        qmix.rename(columns=dcol, inplace=True)
        qmix.to_excel(xlwriter, 'Channel Mix', index=False)
        workbook = xlwriter.book
        bold = workbook.add_format({'bold': True})
        worksheet = xlwriter.sheets['Channel Mix']
        worksheet.set_column('A:ZZ', 20)
        worksheet.set_row(0, None, bold)
        format_key = workbook.add_format({'bg_color': '#E06666', 'font_color': '#660000'})
        worksheet.conditional_format(0, 0, 0, len(qmix.columns) - 1, {'type': 'cell', 'format': format_key, 'criteria': '>=', 'value': '""'})
    else:
        s_ut.my_print('No data for ' + 'channel mix')

    # language actuals
    if lact_df is not None:
        lact_df['actual_count'] = np.round(lact_df['actual_count'].astype(float).values, 0)  # language level actuals
        lact_df['ds'] = lact_df['ds'].dt.date
        lact_df.to_excel(xlwriter, 'Language Level Actuals', index=False)
        workbook = xlwriter.book
        bold = workbook.add_format({'bold': True})
        worksheet = xlwriter.sheets['Language Level Actuals']
        worksheet.set_column('A:Z', 20)
        worksheet.set_row(0, None, bold)
        format_key = workbook.add_format({'bg_color': '#E06666', 'font_color': '#660000'})
        worksheet.conditional_format(0, 0, 0, len(lact_df.columns) - 1, {'type': 'cell', 'format': format_key, 'criteria': '>=', 'value': '""'})
    else:
        s_ut.my_print('No data for ' + ' language level actuals')

    # add the rest of the tabs
    if curr_adj_obj.data is not None:
        counts_dict = rs_to_excel(curr_adj_obj.data, 'ds')
        for ky, sht in counts_dict.items():
            sht.fillna(0, inplace=True)
            sht = sht.applymap(lambda x: np.round(x, 0) if isinstance(x, (float, np.float64, int, np.int64)) else x)
            tab_name = ky.replace('/', '-')
            sht.to_excel(xlwriter, tab_name, index=False)
            set_colors(xlwriter, tab_name, sht, act_dates)
            workbook = xlwriter.book
            bold = workbook.add_format({'bold': True})
            worksheet = xlwriter.sheets[tab_name]
            worksheet.set_column('B:ZZ', 20, bold)
            worksheet.set_column('A:A', 20, bold)
    else:
        s_ut.my_print('No data for ' + 'forecasts')

    s_ut.my_print('saving xls forecast to ' + fx)
    xlwriter.save()
Exemplo n.º 30
0
    # #########################
    actuals_weeks = 12
    target_year = 2020
    init_date = pd.to_datetime('2016-01-01')
    time_scale = 'W'
    ts_name = 'ticket_count'
    # #########################

    # get actuals
    ts_cfg, _ = dp.ts_setup(ts_name, cutoff_date, init_date, time_scale)
    adf = dp.ts_actuals(ts_name, ts_cfg, ['language', 'business_unit', 'channel', 'service_tier'], drop_cols=False, use_cache=False)
    start, end = tm_ut.iso_dates(target_year - 1)
    adf = adf[adf['ds'] >= start]   # use for YoY comparison
    adf.rename(columns={'ticket_count': 'y'}, inplace=True)
    p_ut.save_df(adf, '~/my_tmp/a_df')

    # current forecast
    s_ut.my_print('current forecast')
    cf_df = pd.concat([get_fcast(ts_name, ts_cfg, cutoff_date, bu, use_cache=False) for bu in ts_cfg['business_units']], axis=0)
    cf_df = cf_df[cf_df['ds'] > cutoff_date].copy()
    p_ut.save_df(cf_df, '~/my_tmp/cf_df')

    # target year summary
    xl_ut.year_summary(adf, cf_df, target_year, cutoff_date)

    # get 90 day old forecast (last Sat of a month)
    s_ut.my_print('3 months old forecast')
    date = pd.to_datetime(str(cutoff_date.year) + '-' + str(cutoff_date.month) + '-01')
    cu90 = tm_ut.last_saturday_month(date - pd.to_timedelta(3, unit='M'))
    f90_df = pd.concat([get_fcast(ts_name, ts_cfg, cu90, bu, use_cache=False) for bu in ts_cfg['business_units']], axis=0)