def get_actuals(ts_dict, gcols, use_cache=None): # actuals with a max ds >= cutoff_date cutoff_date = ts_dict['cutoff_date'] init_date = ts_dict['init_date'] ts_name = ts_dict['name'] ycol = ts_dict['ycol'] s_ut.my_print('getting ' + ts_name + ' actuals from table') r_date = hql.get_rmax(ycol, use_cache=USE_CACHE) qcols = list(set(['ds', 'language', 'y'] + gcols)) col_str = ','.join(qcols) print('rmax: ' + str(r_date)) qry = 'select ' + col_str + ' from sup.cx_weekly_actuals where ts_name=\'' + ycol + '\' and run_date=\'' + r_date + '\';' try: uc = USE_CACHE if use_cache is None else use_cache df = hql.from_tble(qry, ['ds'], use_cache=uc, renew=RENEW) s_ut.my_print(qry + ' completed. Got ' + str(len(df)) + ' rows') except: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: query: ' + qry + ' failed. No data for ts ' + ts_name) sys.exit() s_ut.my_print('pid: ' + str(os.getpid()) + ' got actuals for ' + ts_name + ' from table ' + 'sup.cx_weekly_actuals') df.rename(columns={'y': ycol}, inplace=True) # unique name needed (may mix with regressors later) df = df[(df['ds'] >= init_date)].copy() if df['ds'].max() < cutoff_date: s_ut.my_print('ERROR: no actuals up to cutoff date for ' + ts_name) return None else: if len(df) > 0: return df.groupby(['ds'] + gcols).sum().reset_index() else: return None
def ens_fcast(ts_name, regs, cutoff_date, time_scale, fcast_days, init_date, a_df): r_list = list() for rname in regs: r_cfg, _ = dp.ts_setup(rname, cutoff_date, init_date, time_scale) if r_cfg is None: s_ut.my_print('ERROR: invalid regressor name: ' + rname) sys.exit() if r_cfg['do_fcast'] is True: qry = 'select * from sup.cx_ens_forecast where cutoff = \'' + str( cutoff_date.date()) + '\' and ts_name = \'' + rname + '\';' rdf = hql.from_tble(qry, ['ds'], use_cache=USE_CACHE, renew=RENEW) if rdf is None: # no ens fcast file found s_ut.my_print('ERROR: no forecast for regressor: ' + rname) sys.exit() else: cols = ['ds', 'language', 'yhat' ] if 'language' in rdf.columns else ['ds', 'yhat'] rdf = rdf[rdf['ds'] > cutoff_date][cols].copy() adf = get_actuals(r_cfg, init_date='2016-01-01') adf = adf[adf['ds'] <= cutoff_date].copy() adf.rename(columns={r_cfg['ycol']: 'yhat'}, inplace=True) rdf = pd.concat([adf[cols].copy(), rdf], axis=0) else: # static regressors s_ut.my_print(rname + ' is a static regressor') try: reg_func = getattr( sys.modules[__name__], rname) # function to set up the static regressor args = [cutoff_date, init_date, fcast_days, time_scale] rdf = reg_func(*args) except AttributeError as e: s_ut.my_print('pid: ' + str(os.getpid()) + ' No static regressor with name ' + rname + ': ' + str(e)) rdf = None if rdf is not None: s_ut.my_print('pid: ' + str(os.getpid()) + ' found static regressor: ' + str(rname)) else: s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: regressor ' + rname + ' not found <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') if rdf is not None and len(rdf) > 0: rdf.rename(columns={'yhat': rname}, inplace=True) r_list.append(rdf) if len(r_list) > 0: r_dict = merge_regressors_(r_list, init_date) r_dict = { lx: selector(ts_name, lx, a_df[a_df['language'] == lx].copy(), rl, cutoff_date) for lx, rl in r_dict.items() } s_dict = {lx: fl for lx, fl in r_dict.items() if fl is not None} return s_dict else: return dict()
def get_ens_fcast(ts_name, ts_cfg, cutoff_date): tble = 'sup.cx_ens_forecast' s_ut.my_print('getting ' + ts_cfg['ts_key'] + ' ens forecast from table ' + tble + ' and cutoff date: ' + str(cutoff_date.date())) qry = 'select * from ' + tble + ' where cutoff = \'' + str(cutoff_date.date()) + '\' and ts_name = \'' + ts_cfg['ts_key'] + '\';' try: fcast_df = hql.from_tble(qry, ['ds', 'cutoff'], use_cache=USE_CACHE, renew=RENEW) except: s_ut.my_print('ERROR: ' + qry + ' failed') sys.exit() if fcast_df is None: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no data for ' + ts_name + ' and cutoff date ' + str(cutoff_date.date())) return fcast_df
def get_ratio_fcast(ts_name, ts_cfg, cutoff_date, use_cache=None): # >>>>>>>>>>>>> used in to_excel <<<<<<<<<<<<<<<<< tble = 'sup.cx_weekly_forecasts' s_ut.my_print('get_ratio_forecast: getting ' + ts_cfg['ts_key'] + ' ratio forecast from ' + tble + ' and cutoff date ' + str(cutoff_date.date())) qry = 'select * from ' + tble + ' where cutoff = \'' + str(cutoff_date.date()) + '\' and ts_name = \'' + ts_cfg['ts_key'] + '\';' try: uc = USE_CACHE if use_cache is None else use_cache fcast_df = hql.from_tble(qry, ['ds', 'cutoff'], use_cache=uc, renew=RENEW) except: s_ut.my_print('ERROR: ' + qry + ' failed') sys.exit() if fcast_df is None: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no data for ' + ts_name + ' and cutoff date ' + str(cutoff_date.date())) return fcast_df
def get_ens(ts_name, cutoff_date, exclude=('2019-11-30', '2019-12-28', '2020-01-25', '2020-02-29', '2020-03-28', '2020-04-25', '2020-05-30', '2020-06-27')): df = hql.from_tble('select * from sup.cx_language_forecast_performance_detail where ts_name = \'' + ts_name + '\' and cutoff <= \'' + str(cutoff_date.date()) + '\';', ['cutoff', 'upr', 'lwr']) df = df[df['language'] == 'All'].copy() if len(exclude) != 0: df = df[~df['cutoff'].isin(list(exclude))].copy() if len(df) > 0: gf = df.groupby('ens').apply(lambda x: x['err'].mean()).reset_index() s_ut.my_print('+++++++++++++++++ Past performance ++++++++++++++') print(gf) z = gf.nsmallest(1, columns=[0]) return z.loc[z.index[0], 'ens'] else: return 'lasso'
def get_lang_fcast(ts_cfg, cutoff_date, eq=True): ts_name = ts_cfg['name'] ts = ts_cfg['ts_key'] tble = 'sup.cx_language_forecast' s_ut.my_print('getting ' + ts + ' language forecast from table ' + tble + ' and cutoff date: ' + str(cutoff_date.date())) if eq is True: qry = 'select ds, language, yhat, dim_cfg, cutoff from ' + tble + ' where cutoff = \'' + str(cutoff_date.date()) + '\' and ts_name = \'' + ts + '\';' else: qry = 'select ds, language, yhat, dim_cfg, cutoff from ' + tble + ' where cutoff <= \'' + str(cutoff_date.date()) + '\' and ts_name = \'' + ts + '\';' try: fcast_df = hql.from_tble(qry, ['ds', 'cutoff'], use_cache=USE_CACHE, renew=RENEW) s_ut.my_print(qry + ' completed. Got ' + str(len(fcast_df)) + ' rows') except: s_ut.my_print('ERROR: ' + qry + ' failed') sys.exit() if fcast_df is None: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: no data for eq: ' + str(eq) + ', ts: ' + ts_name + ' and cutoff date ' + str(cutoff_date.date())) return fcast_df
def get_actuals(rcfg, init_date='2016-01-01'): cutoff_date = rcfg['cutoff_date'] init_date = rcfg.get('init_date', init_date) r_name = rcfg['name'] int_type = rcfg.get('interaction_type', None) s_ut.my_print('getting ' + r_name + ' regressor actuals from table') ts_name = rcfg['ycol'] r_date = hql.get_rmax(ts_name, use_cache=USE_CACHE) qry = 'select ds, language, y from sup.cx_weekly_actuals where ts_name=\'' + ts_name + '\' and run_date=\'' + r_date + '\';' try: rdf = hql.from_tble(qry, ['ds'], use_cache=USE_CACHE, renew=RENEW) s_ut.my_print(qry + ' completed. Got ' + str(len(rdf)) + ' rows') except: s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: query: ' + qry + ' failed. No data for regressor ' + r_name) sys.exit() rdf.rename(columns={'y': rcfg['ycol']}, inplace=True) s_ut.my_print('pid: ' + str(os.getpid()) + ' got ts_name ' + ts_name + ' from table ' + 'sup.cx_weekly_actuals') if rdf is None: return None else: rdf = rdf[(rdf['ds'] >= pd.to_datetime(init_date))].copy() if rdf['ds'].max() < pd.to_datetime(cutoff_date): s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: ' + r_name + ' max date (' + str(rdf['ds'].max().date()) + ') is smaller than cutoff date (' + str(cutoff_date.date()) + ')') return None elif len(rdf) > 0: if 'interaction_type' in rdf.columns and int_type is not None: rdf = rdf[rdf['interaction_type'] == int_type].copy() rdf.reset_index(inplace=True, drop=True) rdf.rename(columns={'y': rcfg['ycol']}, inplace=True) return rdf else: s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: no data for regressor ' + r_name) return None