Пример #1
0
 def set_bounds(self):
     if self.mdl == 'ErlangA':
         m_bounds = (max(1, self.a / self.window), self.window * self.a
                     )  # overwrite bounds
     elif self.mdl == 'ErlangC':
         try:
             self.min_mval = int(np.ceil(self.a))
             if self.min_mval == int(self.a):
                 self.min_mval += 1
             m_bounds = (self.min_mval, self.window * self.a
                         )  # bounds for servers in the minimization
         except ValueError:
             s_ut.my_print('ERROR_:erlang_tools:ErlangC model invalid::' +
                           self.__str__())
             m_bounds = None
     elif self.mdl == 'ErlangB':
         m_bounds = (max(1, self.a / self.window), self.window * self.a
                     )  # bounds for servers in the minimization
     else:
         s_ut.my_print('@@@@@@@ invalid model @@@@@@@: ' + str(self.mdl))
         m_bounds = None
     try:
         self.m_bounds = (
             int(np.floor(m_bounds[0])),
             int(np.ceil(m_bounds[1]))) if m_bounds is not None else None
     except ValueError:
         self.m_bounds = None
Пример #2
0
 def sla_func(self, m=None, err=False):
     # actual_sla - target_sla: if positive, we are meeting sla
     # find smallest m such that actual_sla >= target_sla
     # err: True return relative error wrt target prob
     if m is None:
         m = self.get_servers() if self.mval is None else self.mval
         if self.verbose is True:
             s_ut.my_print(
                 'WARNING: no value set for servers. Setting to default:: '
                 + str(m))
     q, t = self.func_args[self.sla_func.__name__]
     if m < self.min_mval:
         s_ut.my_print(
             'WARNING: ' + self.mdl +
             ' sla_func could be unstable because m is too small: ' +
             str(m) + ' and min m: ' + str(self.min_mval))
         y = -q  # prob(Wait < t) = 0
     else:
         y = self.queueing_mdl(m).sla_prob(t) - q
         # s_ut.my_print('m: ' + str(m) + ' q: ' + str(q) + ' t: ' + str(t) +  ' prob: ' + str(self.queueing_mdl(m).sla_prob(t)) + ' ret: ' + str(y))
     # print('sla: ' + str(m) + ' ' + str(q) + ' ' + str(self.queueing_mdl(m).sla_prob(t, use_log=False)))
     if err is False:
         return y
     else:
         return y / q
Пример #3
0
def get_data_file(f_all, cutoff_date):
    if f_all is None:
        return None
    cutoff_date = pd.to_datetime(cutoff_date)
    vf = f_all.split('/')
    d_dir = os.path.expanduser(os.path.dirname('/'.join(vf)))
    fname = vf[-1]
    dt_max, fn = pd.to_datetime('2016-01-01'), None
    for f in os.listdir(d_dir):
        if fname in f:
            try:
                f_date = pd.to_datetime(f.split('.')[0][-10:])
            except ValueError:
                s_ut.my_print('WARNING invalid date in ' + str(f) +
                              ' cutoff date: ' + str(cutoff_date.date()) +
                              ' d_dir: ' + str(d_dir))
                continue
            if f_date >= cutoff_date:  # file is acceptable
                if fn is None:  # always update if fn is None
                    dt_max, fn = f_date, f
                elif dt_max < f_date:  # update to a cleaned version if current is cleaned but older
                    dt_max, fn = f_date, f
                else:
                    pass
    if fn is None:
        return None
    else:
        return os.path.join(d_dir,
                            fn)  # return the latest file, cleaned if possible
Пример #4
0
def get_all_files(f_root, cutoff_date, post_cutoff):
    # all files before cutoff date included
    # f_all is the file path + file pattern
    # f_root: ~/my_tmp/fbp/lang_fcast_bookings
    if f_root is None:
        return None
    cutoff_date = pd.to_datetime(cutoff_date)
    d_dir = os.path.expanduser(os.path.dirname(f_root))
    f_name = f_root.split('/')[-1]
    lf_out = list()
    for f in os.listdir(d_dir):
        if f_name in f:
            f_base, f_ext = os.path.splitext(f)
            try:
                f_date = pd.to_datetime(f_base.split('_')[-1])
            except ValueError:
                s_ut.my_print('WARNING invalid date in ' + str(f) +
                              ' cutoff date: ' + str(cutoff_date.date()) +
                              ' d_dir: ' + str(d_dir))
                continue
            if post_cutoff is True and f_date >= cutoff_date:  # file is acceptable
                lf_out.append(os.path.join(d_dir, f))
            if post_cutoff is False and f_date <= cutoff_date:  # file is acceptable
                lf_out.append(os.path.join(d_dir, f))
    return lf_out
Пример #5
0
def ab_func(adb_estimators, max_depth, r, s, lf, X_train, y_train, X_test,
            y_test, y_perf, topN_list):
    ab_reg = AdaBoostRegressor(n_estimators=adb_estimators,
                               base_estimator=DecisionTreeRegressor(
                                   max_depth=max_depth, min_samples_split=s),
                               loss=lf,
                               learning_rate=r)
    try:
        ab_reg.fit(X_train, y_train)
    except ValueError as e:
        s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: ' + str(e))
        s_ut.save_df(
            pd.concat([pd.DataFrame(X_train),
                       pd.DataFrame(y_train)], axis=0), '~/my_tmp/ab_func_err')
        return [dict()]
    yhat_test = ab_reg.predict(X_test)
    d_list = list()
    d_cfg = {
        'adb_estimators': adb_estimators,
        'max_depth': max_depth,
        'learning_rate': r,
        'loss': lf,
        'min_samples_split': s
    }

    # for each AdaBoost cfg and topN get the the values of all the loss functions
    for nval in topN_list:
        d_ = copy.deepcopy(d_cfg)
        d_loss = loss_func(y_test, yhat_test, nval, w=y_perf)
        d_.update(d_loss)
        d_list.append(d_)
    return d_list
Пример #6
0
    def get_holidays(self, lang):  # holidays per language
        prefix = 'not-' if 'not-' in lang else ''
        language = prefix + 'Mandarin' if 'Mandarin' in lang else lang
        end_year = self.fcast_date.year
        holidays_df = hdays.get_hols(
            language, end_year
        )  # returns None if language not valid (e.g. language = foo), returns all languages if language = None
        if self.time_scale == 'D':
            pass
        elif self.time_scale == 'W':
            gcols = ['language',
                     pd.Grouper(key='ds', freq=self.time_scale)
                     ] if 'language' in holidays_df.columns else pd.Grouper(
                         key='ds', freq=self.time_scale)
            holidays_df = holidays_df.groupby(gcols).apply(
                self.w_hols).reset_index()
            holidays_df.drop('level_2', axis=1, inplace=True)
        else:
            s_ut.my_print('ERROR: invalid time scale: ' + str(self.time_scale))
            sys.exit()

        # set the right time scale for holidays
        if holidays_df is None:
            s_ut.my_print('WARNING: no holidays DF for ' + language)
            return None
        else:
            holidays_df.drop('language', axis=1, inplace=True)
            return holidays_df[(holidays_df['ds'] <= self.fcast_date)
                               & (holidays_df['ds'] >= self.init_date)]
Пример #7
0
 def __init__(self,
              name,
              data,
              cutoff_date,
              upr,
              lwr,
              mask,
              loss_type='rel',
              max_evals=200,
              verbose=False):
     super().__init__(name, data, cutoff_date, upr, lwr)
     self.max_evals = max_evals
     self.loss_type = loss_type if loss_type in ['abs', 'rel'] else 'rel'
     self.iter, self.string = 0, ''
     if mask is not None and len(mask) > 0:
         self.X_train = self.X_train[:, mask]
         self.X_test = self.X_test[:, mask]
         self.mask_ = mask
     self.space = SPACE_DICT[self.name]
     self.params, self.loss, self.mdl = None, None, None
     try:
         self.rfunc = getattr(sk_ens, name)
     except AttributeError:
         try:
             self.rfunc = getattr(xgb, name)
         except AttributeError:
             s_ut.my_print('ERROR: ' + name + ' not found')
     self.regr_opt()
     self.regr_set()
     if verbose:
         print(self.string)
     s_ut.my_print(self.name + ': ' + str(self.params) + ' loss: ' +
                   str(self.loss) + ' n_features: ' +
                   str(len(self.features_)))
Пример #8
0
 def __init__(self,
              name,
              f_data,
              this_cu,
              upr,
              lwr,
              max_evals=1000,
              verbose=False):
     super().__init__(name, f_data, this_cu, upr, lwr)
     self.max_evals = max_evals
     self.verbose = verbose
     self.iter, self.valid_iter, self.string, self.min_loss = 0, 0, '', np.inf
     self.params, self.loss, self.mdl, self.df, self.l1_ratio = None, None, None, None, None
     self.do_test = True
     self.space_list = self.get_paths()
     self.regr_opt()  # find opt pars at init time
     if self.params is not None:
         self.alpha = self.params['alpha']
         self.lbda = self.params['l1_ratio']
     self.regr_set()
     if self.params is not None:
         self.ridge_par = 2.0 * len(self.y_train) * self.params['alpha'] * (
             1 - self.params['l1_ratio'])
     else:
         s_ut.my_print(
             'WARNING: default ridge parameter <<<<<<<<<<<<<<<<<<<<<<<< ')
         self.ridge_par = 1.0
     s_ut.my_print('enet: ' + str(self.params) + ' loss: ' +
                   str(self.loss) + ' df: ' + str(self.df) +
                   ' n_features: ' + str(len(self.features_)))
Пример #9
0
def data_check(df, name):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    u_vals = [c for c in df.columns if df[c].nunique() <= 1]
    if df.isnull().sum().sum() > 0 or len(u_vals) > 0:
        p_ut.save_df(df, '~/my_tmp/f_data')
        s_ut.my_print('ERROR: invalid data for ' + str(name))
        sys.exit()
Пример #10
0
 def has_m(self):
     m = self.get_servers() if self.mval is None else self.mval
     if self.verbose is True:
         s_ut.my_print(
             'WARNING: no value set for servers. Setting to default: ' +
             str(m))
     return m
Пример #11
0
def bxform_df(xform_obj, f_list):
    out_list = list()
    for f in f_list:
        y_var = xform_obj.fcast_var(
            f[['yhat_lower', 'yhat_upper']].copy(),
            PROPHET_DICT['prophet_dict']['interval_width'])
        for c in [
                'yhat', 'yhat_upper', 'yhat_lower', 'trend', 'trend_upper',
                'trend_lower', 'additive_terms', 'additive_terms_lower',
                'additive_terms_upper', 'multiplicative_terms',
                'multiplicative_terms_lower', 'multiplicative_terms_upper'
        ]:
            f[c] = xform_obj.inverse_transform(f[c].values, y_var, lbl=c)
        for c in ['yhat']:  # , 'yhat_upper', 'yhat_lower']:
            if f[c].isnull().sum() > 0:
                s_ut.my_print(
                    'pid: ' + str(os.getpid()) +
                    ' WARNING: nulls in back-transformed values for ' +
                    str(c) + ' Ignoring this forecast cfg')
                break
        else:
            out_list.append(f)
    s_ut.my_print('pid: ' + str(os.getpid()) + ' there is ' +
                  str(len(out_list)) + ' valid fcast cfgs')
    return None if len(out_list) == 0 else pd.concat(out_list, axis=0)
Пример #12
0
def get_actuals(ts_dict, gcols, use_cache=None):          # actuals with a max ds >= cutoff_date
    cutoff_date = ts_dict['cutoff_date']
    init_date = ts_dict['init_date']
    ts_name = ts_dict['name']
    ycol = ts_dict['ycol']
    s_ut.my_print('getting ' + ts_name + ' actuals from table')
    r_date = hql.get_rmax(ycol, use_cache=USE_CACHE)
    qcols = list(set(['ds', 'language', 'y'] + gcols))
    col_str = ','.join(qcols)
    print('rmax: ' + str(r_date))
    qry = 'select ' + col_str + ' from sup.cx_weekly_actuals where ts_name=\'' + ycol + '\' and run_date=\'' + r_date + '\';'
    try:
        uc = USE_CACHE if use_cache is None else use_cache
        df = hql.from_tble(qry, ['ds'], use_cache=uc, renew=RENEW)
        s_ut.my_print(qry + ' completed. Got ' + str(len(df)) + ' rows')
    except:
        s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: query: ' + qry + ' failed. No data for ts ' + ts_name)
        sys.exit()
    s_ut.my_print('pid: ' + str(os.getpid()) + ' got actuals for ' + ts_name + ' from table ' + 'sup.cx_weekly_actuals')
    df.rename(columns={'y': ycol}, inplace=True)    # unique name needed (may mix with regressors later)
    df = df[(df['ds'] >= init_date)].copy()
    if df['ds'].max() < cutoff_date:
        s_ut.my_print('ERROR: no actuals up to cutoff date for ' + ts_name)
        return None
    else:
        if len(df) > 0:
            return df.groupby(['ds'] + gcols).sum().reset_index()
        else:
            return None
Пример #13
0
def log_trick(arr):
    # 1. if Y = a1 + a2 + ... , then the log_trick input is an array of the form arr = [[log(|a1|), sign(a1)], [log(|a2|), sign(a2)]], ] ...
    # 3. returns [log(|Y|), sign(Y)]
    if isinstance(arr, list) or isinstance(arr, np.ndarray) or isinstance(
            arr, tuple):
        if len(arr) > 0:
            try:
                vals = [x[0] for x in arr]
                b = [x[1] for x in arr]
                return list(lsexp(vals, b=b, return_sign=True))
            except TypeError:
                s_ut.my_print('log_trick: invalid input format: ' + str(arr))
                return None
        else:  # empty array
            return [-np.inf, 0]
    else:  # if not list, assume float
        try:
            varr = float(arr)
            sgn = np.sign(varr)
            if sgn == 0:
                return [-np.inf, 0]
            else:
                return [np.log(sgn * varr), sgn]
        except ValueError:
            s_ut.my_print('log_trick::invalid input::' + str(arr))
            return None
Пример #14
0
def get_f_cfg(cfg_dict, cutoff_date, init_date, time_scale):
    # set the training windows in multiples of year
    if time_scale == 'W':
        periods = (cutoff_date - init_date).days / 7 - 1
        periods = int(np.ceil(periods))
        nperiods = np.floor(periods / 52.25)
        cfg_dict['training'] = [
            52 * p + 1 for p in range(1,
                                      int(nperiods) + 1)
        ]
    elif time_scale == 'D':
        periods = (cutoff_date - init_date).days - 1
        nperiods = np.floor(periods / 365.25)
        cfg_dict['training'] = [
            365 * p + 1 for p in range(1,
                                       int(nperiods) + 1)
        ]
    else:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR: unsupported time scale: ' + str(time_scale))
        sys.exit()

    if time_scale == 'W':
        cfg_dict['w_mode'] = [None]
    v_list = list(cfg_dict.values())
    k_list = list(cfg_dict.keys())
    f_list = list(itertools.product(*v_list))
    d_list = [{k_list[i]: x[i] for i in range(len(k_list))} for x in f_list]
    return d_list
Пример #15
0
def fcast_adj(k_col, adjust_date):
    # k_col: key columns to be adjusted
    # forecast adjuster (engineering, ECs, ...)
    # ds is the start date of an adj factor for a given k_col and ts_name
    # at the last adj time all adj factors must be 1, otherwise we overwrite
    # All missing adj factors fill to 1
    # All missing k_col values at a given ds get an adj factor or 1
    if adjust_date is None:
        return list()
    else:
        data_cfg = os.path.expanduser(
            '~/my_repos/capacity_planning/forecast/config/fcast_adjust_' +
            str(adjust_date.date()) + '.json')
        if os.path.isfile(data_cfg):
            with open(data_cfg, 'r') as fptr:
                adj_dict = json.load(fptr)
        else:
            s_ut.my_print(
                '>>>>>>>>>>>>>>>> WARNING: could not find adjustments file ' +
                data_cfg + '<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
            return list()

    # find dicts in adj_dict that contain k_col
    adj_names = list()
    for k, vlist in adj_dict.items():
        for v in vlist:
            if k_col in v.keys():
                adj_names.append(k)
                break

    # check adj cfg
    adj_df_list = list()
    for adj_name in adj_names:  # list of dicts in adj_dict that contain k_col
        adj_list = adj_dict[adj_name]
        adj_df_ = pd.DataFrame(adj_list)
        adj_df_['ds'] = pd.to_datetime(adj_df_['ds'].values)

        # add missing entries a ds
        ts_cols = [c for c in adj_df_.columns if c not in ['ds', k_col]]
        f_list = [adj_df_]
        v_list = adj_df_[k_col].unique()
        for ds, f in adj_df_.groupby('ds'):
            m_vals = list(set(v_list) - set(f[k_col].unique()))
            if len(m_vals) > 0:
                lf = pd.DataFrame({'ds': [ds] * len(m_vals), k_col: m_vals})
                f_list.append(lf)
        adj_df = pd.concat(f_list, axis=0, sort=True)
        adj_df[ts_cols].fillna(1, inplace=True)  # default is not adjust

        # check latest date has all 1's
        b = adj_df['ds'] == adj_df['ds'].max()
        d_max = adj_df[b].copy()
        if d_max[ts_cols].min().min() != 1 or d_max[ts_cols].max().max() != 1:
            s_ut.my_print('WARNING: last adjust values not 1. Resetting')
            d_max[ts_cols] = 1
            adj_df = pd.concat([adj_df[~b].copy(), d_max], axis=0)
        fa = adj_df.reset_index(drop=True)
        fa.fillna(1.0, inplace=True)
        adj_df_list.append(fa)
    return adj_df_list
Пример #16
0
def fcast_prep(rf_list, reg_cfg, cutoff_date, fcast_days,
               init_date):  # set the arg_list for regressor forecast
    reg_df_list_, reg_df_list, r_cols = list(), list(), list()
    for f in rf_list:
        if f is not None:
            if 'language' not in f.columns:
                f['language'] = 'NULL'
            if len(f) > 0:
                reg_df_list_.append(f.reset_index(
                    drop=True))  # all reg_df have a language column
            else:
                s_ut.my_print('WARNING: regressor ' + str(f.columns) +
                              ' has not data')

    reg_df_list = reg_gap_check(reg_df_list_, cutoff_date, init_date,
                                fcast_days)  # check for gaps
    do_fcast = {
        list(v['r_col'].keys())[0]: v['do_fcast']
        for v in reg_cfg.values()
    }
    arg_list, rcol_list = list(), list()
    for f in reg_df_list:  # list of regs by reg col and language
        f_cols = [c for c in f.columns if c != 'language']
        rcol = [c for c in f_cols if c not in ['ds', 'ceiling', 'floor']]
        rcol_list += rcol
        lang = f.loc[f.index[0], 'language']
        if len(f) > 0 and len(rcol) == 1:
            arg_list.append(
                [lang, f, 'ds', rcol[0], cutoff_date, fcast_days, do_fcast])
        else:
            s_ut.my_print(
                'WARNING::empty regressor or too many regression columns: ' +
                str(rcol) + ' language" ' + str(lang) + ' len: ' + str(len(f)))
    rcol_list = list(set(rcol_list))
    return arg_list, rcol_list
Пример #17
0
def perf_smry(perf_df, cutoff_date, time_scale, ts_name, upr, lwr):
    # print smry and save
    upr_horizon = cutoff_date + pd.to_timedelta(upr, unit=time_scale)
    lwr_horizon = cutoff_date + pd.to_timedelta(lwr, unit=time_scale)
    if perf_df is not None:
        perf_df.sort_values(by='language', inplace=True)
        perf_df.reset_index(inplace=True, drop=True)
        perf_df['ts_name'] = ts_name
        perf_df['cutoff'] = cutoff_date
        s_ut.my_print(
            '###########################  cutoff: ' + str(cutoff_date.date()) +
            ' ts_name: ' + str(ts_name) + ' performance between ' +
            str(lwr_horizon.date()) + ' (included) and ' +
            str(upr_horizon.date()) +
            ' (included)  ##########################################')
        perf_df.sort_values(by=['language', 'err'], inplace=True)
        print(perf_df.head(10))
        p_ut.save_df(
            perf_df, '~/my_tmp/fbp/lang_perf_' + ts_name + '_' +
            str(cutoff_date.date()))
    else:
        s_ut.my_print(
            'WARNING: no actuals to compute fcast errors for the  between ' +
            str(lwr_horizon.date()) + ' (included) and ' +
            str(upr_horizon.date()) + ' (included) ' + ' for cutoff: ' +
            str(cutoff_date.date()) + ' and ts_name: ' + str(ts_name))
Пример #18
0
 def __init__(self, method, nqs, ceiling=None, floor=None, unbias=False):
     self.method = method
     self.ceiling = ceiling
     self.floor = floor
     self.lmbda = None
     self.name = method
     self.xf_done = False
     self.unbias = unbias  # not implemented
     self.lbl = None
     if method == 'yeo-johnson' or method == 'box-cox':
         self.xobj = PowerTransformer(
             method=method, standardize=False,
             copy=False)  # MUST have standardize = False
     elif method == 'quantile':
         self.xobj = QuantileTransformer(n_quantiles=int(nqs),
                                         output_distribution='normal',
                                         copy=False)
     elif method == 'logistic':
         self.xobj = Linearizer(ceiling, floor, self.unbias)
     elif method == 'log':
         self.xobj = LogTransform(self.unbias)
     elif method == 'anscombe':
         self.xobj = Anscombe()
     elif method is None:
         self.method = None
         self.xobj = NoTransform()
     else:
         su.my_print('pid: ' + str(os.getpid()) +
                     ' WARNING: set_xform: invalid method: ' + str(method))
         self.method = None
         self.xobj = NoTransform()
Пример #19
0
def get_actuals(cutoff_date_):
    fdir = os.path.expanduser('~/my_tmp/cleaned/')  # '~/my_tmp/in_df_data_'
    adf = None
    for f in os.listdir(fdir):
        if str(
                cutoff_date_.date()
        ) in f and 'tickets_' in f and 'old' not in f:  # 'in_df_data_' in f:   # we do not know the rolling window
            s_ut.my_print('getting actuals from ' + fdir + f)
            adf = p_ut.read_df(fdir + f)
            break
    if adf is None:
        s_ut.my_print('no available actuals data for ' +
                      str(cutoff_date_.date()))
        return None
    adf.reset_index(inplace=True, drop=True)
    p_ut.clean_cols(adf,
                    ["language", "service_tier", "channel", "business_unit"],
                    '~/my_repos/capacity_planning/data/config/col_values.json',
                    check_new=False,
                    do_nan=False,
                    rename=True)
    adf.rename(columns={
        'ticket_count': 'y',
        'ds_week_starting': 'ds'
    },
               inplace=True)
    i_vals = ['nan', 'NULL', None, 'other', np.nan, 'null', 'N/A']
    imp_data = imputer.impute(adf, i_vals=i_vals, ex_cols=['ds'])
    imp_data['y'] = np.round(imp_data['y'].values, 0)
    return imp_data
Пример #20
0
 def interpolate_(self, y, yt, nan_pct=0.2):
     # y: inverse-transformed values (values in natural scale)
     # yt: pre-inverse transform (values in transformed scale)
     if y is None:
         return None
     else:
         yx = np.reshape(y, (1, -1))[0] if self.method is not None else y
     nulls = pd.Series(yx).isnull().sum()
     pct = 100.0 * np.round(nulls / len(yx), 2)
     if nulls > nan_pct * np.ceil(len(yx)):
         su.my_print('WARNING: Too many NaN to interpolate for label ' +
                     str(self.lbl) + ': ' + str(nulls) + ' out of ' +
                     str(len(yx)) + ' (' + str(pct) +
                     '%) data points and lambda ' + str(self.lmbda))
         f = pd.DataFrame({'yt': list(yt), 'yx': list(yx)})
         f['lmbda'] = self.lmbda
         p_ut.save_df(f, '~/my_tmp/interpolDF')
         return None
     elif 0 < nulls <= nan_pct * np.ceil(
             len(yx)):  # interpolate yhat if some NaNs
         su.my_print('WARNING: interpolating for label ' + str(self.lbl) +
                     ': ' + str(nulls) + ' NaNs out of ' + str(len(yx)) +
                     ' data points (' + str(pct) + '%)')
         st = pd.Series(yx)
         sint = st.interpolate(limit_direction='both')
         yhat = sint.values
         ys = np.reshape(yhat, (1, -1))
         return ys[0]
     else:  # all OK
         return y
Пример #21
0
def to_table(to_db, table, cutoff_date, ts_name, if_exists, df_out):
    cu_dt = str(cutoff_date.date())
    df_out['cutoff'] = cutoff_date
    df_out['ts_name'] = ts_name
    file_out = p_ut.save_df(df_out,
                            '~/my_tmp/fcast_df_' + cu_dt + '_' + ts_name)
    if to_db is True:
        partition = {'cutoff': cu_dt, 'ts_name': ts_name}
        df_out['ds'] = df_out['ds'].dt.date.astype(str)
        df_out.drop(['cutoff', 'ts_name'], axis=1, inplace=True)
        s_ut.my_print('Loading data to ' + table + ' for partition: ' +
                      str(partition))
        try:  # presto does not work with a partition argument
            ap.hive.push(df_out,
                         table=table,
                         if_exists=if_exists,
                         partition=partition,
                         table_props={
                             'abb_retention_days': '-1',
                             'abb_retention_days_reason': 'fact table. No pii'
                         })
        except:
            s_ut.my_print('ERROR: push to ' + table +
                          ' failed for partition: ' + str(partition))
            sys.exit()
    return file_out
Пример #22
0
 def __init__(self, lbda, mu, m, verbose=False):
     super().__init__(lbda, mu, m, 0.0, verbose)
     if self.m < 1 or self.lbda <= 0 or self.mu <= 0 or self.m <= self.a:
         s_ut.my_print('ErlangC: WARNING: invalid parameters: ' + self.__str__())
         self.pars_ok = False
     self.m_arr = np.array(self.m) if isinstance(self.m, (list, np.ndarray)) else self.m
     self.a_arr = np.array(self.a) if isinstance(self.a, (list, np.ndarray)) else self.a
Пример #23
0
def reg_gap_check(reg_df_list_, cutoff_date, init_date,
                  fcast_days):  # check for gaps
    reg_df_list = list()
    for r in reg_df_list_:
        for l in r.language.unique():
            rl = r[r['language'] == l].copy()
            pdict = dict()
            pdict['language'] = l
            for c in ['ceiling', 'floor']:
                if c in rl.columns:
                    pdict[c] = rl.loc[rl.index[0], c]
                    rl.drop(c, axis=1, inplace=True)
            c = [c_ for c_ in rl.columns if c_ != 'ds' and c_ != 'language'][0]
            rl = d_proc.data_check(rl[['ds', c]].copy(),
                                   c,
                                   'ds',
                                   cutoff_date,
                                   init_date,
                                   max_int=5,
                                   name=l)
            if rl is not None:  # add back language, ceiling and floor
                for k, v in pdict.items():
                    rl[k] = v
                reg_df_list.append(rl)
            else:
                s_ut.my_print('WARNING: regressor ' + str(c) + ' language: ' +
                              l + ' failed data check')
    return reg_df_list
Пример #24
0
 def _get_dcmdl(
     in_mdl,
     size,
     em_phases=1,
     max_splits=2
 ):  # fit arr_v to em_phases of exponentials (partial models)
     arr_v = in_mdl.rvs(size=size)
     dc_em = sut.HyperExp(arr_v,
                          em_phases=em_phases,
                          max_splits=max_splits,
                          floc=0.0)
     dc_em.fit()
     if dc_em.fit_res is None:
         return None
     else:
         v_out = np.array([(d['prob'], 1.0 / d['params'][-1])
                           for d in dc_em.fit_res])  # (prob, rate)
         if dc_em.m_err > 0.25 and dc_em.s_err > 0.5:
             v_out = np.array([(1.0, 1.0 / in_mdl.avg)
                               ])  # replace by plain exponential
             s_ut.my_print('pid: ' + str(os.getpid()) +
                           ' Poor fit for input model: ' +
                           in_mdl.__str__() +
                           ' replacing by exponential: ' + str(v_out))
         return v_out, dc_em.em_mean, dc_em.em_std
Пример #25
0
 def __init__(self, run_id, d_cfg):
     self.run_id = ''  # run_id
     self.d_cfg = d_cfg
     self.worker_hosts = {}
     self.repo_path = d_cfg.get('repo_path', None)
     self.is_ap = True if platform.system() == 'Darwin' else False  # run with airpy from laptop and on cli from redspot
     if self.repo_path is None:
         s_ut.my_print('ERROR_: repo path is missing')
Пример #26
0
 def check_output(self, ya, y):
     if len(np.unique(ya)) == 1:  # xform failed: over/underflow?
         su.my_print(' WARNING: transform ' + self.method +
                     ' failed with lambda ' + str(self.lmbda) +
                     ' and label: ' + str(self.lbl) + ' Trying Quantile')
         return self.reset_xform(y)
     else:
         return ya
Пример #27
0
 def fit_transform(self, y):
     if self.xf_done is True:
         su.my_print('pid: ' + str(os.getpid()) + ' ' + self.method +
                     ' : already fit. Create new Transform instance')
         return None
     else:
         r = self.fit(y)
         return None if r is None else self.transform(y)
Пример #28
0
def solver(d_cfg):
    solver_obj = SimSolver(d_cfg)
    if solver_obj.is_valid:
        return solver_obj.get_dcmdl()
    else:
        s_ut.my_print('pid: ' + str(os.getpid()) +
                      ' ERROR: invalid models. Cannot solve')
        return None
Пример #29
0
    def inverse_transform(self, y, y_var, lbl=None):
        self.lbl = lbl
        if y is not None:
            if isinstance(y, (pd.core.series.Series, pd.core.frame.DataFrame)):
                y = y.values
        else:
            return None

        if y_var is not None:
            if isinstance(y, (pd.core.series.Series, pd.core.frame.DataFrame)):
                y_var = y_var.values

        if isinstance(y, np.ndarray) is False:
            su.my_print('pid: ' + str(os.getpid()) +
                        ' WARNING: invalid type: ' + str(type(y)))
            return None

        if self.xf_done is False:
            su.my_print(
                'pid: ' + str(os.getpid()) +
                ' WARNING: cannot inverse_transform before fit is done')
            return None

        yc = copy.deepcopy(y)
        if self.method == 'logistic':
            yt = self.xobj.inverse_transform(y, y_var, lbl=lbl)
            yt = self.interpolate_(yt, yc, nan_pct=0.2)
            if yt is None:
                su.my_print('pid: ' + str(os.getpid()) +
                            ' WARNING: inverse transform failed for label: ' +
                            str(self.lbl) + ' (method: ' + str(self.method))
                return None
            else:
                return yt
        elif self.method == 'log':
            yt = self.xobj.inverse_transform(y, y_var, lbl=lbl)
            yt = self.interpolate_(yt, yc, nan_pct=0.2)
            if yt is None:
                su.my_print('pid: ' + str(os.getpid()) +
                            ' WARNING: inverse transform failed for label: ' +
                            str(self.lbl) + ' (method: ' + str(self.method))
                return None
            else:
                return yt
        elif self.method is None:
            return y
        else:  # box-cox, yj
            yt = self._inverse_transform(y, yc, y_var)
            if yt is None:
                su.my_print('pid: ' + str(os.getpid()) +
                            ' WARNING: inverse transform failed for label: ' +
                            str(self.lbl) + ' (method: ' + str(self.method) +
                            ' and lambda: ' + str(self.lmbda) + ')')
                return None
            else:
                yout = np.reshape(yt,
                                  (1, -1))[0] if self.method is not None else y
                return yout
Пример #30
0
 def get_emdl(self):  # set basic analytical server counts and avg in system
     self.ss_obj = self._set_emdl(self.lbda, self.mu, self.theta,
                                  self.sla_dict, self.verbose)
     self.min_servers = self.ss_obj.min_mval
     self.m_mdl = self.ss_obj.get_servers()
     if self.m_mdl is None:
         s_ut.my_print('pid: ' + str(os.getpid()) +
                       ' ERROR: analytical model failed')
     return