def forecazt(datos, predice, zeazon): qq = scipy.stats.norm.ppf(0.5 * (1+0.95)) lysta = "" for i in range(len(datos)): lysta = lysta + ", " + str(datos[i]) lysta = lysta[2:] lysta = "c(" + lysta + ")" ro.r('datin <- ' + lysta) ro.r("tdatin <- ts(datin, start = c(2012,1), frequency = " + str(zeazon) + ")") datos = ro.r("tdatin <- tsclean(tdatin)") #Esta es la tendencia n1 = datos m1 = dlm(n1) + trend(1, discount = 1, name = 'a') + seasonality(zeazon, discount = 1, name = 'b') m1.fit() cons = list(n1) opti = list(n1) pesi = list(n1) for i in range(predice): if i == 0: (p1Mean, p1Var) = m1.predict(date = m1.n-1) else: (p1Mean, p1Var) = m1.continuePredict() mean1 = str(p1Mean[[0]])[3:] mean2 = np.float(mean1[:-2]) cons.append(mean2) vari1 = str(np.sqrt(p1Var[[0]]))[3:] vari2 = np.float(vari1[:-2]) opti.append(mean2 + qq * vari2) pesi.append(mean2 - qq * vari2) df = pd.DataFrame() df['optimista'] = opti df['conservador'] = cons df['pesimista'] = pesi return df
def fit(self, y, period, x=None, metric="smape", val_size=None, verbose=False): """ Build the model using best-tuned hyperparameter values. :param y: pd.Series or 1-D np.array, time series to predict. :param period: Int or Str, the number of observations per cycle: 1 or "annual" for yearly data, 4 or "quarterly" for quarterly data, 7 or "daily" for daily data, 12 or "monthly" for monthly data, 24 or "hourly" for hourly data, 52 or "weekly" for weekly data. First-letter abbreviations of strings work as well ("a", "q", "d", "m", "h" and "w", respectively). Additional reference: https://robjhyndman.com/hyndsight/seasonal-periods/. :param x: pd.DataFrame or 2-D np.array, exogeneous predictors, optional :param metric: Str, the metric used for model selection. One of "mse" (mean squared error), "mae" (mean absolute error). :param val_size: Int, the number of most recent observations to use as validation set for tuning. :param verbose: Boolean, True for printing additional info while tuning. :return: None """ self.y = y self.name = "Bayesian Dynamic Linear Model" self.key = "bdlm" self._tune(y=y, period=period, x=x, metric=metric, val_size=val_size, verbose=verbose) self.model = pydlm.dlm(y) self.model = self.model + pydlm.trend(degree=self.params["trend"], discount=0.5) self.model = self.model + pydlm.seasonality(period=self.period, discount=0.99) if self.params["ar"] is not None: self.model = self.model + pydlm.autoReg(degree=self.params["ar"], discount=0.99) if x is not None: for variable_id, x_variable in enumerate(x.T): self.model = self.model + pydlm.dynamic( features=[[v] for v in x_variable], discount=0.99, name=str(variable_id)) with SuppressStdoutStderr(): self.model.tune() self.model.fit()
def dlm_exogenous_r3(y, s, k, a, t, e, r): """ One way to use dlm :returns: x, s', w """ if not s: s = dict() s['dim'] = dimension(y) s = dlm_set_exog_hyperparams(s=s, r=r) y0, exog = split_exogenous(y=y) s['n_obs'] = 0 s['model'] = quietDlm([], printInfo=False) + trend( s['trend_degree'], s['discount']) + seasonality( s['period'], s['discount']) s['model'] = s['model'] + fixedAutoReg( degree=s['auto_degree'], name='ar', w=1.0) if exog: exog_wrapped = [[None if np.isnan(ex0) else ex0 for ex0 in exog]] s['model'] = s['model'] + dynamic(features=exog_wrapped, discount=0.99, name='exog') # Set's first exog if y is not None: y = wrap(y) assert dimension(y) == s['dim'], 'Cannot change dimension of data sent' s['n_obs'] += 1 y0, exog = split_exogenous(y=y) y0_passed_in = None if np.isnan( y0) else y0 # pydlm uses None for missing values s['model'].append([y0_passed_in]) if exog: exog_wrapped = [[None if np.isnan(ex0) else ex0 for ex0 in exog]] if s['n_obs'] > 1: s['model'].append( data=exog_wrapped, component='exog') # Don't get first exog twice num_obs = len(s['model'].data) if s.get('model') else 0 if num_obs % s['n_fit'] == s['n_fit'] - 1: _, _, s = dlm_exogenous_r3(y=None, s=s, k=k, a=a, t=t, e=10, r=r) s['model'].fitForwardFilter() return _dlm_exog_prediction_helper(s=s, k=k, y=y) if y is None: if dimension(y) == 1: s['model'].tune(maxit=20) # Don't tune if exogenous ... haven't got this to work s['model'].fit() return None, None, s
def dlm_univariate_r3(y, s: dict, k: int, a=None, t=None, e=None, r=None): """ Univariate filter - Uses the discounting method of H/W so, doesn't need to be fit as often - Discount factors are periodically tuned - The hyper-parameter controls 'auto_degree', 'trend_degree', 'period' :returns: x, x_std, s """ assert r is not None, 'Requires hyper-parameter (interpreted in dimension 3) ' if not s: s = dict() s = dlm_set_univariate_params(s=s, r=r) s['dim'] = dimension(y) s['n_obs'] = 0 s['model'] = dlm([], printInfo=False) + trend( s['trend_degree'], s['discount']) + seasonality( s['period'], s['discount']) s['model'] = s['model'] + fixedAutoReg( degree=s['auto_degree'], name='ar', w=1.0) if y is not None: s['n_obs'] += 1 assert isinstance(y, float) or len( y) == s['dim'], ' Cannot change dimension of input in flight ' y0, exog = split_exogenous(y=y) y0_passed_in = None if np.isnan( y0) else y0 # pydlm uses None for missing values s['model'].append([y0_passed_in]) num_obs = len(s['model'].data) if s.get('model') else 0 if num_obs % s['n_fit'] == s['n_fit'] - 1: # Perform periodic tuning of discount factors _, _, s = dlm_univariate_r3(y=None, s=s, k=k, a=a, t=t, e=1000, r=r) s['model'].fitForwardFilter() return _dlm_prediction_helper(s=s, k=k, y=y) if y is None and e > 60: s['model'].tune() # Tunes discount factors s['model'].fit() return None, None, s
observation_trajectories.append( expected_value_observation_function(tmp)) state_trajectories.append(tmp) state_trajectories = state_trajectories[1:] ## MEAN print(np.mean(observation_trajectories, axis=1)) ## QUANTILES state_trajectories = np.array(state_trajectories).reshape( (len(test_n_t_inf), -1)) else: myDLM = dlm(train_n_t_inf) myDLM = myDLM + trend(1, name='lineTrend', w=1.0) # add a 7 day seasonality with prior covariance 1.0 myDLM = myDLM + seasonality(52, name='7day', w=1.0) # add a 3 step auto regression myDLM = myDLM + autoReg(degree=2, data=train_n_t_inf, name='ar3', w=1.0) myDLM.fit() (predictMean, predictVar) = myDLM.predictN(N=D - 1, date=myDLM.n - 1) for i in range(len(predictMean)): samples = np.random.normal(predictMean[i], np.sqrt(predictVar[i]), 4) state_trajectories.append(samples) state_trajectories = np.array(state_trajectories) phat = trace['a'].mean(axis=0) from scipy.stats import binom myDLM.plot()
data = dataset.load_excel(excel_file, dir="../../datasets") data = dataset.load_all_regions(data) df_italy = data["italy"] # Arrivals to Italy df_greek_island = data["greek_island"] # Arrivals to Greek Island df_mainland_greece = data["mainland_greece"] # Arrivals to Mainland greece df_fyrom = data["fyrom"] # Arrivals to fYRoM df_serbia = data["serbia"] # Arrivals to Serbia df_croatia = data["croatia"] # Arrivals to Croatia df_hungry = data["hungry"] # Arrivals to Hungry df_slovenia = data["slovenia"] # Arrivals to Slovenia df_austria = data["austria"] # Arrivals to Austria df = df_austria # Seriies to test column_name = df.columns[0] fill_method = "ffill" df.fillna(0, inplace=True) df[df.columns[0]] = df[column_name].replace(to_replace=0, method=fill_method) # Replace 0 in series model = dlm(df[column_name]) model = model + trend(degree=1, discount=0.72, name='trend component') model = model + seasonality(period=2, discount=0.99, name='seasonality component') model.fit() model.plot() predictions = list(np.array(model.result.predictedObs).flatten()) r2 = r2_score(df, predictions) rmse = np.sqrt(model.getMSE()) print('RMSE:', rmse) print('R2:', r2)
y = a + b * x dlm = pydlm.dlm(y) dlm = dlm + pydlm.trend(degree=0, discount=0.98, name='a', w=10.0) dlm = dlm + pydlm.dynamic( features=[[v] for v in x], discount=1, name='b', w=10.0) # randomly generate data data = [0] * 100 + [3] * 100 # creadte model dlm = pydlm.dlm(data) # add components dlm = dlm + pydlm.trend(1, name='lineTrend', w=1.0) # covariance=1 dlm = dlm + pydlm.seasonality(7, name='7day', w=1.0) dlm = dlm + pydlm.autoReg(degree=3, data=data, name='ar3', w=1.0) dlm.ls() # delete unwanted component dlm.delete('7day') dlm.ls() # Analize results dlm.fitForwardFilter() dlm.fitBackwardSmoother() # Plot dlm.plot() dlm.turnOff('smoothed plot') dlm.plot()
from pydlm import dlm, trend, seasonality, dynamic, autoReg, longSeason import warnings; warnings.simplefilter("ignore") import numpy as np import pandas as pd import matplotlib.pyplot as plt from os import environ, path from pocketsphinx.pocketsphinx import * from sphinxbase.sphinxbase import * import unittest # A linear trend linear_trend = trend(degree=1, discount=0.95, name='linear_trend', w=10) # A seasonality seasonal96 = seasonality(period=96, discount=0.99, name='seasonal52', w=10) # Build a simple dlm simple_dlm = dlm(Disk_Avg) + linear_trend + seasonal96 # Fit the model simple_dlm.fit() # Plot the fitted results simple_dlm.turnOff('data points') simple_dlm.plot() # Plot each component (attribute the time series to each component) simple_dlm.turnOff('predict plot') simple_dlm.turnOff('filtered plot') simple_dlm.plot('linear_trend') simple_dlm.plot('seasonal96') # Plot the prediction give the first 351 weeks and forcast the next 200 weeks.
def ts_fit(self, suppress=False): """Fit DLM to the time series data. Parameters: ---------- suppress: bool Suppress or not some of the output messages """ self._prepare_fit() self._model = None self.ts_split() ts_df = self._train_dt.copy() # Fit self._dlm_logger.info("Trying to fit the DLM model....") try: if not suppress: self._dlm_logger.info("...via using parameters\n") print_attributes(self) ts_df = ts_df.reset_index() ts_df.columns = self._ts_df_cols self._model = dlm(ts_df['y']) # trend if self._dlm_trend is not None: self._model = self._model + trend( degree=self._dlm_trend['degree'], discount=self._dlm_trend['discount'], name=self._dlm_trend['name'], w=self._dlm_trend['w']) # seasonality if self._dlm_seasonality is not None: self._model = self._model + seasonality( period=self._dlm_seasonality['period'], discount=self._dlm_seasonality['discount'], name=self._dlm_seasonality['name'], w=self._dlm_seasonality['w']) # dynamic if self._train_dlm_dynamic is not None: for i in range(len(self._train_dlm_dynamic['features'])): self._model = self._model + dynamic( features=self._train_dlm_dynamic['features'][i] ['features'], discount=self._train_dlm_dynamic['features'][i] ['discount'], name=self._train_dlm_dynamic['features'][i]['name'], w=self._train_dlm_dynamic['features'][i]['w']) # auto_reg if self._dlm_auto_reg is not None: self._model = self._model + autoReg( degree=self._dlm_auto_reg['degree'], discount=self._dlm_auto_reg['discount'], name=self._dlm_auto_reg['name'], w=self._dlm_auto_reg['w']) # long_season if self._dlm_long_season is not None: ls = longSeason(period=self._dlm_long_season['period'], stay=self._dlm_long_season['stay'], data=ts_df, name=self._dlm_long_season['name'], w=self._dlm_long_season['w']) self._model = self._model + ls if not suppress: self._dlm_logger.info("The constructed DLM model components:") print(self._model.ls()) # tic start = time() if self._use_rolling_window: self._model.fitForwardFilter(useRollingWindow=True, windowLength=self._window_size) self._model.fitBackwardSmoother() else: self._model.fit() self.model_fit = self._model # toc if not suppress: self._dlm_logger.info("Time elapsed: {} sec.".format(time() - start)) except (Exception, ValueError) as e: self._dlm_logger.exception("DLM error...{}".format(e)) return -1 else: self._dlm_logger.info("Model successfully fitted to the data!") self._dlm_logger.info("Computing fitted values and residuals...") # Residuals self.residuals = pd.Series(self.model_fit.getResidual(), index=self._train_dt.index) try: self.lower_conf_int = pd.Series( self.model_fit.getInterval()[1], index=self._train_dt.index) self.upper_conf_int = pd.Series( self.model_fit.getInterval()[0], index=self._train_dt.index) except ValueError as e: self._dlm_logger.exception( "Something went wrong in getInterval...{}".format(e)) self.mse = self.model_fit.getMSE() # Fitted values # this is not elegant, but found no other way self.fittedvalues = self._train_dt['y'] + self.residuals return self
def run_pydlm_monthly(cus_no, mat_no, prod, param, **kwargs): import pandas as pd import numpy as np from dateutil import parser from fbprophet import Prophet from pydlm import dlm, trend, seasonality, dynamic, autoReg, longSeason, modelTuner if ('min_train_days' in kwargs.keys()): min_train_days = kwargs.get('min_train_days') else: min_train_days = p_model.min_train_days if ('test_points' in kwargs.keys()): test_points = kwargs.get('test_points') else: test_points = p_model.test_points_monthly if ('pred_points' in kwargs.keys()): pred_points = kwargs.get('pred_points') else: pred_points = p_model.pred_points_monthly # model parameters trend_degree = param.get('trend_degree') trend_w = param.get('trend_w') seasonality_w = param.get('seasonality_w') ar_degree = param.get('ar_degree') ar_w = param.get('ar_w') # data transform prod = prod.rename(columns={'dt_week': 'ds', 'quantity': 'y'}) prod = prod[['ds', 'y']] prod.ds = prod.ds.apply(str).apply(parser.parse) prod.y = prod.y.apply(float) prod = prod.sort_values('ds') prod = prod.reset_index(drop=True) # prod = prod.drop(prod.index[[0, len(prod.y) - 1]]).reset_index(drop=True) # Aggregated monthly data prod = get_monthly_aggregate_per_product(prod) # Remove outlier prod = ma_replace_outlier(data=prod, n_pass=3, aggressive=True, window_size=6, sigma=2.5) # prod = prod.reset_index(drop= True) # test and train data creation train = prod[ prod.ds <= ( np.amax(prod.ds) - pd.DateOffset(days=(np.amax(prod.ds) - np.amin(prod.ds)).days - min_train_days))] test = prod[(np.amax(np.array(train.index)) + 1):(np.amax(np.array(train.index)) + 1 + test_points)] print(len(test)) # rem_data = prod[(np.amax(np.array(train.index)) + test_points):] output_result = pd.DataFrame() while (len(test) > 0): train_pydlm = train.set_index('ds', drop=True) test_pydlm = test.set_index('ds', drop=True) # Modeling myDLM = dlm(train_pydlm.y) # add a first-order trend (linear trending) with prior covariance 1.0 myDLM = myDLM + trend(degree=trend_degree, name='trend', w=trend_w) # # add a 12 month seasonality with prior covariance 1.0 myDLM = myDLM + seasonality(12, name='12month', w= seasonality_w) # # add a 3 step auto regression myDLM = myDLM + autoReg(degree=ar_degree, data=train_pydlm.y, name='ar', w=ar_w) myDLM.fit() (predictMean, predictVar) = myDLM.predict(date=myDLM.n - 1) pred_test = np.array([round(predictMean.item((0, 0)),2)]) for i in range(len(test_pydlm)-1): (predictMean_cont, predictVar_cont) = myDLM.continuePredict() pred_test = np.append(pred_test,round(predictMean_cont.item((0, 0)),2)) print(pred_test) result_test = test print((result_test)) result_test['y_pydlm'] = pred_test result_test.loc[(result_test['y_pydlm'] < 0), 'y_pydlm'] = 0 train = prod[:(np.amax(np.array(train.index)) + 1 + test_points)] test = prod[(np.amax(np.array(train.index)) + 1):(np.amax(np.array(train.index)) + 1 + test_points)] # rem_data = prod[(np.amax(np.array(train.index)) + test_points):] output_result = pd.concat([output_result, result_test], axis=0) print(output_result.head()) train_pydlm = prod.set_index('ds', drop=True) # test_pydlm = test.set_index('ds', drop=True) # Modeling myDLM = dlm(train_pydlm.y) # add a first-order trend (linear trending) with prior covariance 1.0 myDLM = myDLM + trend(degree=trend_degree, name='trend', w=trend_w) # # add a 12 month seasonality with prior covariance 1.0 myDLM = myDLM + seasonality(12, name='12month', w=seasonality_w) # # add a 3 step auto regression myDLM = myDLM + autoReg(degree=ar_degree, data=train_pydlm.y, name='ar', w=ar_w) myDLM.fit() print(trend_degree, trend_w, seasonality_w, ar_degree, ar_w) (predictMean, predictVar) = myDLM.predict(date=myDLM.n - 1) pred_test = np.array([round(predictMean.item((0, 0)), 2)]) for i in range(test_points - 1): (predictMean, predictVar) = myDLM.continuePredict() pred_test = np.append(pred_test, round(predictMean.item((0, 0)), 2)) print(pred_test)
#!/usr/bin/env python import pandas as pd from pydlm import dlm, odlm, trend, seasonality ts = [ 0.5429682543922109, 0.5296058346035057, 0.5403294585554494, 0.542441925561093, 0.5435209708555084, 0.5430676782288945, 0.5429877208796179, 0.5429721282202071, 0.5429690254184671, 0.5449758960859548, 0.5457612294317765, 0.5434065016617284, 0.5430519745276086, 0.5436459000038072, 0.5437794184525637 ] ## Version 1 model = odlm([]) + trend(degree=2, discount=0.95, name='trend1') + seasonality(7) model.stableMode(False) d = {} for idx, el in enumerate(ts): print(el) model.append([el], component='main') model.fitForwardFilter() print() mean, var = model.predictN(N=1, date=model.n - 1) d[idx] = mean df1 = pd.DataFrame.from_dict(d, orient="index") ## Version 2 model = dlm([]) + trend(degree=2, discount=0.95,
import matplotlib.pyplot as plt import pydlm.plot.dlmPlot as dlmPlot dlmPlot.plotData(range(len(time_series)), time_series, showDataPoint=False, label='raw_data') plt.legend(loc='best', shadow=True) plt.show() # Build a simple model from pydlm import dlm, trend, seasonality # A linear trend linear_trend = trend(degree=1, discount=0.95, name='linear_trend', w=10) # A seasonality seasonal52 = seasonality(period=52, discount=0.99, name='seasonal52', w=10) simple_dlm = dlm(time_series) + linear_trend + seasonal52 simple_dlm.fit() # Plot the fitted results simple_dlm.turnOff('data points') simple_dlm.plot() # Plot each component (attribution) simple_dlm.turnOff('predict plot') simple_dlm.turnOff('filtered plot') simple_dlm.plot('linear_trend') simple_dlm.plot('seasonal52') # Plot the prediction give the first 350 weeks and forcast the next 200 weeks. simple_dlm.plotPredictN(N=200, date=350) # Plot the prediction give the first 250 weeks and forcast the next 200 weeks.
def estimate_and_predict_dlm_PR(calendar, df_propor_PR_ts, punched_df, end_train_date, start_test_date, start_of_this_year, enable_sales, pred_weeks=8, locality=10, r=0.05, missing_val=201735): ''' accept the forecasting sales_proportion data as one regressor df_propor_PR_test: [] return type: DataFrame with prediction result return: columns = [wm_yr_wk_nbr,club,yhat] ''' res = pd.DataFrame() punched = punched_df.groupby(['club_nbr', 'posting_date'])['cost'].sum() punched.column = ['total_punched_wg'] punched = punched.reset_index() punched = pd.merge(left=punched, right=calendar, how='left', left_on='posting_date', right_on='calendar_date').drop('calendar_date', axis=1) # mean wage among all clubs punched = removehurricane('cost', punched, 201733, 201739, sales=False) punched_mean = punched.groupby(['wm_yr_wk_nbr', 'posting_date'])['cost'].mean() punched_mean = punched_mean.reset_index() punched_mean.columns = ['wm_yr_wk_nbr', 'posting_date', 'cost'] punched_mean['club_nbr'] = pd.Series(np.ones([punched_mean.shape[0]])) ########################## if missing_val not in punched_mean.wm_yr_wk_nbr.tolist(): punched_mean.loc[-1] = [ missing_val, punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add( missing_val, -2)].iloc[0, 1] + timedelta(days=14), 0.5 * punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add( missing_val, -2)].iloc[0, 2] + 0.5 * punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add( missing_val, 2)].iloc[0, 2], 1 ] # adding a row punched_mean.index = punched_mean.index + 1 ######################### punched_mean1 = punched_mean.copy(deep=True) punched_mean1['cost'] = 0.5 * punched_mean1['cost'] + 0.25 * punched_mean1[ 'cost'].shift(1) + 0.25 * punched_mean1['cost'].shift(2) ty = punched_mean1['cost'].mean() punched_mean1[['cost']] = punched_mean1[['cost']].fillna(value=ty) punched_mean1 = estimate_and_predict_prophet_PR( calendar, punched_mean1, end_train_date, start_test_date, daily_view=False, pred_days=120) #predict the mean wages. punched_mean1 = punched_mean1.drop('club', axis=1) punched_mean1.columns = ['posting_date', 'PR_cost'] punched_mean1 = pd.merge(left=punched_mean1, right=calendar, how='left', left_on='posting_date', right_on='calendar_date').drop('calendar_date', axis=1) tmp = punched.groupby(['wm_yr_wk_nbr', 'posting_date'])['cost'].mean() tmp = tmp.reset_index() tmp.columns = ['wm_yr_wk_nbr', 'posting_date', 'PR_cost'] tmp = tmp.loc[tmp.wm_yr_wk_nbr <= end_train_date] tmp['PR_cost'] = 0.5 * tmp['PR_cost'] + 0.25 * tmp['PR_cost'].shift( 1) + 0.25 * tmp['PR_cost'].shift(2) ty = tmp['PR_cost'].mean() tmp[['PR_cost']] = tmp[['PR_cost']].fillna(value=ty) punched_mean = pd.concat([tmp, punched_mean1], axis=0) if missing_val not in punched_mean.wm_yr_wk_nbr.tolist(): tu = [ 0.5 * punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add( missing_val, -2)].iloc[0, 0] + 0.5 * punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add( missing_val, 2)].iloc[0, 0] ] tu.append(punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add( missing_val, -2)].iloc[0, 1] + timedelta(days=14)) tu.append(missing_val) punched_mean.loc[-1] = tu # adding a row punched_mean.index = punched_mean.index + 1 # shifting index punched_mean = punched_mean.sort_values( by='wm_yr_wk_nbr').reset_index().drop('index', axis=1) punched = punched.drop('posting_date', axis=1) punched_pro = punched_df.groupby(['club_nbr', 'posting_date'])['cost'].sum() punched_pro.column = ['total_punched_wg'] punched_pro = punched_pro.reset_index() punched_pro = pd.merge(left=punched_pro, right=calendar, how='left', left_on='posting_date', right_on='calendar_date').drop('calendar_date', axis=1) punched_pro = removehurricane('cost', punched_pro, 201733, 201739, sales=False) #201735 is Maria Hurrican Missing #201737 is the Irma Hurricane club_ls = punched.club_nbr.unique() for club in club_ls: pro_club = punched_pro[punched_pro.club_nbr.isin([club])] ######################################### # adding missing value if missing_val not in pro_club.wm_yr_wk_nbr.tolist(): pro_club.loc[-1] = [ club, pro_club.loc[pro_club.wm_yr_wk_nbr == wm_nbr_add( missing_val, -2)].iloc[0, 1] + timedelta(days=14), 0.5 * pro_club.loc[pro_club.wm_yr_wk_nbr == wm_nbr_add( missing_val, -2)].iloc[0, 2] + 0.5 * pro_club.loc[pro_club.wm_yr_wk_nbr == wm_nbr_add( missing_val, 2)].iloc[0, 2], missing_val ] # adding a row pro_club.index = pro_club.index + 1 # shifting index #################################################### pro_club = pro_club.sort_values(by='posting_date').reset_index().drop( 'index', axis=1) pro_sales = df_propor_PR_ts.loc[df_propor_PR_ts.club == club].drop( ['club'], axis=1) pro_club = pro_club.drop(['club_nbr', 'posting_date'], axis=1) pro_club.columns = ['cost', 'wm_yr_wk_nbr'] pro_sales['total_sales'] = pro_sales['total_sales_across'] * pro_sales[ 'per_nbr_fc'] pro_sales = pd.concat( [pro_sales] + [pro_sales.total_sales.shift(x) for x in range(1, 3)], axis=1) pro_sales.columns = [ 'wm_yr_wk_nbr', 'per_nbr_fc', 'total_sales_across', 'total_sales_0', 'sr_1', 'sr_2' ] ######################################### # adding missing value if missing_val not in pro_sales.wm_yr_wk_nbr.unique().tolist(): tu = [] for k in range(len(pro_sales.columns)): tu.append( 0.5 * pro_sales.loc[pro_sales.wm_yr_wk_nbr == wm_nbr_add( missing_val, -2)].iloc[0, k] + 0.5 * pro_sales.loc[pro_sales.wm_yr_wk_nbr == wm_nbr_add( missing_val, 2)].iloc[0, k]) tu[0] = int(tu[0]) pro_sales.loc[-1] = tu # adding a row pro_sales.index = pro_sales.index + 1 # shifting index pro_sales = pro_sales.sort_values( by='wm_yr_wk_nbr').reset_index().drop('index', axis=1) pro_sales = pd.merge(left=pro_sales, right=punched_mean, how='right', left_on='wm_yr_wk_nbr', right_on='wm_yr_wk_nbr', validate='1:1') pro_sales = pro_sales.drop(['posting_date'], axis=1) pro_sales = pro_sales.apply(lambda x: x.fillna(x.mean()), axis=0) pro_sales_train = pro_sales.loc[ pro_sales.wm_yr_wk_nbr <= end_train_date] pro_sales_test = pro_sales.loc[ pro_sales.wm_yr_wk_nbr >= start_test_date] # trend linear_trend = trend(degree=2, discount=0.98, name='linear_trend', w=8) # seasonality seasonal26 = seasonality(period=26, discount=1, name='seasonal26', w=12) # control variable sales0 = pro_sales_train['total_sales_0'].values.tolist() s0 = [[x] for x in sales0] sales1 = pro_sales_train['sr_1'].values.tolist() s1 = [[x] for x in sales1] sales2 = pro_sales_train['sr_2'].values.tolist() s2 = [[x] for x in sales2] macro = pro_sales_train['PR_cost'].values.tolist() m1 = [[x] for x in macro] ##################################### s0 = dynamic(features=s0, discount=0.99, name='sales0', w=8) s1 = dynamic(features=s1, discount=0.99, name='sales1', w=6) # use the actual sales and forecasting sales amount s2 = dynamic(features=s2, discount=0.95, name='sales2', w=6) m1 = dynamic(features=m1, discount=0.99, name='macro', w=12) #e1 = dynamic(features=e1,discount=0.95,name='eff',w=6) drm = dlm(pro_club['cost']) + linear_trend + seasonal26 + autoReg( degree=locality, name='ar2', w=6) + m1 #+s0+s1+s2+m1 drm.fit() #testset pro_sales_test = pro_sales_test.head(pred_weeks) sales0test = pro_sales_test['total_sales_0'].head( pred_weeks).values.tolist() s0test = [[x] for x in sales0test] sales1test = pro_sales_test['sr_1'].head(pred_weeks).values.tolist() s1test = [[x] for x in sales1test] sales2test = pro_sales_test['sr_2'].head(pred_weeks).values.tolist() s2test = [[x] for x in sales2test] macrotest = pro_sales_test['PR_cost'].head(pred_weeks).values.tolist() m1test = [[x] for x in macrotest] #efftest = testset['eff'].head(pred_weeks).values.tolist() #e1test = [[x] for x in efftest] features = { 'sales0': s0test, 'sales1': s1test, 'sales2': s2test, 'macro': m1test } #,'eff':e1test} (predictMean, predictVar) = drm.predictN(N=pred_weeks, date=drm.n - 1, featureDict=features) #locality pro_sales = pro_sales.drop(['sr_1', 'sr_2'], axis=1) pro_sales['ratio'] = pro_sales['total_sales_0'] / pro_sales[ 'total_sales_across'] pro_sales['ratio_1'] = pro_sales['ratio'].shift(1) pro_sales['ratio_2'] = pro_sales['ratio'].shift(2) trainset1_year = pro_club.loc[ pro_club.wm_yr_wk_nbr <= end_train_date].loc[ pro_club.wm_yr_wk_nbr >= end_train_date - locality] trainset_year = pro_sales.loc[ pro_sales.wm_yr_wk_nbr <= end_train_date].loc[ pro_sales.wm_yr_wk_nbr >= end_train_date - locality] trainset_year.apply(lambda x: x.fillna(x.mean()), axis=0) linear_trend_year = trend(degree=1, discount=0.99, name='linear_trend_year', w=10) sales0_year = trainset_year['ratio'].values.tolist() s0_year = [[x] for x in sales0_year] # use the forecast of the ratio of each club among total in PR area # since this is a local model, the total amount in area can be assumed to be constant. sales1_year = trainset_year['ratio_1'].values.tolist() s1_year = [[x] for x in sales1_year] sales2_year = trainset_year['ratio_2'].values.tolist() s2_year = [[x] for x in sales2_year] macro_year = trainset_year['PR_cost'].values.tolist() m1_year = [[x] for x in macro_year] ##################################### s0_year = dynamic(features=s0_year, discount=0.99, name='sales0_year', w=10) s1_year = dynamic(features=s1_year, discount=0.99, name='sales1_year', w=8) s2_year = dynamic(features=s2_year, discount=0.95, name='sales2_year', w=6) m1_year = dynamic(features=m1_year, discount=0.99, name='macro_year', w=10) #e1_year = dynamic(features=e1_year,discount=0.95,name='eff_year',w=6) if enable_sales: drm_year = dlm(trainset1_year['cost']) + autoReg( degree=locality, name='ar2', w=5 ) + linear_trend_year + m1_year + s0_year + s1_year + s2_year else: drm_year = dlm(trainset1_year['cost']) + autoReg( degree=locality, name='ar2', w=5) + linear_trend_year + m1_year #+s0_year+s1_year+s2_year drm_year.fit() testset_year = pro_sales.loc[ pro_sales.wm_yr_wk_nbr >= start_test_date].head(pred_weeks) sales0test = testset_year['ratio'].head(pred_weeks).values.tolist() s0test = [[x] for x in sales0test] sales1test = testset_year['ratio_1'].head(pred_weeks).values.tolist() s1test = [[x] for x in sales1test] sales2test = testset_year['ratio_2'].head(pred_weeks).values.tolist() s2test = [[x] for x in sales2test] features_year = { 'sales0_year': s0test, 'sales1_year': s1test, 'sales2_year': s2test, 'macro_year': m1test } (predictMean_year, predictVar_year) = drm_year.predictN(N=pred_weeks, date=drm_year.n - 1, featureDict=features_year) weeklist = [] p1 = np.exp(-r * (abs(end_train_date - start_of_this_year - 52))) p2 = 1 - p1 for k in range(pred_weeks): weeklist.append(wm_nbr_add(start_test_date, 2 * k)) if res.shape[0] == 0: res['wm_yr_wk_nbr'] = weeklist res['club'] = pd.Series(club * np.ones(pred_weeks), index=res.index) res['yhat'] = pd.Series(p1 * np.asarray(predictMean) + p2 * np.asarray(predictMean_year), index=res.index) else: tmp = pd.DataFrame() tmp['wm_yr_wk_nbr'] = weeklist tmp['club'] = pd.Series(club * np.ones(pred_weeks), index=tmp.index) tmp['yhat'] = pd.Series(p1 * np.asarray(predictMean) + p2 * np.asarray(predictMean_year), index=tmp.index) res = pd.concat([res, tmp], axis=0) return res
import pandas as pd import scipy.stats series = pd.read_csv('daily-users.csv', header=0, parse_dates=[0], index_col=0, squeeze=True) # Use just last 90 days series = series.ix[-90:] from pydlm import dlm, trend, seasonality constant = trend(degree=0, name="constant") seasonal_week = seasonality(period=7, name='seasonal_week') model = dlm(series) + constant + seasonal_week model.tune() model.fit() # Forecast one day predictions, conf = model.predictN(N=1) print("Prediction for next day: %.2f, confidence: %s" % (predictions[0], conf[0])) while True: actual = float(input("Actual value? ")) zscore = (actual - predictions[0]) / math.sqrt(conf[0]) print("Z-score: %.2f" % zscore) pvalue = scipy.stats.norm.sf(abs(zscore)) * 2 print("p-value: %.2f" % pvalue)
import pandas as pd import matplotlib matplotlib.use('Agg') # for saving figures import matplotlib.pyplot as plt from sklearn.metrics import mean_squared_error series = pd.read_csv('daily-users.csv', header=0, parse_dates=[0], index_col=0, squeeze=True) # group daily counts into monthly series = series.groupby(pd.Grouper(freq='M')).sum() from pydlm import dlm, trend, seasonality, longSeason constant = trend(degree=0, name="constant") seasonal_month = seasonality(period=12, name='seasonal_month') model = dlm(series.ix['2015-01-01':'2016-12-31']) + constant + seasonal_month model.tune() model.fit() model.turnOff('data points') model.turnOff('confidence interval') model.plot() plt.savefig('bayesian-monthly.png', dpi=300, bbox_inches='tight', pad_inches=0) plt.close() print(model.getMSE()) model.turnOff('predict plot') model.turnOff('filtered plot') model.plot('constant')
def monthly_pydlm_model(prod, cus_no, mat_no, min_train_days=731, test_points=1, **kwargs): """ :param prod: data :param cus_no: customer number :param mat_no: product number :param min_train_days: Min training data from where cross validation starts :param test_points: number of points ahead prediction(for the time max is 1): need to include :param kwargs: provide dir_name to save images and error excel :return: returns a data frame containing cross validation result """ import pandas as pd import numpy as np import itertools import warnings import statsmodels.api as sm from fbprophet import Prophet from pydlm import dlm, trend, seasonality, dynamic, autoReg, longSeason, modelTuner from dateutil import parser import datetime as dt # data transform prod = prod.rename(columns={'dt_week': 'ds', 'quantity': 'y'}) prod = prod[['ds', 'y']] prod.ds = prod.ds.apply(str).apply(parser.parse) prod.y = prod.y.apply(float) prod = prod.sort_values('ds') prod = prod.reset_index(drop=True) prod = prod.drop(prod.index[[0, len(prod.y) - 1]]).reset_index(drop=True) prod = get_monthly_aggregate_per_product(prod) # save plot (comment) if ('dir_name' in kwargs.keys()): dir_name = kwargs.get('dir_name') one_dim_save_plot(x=prod.ds, y=prod.y, xlable="Date", ylable="quantity", title="raw_weekly_aggregated_quantity", dir_name=dir_name, cus_no=cus_no, mat_no=mat_no) # Remove outlier if ('dir_name' in kwargs.keys()): dir_name = kwargs.get('dir_name') prod = ma_replace_outlier(data=prod, n_pass=3, aggressive=True, window_size=6, sigma=2.5, dir_name=dir_name, mat_no=mat_no, cus_no=cus_no) else: prod = ma_replace_outlier(data=prod, n_pass=3, aggressive=True, window_size=6, sigma=2.5) # save plot (comment) if ('dir_name' in kwargs.keys()): dir_name = kwargs.get('dir_name') one_dim_save_plot(x=prod.ds, y=prod.y, xlable="Date", ylable="quantity", title="weekly_aggregated_quantity_outlier_replaced", dir_name=dir_name, cus_no=cus_no, mat_no=mat_no) # test and train data creation # test and train data creation train = prod[prod.ds <= (np.amax(prod.ds) - pd.DateOffset( days=(np.amax(prod.ds) - np.amin(prod.ds)).days - min_train_days))] test = prod[(np.amax(np.array(train.index)) + 1):(np.amax(np.array(train.index)) + 1 + test_points)] rem_data = prod[(np.amax(np.array(train.index)) + test_points):] output_result = pd.DataFrame() output_error = pd.DataFrame(columns=[ 'cus_no', 'mat_no', 'rmse', 'mape', '3mre_med', '3mre_max', '4mre_med', '4mre_max', 'cum_error', 'cum_quantity', 'period_days' ]) try: while (len(rem_data.ds) >= test_points): train_pydlm = train.set_index('ds', drop=True) # test_pydlm = test.set_index('ds', drop=True) # Modeling myDLM = dlm(train_pydlm.y) # add a first-order trend (linear trending) with prior covariance 1.0 myDLM = myDLM + trend(degree=3, name='quadratic', w=1.0) # # add a 12 month seasonality with prior covariance 1.0 myDLM = myDLM + seasonality(12, name='12month', w=0.0) # # add a 3 step auto regression myDLM = myDLM + autoReg( degree=3, data=train_pydlm.y, name='ar2', w=1.0) # # show the added components # myDLM.ls() # # fit forward filter # myDLM.fitForwardFilter() # # fit backward smoother # myDLM.fitBackwardSmoother() # myTuner = modelTuner(method='gradient_descent', loss='mse') # tunedDLM = myTuner.tune(myDLM, maxit=100) # tuned_discounts = myTuner.getDiscounts() # print(tuned_discounts) # tunedDLM.fit() myDLM.fit() # myDLM.tune() # myDLM.plot() # plot the results # if ('dir_name' in kwargs.keys()): # dir_name = kwargs.get('dir_name') # fig = plt.figure() # myDLM.plot() # # fig = plot.figure() # plt.savefig(dir_name +str(cus_no)+"_"+str(mat_no)+ '_model_fit.png') # plt.close(fig) # # plot only the filtered results # myDLM.turnOff('smoothed plot') # myDLM.plot() # # plot in one figure # myDLM.turnOff('multiple plots') # myDLM.plot() (predictMean, predictVar) = myDLM.predict(date=myDLM.n - 1) # (predictMean1, predictVar1) = myDLM.continuePredict() # print(predictMean.item((0,0))) # print(predictMean1.item((0,0))) # print(type(predictVar)) result_test = test result_test['y_pydlm'] = np.array([predictMean.item((0, 0))]) result_test.loc[(result_test['y_pydlm'] < 0), 'y_pydlm'] = 0 print('Next Test Starts...') train = prod[:(np.amax(np.array(train.index)) + 1 + test_points)] test = prod[(np.amax(np.array(train.index)) + 1):(np.amax(np.array(train.index)) + 1 + test_points)] rem_data = prod[(np.amax(np.array(train.index)) + test_points):] output_result = pd.concat([output_result, result_test], axis=0) output_result = monthly_pydlm_model_error_calculator(output_result) output_error = pd.DataFrame(data=[[ cus_no, mat_no, rmse_calculator(output_result.y_pydlm, output_result.y), mape_calculator(output_result.y_pydlm, output_result.y), np.nanmedian(output_result.rolling_3month_percent_error), np.nanmax( np.absolute( np.array(output_result.rolling_3month_percent_error))), np.nanmedian(output_result.rolling_4month_percent_error), np.nanmax( np.absolute( np.array(output_result.rolling_4month_percent_error))), output_result['Error_Cumsum'].iloc[-1], output_result['cumsum_quantity'].iloc[-1], ((np.amax(output_result.ds) - np.amin(output_result.ds)).days + 30) ]], columns=[ 'cus_no', 'mat_no', 'rmse', 'mape', '3mre_med', '3mre_max', '4mre_med', '4mre_max', 'cum_error', 'cum_quantity', 'period_days' ]) if ('dir_name' in kwargs.keys()): dir_name = kwargs.get('dir_name') try: # plot cumulative error two_dim_save_plot(x1=output_result.ds, y1=output_result.y_pydlm, y1_label='pydlm_pred', x2=output_result.ds, y2=output_result.y, y2_label='observed', xlable="Date", ylable="quantity", title="pydlm_prediction", dir_name=dir_name, cus_no=cus_no, mat_no=mat_no) # plot cumulative error one_dim_save_plot(x=output_result.ds, y=output_result.Error_Cumsum, xlable="Date", ylable="% Cumulative Error", title="cumulative_error", dir_name=dir_name, cus_no=cus_no, mat_no=mat_no) # plot cumulative error one_dim_save_plot(x=output_result.ds, y=output_result.rolling_3month_percent_error, xlable="Date", ylable="% 3 Month Rolling Error", title="3month_rolling_error", dir_name=dir_name, cus_no=cus_no, mat_no=mat_no) except ValueError: print("No points to plot") except np.linalg.linalg.LinAlgError: print("could not fit") return (output_error)
def _tune(self, y, period, x=None, metric="smape", val_size=None, verbose=False): """ Tune hyperparameters of the model. :param y: pd.Series or 1-D np.array, time series to predict. :param period: Int or Str, the number of observations per cycle: 1 or "annual" for yearly data, 4 or "quarterly" for quarterly data, 7 or "daily" for daily data, 12 or "monthly" for monthly data, 24 or "hourly" for hourly data, 52 or "weekly" for weekly data. First-letter abbreviations of strings work as well ("a", "q", "d", "m", "h" and "w", respectively). Additional reference: https://robjhyndman.com/hyndsight/seasonal-periods/. :param x: pd.DataFrame or 2-D np.array, exogeneous predictors, optional :param metric: Str, the metric used for model selection. One of "mse" (mean squared error), "mae" (mean absolute error). :param val_size: Int, the number of most recent observations to use as validation set for tuning. :param verbose: Boolean, True for printing additional info while tuning. :return: None """ self.period = data_utils.period_to_int(period) if type( period) == str else period val_size = int(len(y) * .1) if val_size is None else val_size y_train, y_val = model_utils.train_val_split(y, val_size=val_size) if x is not None: x_train, x_val = model_utils.train_val_split(x, val_size=val_size) metric_fun = get_metric(metric) params_grid = { "trend": [0, 1, 2, 3], "ar": [None], # "ar": [None, 1, 2, 3], } params_keys, params_values = zip(*params_grid.items()) params_permutations = [ dict(zip(params_keys, v)) for v in itertools.product(*params_values) ] scores = [] for permutation in params_permutations: try: with warnings.catch_warnings(): warnings.simplefilter("ignore") model = pydlm.dlm(y_train) model = model + pydlm.trend(degree=permutation["trend"], discount=0.5) model = model + pydlm.seasonality(period=self.period, discount=0.99) if permutation["ar"] is not None: model = model + pydlm.autoReg(degree=permutation["ar"], discount=0.99) if x is not None: for variable_id, x_variable in enumerate(x_train.T): model = model + pydlm.dynamic( features=[[v] for v in x_variable], discount=0.99, name=str(variable_id)) with SuppressStdoutStderr(): model.tune() model.fit() if x is not None: x_val_dict = {} for variable_id, x_variable in enumerate(x_val.T): x_val_dict.update( {str(variable_id): [[v] for v in x_variable]}) else: x_val_dict = None y_pred = model.predictN(date=model.n - 1, N=len(y_val), featureDict=x_val_dict)[0] score = metric_fun(y_val, y_pred) scores.append(score) except: scores.append(np.inf) best_params = params_permutations[np.nanargmin(scores)] self.params.update(best_params) self.params["tuned"] = True