def fit(self, nsamples=5000): ''' :return: ''' myDLM = dlm(self.yln) myDLM = myDLM + trend(degree=1, discount=0.9, name='trend1') myDLM.fit() results = np.array(myDLM.result.predictedObs)[:, 0, 0] results_var = np.array(myDLM.result.predictedObsVar)[:, 0, 0] predicted, predicted_var = myDLM.predictN(self.forecast_length - 1, myDLM.n - 1) ###INCOMPLETE HOW TO DEAL WITH UNCERTAINTIES!!!! coef = np.array(myDLM.getLatentState()) cov = myDLM.result.smoothedCov self.myDLM = myDLM yln_all = np.append(results, predicted) yln_all_var = np.append(results_var, predicted_var) nall = len(yln_all) yln_models = np.random.randn(nall, nsamples) * \ np.tile(np.sqrt(yln_all_var), nsamples).reshape(nall,nsamples) + \ np.tile(yln_all, nsamples).reshape(nall, nsamples) y_models = np.exp(yln_models) self.y_proj = np.percentile(y_models, [25, 50, 75], axis=1).T self.cov = cov self.coef = coef
def forecazt(datos, predice, zeazon): qq = scipy.stats.norm.ppf(0.5 * (1+0.95)) lysta = "" for i in range(len(datos)): lysta = lysta + ", " + str(datos[i]) lysta = lysta[2:] lysta = "c(" + lysta + ")" ro.r('datin <- ' + lysta) ro.r("tdatin <- ts(datin, start = c(2012,1), frequency = " + str(zeazon) + ")") datos = ro.r("tdatin <- tsclean(tdatin)") #Esta es la tendencia n1 = datos m1 = dlm(n1) + trend(1, discount = 1, name = 'a') + seasonality(zeazon, discount = 1, name = 'b') m1.fit() cons = list(n1) opti = list(n1) pesi = list(n1) for i in range(predice): if i == 0: (p1Mean, p1Var) = m1.predict(date = m1.n-1) else: (p1Mean, p1Var) = m1.continuePredict() mean1 = str(p1Mean[[0]])[3:] mean2 = np.float(mean1[:-2]) cons.append(mean2) vari1 = str(np.sqrt(p1Var[[0]]))[3:] vari2 = np.float(vari1[:-2]) opti.append(mean2 + qq * vari2) pesi.append(mean2 - qq * vari2) df = pd.DataFrame() df['optimista'] = opti df['conservador'] = cons df['pesimista'] = pesi return df
def fit(self, y, period, x=None, metric="smape", val_size=None, verbose=False): """ Build the model using best-tuned hyperparameter values. :param y: pd.Series or 1-D np.array, time series to predict. :param period: Int or Str, the number of observations per cycle: 1 or "annual" for yearly data, 4 or "quarterly" for quarterly data, 7 or "daily" for daily data, 12 or "monthly" for monthly data, 24 or "hourly" for hourly data, 52 or "weekly" for weekly data. First-letter abbreviations of strings work as well ("a", "q", "d", "m", "h" and "w", respectively). Additional reference: https://robjhyndman.com/hyndsight/seasonal-periods/. :param x: pd.DataFrame or 2-D np.array, exogeneous predictors, optional :param metric: Str, the metric used for model selection. One of "mse" (mean squared error), "mae" (mean absolute error). :param val_size: Int, the number of most recent observations to use as validation set for tuning. :param verbose: Boolean, True for printing additional info while tuning. :return: None """ self.y = y self.name = "Bayesian Dynamic Linear Model" self.key = "bdlm" self._tune(y=y, period=period, x=x, metric=metric, val_size=val_size, verbose=verbose) self.model = pydlm.dlm(y) self.model = self.model + pydlm.trend(degree=self.params["trend"], discount=0.5) self.model = self.model + pydlm.seasonality(period=self.period, discount=0.99) if self.params["ar"] is not None: self.model = self.model + pydlm.autoReg(degree=self.params["ar"], discount=0.99) if x is not None: for variable_id, x_variable in enumerate(x.T): self.model = self.model + pydlm.dynamic( features=[[v] for v in x_variable], discount=0.99, name=str(variable_id)) with SuppressStdoutStderr(): self.model.tune() self.model.fit()
def dlm_exogenous_r3(y, s, k, a, t, e, r): """ One way to use dlm :returns: x, s', w """ if not s: s = dict() s['dim'] = dimension(y) s = dlm_set_exog_hyperparams(s=s, r=r) y0, exog = split_exogenous(y=y) s['n_obs'] = 0 s['model'] = quietDlm([], printInfo=False) + trend( s['trend_degree'], s['discount']) + seasonality( s['period'], s['discount']) s['model'] = s['model'] + fixedAutoReg( degree=s['auto_degree'], name='ar', w=1.0) if exog: exog_wrapped = [[None if np.isnan(ex0) else ex0 for ex0 in exog]] s['model'] = s['model'] + dynamic(features=exog_wrapped, discount=0.99, name='exog') # Set's first exog if y is not None: y = wrap(y) assert dimension(y) == s['dim'], 'Cannot change dimension of data sent' s['n_obs'] += 1 y0, exog = split_exogenous(y=y) y0_passed_in = None if np.isnan( y0) else y0 # pydlm uses None for missing values s['model'].append([y0_passed_in]) if exog: exog_wrapped = [[None if np.isnan(ex0) else ex0 for ex0 in exog]] if s['n_obs'] > 1: s['model'].append( data=exog_wrapped, component='exog') # Don't get first exog twice num_obs = len(s['model'].data) if s.get('model') else 0 if num_obs % s['n_fit'] == s['n_fit'] - 1: _, _, s = dlm_exogenous_r3(y=None, s=s, k=k, a=a, t=t, e=10, r=r) s['model'].fitForwardFilter() return _dlm_exog_prediction_helper(s=s, k=k, y=y) if y is None: if dimension(y) == 1: s['model'].tune(maxit=20) # Don't tune if exogenous ... haven't got this to work s['model'].fit() return None, None, s
def dlm_univariate_r3(y, s: dict, k: int, a=None, t=None, e=None, r=None): """ Univariate filter - Uses the discounting method of H/W so, doesn't need to be fit as often - Discount factors are periodically tuned - The hyper-parameter controls 'auto_degree', 'trend_degree', 'period' :returns: x, x_std, s """ assert r is not None, 'Requires hyper-parameter (interpreted in dimension 3) ' if not s: s = dict() s = dlm_set_univariate_params(s=s, r=r) s['dim'] = dimension(y) s['n_obs'] = 0 s['model'] = dlm([], printInfo=False) + trend( s['trend_degree'], s['discount']) + seasonality( s['period'], s['discount']) s['model'] = s['model'] + fixedAutoReg( degree=s['auto_degree'], name='ar', w=1.0) if y is not None: s['n_obs'] += 1 assert isinstance(y, float) or len( y) == s['dim'], ' Cannot change dimension of input in flight ' y0, exog = split_exogenous(y=y) y0_passed_in = None if np.isnan( y0) else y0 # pydlm uses None for missing values s['model'].append([y0_passed_in]) num_obs = len(s['model'].data) if s.get('model') else 0 if num_obs % s['n_fit'] == s['n_fit'] - 1: # Perform periodic tuning of discount factors _, _, s = dlm_univariate_r3(y=None, s=s, k=k, a=a, t=t, e=1000, r=r) s['model'].fitForwardFilter() return _dlm_prediction_helper(s=s, k=k, y=y) if y is None and e > 60: s['model'].tune() # Tunes discount factors s['model'].fit() return None, None, s
def SerBayes(sDay,nAhead,x0,hWeek): dta = sDay['y'] dta.index = [pd.datetime.strptime(str(x)[0:10],'%Y-%m-%d') for x in dta.index] t_line = [float(calendar.timegm(x.utctimetuple()))/1000000 for x in dta.index] dta.index = t_line model = pydlm.dlm(dta) model = model + pydlm.trend(degree=1,discount=0.98,name='a',w=10.0) model = model + pydlm.dynamic(features=[[v] for v in t_line],discount=1,name='b',w=10.0) model = model + pydlm.autoReg(degree=3,data=dta.values,name='ar3',w=1.0) allStates = model.getLatentState(filterType='forwardFilter') model.evolveMode('independent') model.noisePrior(2.0) model.fit() model.plot() model.turnOff('predict') model.plotCoef(name='a') model.plotCoef(name='b') model.plotCoef(name='ar3')
# Dynamic Linear Models (DLM) with pydlm # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ import numpy as np import matplotlib.pyplot as plt import pydlm # Simple example (random walk) n = 100 a = 1.0 + np.random.normal(0, 5, n) # the intercept x = np.random.normal(0, 2, n) # the control variable b = 3.0 # the coefficient y = a + b * x dlm = pydlm.dlm(y) dlm = dlm + pydlm.trend(degree=0, discount=0.98, name='a', w=10.0) dlm = dlm + pydlm.dynamic( features=[[v] for v in x], discount=1, name='b', w=10.0) # randomly generate data data = [0] * 100 + [3] * 100 # creadte model dlm = pydlm.dlm(data) # add components dlm = dlm + pydlm.trend(1, name='lineTrend', w=1.0) # covariance=1 dlm = dlm + pydlm.seasonality(7, name='7day', w=1.0) dlm = dlm + pydlm.autoReg(degree=3, data=data, name='ar3', w=1.0) dlm.ls()
Disk_Avg.dropna(inplace=True) from pydlm import dlm, trend, seasonality, dynamic, autoReg, longSeason import warnings; warnings.simplefilter("ignore") import numpy as np import pandas as pd import matplotlib.pyplot as plt from os import environ, path from pocketsphinx.pocketsphinx import * from sphinxbase.sphinxbase import * import unittest # A linear trend linear_trend = trend(degree=1, discount=0.95, name='linear_trend', w=10) # A seasonality seasonal96 = seasonality(period=96, discount=0.99, name='seasonal52', w=10) # Build a simple dlm simple_dlm = dlm(Disk_Avg) + linear_trend + seasonal96 # Fit the model simple_dlm.fit() # Plot the fitted results simple_dlm.turnOff('data points') simple_dlm.plot() # Plot each component (attribute the time series to each component) simple_dlm.turnOff('predict plot') simple_dlm.turnOff('filtered plot') simple_dlm.plot('linear_trend') simple_dlm.plot('seasonal96')
def ts_fit(self, suppress=False): """Fit DLM to the time series data. Parameters: ---------- suppress: bool Suppress or not some of the output messages """ self._prepare_fit() self._model = None self.ts_split() ts_df = self._train_dt.copy() # Fit self._dlm_logger.info("Trying to fit the DLM model....") try: if not suppress: self._dlm_logger.info("...via using parameters\n") print_attributes(self) ts_df = ts_df.reset_index() ts_df.columns = self._ts_df_cols self._model = dlm(ts_df['y']) # trend if self._dlm_trend is not None: self._model = self._model + trend( degree=self._dlm_trend['degree'], discount=self._dlm_trend['discount'], name=self._dlm_trend['name'], w=self._dlm_trend['w']) # seasonality if self._dlm_seasonality is not None: self._model = self._model + seasonality( period=self._dlm_seasonality['period'], discount=self._dlm_seasonality['discount'], name=self._dlm_seasonality['name'], w=self._dlm_seasonality['w']) # dynamic if self._train_dlm_dynamic is not None: for i in range(len(self._train_dlm_dynamic['features'])): self._model = self._model + dynamic( features=self._train_dlm_dynamic['features'][i] ['features'], discount=self._train_dlm_dynamic['features'][i] ['discount'], name=self._train_dlm_dynamic['features'][i]['name'], w=self._train_dlm_dynamic['features'][i]['w']) # auto_reg if self._dlm_auto_reg is not None: self._model = self._model + autoReg( degree=self._dlm_auto_reg['degree'], discount=self._dlm_auto_reg['discount'], name=self._dlm_auto_reg['name'], w=self._dlm_auto_reg['w']) # long_season if self._dlm_long_season is not None: ls = longSeason(period=self._dlm_long_season['period'], stay=self._dlm_long_season['stay'], data=ts_df, name=self._dlm_long_season['name'], w=self._dlm_long_season['w']) self._model = self._model + ls if not suppress: self._dlm_logger.info("The constructed DLM model components:") print(self._model.ls()) # tic start = time() if self._use_rolling_window: self._model.fitForwardFilter(useRollingWindow=True, windowLength=self._window_size) self._model.fitBackwardSmoother() else: self._model.fit() self.model_fit = self._model # toc if not suppress: self._dlm_logger.info("Time elapsed: {} sec.".format(time() - start)) except (Exception, ValueError) as e: self._dlm_logger.exception("DLM error...{}".format(e)) return -1 else: self._dlm_logger.info("Model successfully fitted to the data!") self._dlm_logger.info("Computing fitted values and residuals...") # Residuals self.residuals = pd.Series(self.model_fit.getResidual(), index=self._train_dt.index) try: self.lower_conf_int = pd.Series( self.model_fit.getInterval()[1], index=self._train_dt.index) self.upper_conf_int = pd.Series( self.model_fit.getInterval()[0], index=self._train_dt.index) except ValueError as e: self._dlm_logger.exception( "Something went wrong in getInterval...{}".format(e)) self.mse = self.model_fit.getMSE() # Fitted values # this is not elegant, but found no other way self.fittedvalues = self._train_dt['y'] + self.residuals return self
def monthly_pydlm_model(prod, cus_no, mat_no, min_train_days=731, test_points=1, **kwargs): """ :param prod: data :param cus_no: customer number :param mat_no: product number :param min_train_days: Min training data from where cross validation starts :param test_points: number of points ahead prediction(for the time max is 1): need to include :param kwargs: provide dir_name to save images and error excel :return: returns a data frame containing cross validation result """ import pandas as pd import numpy as np import itertools import warnings import statsmodels.api as sm from fbprophet import Prophet from pydlm import dlm, trend, seasonality, dynamic, autoReg, longSeason, modelTuner from dateutil import parser import datetime as dt # data transform prod = prod.rename(columns={'dt_week': 'ds', 'quantity': 'y'}) prod = prod[['ds', 'y']] prod.ds = prod.ds.apply(str).apply(parser.parse) prod.y = prod.y.apply(float) prod = prod.sort_values('ds') prod = prod.reset_index(drop=True) prod = prod.drop(prod.index[[0, len(prod.y) - 1]]).reset_index(drop=True) prod = get_monthly_aggregate_per_product(prod) # save plot (comment) if ('dir_name' in kwargs.keys()): dir_name = kwargs.get('dir_name') one_dim_save_plot(x=prod.ds, y=prod.y, xlable="Date", ylable="quantity", title="raw_weekly_aggregated_quantity", dir_name=dir_name, cus_no=cus_no, mat_no=mat_no) # Remove outlier if ('dir_name' in kwargs.keys()): dir_name = kwargs.get('dir_name') prod = ma_replace_outlier(data=prod, n_pass=3, aggressive=True, window_size=6, sigma=2.5, dir_name=dir_name, mat_no=mat_no, cus_no=cus_no) else: prod = ma_replace_outlier(data=prod, n_pass=3, aggressive=True, window_size=6, sigma=2.5) # save plot (comment) if ('dir_name' in kwargs.keys()): dir_name = kwargs.get('dir_name') one_dim_save_plot(x=prod.ds, y=prod.y, xlable="Date", ylable="quantity", title="weekly_aggregated_quantity_outlier_replaced", dir_name=dir_name, cus_no=cus_no, mat_no=mat_no) # test and train data creation # test and train data creation train = prod[prod.ds <= (np.amax(prod.ds) - pd.DateOffset( days=(np.amax(prod.ds) - np.amin(prod.ds)).days - min_train_days))] test = prod[(np.amax(np.array(train.index)) + 1):(np.amax(np.array(train.index)) + 1 + test_points)] rem_data = prod[(np.amax(np.array(train.index)) + test_points):] output_result = pd.DataFrame() output_error = pd.DataFrame(columns=[ 'cus_no', 'mat_no', 'rmse', 'mape', '3mre_med', '3mre_max', '4mre_med', '4mre_max', 'cum_error', 'cum_quantity', 'period_days' ]) try: while (len(rem_data.ds) >= test_points): train_pydlm = train.set_index('ds', drop=True) # test_pydlm = test.set_index('ds', drop=True) # Modeling myDLM = dlm(train_pydlm.y) # add a first-order trend (linear trending) with prior covariance 1.0 myDLM = myDLM + trend(degree=3, name='quadratic', w=1.0) # # add a 12 month seasonality with prior covariance 1.0 myDLM = myDLM + seasonality(12, name='12month', w=0.0) # # add a 3 step auto regression myDLM = myDLM + autoReg( degree=3, data=train_pydlm.y, name='ar2', w=1.0) # # show the added components # myDLM.ls() # # fit forward filter # myDLM.fitForwardFilter() # # fit backward smoother # myDLM.fitBackwardSmoother() # myTuner = modelTuner(method='gradient_descent', loss='mse') # tunedDLM = myTuner.tune(myDLM, maxit=100) # tuned_discounts = myTuner.getDiscounts() # print(tuned_discounts) # tunedDLM.fit() myDLM.fit() # myDLM.tune() # myDLM.plot() # plot the results # if ('dir_name' in kwargs.keys()): # dir_name = kwargs.get('dir_name') # fig = plt.figure() # myDLM.plot() # # fig = plot.figure() # plt.savefig(dir_name +str(cus_no)+"_"+str(mat_no)+ '_model_fit.png') # plt.close(fig) # # plot only the filtered results # myDLM.turnOff('smoothed plot') # myDLM.plot() # # plot in one figure # myDLM.turnOff('multiple plots') # myDLM.plot() (predictMean, predictVar) = myDLM.predict(date=myDLM.n - 1) # (predictMean1, predictVar1) = myDLM.continuePredict() # print(predictMean.item((0,0))) # print(predictMean1.item((0,0))) # print(type(predictVar)) result_test = test result_test['y_pydlm'] = np.array([predictMean.item((0, 0))]) result_test.loc[(result_test['y_pydlm'] < 0), 'y_pydlm'] = 0 print('Next Test Starts...') train = prod[:(np.amax(np.array(train.index)) + 1 + test_points)] test = prod[(np.amax(np.array(train.index)) + 1):(np.amax(np.array(train.index)) + 1 + test_points)] rem_data = prod[(np.amax(np.array(train.index)) + test_points):] output_result = pd.concat([output_result, result_test], axis=0) output_result = monthly_pydlm_model_error_calculator(output_result) output_error = pd.DataFrame(data=[[ cus_no, mat_no, rmse_calculator(output_result.y_pydlm, output_result.y), mape_calculator(output_result.y_pydlm, output_result.y), np.nanmedian(output_result.rolling_3month_percent_error), np.nanmax( np.absolute( np.array(output_result.rolling_3month_percent_error))), np.nanmedian(output_result.rolling_4month_percent_error), np.nanmax( np.absolute( np.array(output_result.rolling_4month_percent_error))), output_result['Error_Cumsum'].iloc[-1], output_result['cumsum_quantity'].iloc[-1], ((np.amax(output_result.ds) - np.amin(output_result.ds)).days + 30) ]], columns=[ 'cus_no', 'mat_no', 'rmse', 'mape', '3mre_med', '3mre_max', '4mre_med', '4mre_max', 'cum_error', 'cum_quantity', 'period_days' ]) if ('dir_name' in kwargs.keys()): dir_name = kwargs.get('dir_name') try: # plot cumulative error two_dim_save_plot(x1=output_result.ds, y1=output_result.y_pydlm, y1_label='pydlm_pred', x2=output_result.ds, y2=output_result.y, y2_label='observed', xlable="Date", ylable="quantity", title="pydlm_prediction", dir_name=dir_name, cus_no=cus_no, mat_no=mat_no) # plot cumulative error one_dim_save_plot(x=output_result.ds, y=output_result.Error_Cumsum, xlable="Date", ylable="% Cumulative Error", title="cumulative_error", dir_name=dir_name, cus_no=cus_no, mat_no=mat_no) # plot cumulative error one_dim_save_plot(x=output_result.ds, y=output_result.rolling_3month_percent_error, xlable="Date", ylable="% 3 Month Rolling Error", title="3month_rolling_error", dir_name=dir_name, cus_no=cus_no, mat_no=mat_no) except ValueError: print("No points to plot") except np.linalg.linalg.LinAlgError: print("could not fit") return (output_error)
# Plot the raw data import matplotlib.pyplot as plt import pydlm.plot.dlmPlot as dlmPlot dlmPlot.plotData(range(len(time_series)), time_series, showDataPoint=False, label='raw_data') plt.legend(loc='best', shadow=True) plt.show() # Build a simple model from pydlm import dlm, trend, seasonality # A linear trend linear_trend = trend(degree=1, discount=0.95, name='linear_trend', w=10) # A seasonality seasonal52 = seasonality(period=52, discount=0.99, name='seasonal52', w=10) simple_dlm = dlm(time_series) + linear_trend + seasonal52 simple_dlm.fit() # Plot the fitted results simple_dlm.turnOff('data points') simple_dlm.plot() # Plot each component (attribution) simple_dlm.turnOff('predict plot') simple_dlm.turnOff('filtered plot') simple_dlm.plot('linear_trend') simple_dlm.plot('seasonal52') # Plot the prediction give the first 350 weeks and forcast the next 200 weeks.
def estimate_and_predict_dlm_PR(calendar, df_propor_PR_ts, punched_df, end_train_date, start_test_date, start_of_this_year, enable_sales, pred_weeks=8, locality=10, r=0.05, missing_val=201735): ''' accept the forecasting sales_proportion data as one regressor df_propor_PR_test: [] return type: DataFrame with prediction result return: columns = [wm_yr_wk_nbr,club,yhat] ''' res = pd.DataFrame() punched = punched_df.groupby(['club_nbr', 'posting_date'])['cost'].sum() punched.column = ['total_punched_wg'] punched = punched.reset_index() punched = pd.merge(left=punched, right=calendar, how='left', left_on='posting_date', right_on='calendar_date').drop('calendar_date', axis=1) # mean wage among all clubs punched = removehurricane('cost', punched, 201733, 201739, sales=False) punched_mean = punched.groupby(['wm_yr_wk_nbr', 'posting_date'])['cost'].mean() punched_mean = punched_mean.reset_index() punched_mean.columns = ['wm_yr_wk_nbr', 'posting_date', 'cost'] punched_mean['club_nbr'] = pd.Series(np.ones([punched_mean.shape[0]])) ########################## if missing_val not in punched_mean.wm_yr_wk_nbr.tolist(): punched_mean.loc[-1] = [ missing_val, punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add( missing_val, -2)].iloc[0, 1] + timedelta(days=14), 0.5 * punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add( missing_val, -2)].iloc[0, 2] + 0.5 * punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add( missing_val, 2)].iloc[0, 2], 1 ] # adding a row punched_mean.index = punched_mean.index + 1 ######################### punched_mean1 = punched_mean.copy(deep=True) punched_mean1['cost'] = 0.5 * punched_mean1['cost'] + 0.25 * punched_mean1[ 'cost'].shift(1) + 0.25 * punched_mean1['cost'].shift(2) ty = punched_mean1['cost'].mean() punched_mean1[['cost']] = punched_mean1[['cost']].fillna(value=ty) punched_mean1 = estimate_and_predict_prophet_PR( calendar, punched_mean1, end_train_date, start_test_date, daily_view=False, pred_days=120) #predict the mean wages. punched_mean1 = punched_mean1.drop('club', axis=1) punched_mean1.columns = ['posting_date', 'PR_cost'] punched_mean1 = pd.merge(left=punched_mean1, right=calendar, how='left', left_on='posting_date', right_on='calendar_date').drop('calendar_date', axis=1) tmp = punched.groupby(['wm_yr_wk_nbr', 'posting_date'])['cost'].mean() tmp = tmp.reset_index() tmp.columns = ['wm_yr_wk_nbr', 'posting_date', 'PR_cost'] tmp = tmp.loc[tmp.wm_yr_wk_nbr <= end_train_date] tmp['PR_cost'] = 0.5 * tmp['PR_cost'] + 0.25 * tmp['PR_cost'].shift( 1) + 0.25 * tmp['PR_cost'].shift(2) ty = tmp['PR_cost'].mean() tmp[['PR_cost']] = tmp[['PR_cost']].fillna(value=ty) punched_mean = pd.concat([tmp, punched_mean1], axis=0) if missing_val not in punched_mean.wm_yr_wk_nbr.tolist(): tu = [ 0.5 * punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add( missing_val, -2)].iloc[0, 0] + 0.5 * punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add( missing_val, 2)].iloc[0, 0] ] tu.append(punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add( missing_val, -2)].iloc[0, 1] + timedelta(days=14)) tu.append(missing_val) punched_mean.loc[-1] = tu # adding a row punched_mean.index = punched_mean.index + 1 # shifting index punched_mean = punched_mean.sort_values( by='wm_yr_wk_nbr').reset_index().drop('index', axis=1) punched = punched.drop('posting_date', axis=1) punched_pro = punched_df.groupby(['club_nbr', 'posting_date'])['cost'].sum() punched_pro.column = ['total_punched_wg'] punched_pro = punched_pro.reset_index() punched_pro = pd.merge(left=punched_pro, right=calendar, how='left', left_on='posting_date', right_on='calendar_date').drop('calendar_date', axis=1) punched_pro = removehurricane('cost', punched_pro, 201733, 201739, sales=False) #201735 is Maria Hurrican Missing #201737 is the Irma Hurricane club_ls = punched.club_nbr.unique() for club in club_ls: pro_club = punched_pro[punched_pro.club_nbr.isin([club])] ######################################### # adding missing value if missing_val not in pro_club.wm_yr_wk_nbr.tolist(): pro_club.loc[-1] = [ club, pro_club.loc[pro_club.wm_yr_wk_nbr == wm_nbr_add( missing_val, -2)].iloc[0, 1] + timedelta(days=14), 0.5 * pro_club.loc[pro_club.wm_yr_wk_nbr == wm_nbr_add( missing_val, -2)].iloc[0, 2] + 0.5 * pro_club.loc[pro_club.wm_yr_wk_nbr == wm_nbr_add( missing_val, 2)].iloc[0, 2], missing_val ] # adding a row pro_club.index = pro_club.index + 1 # shifting index #################################################### pro_club = pro_club.sort_values(by='posting_date').reset_index().drop( 'index', axis=1) pro_sales = df_propor_PR_ts.loc[df_propor_PR_ts.club == club].drop( ['club'], axis=1) pro_club = pro_club.drop(['club_nbr', 'posting_date'], axis=1) pro_club.columns = ['cost', 'wm_yr_wk_nbr'] pro_sales['total_sales'] = pro_sales['total_sales_across'] * pro_sales[ 'per_nbr_fc'] pro_sales = pd.concat( [pro_sales] + [pro_sales.total_sales.shift(x) for x in range(1, 3)], axis=1) pro_sales.columns = [ 'wm_yr_wk_nbr', 'per_nbr_fc', 'total_sales_across', 'total_sales_0', 'sr_1', 'sr_2' ] ######################################### # adding missing value if missing_val not in pro_sales.wm_yr_wk_nbr.unique().tolist(): tu = [] for k in range(len(pro_sales.columns)): tu.append( 0.5 * pro_sales.loc[pro_sales.wm_yr_wk_nbr == wm_nbr_add( missing_val, -2)].iloc[0, k] + 0.5 * pro_sales.loc[pro_sales.wm_yr_wk_nbr == wm_nbr_add( missing_val, 2)].iloc[0, k]) tu[0] = int(tu[0]) pro_sales.loc[-1] = tu # adding a row pro_sales.index = pro_sales.index + 1 # shifting index pro_sales = pro_sales.sort_values( by='wm_yr_wk_nbr').reset_index().drop('index', axis=1) pro_sales = pd.merge(left=pro_sales, right=punched_mean, how='right', left_on='wm_yr_wk_nbr', right_on='wm_yr_wk_nbr', validate='1:1') pro_sales = pro_sales.drop(['posting_date'], axis=1) pro_sales = pro_sales.apply(lambda x: x.fillna(x.mean()), axis=0) pro_sales_train = pro_sales.loc[ pro_sales.wm_yr_wk_nbr <= end_train_date] pro_sales_test = pro_sales.loc[ pro_sales.wm_yr_wk_nbr >= start_test_date] # trend linear_trend = trend(degree=2, discount=0.98, name='linear_trend', w=8) # seasonality seasonal26 = seasonality(period=26, discount=1, name='seasonal26', w=12) # control variable sales0 = pro_sales_train['total_sales_0'].values.tolist() s0 = [[x] for x in sales0] sales1 = pro_sales_train['sr_1'].values.tolist() s1 = [[x] for x in sales1] sales2 = pro_sales_train['sr_2'].values.tolist() s2 = [[x] for x in sales2] macro = pro_sales_train['PR_cost'].values.tolist() m1 = [[x] for x in macro] ##################################### s0 = dynamic(features=s0, discount=0.99, name='sales0', w=8) s1 = dynamic(features=s1, discount=0.99, name='sales1', w=6) # use the actual sales and forecasting sales amount s2 = dynamic(features=s2, discount=0.95, name='sales2', w=6) m1 = dynamic(features=m1, discount=0.99, name='macro', w=12) #e1 = dynamic(features=e1,discount=0.95,name='eff',w=6) drm = dlm(pro_club['cost']) + linear_trend + seasonal26 + autoReg( degree=locality, name='ar2', w=6) + m1 #+s0+s1+s2+m1 drm.fit() #testset pro_sales_test = pro_sales_test.head(pred_weeks) sales0test = pro_sales_test['total_sales_0'].head( pred_weeks).values.tolist() s0test = [[x] for x in sales0test] sales1test = pro_sales_test['sr_1'].head(pred_weeks).values.tolist() s1test = [[x] for x in sales1test] sales2test = pro_sales_test['sr_2'].head(pred_weeks).values.tolist() s2test = [[x] for x in sales2test] macrotest = pro_sales_test['PR_cost'].head(pred_weeks).values.tolist() m1test = [[x] for x in macrotest] #efftest = testset['eff'].head(pred_weeks).values.tolist() #e1test = [[x] for x in efftest] features = { 'sales0': s0test, 'sales1': s1test, 'sales2': s2test, 'macro': m1test } #,'eff':e1test} (predictMean, predictVar) = drm.predictN(N=pred_weeks, date=drm.n - 1, featureDict=features) #locality pro_sales = pro_sales.drop(['sr_1', 'sr_2'], axis=1) pro_sales['ratio'] = pro_sales['total_sales_0'] / pro_sales[ 'total_sales_across'] pro_sales['ratio_1'] = pro_sales['ratio'].shift(1) pro_sales['ratio_2'] = pro_sales['ratio'].shift(2) trainset1_year = pro_club.loc[ pro_club.wm_yr_wk_nbr <= end_train_date].loc[ pro_club.wm_yr_wk_nbr >= end_train_date - locality] trainset_year = pro_sales.loc[ pro_sales.wm_yr_wk_nbr <= end_train_date].loc[ pro_sales.wm_yr_wk_nbr >= end_train_date - locality] trainset_year.apply(lambda x: x.fillna(x.mean()), axis=0) linear_trend_year = trend(degree=1, discount=0.99, name='linear_trend_year', w=10) sales0_year = trainset_year['ratio'].values.tolist() s0_year = [[x] for x in sales0_year] # use the forecast of the ratio of each club among total in PR area # since this is a local model, the total amount in area can be assumed to be constant. sales1_year = trainset_year['ratio_1'].values.tolist() s1_year = [[x] for x in sales1_year] sales2_year = trainset_year['ratio_2'].values.tolist() s2_year = [[x] for x in sales2_year] macro_year = trainset_year['PR_cost'].values.tolist() m1_year = [[x] for x in macro_year] ##################################### s0_year = dynamic(features=s0_year, discount=0.99, name='sales0_year', w=10) s1_year = dynamic(features=s1_year, discount=0.99, name='sales1_year', w=8) s2_year = dynamic(features=s2_year, discount=0.95, name='sales2_year', w=6) m1_year = dynamic(features=m1_year, discount=0.99, name='macro_year', w=10) #e1_year = dynamic(features=e1_year,discount=0.95,name='eff_year',w=6) if enable_sales: drm_year = dlm(trainset1_year['cost']) + autoReg( degree=locality, name='ar2', w=5 ) + linear_trend_year + m1_year + s0_year + s1_year + s2_year else: drm_year = dlm(trainset1_year['cost']) + autoReg( degree=locality, name='ar2', w=5) + linear_trend_year + m1_year #+s0_year+s1_year+s2_year drm_year.fit() testset_year = pro_sales.loc[ pro_sales.wm_yr_wk_nbr >= start_test_date].head(pred_weeks) sales0test = testset_year['ratio'].head(pred_weeks).values.tolist() s0test = [[x] for x in sales0test] sales1test = testset_year['ratio_1'].head(pred_weeks).values.tolist() s1test = [[x] for x in sales1test] sales2test = testset_year['ratio_2'].head(pred_weeks).values.tolist() s2test = [[x] for x in sales2test] features_year = { 'sales0_year': s0test, 'sales1_year': s1test, 'sales2_year': s2test, 'macro_year': m1test } (predictMean_year, predictVar_year) = drm_year.predictN(N=pred_weeks, date=drm_year.n - 1, featureDict=features_year) weeklist = [] p1 = np.exp(-r * (abs(end_train_date - start_of_this_year - 52))) p2 = 1 - p1 for k in range(pred_weeks): weeklist.append(wm_nbr_add(start_test_date, 2 * k)) if res.shape[0] == 0: res['wm_yr_wk_nbr'] = weeklist res['club'] = pd.Series(club * np.ones(pred_weeks), index=res.index) res['yhat'] = pd.Series(p1 * np.asarray(predictMean) + p2 * np.asarray(predictMean_year), index=res.index) else: tmp = pd.DataFrame() tmp['wm_yr_wk_nbr'] = weeklist tmp['club'] = pd.Series(club * np.ones(pred_weeks), index=tmp.index) tmp['yhat'] = pd.Series(p1 * np.asarray(predictMean) + p2 * np.asarray(predictMean_year), index=tmp.index) res = pd.concat([res, tmp], axis=0) return res
import math import pandas as pd import scipy.stats series = pd.read_csv('daily-users.csv', header=0, parse_dates=[0], index_col=0, squeeze=True) # Use just last 90 days series = series.ix[-90:] from pydlm import dlm, trend, seasonality constant = trend(degree=0, name="constant") seasonal_week = seasonality(period=7, name='seasonal_week') model = dlm(series) + constant + seasonal_week model.tune() model.fit() # Forecast one day predictions, conf = model.predictN(N=1) print("Prediction for next day: %.2f, confidence: %s" % (predictions[0], conf[0])) while True: actual = float(input("Actual value? ")) zscore = (actual - predictions[0]) / math.sqrt(conf[0]) print("Z-score: %.2f" % zscore) pvalue = scipy.stats.norm.sf(abs(zscore)) * 2
#from pandas_datareader import DataReader from datetime import datetime import pandas as pd import numpy as np import matplotlib.pylab as plt #import pyflux as pf from pydlm import dlm, trend, seasonality, dynamic, autoReg, longSeason data = np.array([0] * 100 + [3] * 100) myDLM = dlm(data) myDLM = myDLM + trend(degree=1, discount=0.95, name='trend1') myDLM.fit() coef = np.array(myDLM.getLatentState()) results = np.array(myDLM.result.predictedObs)[:,0,0] results_var = np.array(myDLM.result.predictedObsVar )[:,0,0] fig = plt.figure() ax1 = fig.add_subplot(311) ax1.plot(coef[:,0]) ax2 = fig.add_subplot(312) ax2.plot(coef[:,1]) ax3 = fig.add_subplot(313) ax3.plot(results) ax3.plot(data,marker='o',ls='') plt.savefig('scratch_result.pdf') '''
def print_size(obj, level=0): print(''.join([' '] * level) + str(obj)) for el in obj.refs: print_size(el, level=level + 1) def compare_size(obj1, obj2, level=0): print(''.join([' '] * level) + str(obj1)) print(''.join([' '] * level) + str(obj2)) for el1, el2 in zip(obj1.refs, obj2.refs): compare_size(el1, el2, level=level + 1) model1 = odlm([]) + trend(degree=2, discount=0.95, name='trend1') model1.stableMode(False) model2 = dlm([]) + trend(degree=2, discount=0.95, name='trend1') model2.stableMode(False) d1 = {} d2 = {} for idx, el in enumerate(ts): model1.append([el], component='main') model1.fitForwardFilter() model2.append([el], component='main') model2.fitForwardFilter() a1 = asizeof.asized(model1, detail=4)
data = dataset.load_excel(excel_file, dir="../../datasets") data = dataset.load_all_regions(data) df_italy = data["italy"] # Arrivals to Italy df_greek_island = data["greek_island"] # Arrivals to Greek Island df_mainland_greece = data["mainland_greece"] # Arrivals to Mainland greece df_fyrom = data["fyrom"] # Arrivals to fYRoM df_serbia = data["serbia"] # Arrivals to Serbia df_croatia = data["croatia"] # Arrivals to Croatia df_hungry = data["hungry"] # Arrivals to Hungry df_slovenia = data["slovenia"] # Arrivals to Slovenia df_austria = data["austria"] # Arrivals to Austria df = df_austria # Seriies to test column_name = df.columns[0] fill_method = "ffill" df.fillna(0, inplace=True) df[df.columns[0]] = df[column_name].replace(to_replace=0, method=fill_method) # Replace 0 in series model = dlm(df[column_name]) model = model + trend(degree=1, discount=0.72, name='trend component') model = model + seasonality(period=2, discount=0.99, name='seasonality component') model.fit() model.plot() predictions = list(np.array(model.result.predictedObs).flatten()) r2 = r2_score(df, predictions) rmse = np.sqrt(model.getMSE()) print('RMSE:', rmse) print('R2:', r2)
for i in range(len(test_n_t_inf)): tmp = expected_value_transition_function(state_trajectories[i - 1]) observation_trajectories.append( expected_value_observation_function(tmp)) state_trajectories.append(tmp) state_trajectories = state_trajectories[1:] ## MEAN print(np.mean(observation_trajectories, axis=1)) ## QUANTILES state_trajectories = np.array(state_trajectories).reshape( (len(test_n_t_inf), -1)) else: myDLM = dlm(train_n_t_inf) myDLM = myDLM + trend(1, name='lineTrend', w=1.0) # add a 7 day seasonality with prior covariance 1.0 myDLM = myDLM + seasonality(52, name='7day', w=1.0) # add a 3 step auto regression myDLM = myDLM + autoReg(degree=2, data=train_n_t_inf, name='ar3', w=1.0) myDLM.fit() (predictMean, predictVar) = myDLM.predictN(N=D - 1, date=myDLM.n - 1) for i in range(len(predictMean)): samples = np.random.normal(predictMean[i], np.sqrt(predictVar[i]), 4) state_trajectories.append(samples) state_trajectories = np.array(state_trajectories) phat = trace['a'].mean(axis=0) from scipy.stats import binom
#!/usr/bin/env python import pandas as pd from pydlm import dlm, odlm, trend, seasonality ts = [ 0.5429682543922109, 0.5296058346035057, 0.5403294585554494, 0.542441925561093, 0.5435209708555084, 0.5430676782288945, 0.5429877208796179, 0.5429721282202071, 0.5429690254184671, 0.5449758960859548, 0.5457612294317765, 0.5434065016617284, 0.5430519745276086, 0.5436459000038072, 0.5437794184525637 ] ## Version 1 model = odlm([]) + trend(degree=2, discount=0.95, name='trend1') + seasonality(7) model.stableMode(False) d = {} for idx, el in enumerate(ts): print(el) model.append([el], component='main') model.fitForwardFilter() print() mean, var = model.predictN(N=1, date=model.n - 1) d[idx] = mean df1 = pd.DataFrame.from_dict(d, orient="index") ## Version 2 model = dlm([]) + trend(degree=2, discount=0.95,
def run_pydlm_monthly(cus_no, mat_no, prod, param, **kwargs): import pandas as pd import numpy as np from dateutil import parser from fbprophet import Prophet from pydlm import dlm, trend, seasonality, dynamic, autoReg, longSeason, modelTuner if ('min_train_days' in kwargs.keys()): min_train_days = kwargs.get('min_train_days') else: min_train_days = p_model.min_train_days if ('test_points' in kwargs.keys()): test_points = kwargs.get('test_points') else: test_points = p_model.test_points_monthly if ('pred_points' in kwargs.keys()): pred_points = kwargs.get('pred_points') else: pred_points = p_model.pred_points_monthly # model parameters trend_degree = param.get('trend_degree') trend_w = param.get('trend_w') seasonality_w = param.get('seasonality_w') ar_degree = param.get('ar_degree') ar_w = param.get('ar_w') # data transform prod = prod.rename(columns={'dt_week': 'ds', 'quantity': 'y'}) prod = prod[['ds', 'y']] prod.ds = prod.ds.apply(str).apply(parser.parse) prod.y = prod.y.apply(float) prod = prod.sort_values('ds') prod = prod.reset_index(drop=True) # prod = prod.drop(prod.index[[0, len(prod.y) - 1]]).reset_index(drop=True) # Aggregated monthly data prod = get_monthly_aggregate_per_product(prod) # Remove outlier prod = ma_replace_outlier(data=prod, n_pass=3, aggressive=True, window_size=6, sigma=2.5) # prod = prod.reset_index(drop= True) # test and train data creation train = prod[ prod.ds <= ( np.amax(prod.ds) - pd.DateOffset(days=(np.amax(prod.ds) - np.amin(prod.ds)).days - min_train_days))] test = prod[(np.amax(np.array(train.index)) + 1):(np.amax(np.array(train.index)) + 1 + test_points)] print(len(test)) # rem_data = prod[(np.amax(np.array(train.index)) + test_points):] output_result = pd.DataFrame() while (len(test) > 0): train_pydlm = train.set_index('ds', drop=True) test_pydlm = test.set_index('ds', drop=True) # Modeling myDLM = dlm(train_pydlm.y) # add a first-order trend (linear trending) with prior covariance 1.0 myDLM = myDLM + trend(degree=trend_degree, name='trend', w=trend_w) # # add a 12 month seasonality with prior covariance 1.0 myDLM = myDLM + seasonality(12, name='12month', w= seasonality_w) # # add a 3 step auto regression myDLM = myDLM + autoReg(degree=ar_degree, data=train_pydlm.y, name='ar', w=ar_w) myDLM.fit() (predictMean, predictVar) = myDLM.predict(date=myDLM.n - 1) pred_test = np.array([round(predictMean.item((0, 0)),2)]) for i in range(len(test_pydlm)-1): (predictMean_cont, predictVar_cont) = myDLM.continuePredict() pred_test = np.append(pred_test,round(predictMean_cont.item((0, 0)),2)) print(pred_test) result_test = test print((result_test)) result_test['y_pydlm'] = pred_test result_test.loc[(result_test['y_pydlm'] < 0), 'y_pydlm'] = 0 train = prod[:(np.amax(np.array(train.index)) + 1 + test_points)] test = prod[(np.amax(np.array(train.index)) + 1):(np.amax(np.array(train.index)) + 1 + test_points)] # rem_data = prod[(np.amax(np.array(train.index)) + test_points):] output_result = pd.concat([output_result, result_test], axis=0) print(output_result.head()) train_pydlm = prod.set_index('ds', drop=True) # test_pydlm = test.set_index('ds', drop=True) # Modeling myDLM = dlm(train_pydlm.y) # add a first-order trend (linear trending) with prior covariance 1.0 myDLM = myDLM + trend(degree=trend_degree, name='trend', w=trend_w) # # add a 12 month seasonality with prior covariance 1.0 myDLM = myDLM + seasonality(12, name='12month', w=seasonality_w) # # add a 3 step auto regression myDLM = myDLM + autoReg(degree=ar_degree, data=train_pydlm.y, name='ar', w=ar_w) myDLM.fit() print(trend_degree, trend_w, seasonality_w, ar_degree, ar_w) (predictMean, predictVar) = myDLM.predict(date=myDLM.n - 1) pred_test = np.array([round(predictMean.item((0, 0)), 2)]) for i in range(test_points - 1): (predictMean, predictVar) = myDLM.continuePredict() pred_test = np.append(pred_test, round(predictMean.item((0, 0)), 2)) print(pred_test)
def _tune(self, y, period, x=None, metric="smape", val_size=None, verbose=False): """ Tune hyperparameters of the model. :param y: pd.Series or 1-D np.array, time series to predict. :param period: Int or Str, the number of observations per cycle: 1 or "annual" for yearly data, 4 or "quarterly" for quarterly data, 7 or "daily" for daily data, 12 or "monthly" for monthly data, 24 or "hourly" for hourly data, 52 or "weekly" for weekly data. First-letter abbreviations of strings work as well ("a", "q", "d", "m", "h" and "w", respectively). Additional reference: https://robjhyndman.com/hyndsight/seasonal-periods/. :param x: pd.DataFrame or 2-D np.array, exogeneous predictors, optional :param metric: Str, the metric used for model selection. One of "mse" (mean squared error), "mae" (mean absolute error). :param val_size: Int, the number of most recent observations to use as validation set for tuning. :param verbose: Boolean, True for printing additional info while tuning. :return: None """ self.period = data_utils.period_to_int(period) if type( period) == str else period val_size = int(len(y) * .1) if val_size is None else val_size y_train, y_val = model_utils.train_val_split(y, val_size=val_size) if x is not None: x_train, x_val = model_utils.train_val_split(x, val_size=val_size) metric_fun = get_metric(metric) params_grid = { "trend": [0, 1, 2, 3], "ar": [None], # "ar": [None, 1, 2, 3], } params_keys, params_values = zip(*params_grid.items()) params_permutations = [ dict(zip(params_keys, v)) for v in itertools.product(*params_values) ] scores = [] for permutation in params_permutations: try: with warnings.catch_warnings(): warnings.simplefilter("ignore") model = pydlm.dlm(y_train) model = model + pydlm.trend(degree=permutation["trend"], discount=0.5) model = model + pydlm.seasonality(period=self.period, discount=0.99) if permutation["ar"] is not None: model = model + pydlm.autoReg(degree=permutation["ar"], discount=0.99) if x is not None: for variable_id, x_variable in enumerate(x_train.T): model = model + pydlm.dynamic( features=[[v] for v in x_variable], discount=0.99, name=str(variable_id)) with SuppressStdoutStderr(): model.tune() model.fit() if x is not None: x_val_dict = {} for variable_id, x_variable in enumerate(x_val.T): x_val_dict.update( {str(variable_id): [[v] for v in x_variable]}) else: x_val_dict = None y_pred = model.predictN(date=model.n - 1, N=len(y_val), featureDict=x_val_dict)[0] score = metric_fun(y_val, y_pred) scores.append(score) except: scores.append(np.inf) best_params = params_permutations[np.nanargmin(scores)] self.params.update(best_params) self.params["tuned"] = True