def updated_six_period_plot_forecast_vs_arima(idx): ''' Builds plots that show the series history and then also the forecast with confidence intervals ''' series = series_lst6[idx] if idx == 68 or idx == 69: train, test = train_test_split(series, train_size=len(series)-6) model = pm.auto_arima(train, seasonal=True) forecasts = model.predict(test.shape[0]).tolist() elif len(series) <= 24: train, test = train_test_split(series, train_size=len(series)-6) model = pm.auto_arima(train, seasonal=True) forecasts = model.predict(test.shape[0]).tolist() else: train, test = train_test_split(series, train_size=len(series)-6) model = pm.auto_arima(train, seasonal=True, m=12) forecasts = model.predict(test.shape[0]).tolist() forecasts = np.insert(np.array(forecasts), 0 , train.iloc[-1][0]).tolist() params = model.get_params() SARIMAmodel = SARIMAX(train, order=params['order'], seasonal_order=params['seasonal_order']).fit() fcast = SARIMAmodel.get_forecast(6) conf_inf = fcast.conf_int() print(model) fig, ax = plt.subplots(figsize = (18, 12)) ax.plot(series.index, series, label = 'Actual Sales') ax.plot(series[-7:].index, forecasts, label = 'Forecasted Sales') ax.fill_between(conf_inf.index, conf_inf['lower TOTAL'].clip_lower(0), conf_inf['upper TOTAL'], color = 'lightgrey', label = '95% Confidence Interval for Forecast') ax.axvline(x = series[-6:].index[0], color='k', linestyle='--', label = 'End of Historical Sales') ax.set_title(f'Comparison of Actual vs Forecasted Sales \n for the {lst_of_stores[idx][0]} Store and {lst_of_stores[idx][1]} Department', fontsize = 20) ax.set_xlabel('Time', fontsize = 24) ax.set_ylabel('Sales', fontsize = 24) ax.tick_params(axis='both', which='major', labelsize=16) ax.legend(fontsize = 16) #ax.set_ylim([None, 5350]) #for the good #ax.set_ylim([None, 21000]) #for the bad #ax.set_ylim([None, 5350]) #for the ugly plt.grid(c='silver') #plt.savefig('../images/the_bad2') plt.show()
def arima_forecast(store_ids, dept_ids, art_dict, holdout_periods, interval): ''' Estimates ARIMA parameters for all 70 groups and then forecasts Args: store_ids = store ids for 10 stores in dataset dept_ids = department ids for 7 departments in dataset art_dict = dictionary where some series have their training timeframes altered holdout_periods = how many periods you want to forecast and compare against interval = resamples daily sales to a different interval, ie monthly ('M') Returns: -A list of forecasts for all 70 groups (a store/department combination) -A list of series for all 70 groups ''' lst_of_forecasts = [] for _ in range(holdout_periods): lst_of_forecasts.append([]) series_lst = [] for idx, val in enumerate(series_setup(store_ids, dept_ids)): if idx not in art_dict.keys(): series_lst.append(resample_series(make_series(val[0], val[1]), interval)) else: series_lst.append(art_dict[idx]) count = 0 for idx,i in enumerate(series_lst): print(count) count += 1 if idx == 68 or idx == 69: train, test = train_test_split(i, train_size=len(i)-holdout_periods) model = pm.auto_arima(train, seasonal=True) forecasts = model.predict(test.shape[0]).tolist() for idx, val in enumerate(forecasts): lst_of_forecasts[idx].append(val) elif len(i) <= 24: train, test = train_test_split(i, train_size=len(i)-holdout_periods) model = pm.auto_arima(train, seasonal=True) forecasts = model.predict(test.shape[0]).tolist() for idx, val in enumerate(forecasts): lst_of_forecasts[idx].append(val) else: train, test = train_test_split(i, train_size=len(i)-holdout_periods) model = pm.auto_arima(train, seasonal=True, m=12) forecasts = model.predict(test.shape[0]).tolist() for idx, val in enumerate(forecasts): lst_of_forecasts[idx].append(val) return lst_of_forecasts, series_lst
def load_dataset(index:int)->tuple: assert index<N_FILES, "Index out of range" path = "../datasets/"+FILES[index] time_series = pd.read_csv(path, header=None).values.reshape(-1) y_train, y_test = train_test_split(time_series, test_size=TEST_SIZE) return (y_train, y_test)
def forecast(us_counties: pd.DataFrame, log_metrics: bool, hp: dict, metric_threshold: int = 5): metrics = {} growth_rates = {} horizon = hp['horizon'] metric_skip = 0 for location in tqdm(us_counties['location'].unique(), unit=' counties'): if log_metrics: if metric_skip == metric_threshold: metric_skip = 0 else: metric_skip += 1 continue y = us_counties[us_counties.location == location].reset_index()['cases'] if len(y) < horizon: continue model = AutoARIMA(**hp) with warnings.catch_warnings(): # When there is no cases, it will throw a warning warnings.filterwarnings("ignore") try: if log_metrics: y, yv = train_test_split(y, test_size=horizon) model.fit(y) predictions = model.predict(n_periods=horizon) # Value error very rarely with weird/broken time series data except (ValueError, IndexError): continue if log_metrics: metrics[location] = np.mean( np.abs(yv - predictions) / (np.abs(yv) + np.abs(predictions))) last_forecast = predictions[len(predictions) - 1] todays_cases = y[len(y) - 1] # Places with very small amount of cases are hard to predict case_handicap = min(1.0, 0.5 + (todays_cases / 120)) growth = (last_forecast / todays_cases) * case_handicap growth_rates[location] = growth final_list = [ i[0] for i in sorted(growth_rates.items(), key=lambda i: i[1], reverse=True) ] def rank_risk(row) -> int: case_growth = growth_rates.get(row.location) if not case_growth: return 1 return round(max(0, (case_growth - 1) * 100)) if not log_metrics: us_counties['outbreak_risk'] = us_counties.apply(rank_risk, axis=1) return us_counties, final_list, metrics
def test_order_does_not_matter_with_date_transformer(): train_y_dates, test_y_dates, train_X_dates, test_X_dates = \ train_test_split(y_dates, X_dates, test_size=15) pipeline_a = Pipeline([ ('fourier', FourierFeaturizer(m=3, prefix="FOURIER")), ('dates', DateFeaturizer(column_name="date", prefix="DATE")), ("arima", AutoARIMA(seasonal=False, stepwise=True, suppress_warnings=True, maxiter=3, error_action='ignore')) ]).fit(train_y_dates, train_X_dates) Xt_a = pipeline_a.transform(exogenous=test_X_dates) pred_a = pipeline_a.predict(exogenous=test_X_dates) pipeline_b = Pipeline([ ('dates', DateFeaturizer(column_name="date", prefix="DATE")), ('fourier', FourierFeaturizer(m=3, prefix="FOURIER")), ("arima", AutoARIMA(seasonal=False, stepwise=True, suppress_warnings=True, maxiter=3, error_action='ignore')) ]).fit(train_y_dates, train_X_dates) Xt_b = pipeline_b.transform(exogenous=test_X_dates) pred_b = pipeline_b.predict(exogenous=test_X_dates) # dates in A should differ from those in B assert pipeline_a.x_feats_[0].startswith("FOURIER") assert pipeline_a.x_feats_[-1].startswith("DATE") assert pipeline_b.x_feats_[0].startswith("DATE") assert pipeline_b.x_feats_[-1].startswith("FOURIER") # columns should be identical once ordered appropriately assert Xt_a.equals(Xt_b[pipeline_a.x_feats_]) # forecasts should be identical assert_array_almost_equal(pred_a, pred_b, decimal=3)
def arima_pred(actual, pred_num): ''' -------- Description: actual is the true value on both train and test data pred_num is the length of test data -------- Example: fit, pred = arima_pred(series, 28) ''' ## data split train, test = model_selection.train_test_split(actual, test_size=pred_num) ## train model arima_model = pm.auto_arima(train, trace=False, stepwise=True, suppress_warnings=True, error_action='ignore') ## predict pred = arima_model.predict(n_periods=pred_num) return ([arima_model.predict_in_sample(), pred])
def main(): cfparser = configparser.ConfigParser() cfparser.read('config.ini') database = cfparser['Server']['database'] try: con = sqlite3.connect(database) print('Connected to SQLite') except Error as e: print('database connection error: ' + str(e)) sys.exit(-1) with con: cur = con.cursor() cur.execute("SELECT fee FROM gas_fees") data = cur.fetchall() print('Read data -> %s rows' % (len(data), )) print('Sample: %s' % (data[0][0], )) y = np.asarray(data[-100:]) # total number of samples to take print(y) train, test = train_test_split( y, train_size=50) # total number of samples / 2 # Fit your model model = pm.auto_arima(train, seasonal=True, m=7) # Seasonal = True?? # make your forecasts forecasts = model.predict(test.shape[0]) # predict N steps into the future # Visualize the forecasts (blue=train, red=whole dataset, green=forecasts) x = np.arange(y.shape[0]) plt.plot(x, y, c='red') plt.plot(x[:50], train, c='blue') # total number of samples / 2 plt.plot(x[50:], forecasts, c='green') # same as above plt.show()
# print day and pm 2.5 values print(data.head()) # group df by day # calculate mean value of pm2.5 for every given day print("MEAN pm25 values by day\n", data.pm25) data.plot() plt.title('Initial mean values for November') plt.show() # # # begin training X = data.values print("length of input values", len(X)) y_train, y_test = train_test_split(X, test_size=0.3) print("length of train values", len(y_train)) print("length of test values", len(y_test)) predictions = [] model_ar = AR(y_train) model_ar_fit = model_ar.fit() predictions = model_ar_fit.predict(start=len(y_train), end=len(data)) print("length of predictions", len(predictions)) # TODO try all possibilities of (p,d,q) model_arima = ARIMA(y_train, order=(1, 0, 4)) model_arima_fit = model_arima.fit()
def model_train(test=False): ## subset the data to enable faster unittests #create dataframe for temporary capture of traiing results df_res = pd.DataFrame(columns=["country", "rmse", "mape"]) df = fetch_data() ts = df.groupby("invoice_date")["revenue"].sum().rename("sales") y = ts.resample('MS').mean() ## start timer for runtime time_start = time.time() model = pm.auto_arima(y, start_p=1, start_q=1, test='adf', max_p=3, max_q=3, m=12, start_P=0, seasonal=True, d=None, D=1, trace=True, error_action='ignore', suppress_warnings=True, stepwise=True) if test: saved_model = "sales-arima-{}.joblib".format( re.sub("\.", "_", str(MODEL_VERSION))) train, test = model_selection.train_test_split(y, test_size=0.1) result = model.fit(train) joblib.dump(model, os.path.join(MODEL_DIR, saved_model)) df_res.loc[0] = ["all", "0.2", "0.3"] m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) update_train_log(y.shape[0], { 'country': all, 'rmse': "0.2", 'mape': "0.3" }, runtime, MODEL_VERSION, MODEL_VERSION_NOTE, test=True) else: country = data.index.unique().tolist() c_listlen = len(country) for i in range(c_listlen): cntry = country[i] #format the country variable if cntry.isspace(): str_country = cntry str_country = re.sub(r"\s+", '-', str_country) str_country = str_country.lower() else: str_country = cntry.lower() #set country model saved_model = str_country + "-" + "sales-arima-{}.joblib".format( re.sub("\.", "_", str(MODEL_VERSION))) #filter data to train based on country y = filter_cntry_data(df, cntry) # Split data into train / test sets train, test = model_selection.train_test_split(y, test_size=0.1) #smodel.summary() result = model.fit(train) #print Autom arima diagnostics #results.plot_diagnostics(figsize=(16, 8)) #plt.show() joblib.dump(model, os.path.join(MODEL_DIR, saved_model)) #result.plot_diagnostics(figsize=(15,12)) #print( result.summary().tables[1]) #print("\n Proceed with Auto Arima due to better AIC value\n") #forecast forecast = model.predict(n_periods=len(test)) forecast = pd.DataFrame(forecast, index=test.index, columns=['predictions']) #hide the plots as this will be caled via scripts #plot ''' plt.plot(train,label='Train') plt.plot(test, label='Valid') plt.plot(forecast, label ='Prediction') plt.legend() plt.show() ''' rms = round(sqrt(mean_squared_error(test, forecast)), 2) #print("Arima rms \n:{}",model_train ()) mape_result = round(mean_absolute_percentage_error(test, forecast)) df_res.loc[i] = [cntry, rms, mape_result] m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) update_train_log(y.shape[0], { 'country': cntry, 'rmse': rms, 'mape': mape_result }, runtime, MODEL_VERSION, MODEL_VERSION_NOTE, test=False) runtime = "%03d:%02d:%02d" % (h, m, s) return dict(df_res.to_dict())
def draw_(province, isDaily): # 模型训练 model = arima.AutoARIMA( start_p=0, max_p=4, d=None, start_q=0, max_q=1, start_P=0, max_P=1, D=None, start_Q=0, max_Q=1, m=7, seasonal=True, test="kpss", trace=True, error_action="ignore", suppress_warnings=True, stepwise=True, ) if isDaily: data = df[province].diff().dropna() model.fit(data) else: data = df[province] model.fit(data) # 模型验证 train, test = train_test_split(data, train_size=0.8) pred_test = model.predict_in_sample(start=train.shape[0], dynamic=False) validating = pd.Series(pred_test, index=test.index) r2 = r2_score(test, pred_test) # 开始预测 pred, pred_ci = model.predict(n_periods=14, return_conf_int=True) idx = pd.date_range(data.index.max() + pd.Timedelta("1D"), periods=14, freq="D") forecasting = pd.Series(pred, index=idx) # 绘图呈现 plt.figure(figsize=(24, 6)) plt.plot(data.index, data, label="Actual Value", color="blue") plt.plot(validating.index, validating, label="Check Value", color="orange") plt.plot(forecasting.index, forecasting, label="Predict Value", color="red") # plt.fill_between(forecasting.index, pred_ci[:, 0], pred_ci[:, 1], color="black", alpha=.25) plt.legend() plt.ticklabel_format(style="plain", axis="y") # plt.rcParams["font.sans-serif"] = ["Microsoft YaHei"] if isDaily: plt.title( f"Daily Confirmed Cases Forecasting - {province}\nARIMA {model.model_.order}x{model.model_.seasonal_order} (R2 = {r2:.6f})" ) plt.savefig( os.path.join("figures", f"covid-{adjust_name(province)}-daily.svg"), bbox_inches="tight", ) plt.close() else: plt.title( f"Accumulative Confirmed Cases Forecasting - {province}\nARIMA {model.model_.order}x{model.model_.seasonal_order} (R2 = {r2:.6f})" ) plt.savefig( os.path.join("figures", f"covid-{adjust_name(province)}.svg"), bbox_inches="tight", ) plt.close()
def LSTM_uni_train( raw_data, use_date_max, col, com_col, scale, n_steps, n_features, test_h, model, BATCH_SIZE=1, BUFFER_SIZE=100, EVALUATION_INTERVAL=100, EPOCHS=1000, optimizer='adam', loss='mse', metrics=['mse'], saveroot='C:/Users/KIMYEONKYOUNG/Desktop/2021 AI 빅데이터팀/메탈 수요예측/code/model_회사별/' ): #scaleoption : None,standard,minmax,robust # n_features = 1 (univariate) #raw data 2 time series data ts_data = data2tsdata(raw_data, col, use_date_max) #scaling if scale == None: data_t = ts_data if scale == 'standard': (data_t, scal) = standardscale(ts_data) if scale == 'minmax': (data_t, scal) = minmaxscale(ts_data) if scale == 'robust': (data_t, scal) = robustscale(ts_data) #get train data (df_train, df_test) = train_test_split(data_t, test_h, n_steps) # [train] dateaframe to tensor nd_train = np.asarray(df_train) nd_train = nd_train.reshape(len(df_train), ) nd_test = np.asarray(df_test) nd_test = nd_test.reshape(len(df_test), ) (train_x, train_y) = split_sequence(nd_train, n_steps) train_x = train_x.reshape(train_x.shape[0], n_steps, n_features) #train data 2 train & val train_univariate = tf.data.Dataset.from_tensor_slices((train_x, train_y)) train_univariate = train_univariate.cache().shuffle(BUFFER_SIZE).batch( BATCH_SIZE).repeat() val_univariate = tf.data.Dataset.from_tensor_slices((train_x, train_y)) val_univariate = val_univariate.batch(BATCH_SIZE).repeat() # Build EarlyStopping path_checkpoint = "lstm_model_checkpoint_try.h5" es_callback = tf.keras.callbacks.EarlyStopping( monitor="loss", min_delta=0, patience=100, mode='auto' ) # mode=auto loss면 최저값100번정도 반복되면 정지, acc면 최고값이 100번정도 반복되면 정지 modelckpt_callback = tf.keras.callbacks.ModelCheckpoint( monitor="loss", filepath=path_checkpoint, verbose=1, save_weights_only=True, save_best_only=True, ) model.compile(optimizer, loss, metrics) #train model history = model.fit(train_univariate, epochs=EPOCHS, validation_data=val_univariate, steps_per_epoch=EVALUATION_INTERVAL, validation_steps=1, verbose=1, callbacks=[es_callback, modelckpt_callback]) #save model file_root = col.replace('_실적', "") model.save(saveroot + '/' + str(file_root) + '/' + str(use_date_max) + '_' + 'lstm_model_checkpoint_' + str(scale) + '_' + str(col) + '.h5') #graph visualize_loss(history, "Training & vaildation Loss", saveroot=saveroot) ######################################################################################### #predict if test_h == 1: xtt = nd_test.reshape(1, n_steps, n_features) #predict yhat = model.predict(xtt) prediction = pd.DataFrame(yhat) prediction.columns = ['yhat'] prediction.index = df_test[4:].index if test_h != 1: (xt, yt) = split_sequence(nd_test, n_steps) xtt = xt.reshape(xt.shape[0], n_steps, n_features) #predict yhat = model.predict(xtt) prediction = pd.DataFrame(yhat) prediction.columns = ['yhat'] prediction.index = df_test[n_step:].index #inverse_scale prediction['prediction'] = scal.inverse_transform(prediction) train_g, test_g = model_selection.train_test_split( ts_data, train_size=len(ts_data) - test_h) #outputdataframe(이동,실적,pred) outpu = pd.merge(test_g, prediction['prediction'], left_index=True, right_index=True) com_ts_data = data2tsdata(raw_data, com_col, use_date_max) output = pd.merge(outpu, com_ts_data, left_index=True, right_index=True) #timeseries graph(train,test,predict) predict_graph(train_g, test_g, prediction['prediction'], output[com_col], saveroot=saveroot) #excel에 output저장(model_root, test_h, test_train loss, timeseries graph, output df) wb = Workbook() result_df = result_input(wb, model_root=saveroot + 'lstm_model_checkpoint_' + str(scale) + '_' + str(col) + '.h5', test_h=test_h, col=col, com_col=com_col, output=output, pre_graph=saveroot + '_timeseries.png', loss_graph=saveroot + 'lossgraph.png') wb.save(saveroot + '/' + str(file_root) + '/' + use_date_max + '.xlsx') return result_df
data = pd.DataFrame(grp_date.mean()) print("MEAN pm25 values by day\n", data.pm25) data.plot() plt.title('Initial mean values for November') plt.show() # begin training X = data.values print("length of input values", len(X)) # ~70% of data->training # train = X[0:21] # 21 data as train # y_train, y_test = train_test_split( X, test_size=0.3, # shuffle=False) ) print("y train", y_train) print("y test", y_test) # print("length of train values", len(y_train)) # print("length of train values", len(train)) # ~30% to test, 9 data as test # test = X[21:] # print("length of test values", len(test)) print("length of test values", len(y_test)) predictions = []
if use_diff: #将目标序列从原始序列变成 差分序列 price = price.diff(1)[train_3m.shape[0] - past - pred + 1:] else: price = price[train_3m.shape[0] - past - pred + 1:] prefix = valfiles_oi[ind].split( '_')[0] + '-validation-{}d-'.format(pred) #滑动窗口 for i in range(past + pred - 1, len(price)): print( '===========当前训练的是{}数据集,目标节点是{}=================='.format( valfiles_oi[ind].split('_')[0], val_3m.index[(i - (past + pred) + 1)])) sample = price[(i - (past + pred) + 1):(i + 1)] train, test = train_test_split(sample, train_size=past) pipeline = Pipeline([ # ('boxcox', BoxCoxEndogTransformer(lmbda2=1e-6)), # lmbda2 avoids negative values ('arima', pm.AutoARIMA(seasonal=True, m=1, suppress_warnings=True, trace=True, error_action="ignore")) ]) pipeline.fit(train) pred_result = pipeline.predict(pred) print('pred_result is : ', pred_result) print( '====================一次训练结束=============================\n\n\n'
inplace=True) series = model_data.iloc[:, 0] ############################################################################### ## model 1: ARIMA series_diff = series.diff().dropna() plt.plot(series_diff) ## ADF stationary test(p-value 6.575445276313532e-27, stationary) sm.tsa.stattools.adfuller(series_diff) ## Ljung-Box white noise test(p-value close to 0, not white noise) plt.plot(lb_test(series_diff, lags=None, boxpierce=False)[1]) plt.show() ## data split train, test = model_selection.train_test_split(series, test_size=28) ## train model arima_model = pm.auto_arima(train, trace=True, stepwise=True, suppress_warnings=True, error_action='ignore') arima_model.summary() ## model checking(all p-value > 0.05, residual is white noise, model is correct) plt.plot(lb_test(arima_model.resid(), lags=None, boxpierce=False)[1]) plt.axhline(y=0.05, c="r", ls="--", lw=2) plt.show() ## predict preds, conf_int = arima_model.predict(n_periods=test.shape[0],
def model_train (): filenm="" if request.method == 'POST': f = request.files['file'] if f.filename !='': f.save(os.path.join(DATA_DIR,f.filename)) filenm=f.filename print("\nFile name is :\n{}",filenm) data = fetch_data(filenm) #create dataframe for temporary capture of traiing results df_res = pd.DataFrame(columns=["country","rmse","mape"]) ts = data.groupby("invoice_date")["revenue"].sum().rename("sales") y_all = ts.resample('MS').mean() if filenm=="test.txt": time_start = time.time() model = pm.auto_arima(y_all, start_p=1, start_q=1, max_p=3, max_q=3, m=12, start_P=0, seasonal=True, d=None, D=1, trace=True, error_action='ignore', suppress_warnings=True, stepwise=True) saved_model ="sales-arima-{}.joblib".format(re.sub("\.", "_", str(MODEL_VERSION))) #train, test = model_selection.train_test_split(y_all, test_size=0.1) result = model.fit(y_all) joblib.dump(model, os.path.join(MODEL_DIR, saved_model)) df_res.loc[0] = ["all","0.2", "0.3"] m, s = divmod(time.time()-time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d"%(h, m, s) train_shape = str(y_all.shape[0])+" x 1" update_train_log(train_shape,{'country':all,'rmse':"0.2",'mape':"0.3"},runtime,MODEL_VERSION, MODEL_VERSION_NOTE, test=True) else: ## input checking #get the number of months to forecast #select the country model try:#value = int(data['value']) country = data.index.unique().tolist() except (KeyError,TypeError,ValueError): raise JsonError(description='Invalid value') #enforce datetime astype on cloumn invoice_date #data["invoice_date"] = pd.to_datetime(data["invoice_date"]) ## start timer for runtime time_start = time.time() c_listlen= len(country) # Seasonal - fit stepwise auto-ARIMA #with ARIMA, due to size of the data , weshall not use train split #Having checked the ARIMA fit model, all countries repor the same hyperparameters #Training will be done however per country model = pm.auto_arima(y_all, start_p=1, start_q=1, max_p=3, max_q=3, m=12, start_P=0, seasonal=True, d=None, D=1, trace=True, error_action='ignore', suppress_warnings=True, stepwise=True) for i in range(c_listlen): cntry = country[i] #format the country variable if cntry.isspace(): str_country= cntry str_country = re.sub(r"\s+",'-',str_country) str_country =str_country.lower() else: str_country = cntry.lower() #set country model saved_model = str_country+"-"+"sales-arima-{}.joblib".format(re.sub("\.", "_", str(MODEL_VERSION))) #filter data to train based on country y =filter_cntry_data(data,cntry) # Split data into train / test sets train, test = model_selection.train_test_split(y, test_size=0.1) #smodel.summary() result = model.fit(train) #print Autom arima diagnostics #results.plot_diagnostics(figsize=(16, 8)) #plt.show() joblib.dump(model, os.path.join(MODEL_DIR, saved_model)) #result.plot_diagnostics(figsize=(15,12)) #print( result.summary().tables[1]) #print("\n Proceed with Auto Arima due to better AIC value\n") #forecast forecast = model.predict(n_periods=len(test)) forecast = pd.DataFrame(forecast,index=test.index,columns=['predictions']) #hide the plots as this will be caled via scripts #plot ''' plt.plot(train,label='Train') plt.plot(test, label='Valid') plt.plot(forecast, label ='Prediction') plt.legend() plt.show() ''' rms = round(sqrt(mean_squared_error(test,forecast)),2) #print("Arima rms \n:{}",model_train ()) mape_result = round(mean_absolute_percentage_error(test,forecast)) df_res.loc[i] = [cntry,rms, mape_result] m, s = divmod(time.time()-time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d"%(h, m, s) train_shape = str(y_all.shape[0])+" x 1" update_train_log(train_shape,{'country':cntry,'rmse':rms,'mape':mape_result},runtime,MODEL_VERSION, MODEL_VERSION_NOTE, test=False) #return json_response(rmse = rms, mape=mape_result) return dict(df_res.to_dict())
from pmdarima.compat.pytest import pytest_error_str from pmdarima.model_selection import train_test_split from pmdarima.pipeline import Pipeline, _warn_for_deprecated from pmdarima.preprocessing import BoxCoxEndogTransformer, FourierFeaturizer from pmdarima.arima import ARIMA, AutoARIMA from pmdarima.datasets import load_wineind import numpy as np import pytest rs = np.random.RandomState(42) wineind = load_wineind() xreg = rs.rand(wineind.shape[0], 2) train, test, x_train, x_test = train_test_split(wineind, xreg, train_size=125) class TestIllegal: def test_non_unique_names(self): # Will fail since the same name repeated twice with pytest.raises(ValueError) as ve: Pipeline([("stage", BoxCoxEndogTransformer()), ("stage", ARIMA(order=(0, 0, 0)))]) assert "not unique" in pytest_error_str(ve) def test_names_in_params(self): # Will fail because 'steps' is a param of Pipeline with pytest.raises(ValueError) as ve: Pipeline([("steps", BoxCoxEndogTransformer()),
import pmdarima as pm from pmdarima import arima from pmdarima import model_selection from pmdarima import pipeline from pmdarima import preprocessing from pmdarima.datasets._base import load_date_example import numpy as np from matplotlib import pyplot as plt print("pmdarima version: %s" % pm.__version__) # Load the data and split it into separate pieces y, X = load_date_example() y_train, y_test, X_train, X_test = \ model_selection.train_test_split(y, X, test_size=20) # We can examine traits about the time series: pm.tsdisplay(y_train, lag_max=10) # We can see the ACF increases and decreases rather rapidly, which means we may # need some differencing. There also does not appear to be an obvious seasonal # trend. n_diffs = arima.ndiffs(y_train, max_d=5) # Here's what the featurizer will create for us: date_feat = preprocessing.DateFeaturizer( column_name="date", # the name of the date feature in the exog matrix with_day_of_week=True, with_day_of_month=True)
import pmdarima as pm from pmdarima.model_selection import train_test_split import numpy as np import matplotlib.pyplot as plt # Load/split your data y = pm.datasets.load_wineind() train, test = train_test_split(y, train_size=150) # Fit your model model = pm.auto_arima(train, seasonal=True, m=12) # make your forecasts forecasts = model.predict(test.shape[0]) # predict N steps into the future # Visualize the forecasts (blue=train, green=forecasts) x = np.arange(y.shape[0]) plt.plot(x[:150], train, c='blue') plt.plot(x[150:], forecasts, c='green') plt.show()
from pmdarima import auto_arima from pmdarima.arima import ndiffs from pmdarima.model_selection import train_test_split from sklearn.metrics import mean_squared_error from tqdm import tqdm df = pd.concat(map(pd.read_json, Path("data").glob("forecast_*.json")), ignore_index=True) df["time"] = df.apply(lambda r: datetime.fromtimestamp(r["time"]), axis=1) df = df.sort_values(by=["time"]) temperature = df["temperature"] temperature = temperature.fillna(temperature.mean()) train, test = train_test_split(temperature, train_size=temperature.shape[0] - 365) print(f"training size: {train.shape[0]}") print(f"testing size: {test.shape[0]}") # %% kpss_diffs = ndiffs(train, alpha=0.05, test="kpss", max_d=6) adf_diffs = ndiffs(train, alpha=0.05, test="adf", max_d=6) n_diffs = max(adf_diffs, kpss_diffs) print(f"d: {n_diffs}") # %% model = auto_arima( train, d=n_diffs,
def test_train_test_split(): tr, te = train_test_split(y, test_size=10) assert te.shape[0] == 10 assert_array_equal(y, np.concatenate([tr, te]))
import pmdarima as pm from pmdarima.model_selection import train_test_split from pmdarima.pipeline import Pipeline from pmdarima.preprocessing import BoxCoxEndogTransformer import pickle # Load/split your data y = pm.datasets.load_sunspots() train, test = train_test_split(y, train_size=2700) # Define and fit your pipeline pipeline = Pipeline([ ('boxcox', BoxCoxEndogTransformer(lmbda2=1e-6)), # lmbda2 avoids negative values ('arima', pm.AutoARIMA(seasonal=True, m=12, suppress_warnings=True, trace=True)) ]) pipeline.fit(train) # Serialize your model just like you would in scikit: with open('model.pkl', 'wb') as pkl: pickle.dump(pipeline, pkl) # Load it and make predictions seamlessly: with open('model.pkl', 'rb') as pkl: mod = pickle.load(pkl) print(mod.predict(15)) # [25.20580375 25.05573898 24.4263037 23.56766793 22.67463049 21.82231043 # 21.04061069 20.33693017 19.70906027 19.1509862 18.6555793 18.21577243 # 17.8250318 17.47750614 17.16803394]
def sarimax_pmdarima(timeseries, train_length, m): """ Previsioni con il modello SARIMAX e selezione automatica degli ordini Parameters ---------- timeseries : Series la serie temporale. train_length : int la lunghezza del set di train (in rapporto alla serie completa). m : int il periodo stagionale. Returns ------- tuple (order, seasonal_order) """ # creo i set di train e di test train, test = model_selection.train_test_split(timeseries, train_size=train_length) # scelgo e adatto il modello ai dati model = pm.auto_arima(train, seasonal=True, m=m, suppress_warnings=True, trace=True, start_p=1, start_q=1, max_p=2, max_q=2, start_P=1, start_Q=1, max_P=2, max_Q=2) # stampo i parametri del modello print(model.summary()) # predizioni in-sample # http://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.ARIMA.html#pmdarima.arima.ARIMA.predict_in_sample preds = model.predict_in_sample(end=len(train) - 1) sarimax_dates = pd.date_range(start=timeseries.index[0], end=timeseries.index[len(train) - 1], freq='D') sarimax_ts = pd.Series(preds, index=sarimax_dates) # predizioni out-of-sample # http://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.ARIMA.html#pmdarima.arima.ARIMA.predict fcast, conf_int = model.predict(n_periods=test.shape[0], return_conf_int=True) fcast_dates = pd.date_range(start=timeseries.index[len(train)], periods=len(timeseries) - len(train), freq='D') ts_fcast = pd.Series(fcast, index=fcast_dates) ts_ci_min = pd.Series(conf_int[:, 0], index=fcast_dates) ts_ci_max = pd.Series(conf_int[:, 1], index=fcast_dates) print('Test RMSE: %.4f' % np.sqrt(mean_squared_error(test, fcast))) # grafico del modello plt.figure(figsize=(40, 20), dpi=80) plt.title('Modello SARIMAX{}x{} per {}'.format(model.order, model.seasonal_order, timeseries.name)) ax = train.plot(label='Train set', color='black') sarimax_ts.plot(ax=ax, label='In-sample predictions', color='green') plt.legend() plt.show() # grafico delle previsioni plt.figure(figsize=(40, 20), dpi=80) plt.title('Forecasting con SARIMAX{}x{} per {}'.format( model.order, model.seasonal_order, timeseries.name)) ax = timeseries.plot(label='Observed', color='black') ts_fcast.plot(ax=ax, label='Out-of-sample forecasts', alpha=.7, color='red') ax.fill_between(fcast_dates, ts_ci_min, ts_ci_max, color='k', alpha=.2) plt.legend() plt.show() # metriche di errore errore = ts_fcast - timeseries errore.dropna(inplace=True) print('MSE=%.4f' % (errore**2).mean()) print('MAE=%.4f' % (abs(errore)).mean()) return (model.order, model.seasonal_order)
plt.plot(np.linspace(0, 9, 10), cresc_p2['Preco']) # %% from pmdarima.datasets import load_lynx # %% dado_lynx = load_lynx() # %% dado_lynx.shape # %% from pmdarima import model_selection # %% treino, teste = model_selection.train_test_split(dado_lynx, train_size=100) # %% teste1 = teste[:10] teste2 = teste[10:] # %% modelo_arima = auto_arima(treino, start_p=1, start_q=1, d=0, max_p=5, max_q=5, supress_warnings=True, stepwise=True, error_action='ignore')
""" Created on Thu May 7 02:53:11 2020 @author: felip """ import pmdarima as pm from pmdarima import model_selection import numpy as np from matplotlib import pyplot as plt # ############################################################################# # Load the data and split it into separate pieces # Australian total wine sales by wine makers in bottles data = pm.datasets.load_wineind() train, test = model_selection.train_test_split(data, train_size=150) # Fit a simple auto_arima model arima = pm.auto_arima(train, error_action='ignore', trace=True, suppress_warnings=True, maxiter=10, seasonal=True, m=12) # ############################################################################# # Plot actual test vs. forecasts: x = np.arange(test.shape[0]) plt.scatter(x, test, marker='x') plt.plot(x, arima.predict(n_periods=test.shape[0]))
<br/> """ print(__doc__) # Author: Taylor Smith <*****@*****.**> import pmdarima as pm from pmdarima import model_selection import joblib # for persistence import os # ############################################################################# # Load the data and split it into separate pieces y = pm.datasets.load_wineind() train, test = model_selection.train_test_split(y, train_size=125) # Fit an ARIMA arima = pm.ARIMA(order=(1, 1, 2), seasonal_order=(0, 1, 1, 12)) arima.fit(y) # ############################################################################# # Persist a model and create predictions after re-loading it pickle_tgt = "arima.pkl" try: # Pickle it joblib.dump(arima, pickle_tgt, compress=3) # Load the model up, create predictions arima_loaded = joblib.load(pickle_tgt) preds = arima_loaded.predict(n_periods=test.shape[0])