def compute_and_plot_submission_with_arima(save_path, fig_folder): # with this dataset, the index column is day data = pd.read_csv("data/train.csv", index_col="Day") # changing index to datetime object year - month - day data.index = pd.to_datetime(data.index, format="%Y-%m-%d") data = data.asfreq('d') nbr_series = len(data.columns) nbr_samples = data["series-1"].count() start_date = data.index[0] end_date = data.index[781] print("Start date " + str(start_date)) print("End date " + str(end_date)) interval_train = pd.date_range(start=start_date, end='2017-07-30') # training set without validation for the submission interval_train_full = pd.date_range(start=start_date, end=end_date) # validation is 21 days interval_valid = pd.date_range(start='2017-07-31', end=end_date) # test is 21 days interval_test = pd.date_range(start='2017-08-21', end='2017-09-10') # number of samples we are predicting horizon = len(interval_test) # separating data into train and validation set data_train = data.loc[interval_train] data_train_full = data.loc[interval_train_full] data_valid = data.loc[interval_valid] data_test = pd.DataFrame(index=interval_test) data_submit = pd.DataFrame(index=interval_test) data_best_arima = pd.DataFrame(index=interval_test) # for plotting methods_colors = [ "blue", "red", "cyan", "orange", "pink", "magenta", "grey", "yellow" ] # to store the chosen model chosen_method = [] chosen_method_smape = [] chosen_method_param = [] # to record the smape of each model record_smapes = [] number_of_days = 62 ARIMA_ORDER_VALID = [] ARIMA_ORDER_TEST = [] for i in range(1, nbr_series + 1): # to perform comparisons on models best_smape = 1000000000 best_model = None best_model_param = None nn_models_current_series = models_to_try[i - 1] nn_models_params_current_series = models_to_try_parameters[i - 1] nbr_models_current_series = len(nn_models_current_series) record_smapes.append([]) # for validation, we plot each validation forecast separately # axis 0 is a comparison of each forecast on last 90 days # axis 1 is a comparison of each forecast on the whole series _, ax = plt.subplots(nrows=nbr_models_current_series + 3, ncols=1, figsize=(12, 10)) ax[0].set_title( 'Comparison of model forecast on validation series for series ' + str(i) + ' (last ' + str(number_of_days) + ' days)') ax[0].plot(data_train['series-' + str(i)][-number_of_days:], color="black", linestyle="-") ax[0].plot(data_valid['series-' + str(i)], color="green", linestyle="-") ax[1].set_title( 'Comparison of model forecast on validation series for series ' + str(i) + ' (whole series without outliers)') ax[1].plot(remove_outliers(data_train['series-' + str(i)]), color="black", linestyle="-") ax[1].plot(data_valid['series-' + str(i)], color="green", linestyle="-") axis_legend = [ "Train series", "Validation series", auto_arima_forecast.__name__ ] legend_size = 6 for model_name in nn_models_current_series: axis_legend.append(model_name.__name__) ax[0].legend(axis_legend, prop={'size': legend_size}) ax[1].legend(axis_legend, prop={'size': legend_size}) axis_legend = ["Train series", "Validation series"] ax[2].set_title( 'Comparison of model forecast on validation series for series ' + str(i) + ' (last ' + str(number_of_days) + ' days)') ax[2].plot(data_train['series-' + str(i)][-number_of_days:], color="black", linestyle="-") ax[2].plot(data_valid['series-' + str(i)], color="green", linestyle="-") if i >= 100: smape, forecast, order, seasonal_order = auto_arima_forecast( data_train['series-' + str(i)], data_valid['series-' + str(i)], horizon, del_outliers=True, normalize=True, plot=False) else: order = ARIMA_parameters[i - 1][0] seasonal_order = ARIMA_parameters[i - 1][1] smape, forecast = arima_forecast(data_train['series-' + str(i)], data_valid['series-' + str(i)], horizon, order, seasonal_order, del_outliers=True, normalize=True, plot=False) ARIMA_ORDER_VALID.append((order, seasonal_order)) ax[0].plot(forecast, color="steelblue", linestyle="--") ax[1].plot(forecast, color="steelblue", linestyle="--") ax[2].plot(forecast, color="steelblue", linestyle="--") axis_legend_copy = copy.copy(axis_legend) axis_legend_copy.append(auto_arima_forecast.__name__ + " " + str(order) + " " + str(seasonal_order) + " " + str(str("{:.2f}".format(smape)))) ax[2].legend(axis_legend_copy, prop={'size': legend_size}) record_smapes[i - 1].append(smape) if smape < best_smape: best_smape = smape best_model = auto_arima_forecast best_model_param = (order, seasonal_order) for model_index in range(nbr_models_current_series): ax[model_index + 3].set_title( 'Comparison of model forecast on validation series for series ' + str(i) + ' (last ' + str(number_of_days) + ' days)') ax[model_index + 3].plot(data_train['series-' + str(i)][-number_of_days:], color="black", linestyle="-") ax[model_index + 3].plot(data_valid['series-' + str(i)], color="green", linestyle="-") smape, forecast = nn_models_current_series[model_index]( data_train['series-' + str(i)], data_valid['series-' + str(i)], nn_models_params_current_series[model_index], horizon, del_outliers=True, normalize=True, plot=False) ax[0].plot(forecast, color=methods_colors[model_index], linestyle="--") ax[1].plot(forecast, color=methods_colors[model_index], linestyle="--") ax[model_index + 3].plot(forecast, color=methods_colors[model_index], linestyle="--") axis_legend_copy = copy.copy(axis_legend) axis_legend_copy.append( nn_models_current_series[model_index].__name__ + " " + str(str("{:.2f}".format(smape)))) ax[model_index + 3].legend(axis_legend_copy, prop={'size': legend_size}) record_smapes[i - 1].append(smape) if smape < best_smape: best_smape = smape best_model = nn_models_current_series[model_index] best_model_param = nn_models_params_current_series[model_index] print("--- SERIES " + str(i) + " SMAPES ---") print(" MODEL " + auto_arima_forecast.__name__ + " PARAM " + str(ARIMA_ORDER_VALID[i - 1]) + " SMAPE " + str(record_smapes[i - 1][0])) for model_index in range(nbr_models_current_series): print(" MODEL " + nn_models_current_series[model_index].__name__ + " PARAM " + str(nn_models_params_current_series[model_index]) + " SMAPE " + str(record_smapes[i - 1][model_index + 1])) plt.tight_layout() plt.savefig(fig_folder + 'series-' + str(i) + "-comparison.pdf") plt.show() """ --------------- now to chosen the best model -----------------------""" _, ax = plt.subplots(nrows=nbr_models_current_series + 3, ncols=1, figsize=(12, 10)) ax[0].set_title( 'Comparison of model forecast on test series for series ' + str(i) + ' (last ' + str(number_of_days) + ' days)') ax[0].plot(data_train['series-' + str(i)][-number_of_days:], color="black", linestyle="-") ax[0].plot(data_valid['series-' + str(i)], color="green", linestyle="-") ax[1].set_title( 'Comparison of model forecast on test series for series ' + str(i) + ' (whole series without outliers)') ax[1].plot(remove_outliers(data_train['series-' + str(i)]), color="black", linestyle="-") ax[1].plot(data_valid['series-' + str(i)], color="green", linestyle="-") axis_legend_all = ["Train series", "Validation series"] legend_size = 6 axis_legend = ["Train series", "Validation series"] ax[2].set_title( 'Comparison of model forecast on test series for series ' + str(i) + ' (last ' + str(number_of_days) + ' days)') ax[2].plot(data_train['series-' + str(i)][-number_of_days:], color="black", linestyle="-") ax[2].plot(data_valid['series-' + str(i)], color="green", linestyle="-") if i >= 100: smape, forecast, order, seasonal_order = auto_arima_forecast( data_train_full['series-' + str(i)], data_test, horizon, del_outliers=True, normalize=True, plot=False) else: order = ARIMA_parameters[i - 1][0] seasonal_order = ARIMA_parameters[i - 1][1] smape, forecast = arima_forecast(data_train_full['series-' + str(i)], data_test, horizon, order, seasonal_order, del_outliers=True, normalize=True, plot=False) ARIMA_ORDER_TEST.append((order, seasonal_order)) data_best_arima['series-' + str(i)] = forecast ax[0].plot(forecast, color="steelblue", linestyle="--") ax[1].plot(forecast, color="steelblue", linestyle="--") ax[2].plot(forecast, color="steelblue", linestyle="--") data_save_method = pd.DataFrame(index=interval_test) data_save_method['series-' + str(i)] = forecast data_save_method.to_csv(fig_folder + str(i) + "_" + auto_arima_forecast.__name__ + "_" + str(order) + "_" + str(seasonal_order) + ".csv") data_save_method = keyvalue(data_save_method) data_save_method.to_csv(fig_folder + str(i) + "_formatted_" + auto_arima_forecast.__name__ + "_" + str(order) + "_" + str(seasonal_order) + ".csv") if auto_arima_forecast == best_model: axis_legend_all.append(auto_arima_forecast.__name__ + str(order) + " " + str(seasonal_order) + " (chosen)") axis_legend_copy = copy.copy(axis_legend) axis_legend_copy.append(auto_arima_forecast.__name__ + str(order) + " " + str(seasonal_order) + " (chosen)") ax[2].legend(axis_legend_copy, prop={'size': legend_size}) data_submit['series-' + str(i)] = forecast chosen_method.append(best_model.__name__) chosen_method_smape.append(str("{:.2f}".format(best_smape))) chosen_method_param.append( str(order) + " - " + str(seasonal_order)) else: axis_legend_all.append(auto_arima_forecast.__name__) axis_legend_copy = copy.copy(axis_legend) axis_legend_copy.append(auto_arima_forecast.__name__) ax[2].legend(axis_legend_copy, prop={'size': legend_size}) for model_index in range(nbr_models_current_series): ax[model_index + 3].set_title( 'Comparison of model forecast on test series for series ' + str(i) + ' (last ' + str(number_of_days) + ' days)') ax[model_index + 3].plot(data_train['series-' + str(i)][-number_of_days:], color="black", linestyle="-") ax[model_index + 3].plot(data_valid['series-' + str(i)], color="green", linestyle="-") smape, forecast = nn_models_current_series[model_index]( data_train_full['series-' + str(i)], data_test, nn_models_params_current_series[model_index], horizon, del_outliers=True, normalize=True, plot=False) ax[0].plot(forecast, color=methods_colors[model_index], linestyle="--") ax[1].plot(forecast, color=methods_colors[model_index], linestyle="--") ax[model_index + 3].plot(forecast, color=methods_colors[model_index], linestyle="--") data_save_method = pd.DataFrame(index=interval_test) data_save_method['series-' + str(i)] = forecast data_save_method.to_csv( fig_folder + str(i) + "_" + nn_models_current_series[model_index].__name__ + "_" + str(nn_models_params_current_series[model_index]) + ".csv") data_save_method = keyvalue(data_save_method) data_save_method.to_csv( fig_folder + str(i) + "_formatted_" + nn_models_current_series[model_index].__name__ + "_" + str(nn_models_params_current_series[model_index]) + ".csv") if nn_models_current_series[model_index] == best_model: axis_legend_all.append( nn_models_current_series[model_index].__name__ + " (chosen)") axis_legend_copy = copy.copy(axis_legend) axis_legend_copy.append( nn_models_current_series[model_index].__name__ + " (chosen)") ax[model_index + 3].legend(axis_legend_copy, prop={'size': legend_size}) data_submit['series-' + str(i)] = forecast chosen_method.append(best_model.__name__) chosen_method_smape.append(str("{:.2f}".format(best_smape))) chosen_method_param.append(best_model_param) else: axis_legend_all.append( nn_models_current_series[model_index].__name__) axis_legend_copy = copy.copy(axis_legend) axis_legend_copy.append( nn_models_current_series[model_index].__name__) ax[model_index + 3].legend(axis_legend_copy, prop={'size': legend_size}) ax[0].legend(axis_legend_all, prop={'size': legend_size}) ax[1].legend(axis_legend_all, prop={'size': legend_size}) plt.tight_layout() plt.savefig(fig_folder + 'series-' + str(i) + "-submission.pdf") plt.show() print("------------ SUBMISSION ------------") print() print(data_submit.to_string()) data_submit.to_csv("data/all_best_nn_nosub.csv") print() print("------------ FORMATED SUBMISSION ------------") submission = keyvalue(data_submit) print(submission.to_string()) submission.to_csv(save_path) print("------------ CHOSEN METHODS INFO FOR SUBMISSION ------------") print("METHODS " + str(chosen_method)) print("PARAMS " + str(chosen_method_param)) print("SMAPES " + str(chosen_method_smape)) print() print("------------ ARIMA ORDERS ------------") print("ARIMA ORDER VALID " + str(ARIMA_ORDER_VALID)) print("ARIMA ORDER TEST " + str(ARIMA_ORDER_TEST)) print() print("------------ ARIMA SUBMISSION AN FORMATED SUBMISSION ------------") print() print(data_best_arima.to_string()) data_best_arima.to_csv("data/arima_best.csv") print() submission = keyvalue(data_best_arima) print(submission.to_string()) submission.to_csv("data/arima_best_submission.csv") print() print( "------------ ALL PREDICTED FORECAST ON ACTUAL DATASETS SAVED TO 'seriesid-nameofmethod-params.csv' ------------" )
def preprocessing(): """Pipeline for pre-processing data Args: None Returns: None """ for sm in load_config['SM']: # Initialization of variables folder = sm['name'] drop_cols = defaultdict(list) # Pickle load with open('./data/' + folder + '/df_' + folder + '.pickle', 'rb') as handle: df = pickle.load(handle) # Check if DataFrame is not empty if df.shape[0] == 0: print('Empty DataFrame for: ' + folder) continue ###################### # METRIC CALCULATION # ###################### # Calculate metrics and merge into final analysis DataFrame df = SQL.metric_calculation(df) ################## # MISSING VALUES # ################## # Correct blanks to nan df = utils.white_to_nan(df) # Check if there are any missing values, if not skip certain operations nulls = pd.isnull(df).sum().sum() if nulls > 0: # Visualize NaN values utils.missing_visuals(df) # Remove columns with high nan percentage old_cols = df.columns df = df.dropna(axis=1, thresh=0.9) drop_cols['high_nan'].append([i for i in old_cols if i not in df.columns]) # Remove rows with full nan df = df.dropna(axis=0, how='all') # Convert to datetime possible date columns df = utils.datetime_cols(df) # Create synthetic dates df = utils.synthetic_dates(df) ############ # IMPUTING # ############ # Initialize object dict_impute = {} if nulls > 0: dict_impute = utils.imputing_nan(df) else: dict_impute['no_impute'] = df ############ # CLEANING # ############ # For each imputed DataFrame for impute in dict_impute.keys(): df = dict_impute[impute] # Remove duplicates df = df.drop_duplicates(keep='first') df = df.reset_index(drop=True) # Remove columns that are unique identifiers df, drop_cols = utils.remove_identifier_columns(df, drop_cols) # Remove one-value columns df, drop_cols = utils.remove_one_value_columns(df, drop_cols) # Convert to categorical possible categorical columns df, changed_cols = utils.search_categorical(df, 0.001) # Remove outliers using zscore df = utils.remove_outliers(df, 3) # Assign target labels df, target_cols = utils.apply_business_rules(df, folder) ################## # PRE-PROCESSING # ################## # Pickle save with open('./data/' + folder + '/df_' + impute + '_' + folder + '_noOHE.pickle', 'wb') as handle: pickle.dump((df, target_cols), handle) # Perform One-Hot Encoding for categorical columns df, list_new_columns, drop_cols = utils.one_hot_encoding(df, 5, 1, drop_cols) # Standarize and Normalize DataFrame df = utils.standarize_normalize(df, target_cols, list_new_columns) dict_impute[impute] = df # Pickle save with open('./data/' + folder + '/df_' + impute + '_' + folder + '_OHE.pickle', 'wb') as handle: pickle.dump((df, target_cols), handle)
st.subheader('Select two Sectors and compare a metric') sector1 = st.selectbox('Select a Sector', (set(df['Sector']))) sector2 = st.selectbox('Select a Sector to Compare', (set(df['Sector']) - {sector1})) metric = st.selectbox('Select a Metric', (selectable_values)) df = df[df[metric] != '-'] df[metric] = pd.to_numeric(df[metric], downcast="float") sector1_df = df[df['Sector'] == sector1] sector2_df = df[df['Sector'] == sector2] sector1_data = ut.remove_outliers(sector1_df, metric, 3.5) sector2_data = ut.remove_outliers(sector2_df, metric, 3.5) fig = plt.figure(figsize=(25, 15)) matplotlib.rcParams['axes.grid'] = True matplotlib.rcParams['savefig.transparent'] = True custom_style = { 'axes.labelcolor': 'white', 'xtick.color': 'white', 'ytick.color': 'white' } sns.set_style({'axes.grid': False}) sns.set_style(rc=custom_style)
'Beta': mne.filter.filter_data(data=np.mean(experiment_filtered.get_data(), axis=0), l_freq=IAF_p + 2, h_freq=30, sfreq=sfreq, method="fir")} # Calculating calibration values. Consider mean value of all channels. Va;ue are given in microvolts calibration_values = {} for band in WAVES: calibration_values[band] = np.mean(eyes_sub_bands[band], axis=0) * np.power(10, 6) # Performing STFT transform on experiment data for each sub-band. Window size is given in samples window = sfreq * 2 fft = {} for band in WAVES: fft[band] = stft(x=experiment_sub_bands[band], fs=sfreq, window=('kaiser', window), nperseg=1000) erd = np.vectorize(ERD) # Calculating ERD for experiment erd_mean = {} for band in fft: curr_erd = erd(fft[band][2], calibration_values[band]) erd_mean[band] = remove_outliers(np.real(np.mean(curr_erd, axis=0))) # Adding clean Beta and UA energy ratio erd_mean["ABratio"] = remove_outliers(np.real(np.power(experiment_sub_bands["UA"] / experiment_sub_bands["Beta"], 2))) # Dumping erd_mean of experiment pickle.dump(erd_mean, open(obg_dir / subject / "".join([subject, exp_type, ".pkl"]), 'wb'))
def table_performance_comparison_mrr(table, input_data): """Generate the table(s) with algorithm: table_performance_comparison_mrr specified in the specification file. :param table: Table to generate. :param input_data: Data to process. :type table: pandas.Series :type input_data: InputData """ logging.info(" Generating the table {0} ...".format(table.get( "title", ""))) # Transform the data logging.info(" Creating the data set for the {0} '{1}'.".format( table.get("type", ""), table.get("title", ""))) data = input_data.filter_data(table, continue_on_error=True) # Prepare the header of the tables try: header = [ "Test case", "{0} Throughput [Mpps]".format(table["reference"]["title"]), "{0} stdev [Mpps]".format(table["reference"]["title"]), "{0} Throughput [Mpps]".format(table["compare"]["title"]), "{0} stdev [Mpps]".format(table["compare"]["title"]), "Change [%]" ] header_str = ",".join(header) + "\n" except (AttributeError, KeyError) as err: logging.error( "The model is invalid, missing parameter: {0}".format(err)) return # Prepare data to the table: tbl_dict = dict() for job, builds in table["reference"]["data"].items(): for build in builds: for tst_name, tst_data in data[job][str(build)].iteritems(): if tbl_dict.get(tst_name, None) is None: name = "{0}-{1}".format( tst_data["parent"].split("-")[0], "-".join(tst_data["name"].split("-")[1:])) tbl_dict[tst_name] = { "name": name, "ref-data": list(), "cmp-data": list() } try: tbl_dict[tst_name]["ref-data"].\ append(tst_data["result"]["throughput"]) except TypeError: pass # No data in output.xml for this test for job, builds in table["compare"]["data"].items(): for build in builds: for tst_name, tst_data in data[job][str(build)].iteritems(): try: tbl_dict[tst_name]["cmp-data"].\ append(tst_data["result"]["throughput"]) except KeyError: pass except TypeError: tbl_dict.pop(tst_name, None) tbl_lst = list() for tst_name in tbl_dict.keys(): item = [ tbl_dict[tst_name]["name"], ] if tbl_dict[tst_name]["ref-data"]: data_t = remove_outliers(tbl_dict[tst_name]["ref-data"], outlier_const=table["outlier-const"]) # TODO: Specify window size. if data_t: item.append(round(mean(data_t) / 1000000, 2)) item.append(round(stdev(data_t) / 1000000, 2)) else: item.extend([None, None]) else: item.extend([None, None]) if tbl_dict[tst_name]["cmp-data"]: data_t = remove_outliers(tbl_dict[tst_name]["cmp-data"], outlier_const=table["outlier-const"]) # TODO: Specify window size. if data_t: item.append(round(mean(data_t) / 1000000, 2)) item.append(round(stdev(data_t) / 1000000, 2)) else: item.extend([None, None]) else: item.extend([None, None]) if item[1] is not None and item[3] is not None and item[1] != 0: item.append(int(relative_change(float(item[1]), float(item[3])))) if len(item) == 6: tbl_lst.append(item) # Sort the table according to the relative change tbl_lst.sort(key=lambda rel: rel[-1], reverse=True) # Generate tables: # All tests in csv: tbl_names = [ "{0}-1t1c-full{1}".format(table["output-file"], table["output-file-ext"]), "{0}-2t2c-full{1}".format(table["output-file"], table["output-file-ext"]), "{0}-4t4c-full{1}".format(table["output-file"], table["output-file-ext"]) ] for file_name in tbl_names: logging.info(" Writing file: '{0}'".format(file_name)) with open(file_name, "w") as file_handler: file_handler.write(header_str) for test in tbl_lst: if file_name.split("-")[-2] in test[0]: # cores test[0] = "-".join(test[0].split("-")[:-1]) file_handler.write(",".join([str(item) for item in test]) + "\n") # All tests in txt: tbl_names_txt = [ "{0}-1t1c-full.txt".format(table["output-file"]), "{0}-2t2c-full.txt".format(table["output-file"]), "{0}-4t4c-full.txt".format(table["output-file"]) ] for i, txt_name in enumerate(tbl_names_txt): txt_table = None logging.info(" Writing file: '{0}'".format(txt_name)) with open(tbl_names[i], 'rb') as csv_file: csv_content = csv.reader(csv_file, delimiter=',', quotechar='"') for row in csv_content: if txt_table is None: txt_table = prettytable.PrettyTable(row) else: txt_table.add_row(row) txt_table.align["Test case"] = "l" with open(txt_name, "w") as txt_file: txt_file.write(str(txt_table))
def nn_with_past_outliers_multi_step_forecast(series, validation_series, input_length, horizon, del_outliers=False, normalize=False, plot=False): """ Perform forecasting of a time series using a simple neural network with a single 128 neurons hidden layer. The network is trained using samples of shape input_length (corresponding to the last input_length days) to predict an array of horizon values (corresponding to horizon days). In this case, the network predicts horizon days at the time. Performance of the trained network is assessed on a validation series. The size of the validation series must be horizon. This function differs from nn_multi_step_forecast as in addition to the last input_length days, we also use horizon days at the same period the previous year as an input to the network. In addition, the horizon days from the previous year are normalized from the original series and contain the outliers. The hope is to gain information from the previous year. :param series: :param validation_series: :param input_length: :param horizon: :param del_outliers: :param normalize: :param plot: :return: SMAPE for the validation series, the forecast validation series """ # whether to remove outliers in the training series if del_outliers: working_series = remove_outliers(series) else: working_series = series # whether to normalize the training series if normalize: scaler, working_series = normalize_series(working_series) scaler_bis, working_series_with_outliers = normalize_series(series) else: scaler = None working_series_with_outliers = series # input sequence is our data, np.log1p is applied to the data and mae error is used to approximate SMAPE error train_series = np.log1p(working_series) # we use the last n_steps_in days as input and predict n_steps_out n_steps_in, n_steps_out = input_length, horizon # split into samples train_samples, train_targets = split_sequence_nn_with_past_outliers_multi_step( train_series, working_series_with_outliers, n_steps_in, n_steps_out) # create the model model = Sequential() model.add(Dense(128, activation='relu', input_dim=n_steps_in + horizon)) # we predict n_steps_out values model.add(Dense(n_steps_out)) # we use 'mae' with data transformed with log1p and expm1 to approach SMAPE error model.compile(optimizer='adam', loss='mae') # fit model model.fit(train_samples, train_targets, epochs=200, verbose=0) # perform prediction # input is the last n_steps_in values of the train series (working_series is not log1p transformed) # in addition, we prepend the horizon values from the last year validation_in_sample = np.log1p( np.append( np.array(working_series_with_outliers.values[-365:-365 + horizon]), np.array(working_series.values[-n_steps_in:]))) validation_in_sample = validation_in_sample.reshape( (1, n_steps_in + horizon)) validation_forecast = model.predict(validation_in_sample, verbose=0) # dataframe which contains the result forecast_dataframe = pd.DataFrame(index=validation_series.index) # if data was normalized, we need to apply the reverse transform if normalize: # first reverse log1p using expm1 validation_forecast = np.expm1(validation_forecast) # use scaler to reverse normalizing denormalized_forecast = scaler.inverse_transform( validation_forecast.reshape(-1, 1)) denormalized_forecast = [val[0] for val in denormalized_forecast] # save the forecast in the dataframe forecast_dataframe['forecast'] = denormalized_forecast else: # save the forecast in the dataframe forecast_dataframe['forecast'] = np.expm1(validation_forecast) if plot: plt.figure(figsize=(10, 6)) plt.plot(series[-100:], color="blue", linestyle="-") plt.plot(validation_series, color="green", linestyle="-") plt.plot(forecast_dataframe, color="red", linestyle="--") plt.legend(["Train series", "Validation series", "Predicted series"]) plt.title( "Validation of simple multi step NN with past values and input size " + str(n_steps_in) + " output size " + str(n_steps_out)) plt.show() return smape( validation_series, forecast_dataframe['forecast']), forecast_dataframe['forecast']
def arima_forecast(series, validation_series, horizon, order, seasonal_order, del_outliers=False, normalize=False, plot=False): """ Creates an arima model with the provided order and seasonal order and assess performance of the model is on a validation series. :param series: :param validation_series: :param horizon: :param order: :param seasonal_order: :param del_outliers: :param normalize: :param plot: :return: SMAPE for the validation series, the forecast validation series """ # whether to remove outliers in the training series if del_outliers: working_series = remove_outliers(series) else: working_series = series # whether to normalize the training series if normalize: scaler, working_series = normalize_series(working_series) else: scaler = None # input sequence is our data train_series = working_series # perform search for best parameters and fit model = arima.ARIMA(order=order, seasonal_order=seasonal_order, suppress_warnings=True) model.fit(train_series) # perform predictions f_autoarima = model.predict(n_periods=horizon) # dataframe which contains the result forecast_dataframe = pd.DataFrame(index=validation_series.index) # if data was normalized, we need to apply the reverse transform if normalize: # first reverse log1p using expm1 validation_forecast = f_autoarima # use scaler to reverse normalizing denormalized_forecast = scaler.inverse_transform( validation_forecast.reshape(-1, 1)) denormalized_forecast = [val[0] for val in denormalized_forecast] # save the forecast in the dataframe forecast_dataframe['forecast'] = denormalized_forecast else: # save the forecast in the dataframe forecast_dataframe['forecast'] = f_autoarima if plot: plt.figure(figsize=(10, 6)) plt.plot(series[-100:], color="blue", linestyle="-") plt.plot(validation_series, color="green", linestyle="-") plt.plot(forecast_dataframe, color="red", linestyle="--") plt.legend(["Train series", "Validation series", "Predicted series"]) plt.title("Validation of arima model with order " + str(order) + " seasonal order " + str(seasonal_order)) plt.show() return smape( validation_series, forecast_dataframe['forecast']), forecast_dataframe['forecast']
def auto_arima_forecast(series, validation_series, horizon, del_outliers=False, normalize=False, plot=False): """ Fits an auto arima model from the series to find the best parameters. Performance of the trained model is assessed on a validation series. :param series: :param validation_series: :param horizon: :param del_outliers: :param normalize: :param plot: :return: SMAPE for the validation series, the forecast validation series, order, seasonal_order """ # whether to remove outliers in the training series if del_outliers: working_series = remove_outliers(series) else: working_series = series # whether to normalize the training series if normalize: scaler, working_series = normalize_series(working_series) else: scaler = None # input sequence is our data train_series = working_series # perform search for best parameters and fit model = auto_arima(train_series, seasonal=True, max_D=2, m=7, trace=True, error_action='ignore', suppress_warnings=True, stepwise=True) order = model.get_params()['order'] seasonal_order = model.get_params()['seasonal_order'] # apparently useless model.fit(train_series) # perform predictions f_autoarima = model.predict(n_periods=horizon) # dataframe which contains the result forecast_dataframe = pd.DataFrame(index=validation_series.index) # if data was normalized, we need to apply the reverse transform if normalize: # first reverse log1p using expm1 validation_forecast = f_autoarima # use scaler to reverse normalizing denormalized_forecast = scaler.inverse_transform( validation_forecast.reshape(-1, 1)) denormalized_forecast = [val[0] for val in denormalized_forecast] # save the forecast in the dataframe forecast_dataframe['forecast'] = denormalized_forecast else: # save the forecast in the dataframe forecast_dataframe['forecast'] = f_autoarima if plot: plt.figure(figsize=(10, 6)) plt.plot(series[-100:], color="blue", linestyle="-") plt.plot(validation_series, color="green", linestyle="-") plt.plot(forecast_dataframe, color="red", linestyle="--") plt.legend(["Train series", "Validation series", "Predicted series"]) plt.title("Validation of auto arima model") plt.show() return smape(validation_series, forecast_dataframe['forecast'] ), forecast_dataframe['forecast'], order, seasonal_order