def average(input_df, kunag, matnr): """ applies average model and calculates mse score on test data :param input_df: :param kunag: :param matnr: :return: """ df = input_df.copy() df = remove_negative_rows(df) df_series = individual_series(df, kunag, matnr) df_series = data_transformation.get_weekly_aggregate(df_series) df_series["date"] = df_series["dt_week"].map(str) df_series["date"] = df_series["date"].apply(lambda x: x.replace("-", "")) df_series["prediction"] = df_series["quantity"] df_series_train, df_series_test = splitter(df_series) k = 0 for index, row in df_series_test.iterrows(): row["prediction"] = df_series_train["quantity"].mean() df_series_train = pd.concat([df_series_train, pd.DataFrame(row).T ]).reset_index(drop=True) if k == 0: test_index = df_series_train.shape[0] - 1 k = 1 output_df = df_series_train test_df = df_series_train.iloc[test_index:] # print("mean squared error is :",mean_squared_error(output_df["quantity"], output_df["prediction"])) return output_df, mean_squared_error(test_df["quantity"], test_df["prediction"])
def arima(input_df, kunag, matnr, p=2, d=1, q=4, trend="n"): df = input_df.copy() df = remove_negative_rows(df) df_series = individual_series(df, kunag, matnr) df_series = data_transformation.get_weekly_aggregate(df_series) df_series["date"] = df_series["dt_week"].map(str) df_series["date"] = df_series["date"].apply(lambda x: x.replace("-", "")) df_series["prediction"] = df_series["quantity"] df_series_train, df_series_test = splitter(df_series) k = 0 for index, row in df_series_test.iterrows(): df_series_train["quantity"] = df_series_train["quantity"].map(float) fit1 = sm.tsa.statespace.SARIMAX(df_series_train["quantity"], order=(p, d, q), trend=trend).fit() predicted = fit1.forecast(1) row["prediction"] = predicted.values[0] df_series_train = pd.concat([df_series_train, pd.DataFrame(row).T ]).reset_index(drop=True) if k == 0: test_index = df_series_train.shape[0] - 1 k = 1 output_df = df_series_train test_df = df_series_train.iloc[test_index:] # print("mean squared error is :",mean_squared_error(output_df["quantity"], output_df["prediction"])) return output_df, mean_squared_error(test_df["quantity"], test_df["prediction"])
def holts_linear_trend(input_df, kunag, matnr, smoothing_level=0.3, smoothing_slope=0.1): df = input_df.copy() df = remove_negative_rows(df) df_series = individual_series(df, kunag, matnr) df_series = data_transformation.get_weekly_aggregate(df_series) df_series["date"] = df_series["dt_week"].map(str) df_series["date"] = df_series["date"].apply(lambda x: x.replace("-", "")) df_series["prediction"] = df_series["quantity"] df_series_train, df_series_test = splitter(df_series) k = 0 for index, row in df_series_test.iterrows(): df_series_train["quantity"] = df_series_train["quantity"].map(float) fit1 = Holt(np.asarray(df_series_train["quantity"])).fit( smoothing_level=smoothing_level, smoothing_slope=smoothing_slope) predicted = fit1.forecast(1) row["prediction"] = predicted[0] df_series_train = pd.concat([df_series_train, pd.DataFrame(row).T ]).reset_index(drop=True) if k == 0: test_index = df_series_train.shape[0] - 1 k = 1 output_df = df_series_train test_df = df_series_train.iloc[test_index:] # print("mean squared error is :",mean_squared_error(output_df["quantity"], output_df["prediction"])) return output_df, mean_squared_error(test_df["quantity"], test_df["prediction"])
def simple_exponential_smoothing(input_df, kunag, matnr, alpha=0.6): df = input_df.copy() df = remove_negative_rows(df) df_series = individual_series(df, kunag, matnr) df_series = data_transformation.get_weekly_aggregate(df_series) df_series["date"] = df_series["dt_week"].map(str) df_series["date"] = df_series["date"].apply(lambda x: x.replace("-", "")) df_series["prediction"] = df_series["quantity"] df_series_train, df_series_test = splitter(df_series) k = 0 for index, row in df_series_test.iterrows(): fit2 = SimpleExpSmoothing(np.asarray(df_series_train["quantity"])).fit( smoothing_level=alpha, optimized=False) row["prediction"] = fit2.forecast(1) df_series_train = pd.concat([df_series_train, pd.DataFrame(row).T ]).reset_index(drop=True) if k == 0: test_index = df_series_train.shape[0] - 1 k = 1 output_df = df_series_train test_df = df_series_train.iloc[test_index:] # print("mean squared error is :",mean_squared_error(output_df["quantity"], output_df["prediction"])) return output_df, mean_squared_error(test_df["quantity"], test_df["prediction"])
def prophet(input_df, kunag, matnr, growth="linear", changepoint_prior_scale=0.5, yearly_seasonality=False, daily_seasonality=False, weekly_seasonality=False, seasonality_prior_scale=100): df = input_df.copy() df = remove_negative_rows(df) df_series = individual_series( df, kunag, matnr, ) df_series = data_transformation.get_weekly_aggregate(df_series) df_series["date"] = df_series["dt_week"].map(str) df_series["date"] = df_series["date"].apply(lambda x: x.replace("-", "")) df_series["prediction"] = df_series["quantity"] df_series_train, df_series_test = splitter(df_series) k = 0 for index, row in df_series_test.iterrows(): df_series_train["quantity"] = df_series_train["quantity"].map(float) df_copy = df_series_train.copy() ts = prophet_preprocess(df_copy) m = Prophet(growth=growth, changepoint_prior_scale=changepoint_prior_scale, yearly_seasonality=yearly_seasonality, daily_seasonality=daily_seasonality, weekly_seasonality=weekly_seasonality, seasonality_prior_scale=seasonality_prior_scale) m.fit(ts) future = m.make_future_dataframe( periods=1, freq="W", include_history=False).apply( lambda x: (x + pd.Timedelta(4, unit="D"))) # + pd.Timedelta(4, unit="D") forecast = m.predict(future) row["prediction"] = forecast.iloc[0]["yhat"] df_series_train = pd.concat([df_series_train, pd.DataFrame(row).T ]).reset_index(drop=True) if k == 0: test_index = df_series_train.shape[0] - 1 k = 1 output_df = df_series_train test_df = df_series_train.iloc[test_index:] return output_df, mean_squared_error(test_df["quantity"], test_df["prediction"])
def holts_winter_method(input_df, kunag, matnr, seasonal_period, alpha=None, beta=None, gamma=None): """ :param input_df: :param kunag: :param matnr: :param seasonal_period: :param alpha: :param beta: :param gamma: :return: """ df = input_df.copy() df = remove_negative_rows(df) df_series = individual_series(df, kunag, matnr) df_series = data_transformation.get_weekly_aggregate(df_series) df_series["date"] = df_series["dt_week"].map(str) df_series["date"] = df_series["date"].apply(lambda x: x.replace("-", "")) df_series["prediction"] = df_series["quantity"] df_series_train, df_series_test = splitter(df_series) k = 0 for index, row in df_series_test.iterrows(): df_series_train["quantity"] = df_series_train["quantity"].map(float) fit1 = ExponentialSmoothing( np.asarray(df_series_train["quantity"]), seasonal_periods=seasonal_period, trend='add', seasonal='add', ).fit(smoothing_level=alpha, smoothing_slope=beta, smoothing_seasonal=gamma) predicted = fit1.forecast(1) row["prediction"] = predicted[0] df_series_train = pd.concat([df_series_train, pd.DataFrame(row).T ]).reset_index(drop=True) if k == 0: test_index = df_series_train.shape[0] - 1 k = 1 output_df = df_series_train test_df = df_series_train.iloc[test_index:] # print("mean squared error is :",mean_squared_error(output_df["quantity"], output_df["prediction"])) return output_df, mean_squared_error(test_df["quantity"], test_df["prediction"])
def individual_series(input_df, kunag=500057582, matnr=103029): """ selects a dataframe corresponding to a particular kunag and matnr param: a pandas dataframe return: a pandas dataframe """ df_copy = input_df.copy() df_copy = remove_negative_rows(df_copy) df_copy = df_copy[df_copy["date"] >= 20160703] output_df = df_copy[(df_copy["kunag"] == kunag) & (df_copy["matnr"] == matnr)] output_df = get_weekly_aggregate(output_df) output_df["dt_week"] = output_df["dt_week"].apply( lambda x: pd.to_datetime(x, format="%Y-%m-%d")) outlier_removed = outlier(output_df) return outlier_removed
def samples_aggregate_seas(): df = load_data() bucket_1_sample = pd.read_csv( "/home/aman/PycharmProjects/seasonality_hypothesis/data_generated/bucket_1_sample.csv" ) k = 0 for index, row in bucket_1_sample.iterrows(): df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])] df_series = df_series[df_series["quantity"] >= 0] df_series = df_series[df_series["date"] >= 20160703] df_series = get_weekly_aggregate(df_series) _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={ 'dt_week': 'ds', 'quantity': 'y' }) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) _result = ma_replace_outlier(data=aggregated_data, n_pass=3, aggressive=True, window_size=12, sigma=3.0) result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'}) if k == 1: final = pd.concat([final, result]) if k == 0: final = result k = 1 final = final.groupby("dt_week")["quantity"].sum().reset_index() final = final.set_index("dt_week") #plt.figure(figsize=(16, 8)) #plt.plot(final["quantity"], label='quantity', marker=".") #plt.title("200 sample aggregated plot") #plt.xlabel("dt_weeks") #plt.ylabel("aggregated quantities") #plt.show() result = seasonal_decompose(final["quantity"], model="additive") #result.plot() #plt.show() return result.seasonal
def aggregate_seasonal_comp(): df = load_data()[["date", "quantity"]] df = df[(df["quantity"] >= 0) & (df["quantity"] <= 10)] aggregate_data = df.groupby("date")["quantity"].sum() aggregate_data = aggregate_data.reset_index() aggregate_data["kunag"] = 1 aggregate_data["matnr"] = 2 aggregate_data["price"] = 3 aggregate_data = get_weekly_aggregate(aggregate_data) aggregate_data["dt_week"] = aggregate_data["dt_week"].apply( lambda x: pd.to_datetime(x, format="%Y-%m-%d")) aggregate_data = aggregate_data.set_index("dt_week") # plt.figure(figsize=(16, 8)) # plt.plot(aggregate_data["quantity"], label='quantity') # plt.title("aggregated plot") # plt.show() result = seasonal_decompose(aggregate_data["quantity"], model="additive") # result.plot() plt.show() return result.seasonal
def dickey_fuller_test(input_df, matnr=112260): """ This function aggregates whole cleaveland data with ma outliers removing different categories series outliers First week has been removed :return: pandas_df : seasonal component of the aggregated data """ df = input_df.copy() df = df[df["matnr"] == matnr] overall = pd.read_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_cleaveland.csv" ) overall = overall[overall["matnr"] == matnr] product = pd.read_csv( "~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv", sep="\t") product_name = product[product["matnr"] == str( matnr)]["description"].values[0] k = 0 for index, row in tqdm(overall.iterrows()): frequency = row["frequency"] days = row["days"] df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])] df_series = df_series[df_series["quantity"] >= 0] df_series = df_series[df_series["date"] >= 20160703] if frequency == 0: continue #print(df_series) df_series = get_weekly_aggregate(df_series) _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={ 'dt_week': 'ds', 'quantity': 'y' }) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) outlier = True if (frequency >= 26) & (days > 365 + 183): n_pass = 3 window_size = 12 sigma = 4.0 elif (frequency >= 20) & (frequency < 26): n_pass = 3 window_size = 12 sigma = 5.0 elif (frequency >= 26) & (days <= 365 + 183): if len(aggregated_data) >= 26: n_pass = 3 window_size = 12 sigma = 4.0 else: outlier = False elif (frequency >= 12) & (frequency < 20): if len(aggregated_data) >= 26: n_pass = 3 window_size = 24 sigma = 5.0 else: outlier = False elif frequency < 12: outlier = False if outlier: _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size, sigma=sigma) result = _result[0].rename(columns={ 'ds': 'dt_week', 'y': 'quantity' }) else: result = aggregated_data.rename(columns={ 'ds': 'dt_week', 'y': 'quantity' }) if k == 1: final = pd.concat([final, result]) if k == 0: final = result k = 1 final = final.groupby("dt_week")["quantity"].sum().reset_index() final = final.set_index("dt_week") #temp = final # plt.plot(final, marker=".") # plt.title("original") # plt.show() final, Flag = cond_check(final) if Flag: final_detrended = detrend(final) # plt.plot(final_detrended, marker=".") # plt.title("detrended") # plt.show() final_aggregate = monthly_aggregate(final_detrended) # plt.plot(final_aggregate, marker=".") # plt.title("aggregated") # plt.show() result = adfuller(final_aggregate["quantity"], maxlag=10, autolag='t-stat') print('ADF Statistic: %f' % result[0]) print('p-value: %f' % result[1]) print("No of lags used: %f" % result[2]) print("No of observations: %f" % result[3]) print('Critical Values:') for key, value in result[4].items(): print('\t%s: %.3f' % (key, value)) #plt.figure(figsize=(16, 8)) #plt.plot(temp, marker=".") if result[1] >= 0.05: #plt.title("Not Stationary") print("Not stationary") else: #plt.title("Stationary") print("Stationary") #plt.savefig("/home/aman/PycharmProjects/seasonality_hypothesis/seasonality_result_2/" + str(matnr) + "_" + product_name + ".png") else: print("length of series is less than 112")
def overall_aggregate_seas(input_df, matnr=103029): """ This function aggregates whole cleaveland data with ma outliers removing different categories series outliers First week has been removed :return: pandas_df : seasonal component of the aggregated data """ df = input_df.copy() df = df[df["matnr"] == matnr] overall = pd.read_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_4200_C025.csv") overall = overall[overall["matnr"] == matnr] product = pd.read_csv("~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv", sep="\t") product_name = product[product["matnr"] == str(matnr)]["description"].values[0] k = 0 for index, row in overall.iterrows(): frequency = row["frequency"] days = row["days"] df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])] df_series = df_series[df_series["quantity"] >= 0] df_series = df_series[df_series["date"] >= 20160703] if int(frequency) == 0: continue df_series = get_weekly_aggregate(df_series) _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'}) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) outlier = True if (frequency >= 26) & (days > 365 + 183): n_pass = 3 window_size = 12 sigma = 4.0 elif(frequency >= 20) & (frequency < 26): n_pass = 3 window_size = 12 sigma = 5.0 elif (frequency >= 26) & (days <= 365+183): if len(aggregated_data) >= 26: n_pass = 3 window_size = 12 sigma = 4.0 else: outlier = False elif (frequency >= 12) & (frequency < 20): if len(aggregated_data) >= 26: n_pass = 3 window_size = 24 sigma = 5.0 else: outlier = False elif frequency < 12: outlier = False if outlier: _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size, sigma=sigma) result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'}) else: result = aggregated_data.rename(columns={'ds': 'dt_week', 'y': 'quantity'}) if k == 1: final = pd.concat([final, result]) if k == 0: final = result k = 1 try: final = final.groupby("dt_week")["quantity"].sum().reset_index() except: return None final = final.set_index("dt_week") print(final["quantity"]) print(final["quantity"].shape) result = seasonal_decompose(final["quantity"], model="additive") result.plot() #plt.show() plt.savefig( "/home/aman/PycharmProjects/seasonality_hypothesis/plots_product_C0025/"+str(matnr)+"_"+product_name+".png") #result.seasonal.to_csv( # "~/PycharmProjects/seasonality_hypothesis/data_generated/product_aggregate_seasonality_"+str(matnr)+".csv") return result.seasonal
normalized_freq = float(freq) * multiplier return normalized_freq if __name__ == "__main__": import matplotlib.pyplot as plt from statsmodels.tsa.stattools import acf file_address = "/home/aman/Desktop/CSO_drug/data/raw_invoices_cleaveland_sample_100_stores_2018-12-09.tsv" dateparse = lambda dates: pd.datetime.strptime(dates, '%Y%m%d') data = pd.read_csv(file_address, sep="\t", parse_dates=['date'], index_col='date', date_parser=dateparse) data = data.sort_index() #norm_freq_last_year_all_series(data) ts = select_series(data, kunag=500076413, matnr=144089) ts = ts.reset_index() #ts.to_csv("original.csv") ts = data_transformation.get_weekly_aggregate(ts) #ts.to_csv("aggregated.csv") #pd.DataFrame(acf(ts["quantity"])).plot(kind='bar') #plt.show() #ts.to_csv("/home/aman/Desktop/CSO_drug/file_generated/series.csv") print("done") # plt.plot(ts_1) # plt.show()
if __name__ == "__main__": df = load_data() df = df[df["quantity"] >= 0] df = df[df["matnr"] == 101728] series_stores = pd.DataFrame() for index, group in df.groupby(["date"]): num = len(group["kunag"]) series_stores = series_stores.append([[index, num]]) series_stores.columns = ["date", "quantity"] series_stores = series_stores[series_stores["date"] >= 20160703] series_stores["matnr"] = "A" series_stores["kunag"] = "B" series_stores["price"] = 0 series_stores["date"] = series_stores["date"].map(str) series_stores = series_stores.reset_index(drop=True) series_stores = get_weekly_aggregate(series_stores) series_stores = series_stores.set_index("dt_week") series_stores.to_csv( "/home/aman/PycharmProjects/seasonality_hypothesis/stores_analysis.csv" ) plt.plot(series_stores["quantity"], marker=".") plt.show() data_stores = pd.read_csv( "/home/aman/PycharmProjects/seasonality_hypothesis/stores_analysis.csv" ) data_aggregated = pd.read_csv( "/home/aman/PycharmProjects/seasonality_hypothesis/aggregated.csv") data_aggregated[ "division"] = data_aggregated["quantity"] / data_stores["quantity"] data_aggregated = data_aggregated.set_index("dt_week")
def ljung_box_test(input_df, matnr=112260): """ This function aggregates whole cleaveland data with ma outliers removing different categories series outliers First week has been removed :return: pandas_df : seasonal component of the aggregated data """ df = input_df.copy() df = df[df["matnr"] == matnr] overall = pd.read_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_4200_C005.csv") overall = overall[overall["matnr"] == matnr] # product = pd.read_csv("~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv", sep="\t") # product_name = product[product["matnr"] == str(int(matnr))]["description"].values[0] k = 0 for index, row in overall.iterrows(): frequency = row["frequency"] days = row["days"] df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])] df_series = df_series[df_series["quantity"] >= 0] df_series = df_series[df_series["date"] >= 20160703] if frequency == 0: continue # print(df_series) df_series = get_weekly_aggregate(df_series) # plt.plot(df_series.set_index("dt_week")["quantity"], marker=".", label="individual series") # plt.title(str(row["matnr"]) + " before outlier") # plt.show() _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'}) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) outlier = True if (frequency >= 26) & (days > 365 + 183): n_pass = 3 window_size = 12 sigma = 4.0 elif(frequency >= 20) & (frequency < 26): n_pass = 3 window_size = 12 sigma = 5.0 elif (frequency >= 26) & (days <= 365+183): if len(aggregated_data) >= 26: n_pass = 3 window_size = 12 sigma = 4.0 else: outlier = False elif (frequency >= 12) & (frequency < 20): if len(aggregated_data) >= 26: n_pass = 3 window_size = 24 sigma = 5.0 else: outlier = False elif frequency < 12: outlier = False if outlier: _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size, sigma=sigma) result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'}) else: result = aggregated_data.rename(columns={'ds': 'dt_week', 'y': 'quantity'}) # plt.plot(result.set_index("dt_week")["quantity"], marker=".", label="individual_series_after_outlier") # plt.title(str(row["matnr"]) + " after outlier") # plt.show() if k == 1: final = pd.concat([final, result]) if k == 0: final = result k = 1 final = final.groupby("dt_week")["quantity"].sum().reset_index() # plt.figure(figsize=(16, 8)) # plt.plot(final.set_index("dt_week"), marker=".", markerfacecolor="red", label="y") # plt.plot(final.set_index("dt_week").diff(), marker=".", label="differenced aggregated_data") # plt.xticks(fontsize=14) # plt.yticks(fontsize=14) # plt.xlabel("Date", fontsize=14) # plt.ylabel("Quantity", fontsize=14) # plt.title("Product Weekly Aggregated Data", fontsize=16) # plt.legend(fontsize=14) # plt.show() final = outlier_on_aggregated(final) final_temp = final plt.figure(figsize=(16, 8)) plt.plot(final.set_index("dt_week"), marker=".", markerfacecolor="red", label="y") # plt.plot(final.set_index("dt_week").diff(), marker=".", label="differenced aggregated_data") # plt.xticks(fontsize=14) # plt.yticks(fontsize=14) # plt.xlabel("Date", fontsize=14) # plt.ylabel("Quantity", fontsize=14) # plt.title("Product Weekly Aggregated Data Outlier Removed", fontsize=16) # plt.legend(fontsize=14) plt.show() # print("________", final.dtypes) final = final.set_index("dt_week") missing_more_24 = missing_data_detection(final) if missing_more_24: # print("data is missing for more than 6 months") return False, 1, 0, final, final_temp #temp = final # final = final.diff() # print("checking the length of aggregated series ...") final, Flag = cond_check(final) if Flag: # print("detrending the aggregated series ...") final_detrended = detrend(final) # plt.figure(figsize=(16, 8)) # plt.plot(final_detrended, marker=".", markerfacecolor="red", label="y") # plt.xticks(fontsize=14) # plt.yticks(fontsize=14) # plt.xlabel("Date", fontsize=14) # plt.ylabel("Quantity", fontsize=14) # plt.title("Detrended", fontsize=16) # plt.legend(fontsize=14) # plt.show() # print("monthly aggregating the aggregated series ...") final_aggregate = monthly_aggregate(final_detrended) # plt.figure(figsize=(16, 8)) # plt.plot(final_aggregate, marker=".", markerfacecolor="red", label="y") # plt.xticks(fontsize=14) # plt.yticks(fontsize=14) # plt.xlabel("Date", fontsize=14) # plt.ylabel("Quantity", fontsize=14) # plt.title("Monthly Aggregated", fontsize=16) # plt.legend(fontsize=14) # plt.show() # print("standard deviation is", final.std()/ final.mean()) # print("performing ljung box test ...") result = acorr_ljungbox(final_aggregate["quantity"], lags=[13]) # print(result) result_dickey = adfuller(final_aggregate["quantity"]) # print("statistic: %f" %result[0]) # print("p-value: %f" %result[1]) # print("p_value is :", result[1][0]) if result[1] < 0.02: # print(str(matnr)+" is seasonal") return True, result[1][0], result_dickey[1], final, final_temp else: # print(str(matnr) + " is not seasonal") return False, result[1][0], result_dickey[1], final, final_temp else: print("length of series is less than 112") return [False, "length is small", 0, final, final_temp]
def ljung_box_test_without_aggregation(input_df, matnr=112260): """ This function aggregates whole cleaveland data with ma outliers removing different categories series outliers First week has been removed :return: pandas_df : seasonal component of the aggregated data """ df = input_df.copy() df = df[df["matnr"] == matnr] overall = pd.read_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_4200_C005.csv") overall = overall[overall["matnr"] == matnr] #product = pd.read_csv("~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv", sep="\t") #product_name = product[product["matnr"] == str(int(matnr))]["description"].values[0] k = 0 for index, row in overall.iterrows(): frequency = row["frequency"] days = row["days"] df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])] df_series = df_series[df_series["quantity"] >= 0] df_series = df_series[df_series["date"] >= 20160703] if frequency == 0: continue #print(df_series) df_series = get_weekly_aggregate(df_series) _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'}) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) outlier = True if (frequency >= 26) & (days > 365 + 183): n_pass = 3 window_size = 12 sigma = 4.0 elif(frequency >= 20) & (frequency < 26): n_pass = 3 window_size = 12 sigma = 5.0 elif (frequency >= 26) & (days <= 365+183): if len(aggregated_data) >= 26: n_pass = 3 window_size = 12 sigma = 4.0 else: outlier = False elif (frequency >= 12) & (frequency < 20): if len(aggregated_data) >= 26: n_pass = 3 window_size = 24 sigma = 5.0 else: outlier = False elif frequency < 12: outlier = False if outlier: _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size, sigma=sigma) result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'}) else: result = aggregated_data.rename(columns={'ds': 'dt_week', 'y': 'quantity'}) if k == 1: final = pd.concat([final, result]) if k == 0: final = result k = 1 final = final.groupby("dt_week")["quantity"].sum().reset_index() final = final.set_index("dt_week") result = acorr_ljungbox(final["quantity"], lags=[52]) # print("statistic: %f" %result[0]) # print("p-value: %f" %result[1]) if result[1] < 0.01: #print(result[1]) return True, result[1], final else: return False, result[1], final
def overall_aggregate_seas(): """ This function aggregates whole cleaveland data with ma outliers removing different categories series outliers First week has been removed :return: pandas_df : seasonal component of the aggregated data """ df = load_data() overall = pd.read_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_cleaveland.csv" ) k = 0 for index, row in tqdm(overall.iterrows()): frequency = row["frequency"] days = row["days"] df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])] df_series = df_series[df_series["quantity"] >= 0] df_series = df_series[df_series["date"] >= 20160703] if frequency == 0: continue df_series = get_weekly_aggregate(df_series) _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={ 'dt_week': 'ds', 'quantity': 'y' }) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) outlier = True if (frequency >= 26) & (days > 365 + 183): n_pass = 3 window_size = 12 sigma = 4.0 elif (frequency >= 20) & (frequency < 26): n_pass = 3 window_size = 12 sigma = 5.0 elif (frequency >= 26) & (days <= 365 + 183): if len(aggregated_data) >= 26: n_pass = 3 window_size = 12 sigma = 4.0 else: outlier = False elif (frequency >= 12) & (frequency < 20): if len(aggregated_data) >= 26: n_pass = 3 window_size = 24 sigma = 5.0 else: outlier = False elif frequency < 12: outlier = False if outlier: _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size, sigma=sigma) result = _result[0].rename(columns={ 'ds': 'dt_week', 'y': 'quantity' }) else: result = aggregated_data.rename(columns={ 'ds': 'dt_week', 'y': 'quantity' }) if k == 1: final = pd.concat([final, result]) if k == 0: final = result k = 1 final = final.groupby("dt_week")["quantity"].sum().reset_index() final = final.set_index("dt_week") final.to_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/aggregated_complete_outliers_removed.csv" ) result = seasonal_decompose(final["quantity"], model="additive") result.seasonal.to_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/aggregated_complete_outliers_removed_seas.csv" ) return result.seasonal