Exemplo n.º 1
0
def average(input_df, kunag, matnr):
    """
    applies average model and calculates mse score on test data
    :param input_df:
    :param kunag:
    :param matnr:
    :return:
    """
    df = input_df.copy()
    df = remove_negative_rows(df)
    df_series = individual_series(df, kunag, matnr)
    df_series = data_transformation.get_weekly_aggregate(df_series)
    df_series["date"] = df_series["dt_week"].map(str)
    df_series["date"] = df_series["date"].apply(lambda x: x.replace("-", ""))
    df_series["prediction"] = df_series["quantity"]
    df_series_train, df_series_test = splitter(df_series)
    k = 0
    for index, row in df_series_test.iterrows():
        row["prediction"] = df_series_train["quantity"].mean()
        df_series_train = pd.concat([df_series_train,
                                     pd.DataFrame(row).T
                                     ]).reset_index(drop=True)
        if k == 0:
            test_index = df_series_train.shape[0] - 1
            k = 1
    output_df = df_series_train
    test_df = df_series_train.iloc[test_index:]
    # print("mean squared error is :",mean_squared_error(output_df["quantity"], output_df["prediction"]))
    return output_df, mean_squared_error(test_df["quantity"],
                                         test_df["prediction"])
Exemplo n.º 2
0
def arima(input_df, kunag, matnr, p=2, d=1, q=4, trend="n"):
    df = input_df.copy()
    df = remove_negative_rows(df)
    df_series = individual_series(df, kunag, matnr)
    df_series = data_transformation.get_weekly_aggregate(df_series)
    df_series["date"] = df_series["dt_week"].map(str)
    df_series["date"] = df_series["date"].apply(lambda x: x.replace("-", ""))
    df_series["prediction"] = df_series["quantity"]
    df_series_train, df_series_test = splitter(df_series)
    k = 0
    for index, row in df_series_test.iterrows():
        df_series_train["quantity"] = df_series_train["quantity"].map(float)
        fit1 = sm.tsa.statespace.SARIMAX(df_series_train["quantity"],
                                         order=(p, d, q),
                                         trend=trend).fit()
        predicted = fit1.forecast(1)
        row["prediction"] = predicted.values[0]
        df_series_train = pd.concat([df_series_train,
                                     pd.DataFrame(row).T
                                     ]).reset_index(drop=True)
        if k == 0:
            test_index = df_series_train.shape[0] - 1
            k = 1
    output_df = df_series_train
    test_df = df_series_train.iloc[test_index:]
    # print("mean squared error is :",mean_squared_error(output_df["quantity"], output_df["prediction"]))
    return output_df, mean_squared_error(test_df["quantity"],
                                         test_df["prediction"])
Exemplo n.º 3
0
def holts_linear_trend(input_df,
                       kunag,
                       matnr,
                       smoothing_level=0.3,
                       smoothing_slope=0.1):
    df = input_df.copy()
    df = remove_negative_rows(df)
    df_series = individual_series(df, kunag, matnr)
    df_series = data_transformation.get_weekly_aggregate(df_series)
    df_series["date"] = df_series["dt_week"].map(str)
    df_series["date"] = df_series["date"].apply(lambda x: x.replace("-", ""))
    df_series["prediction"] = df_series["quantity"]
    df_series_train, df_series_test = splitter(df_series)
    k = 0
    for index, row in df_series_test.iterrows():
        df_series_train["quantity"] = df_series_train["quantity"].map(float)
        fit1 = Holt(np.asarray(df_series_train["quantity"])).fit(
            smoothing_level=smoothing_level, smoothing_slope=smoothing_slope)
        predicted = fit1.forecast(1)
        row["prediction"] = predicted[0]
        df_series_train = pd.concat([df_series_train,
                                     pd.DataFrame(row).T
                                     ]).reset_index(drop=True)
        if k == 0:
            test_index = df_series_train.shape[0] - 1
            k = 1
    output_df = df_series_train
    test_df = df_series_train.iloc[test_index:]
    # print("mean squared error is :",mean_squared_error(output_df["quantity"], output_df["prediction"]))
    return output_df, mean_squared_error(test_df["quantity"],
                                         test_df["prediction"])
Exemplo n.º 4
0
def simple_exponential_smoothing(input_df, kunag, matnr, alpha=0.6):

    df = input_df.copy()
    df = remove_negative_rows(df)
    df_series = individual_series(df, kunag, matnr)
    df_series = data_transformation.get_weekly_aggregate(df_series)
    df_series["date"] = df_series["dt_week"].map(str)
    df_series["date"] = df_series["date"].apply(lambda x: x.replace("-", ""))
    df_series["prediction"] = df_series["quantity"]
    df_series_train, df_series_test = splitter(df_series)
    k = 0
    for index, row in df_series_test.iterrows():
        fit2 = SimpleExpSmoothing(np.asarray(df_series_train["quantity"])).fit(
            smoothing_level=alpha, optimized=False)
        row["prediction"] = fit2.forecast(1)
        df_series_train = pd.concat([df_series_train,
                                     pd.DataFrame(row).T
                                     ]).reset_index(drop=True)
        if k == 0:
            test_index = df_series_train.shape[0] - 1
            k = 1
    output_df = df_series_train
    test_df = df_series_train.iloc[test_index:]
    # print("mean squared error is :",mean_squared_error(output_df["quantity"], output_df["prediction"]))
    return output_df, mean_squared_error(test_df["quantity"],
                                         test_df["prediction"])
Exemplo n.º 5
0
def prophet(input_df,
            kunag,
            matnr,
            growth="linear",
            changepoint_prior_scale=0.5,
            yearly_seasonality=False,
            daily_seasonality=False,
            weekly_seasonality=False,
            seasonality_prior_scale=100):
    df = input_df.copy()
    df = remove_negative_rows(df)
    df_series = individual_series(
        df,
        kunag,
        matnr,
    )
    df_series = data_transformation.get_weekly_aggregate(df_series)
    df_series["date"] = df_series["dt_week"].map(str)
    df_series["date"] = df_series["date"].apply(lambda x: x.replace("-", ""))
    df_series["prediction"] = df_series["quantity"]
    df_series_train, df_series_test = splitter(df_series)
    k = 0
    for index, row in df_series_test.iterrows():
        df_series_train["quantity"] = df_series_train["quantity"].map(float)
        df_copy = df_series_train.copy()
        ts = prophet_preprocess(df_copy)
        m = Prophet(growth=growth,
                    changepoint_prior_scale=changepoint_prior_scale,
                    yearly_seasonality=yearly_seasonality,
                    daily_seasonality=daily_seasonality,
                    weekly_seasonality=weekly_seasonality,
                    seasonality_prior_scale=seasonality_prior_scale)
        m.fit(ts)
        future = m.make_future_dataframe(
            periods=1, freq="W", include_history=False).apply(
                lambda x:
                (x + pd.Timedelta(4, unit="D")))  # + pd.Timedelta(4, unit="D")

        forecast = m.predict(future)
        row["prediction"] = forecast.iloc[0]["yhat"]
        df_series_train = pd.concat([df_series_train,
                                     pd.DataFrame(row).T
                                     ]).reset_index(drop=True)
        if k == 0:
            test_index = df_series_train.shape[0] - 1
            k = 1
    output_df = df_series_train
    test_df = df_series_train.iloc[test_index:]
    return output_df, mean_squared_error(test_df["quantity"],
                                         test_df["prediction"])
Exemplo n.º 6
0
def holts_winter_method(input_df,
                        kunag,
                        matnr,
                        seasonal_period,
                        alpha=None,
                        beta=None,
                        gamma=None):
    """

    :param input_df:
    :param kunag:
    :param matnr:
    :param seasonal_period:
    :param alpha:
    :param beta:
    :param gamma:
    :return:
    """
    df = input_df.copy()
    df = remove_negative_rows(df)
    df_series = individual_series(df, kunag, matnr)
    df_series = data_transformation.get_weekly_aggregate(df_series)
    df_series["date"] = df_series["dt_week"].map(str)
    df_series["date"] = df_series["date"].apply(lambda x: x.replace("-", ""))
    df_series["prediction"] = df_series["quantity"]
    df_series_train, df_series_test = splitter(df_series)
    k = 0
    for index, row in df_series_test.iterrows():
        df_series_train["quantity"] = df_series_train["quantity"].map(float)
        fit1 = ExponentialSmoothing(
            np.asarray(df_series_train["quantity"]),
            seasonal_periods=seasonal_period,
            trend='add',
            seasonal='add',
        ).fit(smoothing_level=alpha,
              smoothing_slope=beta,
              smoothing_seasonal=gamma)
        predicted = fit1.forecast(1)
        row["prediction"] = predicted[0]
        df_series_train = pd.concat([df_series_train,
                                     pd.DataFrame(row).T
                                     ]).reset_index(drop=True)
        if k == 0:
            test_index = df_series_train.shape[0] - 1
            k = 1
    output_df = df_series_train
    test_df = df_series_train.iloc[test_index:]
    # print("mean squared error is :",mean_squared_error(output_df["quantity"], output_df["prediction"]))
    return output_df, mean_squared_error(test_df["quantity"],
                                         test_df["prediction"])
Exemplo n.º 7
0
def individual_series(input_df, kunag=500057582, matnr=103029):
    """
    selects a dataframe corresponding to a particular kunag and matnr
    param: a pandas dataframe
    return: a pandas dataframe
    """
    df_copy = input_df.copy()
    df_copy = remove_negative_rows(df_copy)
    df_copy = df_copy[df_copy["date"] >= 20160703]
    output_df = df_copy[(df_copy["kunag"] == kunag)
                        & (df_copy["matnr"] == matnr)]
    output_df = get_weekly_aggregate(output_df)
    output_df["dt_week"] = output_df["dt_week"].apply(
        lambda x: pd.to_datetime(x, format="%Y-%m-%d"))
    outlier_removed = outlier(output_df)
    return outlier_removed
Exemplo n.º 8
0
def samples_aggregate_seas():
    df = load_data()
    bucket_1_sample = pd.read_csv(
        "/home/aman/PycharmProjects/seasonality_hypothesis/data_generated/bucket_1_sample.csv"
    )
    k = 0
    for index, row in bucket_1_sample.iterrows():
        df_series = df[(df["kunag"] == row["kunag"])
                       & (df["matnr"] == row["matnr"])]
        df_series = df_series[df_series["quantity"] >= 0]
        df_series = df_series[df_series["date"] >= 20160703]
        df_series = get_weekly_aggregate(df_series)
        _testing = df_series[["quantity", "dt_week"]].copy()
        aggregated_data = _testing.rename(columns={
            'dt_week': 'ds',
            'quantity': 'y'
        })

        aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
        aggregated_data.y = aggregated_data.y.apply(float)
        aggregated_data = aggregated_data.sort_values('ds')
        aggregated_data = aggregated_data.reset_index(drop=True)

        _result = ma_replace_outlier(data=aggregated_data,
                                     n_pass=3,
                                     aggressive=True,
                                     window_size=12,
                                     sigma=3.0)
        result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        if k == 1:
            final = pd.concat([final, result])
        if k == 0:
            final = result
            k = 1
    final = final.groupby("dt_week")["quantity"].sum().reset_index()
    final = final.set_index("dt_week")
    #plt.figure(figsize=(16, 8))
    #plt.plot(final["quantity"], label='quantity', marker=".")
    #plt.title("200 sample aggregated plot")
    #plt.xlabel("dt_weeks")
    #plt.ylabel("aggregated quantities")
    #plt.show()
    result = seasonal_decompose(final["quantity"], model="additive")
    #result.plot()
    #plt.show()
    return result.seasonal
Exemplo n.º 9
0
def aggregate_seasonal_comp():
    df = load_data()[["date", "quantity"]]
    df = df[(df["quantity"] >= 0) & (df["quantity"] <= 10)]
    aggregate_data = df.groupby("date")["quantity"].sum()
    aggregate_data = aggregate_data.reset_index()
    aggregate_data["kunag"] = 1
    aggregate_data["matnr"] = 2
    aggregate_data["price"] = 3
    aggregate_data = get_weekly_aggregate(aggregate_data)
    aggregate_data["dt_week"] = aggregate_data["dt_week"].apply(
        lambda x: pd.to_datetime(x, format="%Y-%m-%d"))
    aggregate_data = aggregate_data.set_index("dt_week")
    # plt.figure(figsize=(16, 8))
    # plt.plot(aggregate_data["quantity"], label='quantity')
    # plt.title("aggregated plot")
    # plt.show()
    result = seasonal_decompose(aggregate_data["quantity"], model="additive")
    # result.plot()
    plt.show()
    return result.seasonal
Exemplo n.º 10
0
def dickey_fuller_test(input_df, matnr=112260):
    """
    This function aggregates whole cleaveland data with ma outliers removing different categories series outliers
    First week has been removed
    :return: pandas_df : seasonal component of the aggregated data
    """
    df = input_df.copy()
    df = df[df["matnr"] == matnr]
    overall = pd.read_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_cleaveland.csv"
    )
    overall = overall[overall["matnr"] == matnr]
    product = pd.read_csv(
        "~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv",
        sep="\t")
    product_name = product[product["matnr"] == str(
        matnr)]["description"].values[0]
    k = 0
    for index, row in tqdm(overall.iterrows()):
        frequency = row["frequency"]
        days = row["days"]
        df_series = df[(df["kunag"] == row["kunag"])
                       & (df["matnr"] == row["matnr"])]
        df_series = df_series[df_series["quantity"] >= 0]
        df_series = df_series[df_series["date"] >= 20160703]
        if frequency == 0:
            continue
        #print(df_series)
        df_series = get_weekly_aggregate(df_series)
        _testing = df_series[["quantity", "dt_week"]].copy()
        aggregated_data = _testing.rename(columns={
            'dt_week': 'ds',
            'quantity': 'y'
        })

        aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
        aggregated_data.y = aggregated_data.y.apply(float)
        aggregated_data = aggregated_data.sort_values('ds')
        aggregated_data = aggregated_data.reset_index(drop=True)
        outlier = True
        if (frequency >= 26) & (days > 365 + 183):
            n_pass = 3
            window_size = 12
            sigma = 4.0
        elif (frequency >= 20) & (frequency < 26):
            n_pass = 3
            window_size = 12
            sigma = 5.0
        elif (frequency >= 26) & (days <= 365 + 183):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 12
                sigma = 4.0
            else:
                outlier = False
        elif (frequency >= 12) & (frequency < 20):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 24
                sigma = 5.0
            else:
                outlier = False
        elif frequency < 12:
            outlier = False

        if outlier:
            _result = ma_replace_outlier(data=aggregated_data,
                                         n_pass=n_pass,
                                         aggressive=True,
                                         window_size=window_size,
                                         sigma=sigma)
            result = _result[0].rename(columns={
                'ds': 'dt_week',
                'y': 'quantity'
            })
        else:
            result = aggregated_data.rename(columns={
                'ds': 'dt_week',
                'y': 'quantity'
            })
        if k == 1:
            final = pd.concat([final, result])
        if k == 0:
            final = result
            k = 1
    final = final.groupby("dt_week")["quantity"].sum().reset_index()
    final = final.set_index("dt_week")
    #temp = final
    # plt.plot(final, marker=".")
    # plt.title("original")
    # plt.show()
    final, Flag = cond_check(final)
    if Flag:
        final_detrended = detrend(final)
        # plt.plot(final_detrended, marker=".")
        # plt.title("detrended")
        # plt.show()
        final_aggregate = monthly_aggregate(final_detrended)
        # plt.plot(final_aggregate, marker=".")
        # plt.title("aggregated")
        # plt.show()
        result = adfuller(final_aggregate["quantity"],
                          maxlag=10,
                          autolag='t-stat')
        print('ADF Statistic: %f' % result[0])
        print('p-value: %f' % result[1])
        print("No of lags used: %f" % result[2])
        print("No of observations: %f" % result[3])
        print('Critical Values:')
        for key, value in result[4].items():
            print('\t%s: %.3f' % (key, value))
        #plt.figure(figsize=(16, 8))
        #plt.plot(temp, marker=".")
        if result[1] >= 0.05:
            #plt.title("Not Stationary")
            print("Not stationary")
        else:
            #plt.title("Stationary")
            print("Stationary")
        #plt.savefig("/home/aman/PycharmProjects/seasonality_hypothesis/seasonality_result_2/" + str(matnr) + "_" + product_name + ".png")
    else:
        print("length of series is less than 112")
def overall_aggregate_seas(input_df, matnr=103029):
    """
    This function aggregates whole cleaveland data with ma outliers removing different categories series outliers
    First week has been removed
    :return: pandas_df : seasonal component of the aggregated data
    """
    df = input_df.copy()
    df = df[df["matnr"] == matnr]
    overall = pd.read_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_4200_C025.csv")
    overall = overall[overall["matnr"] == matnr]
    product = pd.read_csv("~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv", sep="\t")
    product_name = product[product["matnr"] == str(matnr)]["description"].values[0]
    k = 0
    for index, row in overall.iterrows():
        frequency = row["frequency"]
        days = row["days"]
        df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])]
        df_series = df_series[df_series["quantity"] >= 0]
        df_series = df_series[df_series["date"] >= 20160703]
        if int(frequency) == 0:
            continue
        df_series = get_weekly_aggregate(df_series)
        _testing = df_series[["quantity", "dt_week"]].copy()
        aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'})

        aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
        aggregated_data.y = aggregated_data.y.apply(float)
        aggregated_data = aggregated_data.sort_values('ds')
        aggregated_data = aggregated_data.reset_index(drop=True)
        outlier = True
        if (frequency >= 26) & (days > 365 + 183):
            n_pass = 3
            window_size = 12
            sigma = 4.0
        elif(frequency >= 20) & (frequency < 26):
            n_pass = 3
            window_size = 12
            sigma = 5.0
        elif (frequency >= 26) & (days <= 365+183):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 12
                sigma = 4.0
            else:
                outlier = False
        elif (frequency >= 12) & (frequency < 20):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 24
                sigma = 5.0
            else:
                outlier = False
        elif frequency < 12:
            outlier = False

        if outlier:
            _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size,
                                         sigma=sigma)
            result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        else:
            result = aggregated_data.rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        if k == 1:
            final = pd.concat([final, result])
        if k == 0:
            final = result
            k = 1

    try:
        final = final.groupby("dt_week")["quantity"].sum().reset_index()
    except:
        return None
    final = final.set_index("dt_week")
    print(final["quantity"])
    print(final["quantity"].shape)
    result = seasonal_decompose(final["quantity"], model="additive")
    result.plot()
    #plt.show()
    plt.savefig(
        "/home/aman/PycharmProjects/seasonality_hypothesis/plots_product_C0025/"+str(matnr)+"_"+product_name+".png")
    #result.seasonal.to_csv(
    #    "~/PycharmProjects/seasonality_hypothesis/data_generated/product_aggregate_seasonality_"+str(matnr)+".csv")
    return result.seasonal
Exemplo n.º 12
0
        normalized_freq = float(freq) * multiplier
        return normalized_freq


if __name__ == "__main__":
    import matplotlib.pyplot as plt
    from statsmodels.tsa.stattools import acf

    file_address = "/home/aman/Desktop/CSO_drug/data/raw_invoices_cleaveland_sample_100_stores_2018-12-09.tsv"

    dateparse = lambda dates: pd.datetime.strptime(dates, '%Y%m%d')
    data = pd.read_csv(file_address,
                       sep="\t",
                       parse_dates=['date'],
                       index_col='date',
                       date_parser=dateparse)
    data = data.sort_index()
    #norm_freq_last_year_all_series(data)
    ts = select_series(data, kunag=500076413, matnr=144089)
    ts = ts.reset_index()
    #ts.to_csv("original.csv")
    ts = data_transformation.get_weekly_aggregate(ts)
    #ts.to_csv("aggregated.csv")
    #pd.DataFrame(acf(ts["quantity"])).plot(kind='bar')
    #plt.show()
    #ts.to_csv("/home/aman/Desktop/CSO_drug/file_generated/series.csv")
    print("done")

    # plt.plot(ts_1)
    # plt.show()
Exemplo n.º 13
0
if __name__ == "__main__":
    df = load_data()
    df = df[df["quantity"] >= 0]
    df = df[df["matnr"] == 101728]
    series_stores = pd.DataFrame()
    for index, group in df.groupby(["date"]):
        num = len(group["kunag"])
        series_stores = series_stores.append([[index, num]])
    series_stores.columns = ["date", "quantity"]
    series_stores = series_stores[series_stores["date"] >= 20160703]
    series_stores["matnr"] = "A"
    series_stores["kunag"] = "B"
    series_stores["price"] = 0
    series_stores["date"] = series_stores["date"].map(str)
    series_stores = series_stores.reset_index(drop=True)
    series_stores = get_weekly_aggregate(series_stores)
    series_stores = series_stores.set_index("dt_week")
    series_stores.to_csv(
        "/home/aman/PycharmProjects/seasonality_hypothesis/stores_analysis.csv"
    )
    plt.plot(series_stores["quantity"], marker=".")
    plt.show()

    data_stores = pd.read_csv(
        "/home/aman/PycharmProjects/seasonality_hypothesis/stores_analysis.csv"
    )
    data_aggregated = pd.read_csv(
        "/home/aman/PycharmProjects/seasonality_hypothesis/aggregated.csv")
    data_aggregated[
        "division"] = data_aggregated["quantity"] / data_stores["quantity"]
    data_aggregated = data_aggregated.set_index("dt_week")
Exemplo n.º 14
0
def ljung_box_test(input_df, matnr=112260):
    """
    This function aggregates whole cleaveland data with ma outliers removing different categories series outliers
    First week has been removed
    :return: pandas_df : seasonal component of the aggregated data
    """
    df = input_df.copy()
    df = df[df["matnr"] == matnr]
    overall = pd.read_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_4200_C005.csv")
    overall = overall[overall["matnr"] == matnr]
    # product = pd.read_csv("~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv", sep="\t")
    # product_name = product[product["matnr"] == str(int(matnr))]["description"].values[0]
    k = 0
    for index, row in overall.iterrows():
        frequency = row["frequency"]
        days = row["days"]
        df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])]
        df_series = df_series[df_series["quantity"] >= 0]
        df_series = df_series[df_series["date"] >= 20160703]
        if frequency == 0:
            continue
        # print(df_series)
        df_series = get_weekly_aggregate(df_series)
        # plt.plot(df_series.set_index("dt_week")["quantity"], marker=".", label="individual series")
        # plt.title(str(row["matnr"]) + " before outlier")
        # plt.show()
        _testing = df_series[["quantity", "dt_week"]].copy()
        aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'})

        aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
        aggregated_data.y = aggregated_data.y.apply(float)
        aggregated_data = aggregated_data.sort_values('ds')
        aggregated_data = aggregated_data.reset_index(drop=True)
        outlier = True
        if (frequency >= 26) & (days > 365 + 183):
            n_pass = 3
            window_size = 12
            sigma = 4.0
        elif(frequency >= 20) & (frequency < 26):
            n_pass = 3
            window_size = 12
            sigma = 5.0
        elif (frequency >= 26) & (days <= 365+183):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 12
                sigma = 4.0
            else:
                outlier = False
        elif (frequency >= 12) & (frequency < 20):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 24
                sigma = 5.0
            else:
                outlier = False
        elif frequency < 12:
            outlier = False

        if outlier:
            _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size,
                                         sigma=sigma)
            result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        else:
            result = aggregated_data.rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        # plt.plot(result.set_index("dt_week")["quantity"], marker=".", label="individual_series_after_outlier")
        # plt.title(str(row["matnr"]) + " after outlier")
        # plt.show()
        if k == 1:
            final = pd.concat([final, result])
        if k == 0:
            final = result
            k = 1
    final = final.groupby("dt_week")["quantity"].sum().reset_index()
    # plt.figure(figsize=(16, 8))
    # plt.plot(final.set_index("dt_week"), marker=".", markerfacecolor="red", label="y")
    # plt.plot(final.set_index("dt_week").diff(), marker=".", label="differenced aggregated_data")
    # plt.xticks(fontsize=14)
    # plt.yticks(fontsize=14)
    # plt.xlabel("Date", fontsize=14)
    # plt.ylabel("Quantity", fontsize=14)
    # plt.title("Product Weekly Aggregated Data", fontsize=16)
    # plt.legend(fontsize=14)
    # plt.show()
    final = outlier_on_aggregated(final)
    final_temp = final
    plt.figure(figsize=(16, 8))
    plt.plot(final.set_index("dt_week"), marker=".", markerfacecolor="red", label="y")
    # plt.plot(final.set_index("dt_week").diff(), marker=".", label="differenced aggregated_data")
    # plt.xticks(fontsize=14)
    # plt.yticks(fontsize=14)
    # plt.xlabel("Date", fontsize=14)
    # plt.ylabel("Quantity", fontsize=14)
    # plt.title("Product Weekly Aggregated Data Outlier Removed", fontsize=16)
    # plt.legend(fontsize=14)
    plt.show()
    # print("________", final.dtypes)
    final = final.set_index("dt_week")
    missing_more_24 = missing_data_detection(final)
    if missing_more_24:
        # print("data is missing for more than 6 months")
        return False, 1, 0, final, final_temp
    #temp = final
    # final = final.diff()
    # print("checking the length of aggregated series ...")
    final, Flag = cond_check(final)
    if Flag:
        # print("detrending the aggregated series ...")
        final_detrended = detrend(final)
        # plt.figure(figsize=(16, 8))
        # plt.plot(final_detrended, marker=".", markerfacecolor="red", label="y")
        # plt.xticks(fontsize=14)
        # plt.yticks(fontsize=14)
        # plt.xlabel("Date", fontsize=14)
        # plt.ylabel("Quantity", fontsize=14)
        # plt.title("Detrended", fontsize=16)
        # plt.legend(fontsize=14)
        # plt.show()
        # print("monthly aggregating the aggregated series ...")
        final_aggregate = monthly_aggregate(final_detrended)
        # plt.figure(figsize=(16, 8))
        # plt.plot(final_aggregate, marker=".", markerfacecolor="red", label="y")
        # plt.xticks(fontsize=14)
        # plt.yticks(fontsize=14)
        # plt.xlabel("Date", fontsize=14)
        # plt.ylabel("Quantity", fontsize=14)
        # plt.title("Monthly Aggregated", fontsize=16)
        # plt.legend(fontsize=14)
        # plt.show()
        # print("standard deviation is", final.std()/ final.mean())
        # print("performing ljung box test ...")
        result = acorr_ljungbox(final_aggregate["quantity"], lags=[13])
        # print(result)
        result_dickey = adfuller(final_aggregate["quantity"])
        # print("statistic: %f" %result[0])
        # print("p-value: %f" %result[1])
        # print("p_value is :", result[1][0])
        if result[1] < 0.02:
            # print(str(matnr)+" is seasonal")
            return True, result[1][0], result_dickey[1], final, final_temp
        else:
            # print(str(matnr) + " is not seasonal")
            return False, result[1][0], result_dickey[1], final, final_temp
    else:
        print("length of series is less than 112")
        return [False, "length is small", 0, final, final_temp]
Exemplo n.º 15
0
def ljung_box_test_without_aggregation(input_df, matnr=112260):
    """
    This function aggregates whole cleaveland data with ma outliers removing different categories series outliers
    First week has been removed
    :return: pandas_df : seasonal component of the aggregated data
    """
    df = input_df.copy()
    df = df[df["matnr"] == matnr]
    overall = pd.read_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_4200_C005.csv")
    overall = overall[overall["matnr"] == matnr]
    #product = pd.read_csv("~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv", sep="\t")
    #product_name = product[product["matnr"] == str(int(matnr))]["description"].values[0]
    k = 0
    for index, row in overall.iterrows():
        frequency = row["frequency"]
        days = row["days"]
        df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])]
        df_series = df_series[df_series["quantity"] >= 0]
        df_series = df_series[df_series["date"] >= 20160703]
        if frequency == 0:
            continue
        #print(df_series)
        df_series = get_weekly_aggregate(df_series)
        _testing = df_series[["quantity", "dt_week"]].copy()
        aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'})

        aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
        aggregated_data.y = aggregated_data.y.apply(float)
        aggregated_data = aggregated_data.sort_values('ds')
        aggregated_data = aggregated_data.reset_index(drop=True)
        outlier = True
        if (frequency >= 26) & (days > 365 + 183):
            n_pass = 3
            window_size = 12
            sigma = 4.0
        elif(frequency >= 20) & (frequency < 26):
            n_pass = 3
            window_size = 12
            sigma = 5.0
        elif (frequency >= 26) & (days <= 365+183):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 12
                sigma = 4.0
            else:
                outlier = False
        elif (frequency >= 12) & (frequency < 20):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 24
                sigma = 5.0
            else:
                outlier = False
        elif frequency < 12:
            outlier = False

        if outlier:
            _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size,
                                         sigma=sigma)
            result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        else:
            result = aggregated_data.rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        if k == 1:
            final = pd.concat([final, result])
        if k == 0:
            final = result
            k = 1
    final = final.groupby("dt_week")["quantity"].sum().reset_index()
    final = final.set_index("dt_week")
    result = acorr_ljungbox(final["quantity"], lags=[52])

        # print("statistic: %f" %result[0])
        # print("p-value: %f" %result[1])
    if result[1] < 0.01:
        #print(result[1])
        return True, result[1], final
    else:
        return False, result[1], final
Exemplo n.º 16
0
def overall_aggregate_seas():
    """
    This function aggregates whole cleaveland data with ma outliers removing different categories series outliers
    First week has been removed
    :return: pandas_df : seasonal component of the aggregated data
    """
    df = load_data()
    overall = pd.read_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_cleaveland.csv"
    )
    k = 0
    for index, row in tqdm(overall.iterrows()):
        frequency = row["frequency"]
        days = row["days"]
        df_series = df[(df["kunag"] == row["kunag"])
                       & (df["matnr"] == row["matnr"])]
        df_series = df_series[df_series["quantity"] >= 0]
        df_series = df_series[df_series["date"] >= 20160703]
        if frequency == 0:
            continue
        df_series = get_weekly_aggregate(df_series)
        _testing = df_series[["quantity", "dt_week"]].copy()
        aggregated_data = _testing.rename(columns={
            'dt_week': 'ds',
            'quantity': 'y'
        })

        aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
        aggregated_data.y = aggregated_data.y.apply(float)
        aggregated_data = aggregated_data.sort_values('ds')
        aggregated_data = aggregated_data.reset_index(drop=True)
        outlier = True
        if (frequency >= 26) & (days > 365 + 183):
            n_pass = 3
            window_size = 12
            sigma = 4.0
        elif (frequency >= 20) & (frequency < 26):
            n_pass = 3
            window_size = 12
            sigma = 5.0
        elif (frequency >= 26) & (days <= 365 + 183):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 12
                sigma = 4.0
            else:
                outlier = False
        elif (frequency >= 12) & (frequency < 20):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 24
                sigma = 5.0
            else:
                outlier = False
        elif frequency < 12:
            outlier = False

        if outlier:
            _result = ma_replace_outlier(data=aggregated_data,
                                         n_pass=n_pass,
                                         aggressive=True,
                                         window_size=window_size,
                                         sigma=sigma)
            result = _result[0].rename(columns={
                'ds': 'dt_week',
                'y': 'quantity'
            })
        else:
            result = aggregated_data.rename(columns={
                'ds': 'dt_week',
                'y': 'quantity'
            })
        if k == 1:
            final = pd.concat([final, result])
        if k == 0:
            final = result
            k = 1
    final = final.groupby("dt_week")["quantity"].sum().reset_index()
    final = final.set_index("dt_week")
    final.to_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/aggregated_complete_outliers_removed.csv"
    )
    result = seasonal_decompose(final["quantity"], model="additive")
    result.seasonal.to_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/aggregated_complete_outliers_removed_seas.csv"
    )
    return result.seasonal