示例#1
0
def outlier(df_series, category):
    flag = False
    if category == 1 or category == 2 or category == 3:
        window_size, sigma, flag = 12, 3, True
    if category == 4 or category == 5 or category == 8:
        window_size, sigma, flag = 12, 5, True
    if category == 7 and df_series.shape[0] >= 26:
        window_size, sigma, flag = 12, 4, True
    if category == 9:
        window_size, sigma, flag = 24, 5, True

    _testing = df_series[["quantity", "dt_week"]].copy()
    aggregated_data = _testing.rename(columns={
        'dt_week': 'ds',
        'quantity': 'y'
    })

    aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
    aggregated_data.y = aggregated_data.y.apply(float)
    aggregated_data = aggregated_data.sort_values('ds')
    aggregated_data = aggregated_data.reset_index(drop=True)
    if not flag:
        aggregated_data = aggregated_data.rename(columns={
            'ds': 'dt_week',
            'y': 'quantity'
        })
        return aggregated_data
    _result = ma_replace_outlier(data=aggregated_data,
                                 n_pass=3,
                                 aggressive=True,
                                 window_size=window_size,
                                 sigma=sigma)
    result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'})
    return result
示例#2
0
def samples_aggregate_seas():
    df = load_data()
    bucket_1_sample = pd.read_csv(
        "/home/aman/PycharmProjects/seasonality_hypothesis/data_generated/bucket_1_sample.csv"
    )
    k = 0
    for index, row in bucket_1_sample.iterrows():
        df_series = df[(df["kunag"] == row["kunag"])
                       & (df["matnr"] == row["matnr"])]
        df_series = df_series[df_series["quantity"] >= 0]
        df_series = df_series[df_series["date"] >= 20160703]
        df_series = get_weekly_aggregate(df_series)
        _testing = df_series[["quantity", "dt_week"]].copy()
        aggregated_data = _testing.rename(columns={
            'dt_week': 'ds',
            'quantity': 'y'
        })

        aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
        aggregated_data.y = aggregated_data.y.apply(float)
        aggregated_data = aggregated_data.sort_values('ds')
        aggregated_data = aggregated_data.reset_index(drop=True)

        _result = ma_replace_outlier(data=aggregated_data,
                                     n_pass=3,
                                     aggressive=True,
                                     window_size=12,
                                     sigma=3.0)
        result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        if k == 1:
            final = pd.concat([final, result])
        if k == 0:
            final = result
            k = 1
    final = final.groupby("dt_week")["quantity"].sum().reset_index()
    final = final.set_index("dt_week")
    #plt.figure(figsize=(16, 8))
    #plt.plot(final["quantity"], label='quantity', marker=".")
    #plt.title("200 sample aggregated plot")
    #plt.xlabel("dt_weeks")
    #plt.ylabel("aggregated quantities")
    #plt.show()
    result = seasonal_decompose(final["quantity"], model="additive")
    #result.plot()
    #plt.show()
    return result.seasonal
示例#3
0
def outlier_material(df_series):
    _testing = df_series[["quantity", "dt_week"]].copy()
    aggregated_data = _testing.rename(columns={
        'dt_week': 'ds',
        'quantity': 'y'
    })

    aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
    aggregated_data.y = aggregated_data.y.apply(float)
    aggregated_data = aggregated_data.sort_values('ds')
    aggregated_data = aggregated_data.reset_index(drop=True)

    _result = ma_replace_outlier(data=aggregated_data,
                                 n_pass=3,
                                 aggressive=True,
                                 window_size=12,
                                 sigma=3)
    result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'})
    return result
示例#4
0
def outlier_on_aggregated(aggregated_df):
    _testing = aggregated_df[["quantity", "dt_week"]].copy()
    aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'})

    aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
    aggregated_data.y = aggregated_data.y.apply(float)
    aggregated_data = aggregated_data.sort_values('ds')
    aggregated_data = aggregated_data.reset_index(drop=True)
    n_pass = 3
    window_size = 12
    sigma = 3.0
    _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size,
                                 sigma=sigma)
    result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'})
    # plt.plot(result.set_index("dt_week")["quantity"], marker=".", label="after outlier")
    # # plt.plot(result.set_index("dt_week").diff(), marker=".", label="differenced after outlier")
    # plt.title("aggregated outlier removed")
    # plt.show()
    return result
示例#5
0
def dickey_fuller_test(input_df, matnr=112260):
    """
    This function aggregates whole cleaveland data with ma outliers removing different categories series outliers
    First week has been removed
    :return: pandas_df : seasonal component of the aggregated data
    """
    df = input_df.copy()
    df = df[df["matnr"] == matnr]
    overall = pd.read_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_cleaveland.csv"
    )
    overall = overall[overall["matnr"] == matnr]
    product = pd.read_csv(
        "~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv",
        sep="\t")
    product_name = product[product["matnr"] == str(
        matnr)]["description"].values[0]
    k = 0
    for index, row in tqdm(overall.iterrows()):
        frequency = row["frequency"]
        days = row["days"]
        df_series = df[(df["kunag"] == row["kunag"])
                       & (df["matnr"] == row["matnr"])]
        df_series = df_series[df_series["quantity"] >= 0]
        df_series = df_series[df_series["date"] >= 20160703]
        if frequency == 0:
            continue
        #print(df_series)
        df_series = get_weekly_aggregate(df_series)
        _testing = df_series[["quantity", "dt_week"]].copy()
        aggregated_data = _testing.rename(columns={
            'dt_week': 'ds',
            'quantity': 'y'
        })

        aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
        aggregated_data.y = aggregated_data.y.apply(float)
        aggregated_data = aggregated_data.sort_values('ds')
        aggregated_data = aggregated_data.reset_index(drop=True)
        outlier = True
        if (frequency >= 26) & (days > 365 + 183):
            n_pass = 3
            window_size = 12
            sigma = 4.0
        elif (frequency >= 20) & (frequency < 26):
            n_pass = 3
            window_size = 12
            sigma = 5.0
        elif (frequency >= 26) & (days <= 365 + 183):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 12
                sigma = 4.0
            else:
                outlier = False
        elif (frequency >= 12) & (frequency < 20):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 24
                sigma = 5.0
            else:
                outlier = False
        elif frequency < 12:
            outlier = False

        if outlier:
            _result = ma_replace_outlier(data=aggregated_data,
                                         n_pass=n_pass,
                                         aggressive=True,
                                         window_size=window_size,
                                         sigma=sigma)
            result = _result[0].rename(columns={
                'ds': 'dt_week',
                'y': 'quantity'
            })
        else:
            result = aggregated_data.rename(columns={
                'ds': 'dt_week',
                'y': 'quantity'
            })
        if k == 1:
            final = pd.concat([final, result])
        if k == 0:
            final = result
            k = 1
    final = final.groupby("dt_week")["quantity"].sum().reset_index()
    final = final.set_index("dt_week")
    #temp = final
    # plt.plot(final, marker=".")
    # plt.title("original")
    # plt.show()
    final, Flag = cond_check(final)
    if Flag:
        final_detrended = detrend(final)
        # plt.plot(final_detrended, marker=".")
        # plt.title("detrended")
        # plt.show()
        final_aggregate = monthly_aggregate(final_detrended)
        # plt.plot(final_aggregate, marker=".")
        # plt.title("aggregated")
        # plt.show()
        result = adfuller(final_aggregate["quantity"],
                          maxlag=10,
                          autolag='t-stat')
        print('ADF Statistic: %f' % result[0])
        print('p-value: %f' % result[1])
        print("No of lags used: %f" % result[2])
        print("No of observations: %f" % result[3])
        print('Critical Values:')
        for key, value in result[4].items():
            print('\t%s: %.3f' % (key, value))
        #plt.figure(figsize=(16, 8))
        #plt.plot(temp, marker=".")
        if result[1] >= 0.05:
            #plt.title("Not Stationary")
            print("Not stationary")
        else:
            #plt.title("Stationary")
            print("Stationary")
        #plt.savefig("/home/aman/PycharmProjects/seasonality_hypothesis/seasonality_result_2/" + str(matnr) + "_" + product_name + ".png")
    else:
        print("length of series is less than 112")
def overall_aggregate_seas(input_df, matnr=103029):
    """
    This function aggregates whole cleaveland data with ma outliers removing different categories series outliers
    First week has been removed
    :return: pandas_df : seasonal component of the aggregated data
    """
    df = input_df.copy()
    df = df[df["matnr"] == matnr]
    overall = pd.read_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_4200_C025.csv")
    overall = overall[overall["matnr"] == matnr]
    product = pd.read_csv("~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv", sep="\t")
    product_name = product[product["matnr"] == str(matnr)]["description"].values[0]
    k = 0
    for index, row in overall.iterrows():
        frequency = row["frequency"]
        days = row["days"]
        df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])]
        df_series = df_series[df_series["quantity"] >= 0]
        df_series = df_series[df_series["date"] >= 20160703]
        if int(frequency) == 0:
            continue
        df_series = get_weekly_aggregate(df_series)
        _testing = df_series[["quantity", "dt_week"]].copy()
        aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'})

        aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
        aggregated_data.y = aggregated_data.y.apply(float)
        aggregated_data = aggregated_data.sort_values('ds')
        aggregated_data = aggregated_data.reset_index(drop=True)
        outlier = True
        if (frequency >= 26) & (days > 365 + 183):
            n_pass = 3
            window_size = 12
            sigma = 4.0
        elif(frequency >= 20) & (frequency < 26):
            n_pass = 3
            window_size = 12
            sigma = 5.0
        elif (frequency >= 26) & (days <= 365+183):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 12
                sigma = 4.0
            else:
                outlier = False
        elif (frequency >= 12) & (frequency < 20):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 24
                sigma = 5.0
            else:
                outlier = False
        elif frequency < 12:
            outlier = False

        if outlier:
            _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size,
                                         sigma=sigma)
            result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        else:
            result = aggregated_data.rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        if k == 1:
            final = pd.concat([final, result])
        if k == 0:
            final = result
            k = 1

    try:
        final = final.groupby("dt_week")["quantity"].sum().reset_index()
    except:
        return None
    final = final.set_index("dt_week")
    print(final["quantity"])
    print(final["quantity"].shape)
    result = seasonal_decompose(final["quantity"], model="additive")
    result.plot()
    #plt.show()
    plt.savefig(
        "/home/aman/PycharmProjects/seasonality_hypothesis/plots_product_C0025/"+str(matnr)+"_"+product_name+".png")
    #result.seasonal.to_csv(
    #    "~/PycharmProjects/seasonality_hypothesis/data_generated/product_aggregate_seasonality_"+str(matnr)+".csv")
    return result.seasonal
示例#7
0
def ljung_box_test(input_df, matnr=112260):
    """
    This function aggregates whole cleaveland data with ma outliers removing different categories series outliers
    First week has been removed
    :return: pandas_df : seasonal component of the aggregated data
    """
    df = input_df.copy()
    df = df[df["matnr"] == matnr]
    overall = pd.read_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_4200_C005.csv")
    overall = overall[overall["matnr"] == matnr]
    # product = pd.read_csv("~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv", sep="\t")
    # product_name = product[product["matnr"] == str(int(matnr))]["description"].values[0]
    k = 0
    for index, row in overall.iterrows():
        frequency = row["frequency"]
        days = row["days"]
        df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])]
        df_series = df_series[df_series["quantity"] >= 0]
        df_series = df_series[df_series["date"] >= 20160703]
        if frequency == 0:
            continue
        # print(df_series)
        df_series = get_weekly_aggregate(df_series)
        # plt.plot(df_series.set_index("dt_week")["quantity"], marker=".", label="individual series")
        # plt.title(str(row["matnr"]) + " before outlier")
        # plt.show()
        _testing = df_series[["quantity", "dt_week"]].copy()
        aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'})

        aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
        aggregated_data.y = aggregated_data.y.apply(float)
        aggregated_data = aggregated_data.sort_values('ds')
        aggregated_data = aggregated_data.reset_index(drop=True)
        outlier = True
        if (frequency >= 26) & (days > 365 + 183):
            n_pass = 3
            window_size = 12
            sigma = 4.0
        elif(frequency >= 20) & (frequency < 26):
            n_pass = 3
            window_size = 12
            sigma = 5.0
        elif (frequency >= 26) & (days <= 365+183):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 12
                sigma = 4.0
            else:
                outlier = False
        elif (frequency >= 12) & (frequency < 20):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 24
                sigma = 5.0
            else:
                outlier = False
        elif frequency < 12:
            outlier = False

        if outlier:
            _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size,
                                         sigma=sigma)
            result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        else:
            result = aggregated_data.rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        # plt.plot(result.set_index("dt_week")["quantity"], marker=".", label="individual_series_after_outlier")
        # plt.title(str(row["matnr"]) + " after outlier")
        # plt.show()
        if k == 1:
            final = pd.concat([final, result])
        if k == 0:
            final = result
            k = 1
    final = final.groupby("dt_week")["quantity"].sum().reset_index()
    # plt.figure(figsize=(16, 8))
    # plt.plot(final.set_index("dt_week"), marker=".", markerfacecolor="red", label="y")
    # plt.plot(final.set_index("dt_week").diff(), marker=".", label="differenced aggregated_data")
    # plt.xticks(fontsize=14)
    # plt.yticks(fontsize=14)
    # plt.xlabel("Date", fontsize=14)
    # plt.ylabel("Quantity", fontsize=14)
    # plt.title("Product Weekly Aggregated Data", fontsize=16)
    # plt.legend(fontsize=14)
    # plt.show()
    final = outlier_on_aggregated(final)
    final_temp = final
    plt.figure(figsize=(16, 8))
    plt.plot(final.set_index("dt_week"), marker=".", markerfacecolor="red", label="y")
    # plt.plot(final.set_index("dt_week").diff(), marker=".", label="differenced aggregated_data")
    # plt.xticks(fontsize=14)
    # plt.yticks(fontsize=14)
    # plt.xlabel("Date", fontsize=14)
    # plt.ylabel("Quantity", fontsize=14)
    # plt.title("Product Weekly Aggregated Data Outlier Removed", fontsize=16)
    # plt.legend(fontsize=14)
    plt.show()
    # print("________", final.dtypes)
    final = final.set_index("dt_week")
    missing_more_24 = missing_data_detection(final)
    if missing_more_24:
        # print("data is missing for more than 6 months")
        return False, 1, 0, final, final_temp
    #temp = final
    # final = final.diff()
    # print("checking the length of aggregated series ...")
    final, Flag = cond_check(final)
    if Flag:
        # print("detrending the aggregated series ...")
        final_detrended = detrend(final)
        # plt.figure(figsize=(16, 8))
        # plt.plot(final_detrended, marker=".", markerfacecolor="red", label="y")
        # plt.xticks(fontsize=14)
        # plt.yticks(fontsize=14)
        # plt.xlabel("Date", fontsize=14)
        # plt.ylabel("Quantity", fontsize=14)
        # plt.title("Detrended", fontsize=16)
        # plt.legend(fontsize=14)
        # plt.show()
        # print("monthly aggregating the aggregated series ...")
        final_aggregate = monthly_aggregate(final_detrended)
        # plt.figure(figsize=(16, 8))
        # plt.plot(final_aggregate, marker=".", markerfacecolor="red", label="y")
        # plt.xticks(fontsize=14)
        # plt.yticks(fontsize=14)
        # plt.xlabel("Date", fontsize=14)
        # plt.ylabel("Quantity", fontsize=14)
        # plt.title("Monthly Aggregated", fontsize=16)
        # plt.legend(fontsize=14)
        # plt.show()
        # print("standard deviation is", final.std()/ final.mean())
        # print("performing ljung box test ...")
        result = acorr_ljungbox(final_aggregate["quantity"], lags=[13])
        # print(result)
        result_dickey = adfuller(final_aggregate["quantity"])
        # print("statistic: %f" %result[0])
        # print("p-value: %f" %result[1])
        # print("p_value is :", result[1][0])
        if result[1] < 0.02:
            # print(str(matnr)+" is seasonal")
            return True, result[1][0], result_dickey[1], final, final_temp
        else:
            # print(str(matnr) + " is not seasonal")
            return False, result[1][0], result_dickey[1], final, final_temp
    else:
        print("length of series is less than 112")
        return [False, "length is small", 0, final, final_temp]
示例#8
0
def ljung_box_test_without_aggregation(input_df, matnr=112260):
    """
    This function aggregates whole cleaveland data with ma outliers removing different categories series outliers
    First week has been removed
    :return: pandas_df : seasonal component of the aggregated data
    """
    df = input_df.copy()
    df = df[df["matnr"] == matnr]
    overall = pd.read_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_4200_C005.csv")
    overall = overall[overall["matnr"] == matnr]
    #product = pd.read_csv("~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv", sep="\t")
    #product_name = product[product["matnr"] == str(int(matnr))]["description"].values[0]
    k = 0
    for index, row in overall.iterrows():
        frequency = row["frequency"]
        days = row["days"]
        df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])]
        df_series = df_series[df_series["quantity"] >= 0]
        df_series = df_series[df_series["date"] >= 20160703]
        if frequency == 0:
            continue
        #print(df_series)
        df_series = get_weekly_aggregate(df_series)
        _testing = df_series[["quantity", "dt_week"]].copy()
        aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'})

        aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
        aggregated_data.y = aggregated_data.y.apply(float)
        aggregated_data = aggregated_data.sort_values('ds')
        aggregated_data = aggregated_data.reset_index(drop=True)
        outlier = True
        if (frequency >= 26) & (days > 365 + 183):
            n_pass = 3
            window_size = 12
            sigma = 4.0
        elif(frequency >= 20) & (frequency < 26):
            n_pass = 3
            window_size = 12
            sigma = 5.0
        elif (frequency >= 26) & (days <= 365+183):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 12
                sigma = 4.0
            else:
                outlier = False
        elif (frequency >= 12) & (frequency < 20):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 24
                sigma = 5.0
            else:
                outlier = False
        elif frequency < 12:
            outlier = False

        if outlier:
            _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size,
                                         sigma=sigma)
            result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        else:
            result = aggregated_data.rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        if k == 1:
            final = pd.concat([final, result])
        if k == 0:
            final = result
            k = 1
    final = final.groupby("dt_week")["quantity"].sum().reset_index()
    final = final.set_index("dt_week")
    result = acorr_ljungbox(final["quantity"], lags=[52])

        # print("statistic: %f" %result[0])
        # print("p-value: %f" %result[1])
    if result[1] < 0.01:
        #print(result[1])
        return True, result[1], final
    else:
        return False, result[1], final
示例#9
0
def overall_aggregate_seas():
    """
    This function aggregates whole cleaveland data with ma outliers removing different categories series outliers
    First week has been removed
    :return: pandas_df : seasonal component of the aggregated data
    """
    df = load_data()
    overall = pd.read_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_cleaveland.csv"
    )
    k = 0
    for index, row in tqdm(overall.iterrows()):
        frequency = row["frequency"]
        days = row["days"]
        df_series = df[(df["kunag"] == row["kunag"])
                       & (df["matnr"] == row["matnr"])]
        df_series = df_series[df_series["quantity"] >= 0]
        df_series = df_series[df_series["date"] >= 20160703]
        if frequency == 0:
            continue
        df_series = get_weekly_aggregate(df_series)
        _testing = df_series[["quantity", "dt_week"]].copy()
        aggregated_data = _testing.rename(columns={
            'dt_week': 'ds',
            'quantity': 'y'
        })

        aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
        aggregated_data.y = aggregated_data.y.apply(float)
        aggregated_data = aggregated_data.sort_values('ds')
        aggregated_data = aggregated_data.reset_index(drop=True)
        outlier = True
        if (frequency >= 26) & (days > 365 + 183):
            n_pass = 3
            window_size = 12
            sigma = 4.0
        elif (frequency >= 20) & (frequency < 26):
            n_pass = 3
            window_size = 12
            sigma = 5.0
        elif (frequency >= 26) & (days <= 365 + 183):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 12
                sigma = 4.0
            else:
                outlier = False
        elif (frequency >= 12) & (frequency < 20):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 24
                sigma = 5.0
            else:
                outlier = False
        elif frequency < 12:
            outlier = False

        if outlier:
            _result = ma_replace_outlier(data=aggregated_data,
                                         n_pass=n_pass,
                                         aggressive=True,
                                         window_size=window_size,
                                         sigma=sigma)
            result = _result[0].rename(columns={
                'ds': 'dt_week',
                'y': 'quantity'
            })
        else:
            result = aggregated_data.rename(columns={
                'ds': 'dt_week',
                'y': 'quantity'
            })
        if k == 1:
            final = pd.concat([final, result])
        if k == 0:
            final = result
            k = 1
    final = final.groupby("dt_week")["quantity"].sum().reset_index()
    final = final.set_index("dt_week")
    final.to_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/aggregated_complete_outliers_removed.csv"
    )
    result = seasonal_decompose(final["quantity"], model="additive")
    result.seasonal.to_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/aggregated_complete_outliers_removed_seas.csv"
    )
    return result.seasonal
示例#10
0
    "/home/aman/PycharmProjects/seasonality_hypothesis/115584.csv")

y = data.quantity
from outlier import ma_replace_outlier
from dateutil import parser
aggregated_df = data
print(aggregated_df)
_testing = aggregated_df[["quantity", "dt_week"]].copy()
aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'})

aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
aggregated_data.y = aggregated_data.y.apply(float)
aggregated_data = aggregated_data.sort_values('ds')
aggregated_data = aggregated_data.reset_index(drop=True)
n_pass = 3
window_size = 12
sigma = 3.0
_result = ma_replace_outlier(data=aggregated_data,
                             n_pass=n_pass,
                             aggressive=True,
                             window_size=window_size,
                             sigma=sigma)
result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'})
print(result.iloc[31:34])

# import numpy as np
# a = np.ones(10)
#
# a[7] = 2
# print(a)
# print(np.where(a == 2))