def __init__(self, file_granularity: str, file_path_format: str, is_timezone_variable: bool, timezone: timezone = None, part_prefix: str = None, file_extension='.csv'): super(ReaderConfig, self).__init__('Config for reading data from different mediums') assert file_granularity in ['daily', 'monthly', 'weekly'] self.file_granularity = file_granularity if self.file_granularity == 'daily': self.date_offset = timedelta(days=1) elif self.file_granularity == 'monthly': self.date_offset = offsets.MonthBegin() elif self.file_granularity == 'weekly': self.date_offset = offsets.Week(weekday=0) self.file_path_format = file_path_format self.file_extension = file_extension self.partwise = part_prefix is not None self.part_prefix = part_prefix self.is_timezone_variable = is_timezone_variable self.timezone = timezone
def test_valid(self): df = self.regular # not a valid freq with pytest.raises(ValueError): df.rolling(window="foobar") # not a datetimelike index with pytest.raises(ValueError): df.reset_index().rolling(window="foobar") # non-fixed freqs for freq in ["2MS", offsets.MonthBegin(2)]: with pytest.raises(ValueError): df.rolling(window=freq) for freq in ["1D", offsets.Day(2), "2ms"]: df.rolling(window=freq) # non-integer min_periods for minp in [1.0, "foo", np.array([1, 2, 3])]: with pytest.raises(ValueError): df.rolling(window="1D", min_periods=minp) # center is not implemented with pytest.raises(NotImplementedError): df.rolling(window="1D", center=True)
def test_valid(self): df = self.regular # not a valid freq msg = "passed window foobar is not compatible with a datetimelike index" with pytest.raises(ValueError, match=msg): df.rolling(window="foobar") # not a datetimelike index msg = "window must be an integer" with pytest.raises(ValueError, match=msg): df.reset_index().rolling(window="foobar") # non-fixed freqs msg = "\\<2 \\* MonthBegins\\> is a non-fixed frequency" for freq in ["2MS", offsets.MonthBegin(2)]: with pytest.raises(ValueError, match=msg): df.rolling(window=freq) for freq in ["1D", offsets.Day(2), "2ms"]: df.rolling(window=freq) # non-integer min_periods msg = (r"local variable 'minp' referenced before assignment|" "min_periods must be an integer") for minp in [1.0, "foo", np.array([1, 2, 3])]: with pytest.raises(ValueError, match=msg): df.rolling(window="1D", min_periods=minp) # center is not implemented msg = "center is not implemented for datetimelike and offset based windows" with pytest.raises(NotImplementedError, match=msg): df.rolling(window="1D", center=True)
def get_marriage_mask(dates: pd.DataFrame, min_start='1970-01-01', max_end='2017-01-01') -> np.ndarray: # Result is <inclusive, exclusive> dates = dates.copy() min_start = pd.to_datetime(min_start) max_end = pd.to_datetime(max_end) # A hack to validate NaT comparisons dates = dates.fillna(max_end + offsets.MonthBegin()) idx = pd.date_range(min_start, max_end, freq='MS')[:, None] marriage = dates['start'].values divorce = dates['end'].values mask = (idx >= marriage) & (idx < divorce) return pd.DataFrame(mask, index=idx.squeeze(), columns=dates.index)
def regress_by_store(df): ret_list = [] month_ends = pd.date_range(start='01/01/2016', end='05/01/2017', freq='M') for month_end in month_ends: quarter_start = month_end - offsets.MonthBegin(3) quarter_df = take_df_by_valid_period(df, quarter_start, month_end) if quarter_df.empty: continue next_month_start = month_end + offsets.MonthBegin(1) next_month_end = month_end + offsets.MonthEnd(1) next_month_df = take_df_by_period(df, next_month_start, next_month_end) if next_month_df.empty: continue quarter_y_pred = do_regression(quarter_df, next_month_df) year_start = month_end - offsets.MonthBegin(12) year_df = take_df_by_valid_period(df, year_start, month_end) year_y_pred = do_regression(year_df, next_month_df) temp_df = pd.DataFrame(index=next_month_df.index) temp_df["quarter_regress_no_dow"] = quarter_y_pred temp_df["year_regress_no_dow"] = year_y_pred ret_list.append(temp_df) return ret_list
def test_ms_vs_capital_ms(): left = frequencies._get_offset("ms") right = frequencies._get_offset("MS") assert left == offsets.Milli() assert right == offsets.MonthBegin()
def test_ms_vs_MS(): left = frequencies.get_offset('ms') right = frequencies.get_offset('MS') assert left == offsets.Milli() assert right == offsets.MonthBegin()
def constrain_horizon( r, strict=False, cust=None, years=0, quarters=0, months=0, days=0, weeks=0, year=None, month=None, day=None, ): """Constrain a Series/DataFrame to a specified lookback period. See the documentation for dateutil.relativedelta: dateutil.readthedocs.io/en/stable/relativedelta.html Parameters ---------- r : DataFrame or Series The target pandas object to constrain strict : bool, default False If True, raise Error if the implied start date on the horizon predates the actual start date of `r`. If False, just return `r` in this situation years, months, weeks, days : int, default 0 Relative information; specify as positive to subtract periods. Adding or subtracting a relativedelta with relative information performs the corresponding aritmetic operation on the original datetime value with the information in the relativedelta quarters : int, default 0 Similar to the other plural relative info periods above, but note that this param is custom here. (It is not a standard relativedelta param) year, month, day : int, default None Absolute information; specify as positive to subtract periods. Adding relativedelta with absolute information does not perform an aritmetic operation, but rather REPLACES the corresponding value in the original datetime with the value(s) in relativedelta """ textnum = { "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10, "eleven": 11, "twelve": 12, "thirteen": 13, "fourteen": 14, "fifteen": 15, "sixteen": 16, "seventeen": 17, "eighteen": 18, "nineteen": 19, "twenty": 20, "twenty four": 24, "thirty six": 36, } relativedeltas = years, quarters, months, days, weeks, year, month, day if cust is not None and any(relativedeltas): raise ValueError("Cannot specify competing (nonzero) values for both" " `cust` and other parameters.") if cust is not None: cust = cust.lower() if cust.endswith("y"): years = int(re.search(r"\d+", cust).group(0)) elif cust.endswith("m"): months = int(re.search(r"\d+", cust).group(0)) elif cust.endswith(("years ago", "year ago", "year", "years")): pos = cust.find(" year") years = textnum[cust[:pos].replace("-", "")] elif cust.endswith(("months ago", "month ago", "month", "months")): pos = cust.find(" month") months = textnum[cust[:pos].replace("-", "")] else: raise ValueError("`cust` not recognized.") # Convert quarters to months & combine for MonthOffset months += quarters * 3 # Start date will be computed relative to `end` end = r.index[-1] # Establish some funky date conventions assumed in finance. If the end # date is 6/30, the date *3 months prior* is 3/31, not 3/30 as would be # produced by dateutil.relativedelta. if end.is_month_end and days == 0 and weeks == 0: if years != 0: years *= 12 months += years start = end - offsets.MonthBegin(months) else: start = end - offsets.DateOffset( years=years, months=months, days=days - 1, weeks=weeks, year=year, month=month, day=day, ) if strict and start < r.index[0]: raise ValueError("`start` pre-dates first element of the Index, %s" % r.index[0]) return r[start:end]
class TestRollingTS: # rolling time-series friendly # xref GH13327 def test_doc_string(self): df = DataFrame( {"B": [0, 1, 2, np.nan, 4]}, index=[ Timestamp("20130101 09:00:00"), Timestamp("20130101 09:00:02"), Timestamp("20130101 09:00:03"), Timestamp("20130101 09:00:05"), Timestamp("20130101 09:00:06"), ], ) df df.rolling("2s").sum() def test_invalid_window_non_int(self, regular): # not a valid freq msg = "passed window foobar is not compatible with a datetimelike index" with pytest.raises(ValueError, match=msg): regular.rolling(window="foobar") # not a datetimelike index msg = "window must be an integer" with pytest.raises(ValueError, match=msg): regular.reset_index().rolling(window="foobar") @pytest.mark.parametrize("freq", ["2MS", offsets.MonthBegin(2)]) def test_invalid_window_nonfixed(self, freq, regular): # non-fixed freqs msg = "\\<2 \\* MonthBegins\\> is a non-fixed frequency" with pytest.raises(ValueError, match=msg): regular.rolling(window=freq) @pytest.mark.parametrize("freq", ["1D", offsets.Day(2), "2ms"]) def test_valid_window(self, freq, regular): regular.rolling(window=freq) @pytest.mark.parametrize("minp", [1.0, "foo", np.array([1, 2, 3])]) def test_invalid_minp(self, minp, regular): # non-integer min_periods msg = (r"local variable 'minp' referenced before assignment|" "min_periods must be an integer") with pytest.raises(ValueError, match=msg): regular.rolling(window="1D", min_periods=minp) def test_on(self, regular): df = regular # not a valid column msg = (r"invalid on specified as foobar, must be a column " "\\(of DataFrame\\), an Index or None") with pytest.raises(ValueError, match=msg): df.rolling(window="2s", on="foobar") # column is valid df = df.copy() df["C"] = date_range("20130101", periods=len(df)) df.rolling(window="2d", on="C").sum() # invalid columns msg = "window must be an integer" with pytest.raises(ValueError, match=msg): df.rolling(window="2d", on="B") # ok even though on non-selected df.rolling(window="2d", on="C").B.sum() def test_monotonic_on(self): # on/index must be monotonic df = DataFrame({ "A": date_range("20130101", periods=5, freq="s"), "B": range(5) }) assert df.A.is_monotonic_increasing df.rolling("2s", on="A").sum() df = df.set_index("A") assert df.index.is_monotonic_increasing df.rolling("2s").sum() def test_non_monotonic_on(self): # GH 19248 df = DataFrame({ "A": date_range("20130101", periods=5, freq="s"), "B": range(5) }) df = df.set_index("A") non_monotonic_index = df.index.to_list() non_monotonic_index[0] = non_monotonic_index[3] df.index = non_monotonic_index assert not df.index.is_monotonic_increasing msg = "index values must be monotonic" with pytest.raises(ValueError, match=msg): df.rolling("2s").sum() df = df.reset_index() msg = (r"invalid on specified as A, must be a column " "\\(of DataFrame\\), an Index or None") with pytest.raises(ValueError, match=msg): df.rolling("2s", on="A").sum() def test_frame_on(self): df = DataFrame({ "B": range(5), "C": date_range("20130101 09:00:00", periods=5, freq="3s") }) df["A"] = [ Timestamp("20130101 09:00:00"), Timestamp("20130101 09:00:02"), Timestamp("20130101 09:00:03"), Timestamp("20130101 09:00:05"), Timestamp("20130101 09:00:06"), ] # we are doing simulating using 'on' expected = df.set_index("A").rolling("2s").B.sum().reset_index( drop=True) result = df.rolling("2s", on="A").B.sum() tm.assert_series_equal(result, expected) # test as a frame # we should be ignoring the 'on' as an aggregation column # note that the expected is setting, computing, and resetting # so the columns need to be switched compared # to the actual result where they are ordered as in the # original expected = (df.set_index("A").rolling("2s")[[ "B" ]].sum().reset_index()[["B", "A"]]) result = df.rolling("2s", on="A")[["B"]].sum() tm.assert_frame_equal(result, expected) def test_frame_on2(self): # using multiple aggregation columns df = DataFrame( { "A": [0, 1, 2, 3, 4], "B": [0, 1, 2, np.nan, 4], "C": Index([ Timestamp("20130101 09:00:00"), Timestamp("20130101 09:00:02"), Timestamp("20130101 09:00:03"), Timestamp("20130101 09:00:05"), Timestamp("20130101 09:00:06"), ]), }, columns=["A", "C", "B"], ) expected1 = DataFrame( { "A": [0.0, 1, 3, 3, 7], "B": [0, 1, 3, np.nan, 4], "C": df["C"] }, columns=["A", "C", "B"], ) result = df.rolling("2s", on="C").sum() expected = expected1 tm.assert_frame_equal(result, expected) expected = Series([0, 1, 3, np.nan, 4], name="B") result = df.rolling("2s", on="C").B.sum() tm.assert_series_equal(result, expected) expected = expected1[["A", "B", "C"]] result = df.rolling("2s", on="C")[["A", "B", "C"]].sum() tm.assert_frame_equal(result, expected) def test_basic_regular(self, regular): df = regular.copy() df.index = date_range("20130101", periods=5, freq="D") expected = df.rolling(window=1, min_periods=1).sum() result = df.rolling(window="1D").sum() tm.assert_frame_equal(result, expected) df.index = date_range("20130101", periods=5, freq="2D") expected = df.rolling(window=1, min_periods=1).sum() result = df.rolling(window="2D", min_periods=1).sum() tm.assert_frame_equal(result, expected) expected = df.rolling(window=1, min_periods=1).sum() result = df.rolling(window="2D", min_periods=1).sum() tm.assert_frame_equal(result, expected) expected = df.rolling(window=1).sum() result = df.rolling(window="2D").sum() tm.assert_frame_equal(result, expected) def test_min_periods(self, regular): # compare for min_periods df = regular # these slightly different expected = df.rolling(2, min_periods=1).sum() result = df.rolling("2s").sum() tm.assert_frame_equal(result, expected) expected = df.rolling(2, min_periods=1).sum() result = df.rolling("2s", min_periods=1).sum() tm.assert_frame_equal(result, expected) def test_closed(self, regular): # xref GH13965 df = DataFrame( {"A": [1] * 5}, index=[ Timestamp("20130101 09:00:01"), Timestamp("20130101 09:00:02"), Timestamp("20130101 09:00:03"), Timestamp("20130101 09:00:04"), Timestamp("20130101 09:00:06"), ], ) # closed must be 'right', 'left', 'both', 'neither' msg = "closed must be 'right', 'left', 'both' or 'neither'" with pytest.raises(ValueError, match=msg): regular.rolling(window="2s", closed="blabla") expected = df.copy() expected["A"] = [1.0, 2, 2, 2, 1] result = df.rolling("2s", closed="right").sum() tm.assert_frame_equal(result, expected) # default should be 'right' result = df.rolling("2s").sum() tm.assert_frame_equal(result, expected) expected = df.copy() expected["A"] = [1.0, 2, 3, 3, 2] result = df.rolling("2s", closed="both").sum() tm.assert_frame_equal(result, expected) expected = df.copy() expected["A"] = [np.nan, 1.0, 2, 2, 1] result = df.rolling("2s", closed="left").sum() tm.assert_frame_equal(result, expected) expected = df.copy() expected["A"] = [np.nan, 1.0, 1, 1, np.nan] result = df.rolling("2s", closed="neither").sum() tm.assert_frame_equal(result, expected) def test_ragged_sum(self, ragged): df = ragged result = df.rolling(window="1s", min_periods=1).sum() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).sum() expected = df.copy() expected["B"] = [0.0, 1, 3, 3, 7] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=2).sum() expected = df.copy() expected["B"] = [np.nan, np.nan, 3, np.nan, 7] tm.assert_frame_equal(result, expected) result = df.rolling(window="3s", min_periods=1).sum() expected = df.copy() expected["B"] = [0.0, 1, 3, 5, 7] tm.assert_frame_equal(result, expected) result = df.rolling(window="3s").sum() expected = df.copy() expected["B"] = [0.0, 1, 3, 5, 7] tm.assert_frame_equal(result, expected) result = df.rolling(window="4s", min_periods=1).sum() expected = df.copy() expected["B"] = [0.0, 1, 3, 6, 9] tm.assert_frame_equal(result, expected) result = df.rolling(window="4s", min_periods=3).sum() expected = df.copy() expected["B"] = [np.nan, np.nan, 3, 6, 9] tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).sum() expected = df.copy() expected["B"] = [0.0, 1, 3, 6, 10] tm.assert_frame_equal(result, expected) def test_ragged_mean(self, ragged): df = ragged result = df.rolling(window="1s", min_periods=1).mean() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).mean() expected = df.copy() expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_median(self, ragged): df = ragged result = df.rolling(window="1s", min_periods=1).median() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).median() expected = df.copy() expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_quantile(self, ragged): df = ragged result = df.rolling(window="1s", min_periods=1).quantile(0.5) expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).quantile(0.5) expected = df.copy() expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_std(self, ragged): df = ragged result = df.rolling(window="1s", min_periods=1).std(ddof=0) expected = df.copy() expected["B"] = [0.0] * 5 tm.assert_frame_equal(result, expected) result = df.rolling(window="1s", min_periods=1).std(ddof=1) expected = df.copy() expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) result = df.rolling(window="3s", min_periods=1).std(ddof=0) expected = df.copy() expected["B"] = [0.0] + [0.5] * 4 tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).std(ddof=1) expected = df.copy() expected["B"] = [np.nan, 0.707107, 1.0, 1.0, 1.290994] tm.assert_frame_equal(result, expected) def test_ragged_var(self, ragged): df = ragged result = df.rolling(window="1s", min_periods=1).var(ddof=0) expected = df.copy() expected["B"] = [0.0] * 5 tm.assert_frame_equal(result, expected) result = df.rolling(window="1s", min_periods=1).var(ddof=1) expected = df.copy() expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) result = df.rolling(window="3s", min_periods=1).var(ddof=0) expected = df.copy() expected["B"] = [0.0] + [0.25] * 4 tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).var(ddof=1) expected = df.copy() expected["B"] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.0] tm.assert_frame_equal(result, expected) def test_ragged_skew(self, ragged): df = ragged result = df.rolling(window="3s", min_periods=1).skew() expected = df.copy() expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).skew() expected = df.copy() expected["B"] = [np.nan] * 2 + [0.0, 0.0, 0.0] tm.assert_frame_equal(result, expected) def test_ragged_kurt(self, ragged): df = ragged result = df.rolling(window="3s", min_periods=1).kurt() expected = df.copy() expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).kurt() expected = df.copy() expected["B"] = [np.nan] * 4 + [-1.2] tm.assert_frame_equal(result, expected) def test_ragged_count(self, ragged): df = ragged result = df.rolling(window="1s", min_periods=1).count() expected = df.copy() expected["B"] = [1.0, 1, 1, 1, 1] tm.assert_frame_equal(result, expected) df = ragged result = df.rolling(window="1s").count() tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).count() expected = df.copy() expected["B"] = [1.0, 1, 2, 1, 2] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=2).count() expected = df.copy() expected["B"] = [np.nan, np.nan, 2, np.nan, 2] tm.assert_frame_equal(result, expected) def test_regular_min(self): df = DataFrame({ "A": date_range("20130101", periods=5, freq="s"), "B": [0.0, 1, 2, 3, 4] }).set_index("A") result = df.rolling("1s").min() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) df = DataFrame({ "A": date_range("20130101", periods=5, freq="s"), "B": [5, 4, 3, 4, 5] }).set_index("A") tm.assert_frame_equal(result, expected) result = df.rolling("2s").min() expected = df.copy() expected["B"] = [5.0, 4, 3, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling("5s").min() expected = df.copy() expected["B"] = [5.0, 4, 3, 3, 3] tm.assert_frame_equal(result, expected) def test_ragged_min(self, ragged): df = ragged result = df.rolling(window="1s", min_periods=1).min() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).min() expected = df.copy() expected["B"] = [0.0, 1, 1, 3, 3] tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).min() expected = df.copy() expected["B"] = [0.0, 0, 0, 1, 1] tm.assert_frame_equal(result, expected) def test_perf_min(self): N = 10000 dfp = DataFrame({"B": np.random.randn(N)}, index=date_range("20130101", periods=N, freq="s")) expected = dfp.rolling(2, min_periods=1).min() result = dfp.rolling("2s").min() assert ((result - expected) < 0.01).all().bool() expected = dfp.rolling(200, min_periods=1).min() result = dfp.rolling("200s").min() assert ((result - expected) < 0.01).all().bool() def test_ragged_max(self, ragged): df = ragged result = df.rolling(window="1s", min_periods=1).max() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling(window="2s", min_periods=1).max() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) result = df.rolling(window="5s", min_periods=1).max() expected = df.copy() expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "freq, op, result_data", [ ("ms", "min", [0.0] * 10), ("ms", "mean", [0.0] * 9 + [2.0 / 9]), ("ms", "max", [0.0] * 9 + [2.0]), ("s", "min", [0.0] * 10), ("s", "mean", [0.0] * 9 + [2.0 / 9]), ("s", "max", [0.0] * 9 + [2.0]), ("min", "min", [0.0] * 10), ("min", "mean", [0.0] * 9 + [2.0 / 9]), ("min", "max", [0.0] * 9 + [2.0]), ("h", "min", [0.0] * 10), ("h", "mean", [0.0] * 9 + [2.0 / 9]), ("h", "max", [0.0] * 9 + [2.0]), ("D", "min", [0.0] * 10), ("D", "mean", [0.0] * 9 + [2.0 / 9]), ("D", "max", [0.0] * 9 + [2.0]), ], ) def test_freqs_ops(self, freq, op, result_data): # GH 21096 index = date_range(start="2018-1-1 01:00:00", freq=f"1{freq}", periods=10) s = Series(data=0, index=index) s.iloc[1] = np.nan s.iloc[-1] = 2 result = getattr(s.rolling(window=f"10{freq}"), op)() expected = Series(data=result_data, index=index) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "f", [ "sum", "mean", pytest.param( "count", marks=pytest.mark.filterwarnings( "ignore:min_periods:FutureWarning"), ), "median", "std", "var", "kurt", "skew", "min", "max", ], ) def test_all(self, f, regular): # simple comparison of integer vs time-based windowing df = regular * 2 er = df.rolling(window=1) r = df.rolling(window="1s") result = getattr(r, f)() expected = getattr(er, f)() tm.assert_frame_equal(result, expected) result = r.quantile(0.5) expected = er.quantile(0.5) tm.assert_frame_equal(result, expected) def test_all2(self, arithmetic_win_operators): f = arithmetic_win_operators # more sophisticated comparison of integer vs. # time-based windowing df = DataFrame({"B": np.arange(50)}, index=date_range("20130101", periods=50, freq="H")) # in-range data dft = df.between_time("09:00", "16:00") r = dft.rolling(window="5H") result = getattr(r, f)() # we need to roll the days separately # to compare with a time-based roll # finally groupby-apply will return a multi-index # so we need to drop the day def agg_by_day(x): x = x.between_time("09:00", "16:00") return getattr(x.rolling(5, min_periods=1), f)() expected = (df.groupby(df.index.day).apply(agg_by_day).reset_index( level=0, drop=True)) tm.assert_frame_equal(result, expected) def test_groupby_monotonic(self): # GH 15130 # we don't need to validate monotonicity when grouping # GH 43909 we should raise an error here to match # behaviour of non-groupby rolling. data = [ ["David", "1/1/2015", 100], ["David", "1/5/2015", 500], ["David", "5/30/2015", 50], ["David", "7/25/2015", 50], ["Ryan", "1/4/2014", 100], ["Ryan", "1/19/2015", 500], ["Ryan", "3/31/2016", 50], ["Joe", "7/1/2015", 100], ["Joe", "9/9/2015", 500], ["Joe", "10/15/2015", 50], ] df = DataFrame(data=data, columns=["name", "date", "amount"]) df["date"] = to_datetime(df["date"]) df = df.sort_values("date") expected = (df.set_index("date").groupby("name").apply( lambda x: x.rolling("180D")["amount"].sum())) result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) def test_non_monotonic_raises(self): # GH 13966 (similar to #15130, closed by #15175) # superseded by 43909 dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s") df = DataFrame({ "A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.concatenate((dates, dates)), "C": np.arange(40), }) expected = (df.set_index("B").groupby("A").apply( lambda x: x.rolling("4s")["C"].mean())) with pytest.raises(ValueError, match=r".* must be monotonic"): df.groupby("A").rolling( "4s", on="B").C.mean() # should raise for non-monotonic t series df2 = df.sort_values("B") result = df2.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) def test_rolling_cov_offset(self): # GH16058 idx = date_range("2017-01-01", periods=24, freq="1h") ss = Series(np.arange(len(idx)), index=idx) result = ss.rolling("2h").cov() expected = Series([np.nan] + [0.5] * (len(idx) - 1), index=idx) tm.assert_series_equal(result, expected) expected2 = ss.rolling(2, min_periods=1).cov() tm.assert_series_equal(result, expected2) result = ss.rolling("3h").cov() expected = Series([np.nan, 0.5] + [1.0] * (len(idx) - 2), index=idx) tm.assert_series_equal(result, expected) expected2 = ss.rolling(3, min_periods=1).cov() tm.assert_series_equal(result, expected2) def test_rolling_on_decreasing_index(self): # GH-19248, GH-32385 index = [ Timestamp("20190101 09:00:30"), Timestamp("20190101 09:00:27"), Timestamp("20190101 09:00:20"), Timestamp("20190101 09:00:18"), Timestamp("20190101 09:00:10"), ] df = DataFrame({"column": [3, 4, 4, 5, 6]}, index=index) result = df.rolling("5s").min() expected = DataFrame({"column": [3.0, 3.0, 4.0, 4.0, 6.0]}, index=index) tm.assert_frame_equal(result, expected) def test_rolling_on_empty(self): # GH-32385 df = DataFrame({"column": []}, index=[]) result = df.rolling("5s").min() expected = DataFrame({"column": []}, index=[]) tm.assert_frame_equal(result, expected) def test_rolling_on_multi_index_level(self): # GH-15584 df = DataFrame( {"column": range(6)}, index=MultiIndex.from_product( [date_range("20190101", periods=3), range(2)], names=["date", "seq"]), ) result = df.rolling("10d", on=df.index.get_level_values("date")).sum() expected = DataFrame({"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]}, index=df.index) tm.assert_frame_equal(result, expected)
def get_first_fortnight_last_day(ds): """Return the last day of the datestamp's fortnight for its month.""" first_bday = ds + offsets.MonthBegin(1) - offsets.BMonthBegin(1) first_monday_second_fortnight = first_bday + offsets.BDay(10) last_sunday_first_fortnight = first_monday_second_fortnight - offsets.Day(1) return last_sunday_first_fortnight
import pandas as pd import pandas.tseries.offsets as offsets week_ends = pd.date_range(start='01/02/2017', end='05/01/2017', freq='W') print(week_ends) print(week_ends[16] + offsets.Week(1)) exit(0) month_ends = pd.date_range(start='01/01/2016', end='05/01/2017', freq='M') print(month_ends[15].replace(day=22)) print(month_ends[15]) if month_ends[15].month == 4 and month_ends[15].year == 2017: month_ends[15].replace(day=22) print(month_ends[15]) print(month_ends) exit(0) for month_end in month_ends: quarter_start = month_end - offsets.MonthBegin(3) next_month_start = month_end + offsets.MonthBegin(1) next_month_end = month_end + offsets.MonthEnd(1) year_start = month_end - offsets.MonthBegin(12) print("-"*30) print(quarter_start) print(next_month_start) print(next_month_end) print(year_start)
def Orders_prediction(): # from tqdm import tqdm csv.field_size_limit(1000000000) font = 'utf-8' # font='shift_jisx0213' # スクリプトのあるディレクトリの絶対パスを取得 script_pass = os.path.dirname(os.path.abspath(__name__)) if __name__ == '__main__': local_pass = script_pass + '/' else: local_pass = script_pass + '/Orders_prediction/' # ファイル選択ダイアログの表示 root = tk.Tk() root.withdraw() fTyp = [("", "*")] iDir = local_pass # 受注実績データの取得 # ここの1行を変更 askopenfilename → askopenfilenames file = tkinter.filedialog.askopenfilenames(filetypes=fTyp, initialdir=iDir, title='受注実績データの取得') if len(file) != 0: # 選択ファイルリスト作成 list_f = list(file) # FCデータ取り込み # ファイル名を取得 fc_pass = glob.glob(local_pass + 'FC_*.csv') fc_name = os.path.basename(fc_pass[0]) fc_cd = fc_name.replace('FC_', '') fc_cd = fc_cd.replace('.csv', '') # FCファイルが複数ある場合にメッセージを出力 if len(fc_pass) == 1: FC = pd.read_csv(local_pass + fc_name, encoding=font, dtype='object', index_col=None) # FCからサプライヤコードを抽出 fc_sup = FC.drop_duplicates(subset=['SUPPLIER_CD'], keep='first', inplace=False) # 重複削除 fc_sup = fc_sup.loc[::, ['SUPPLIER_CD']] fc_sup = fc_sup.T fc_sup_l = fc_sup.values.tolist() list_pg = fc_sup_l[0] # リストボックスの作成を実行 result = getFACI_CD(fc_name, list_pg) check_list = [ result[5], result[6], result[7], result[8], result[9], result[10], result[11], result[12] ] for c in range(len(check_list)): if len(check_list[c]) == 1: check_list[c] = '0' + check_list[c] Tgt_S_M = result[1] + check_list[0] + check_list[4] Tgt_E_M = result[2] + check_list[1] + check_list[5] Pre_S_M = result[3] + check_list[2] + check_list[6] Pre_E_M = result[4] + check_list[3] + check_list[7] # 1つ目のファイルを開く f_name = os.path.basename(list_f[0]) f_pass = os.path.dirname(list_f[0]) # 必要な列のみ読み込む print(f_pass + '/' + f_name) order = pd.read_csv( f_pass + '/' + f_name, sep='\t', encoding=font, dtype=object, engine='python', error_bad_lines=False, usecols=[ '番号', '現法コード', 'グローバル番号', '受注日・見積回答日', '受注時間・見積回答時間', 'JST変換受注日・JST変換見積回答日', 'JST変換受注時間・JST変換見積回答時間', '見積有効日', '見積有効時間', 'JST変換見積有効日', 'JST変換見積有効時間', 'アンフィット種別', '得意先コード', '直送先コード', 'MCコード', 'インナーコード', '商品コード', '実績現法コード', '実績仕入先コード', '実績管理単位コード', 'ACE仕入先コード', 'ACE仕入先カテゴリコード', '受注実績SSD', '見積回答SSD', '数量', '納入区分', '顧客希望納期' ]) # ファイルを繰り返し開き結合する for r in range(1, len(list_f)): f_name = os.path.basename(list_f[r]) print(f_name) order_add = pd.read_csv( f_pass + '/' + f_name, sep='\t', encoding=font, dtype=object, engine='python', error_bad_lines=False, usecols=[ '番号', '現法コード', 'グローバル番号', '受注日・見積回答日', '受注時間・見積回答時間', 'JST変換受注日・JST変換見積回答日', 'JST変換受注時間・JST変換見積回答時間', '見積有効日', '見積有効時間', 'JST変換見積有効日', 'JST変換見積有効時間', 'アンフィット種別', '得意先コード', '直送先コード', 'MCコード', 'インナーコード', '商品コード', '実績現法コード', '実績仕入先コード', '実績管理単位コード', 'ACE仕入先コード', 'ACE仕入先カテゴリコード', '受注実績SSD', '見積回答SSD', '数量', '納入区分', '顧客希望納期' ]) # ファイルを追加する order = order.append(order_add, sort=False) # 非稼働日データを読み込む nowork_day = pd.read_csv(local_pass + 'nowork_day.csv', encoding=font, dtype='object', index_col=None) # 各現法、拠点の非稼働日をリスト化 sub_name = [ 'CHN', 'GRM', 'HKG', 'IND', 'JKT', 'KOR', 'MEX', 'MJP', 'MYS', 'SGP', 'THA', 'TIW', 'USA', 'VNM', '0143', '7017', '3764', '0FCN', '0AIO', 'SPCM' ] calendar_name = [ 'CAAAA', 'GAAAA', 'NAAAA', 'DAAAA', 'JAAAA', 'KAAAA', 'QAAAA', '5AAAA', 'MAAAA', 'SAAAA', 'HAAAA', 'TAAAA', 'UAAAA', 'VAAAA', '5AAAA', '5AAAA', '5AAAA', 'C8677', 'C8677', '50SPC' ] calendar_dict = {} for i in range(0, len(sub_name)): noworkday_df = nowork_day[nowork_day['CALENDAR_CD'] == calendar_name[i]] noworkday_df = noworkday_df.loc[::, ['OFF_DATE']] noworkday_df = noworkday_df.T noworkday_list = noworkday_df.values.tolist() calendar_dict[sub_name[i]] = noworkday_list[0] # 見積りデータを除く order = order[order['見積有効日'].isnull()] # 受注日・見積回答日の名前を変える order = order.rename(columns={'受注日・見積回答日': '受注日'}) order = order.astype({'受注日': int, '受注実績SSD': int}) # 出荷実績SSDデータ利用期間に限定する # 基本的に受注予測は受注日でのサマリ # 条件を作成 condition = Tgt_S_M + ' <= 受注日 <= ' + Tgt_E_M order = order.query(condition) # 作成したデータを入れるdfを作成 prediction_all = pd.DataFrame(columns=[ 'SUBSIDIARY_CD', 'SUPPLIER_CD', 'FACILITY_CD', 'BASE_DATE', 'BASE_DATE_ADD_DAYS', 'PREDICTION_QUANTITY', 'UPD_COUNT', 'DEL_FLG', 'REG_USR', 'REG_TIME', 'UPD_USR', 'UPD_TIME' ]) for pg_name in list_pg: # orderのサプライヤコードが指定のサプライヤのものだけにする order_sup = order[order['実績仕入先コード'] == pg_name].copy() if len(order_sup) > 0: Tgt_S = copy.copy(Tgt_S_M) Tgt_E = copy.copy(Tgt_E_M) Pre_S = copy.copy(Pre_S_M) Pre_E = copy.copy(Pre_E_M) # 受注日、出荷日をdate形式へ変更 order_sup = order_sup.astype({'受注日': str, '受注実績SSD': str}) order_sup[ '受注日'] = order_sup['受注日'].str[0:4] + '-' + order_sup[ '受注日'].str[4:6] + '-' + order_sup['受注日'].str[6:8] order_sup['受注実績SSD'] = order_sup['受注実績SSD'].str[ 0:4] + '-' + order_sup['受注実績SSD'].str[ 4:6] + '-' + order_sup['受注実績SSD'].str[6:8] # 受注曜日カラムを追加 order_sup['weekday'] = [ dt.datetime.strptime(x, "%Y-%m-%d").strftime('%a') for x in order_sup['受注日'] ] # 受注日と受注実績SSDをdate形式に変換 order_sup['受注日'] = pd.to_datetime(order_sup['受注日']) order_sup['受注実績SSD'] = pd.to_datetime(order_sup['受注実績SSD']) # 受注日をDatetimeIndexとし年、月、曜日のindexを追加 order_sup = order_sup.set_index('受注日') order_sup = order_sup.set_index([ order_sup.index.year, order_sup.index.month, order_sup.index ]) order_sup.index.names = ['year', 'month', '受注日'] order_sup = order_sup.reset_index() # 納期属性カラムを追加 # 非稼働日カレンダを指定 nowork_day_w = calendar_dict[pg_name][:] # カレンダのリストのstrを1行ずつdate形式に変換 for i in range(0, len(nowork_day_w)): nowork_day_w[i] = dt.datetime.strptime( nowork_day_w[i], "%Y-%m-%d") # 納期属性を計算 order_sup['納期属性'] = [ (z - y).days - len(list(filter(lambda x: y <= x <= z, nowork_day_w))) for y, z in zip(order_sup['受注日'], order_sup['受注実績SSD']) ] # 納期属性を73以上は全て73,0未満のRECは0へ(削除も検討?正しくない) order_sup.loc[order_sup['納期属性'] > 73, '納期属性'] = 73 order_sup.loc[order_sup['納期属性'] < 0, '納期属性'] = 0 # order_sup = order_sup.query('納期属性 >= 0') order_sup = order_sup.astype({'数量': int}) # 現法毎の実績,稼働日数を集計 # 現法毎の年間数量を集計 order_subtotal = order_sup.groupby( ['year', 'month'], as_index=False)['数量'].sum() order_subtotal = order_subtotal.astype({ 'year': str, 'month': str }) # order_subtotalを割る製造拠点の稼働日を集計 order_subtotal['開始日'] = pd.to_datetime( (order_subtotal['year'] + '/' + order_subtotal['month'] + '/01'), format='%Y/%m/%d') order_subtotal[ '終了日'] = order_subtotal['開始日'] + offsets.MonthBegin(1) order_subtotal['月稼働日'] = [ (z - y).days - len(list(filter(lambda x: y <= x < z, nowork_day_w))) for y, z in zip(order_subtotal['開始日'], order_subtotal['終了日']) ] order_subtotal['月平均本数'] = (order_subtotal['数量'] / order_subtotal['月稼働日']).round(3) # 現法毎の期間中の受注数量比を算出 order_subratio = order_sup.groupby( ['year', '現法コード'], as_index=False)['数量'].sum() order_subratio['合計'] = order_subtotal['数量'].sum() order_subratio['ratio'] = (order_subratio['数量'] / order_subratio['合計']).round(3) order_subratio.drop(['数量', '合計'], axis=1, inplace=True) # 月ごとの数量合計に現法比率をかける yearのtypeを統一する order_subtotal = order_subtotal.astype({ 'year': int, 'month': int }) order_subtotal = pd.merge(order_subtotal, order_subratio, on=['year'], how='left') order_subtotal['月平均本数'] = ( order_subtotal['月平均本数'] * order_subtotal['ratio']).round(3) # とりあえず実績をFCの数値をして利用する # FCを実績で作成する場合は期間を出荷日で規定 # しかし受注予測は受注日で規定 Tgt_S,Eを使ったフィルタリングでは不足 # とはいえ基本的にFCは外から入れることにする # 不要列削除 # order_subtotal.drop(['数量', '開始日', '終了日', '月稼働日', 'ratio'], axis=1, inplace=True) # FC = order_subtotal # header名の変更 FC = FC.rename(columns={'SUBSIDIARY_CD': '現法コード'}) # SUPPLIER_CDの指定 FC = FC[FC['SUPPLIER_CD'] == pg_name] FC = FC.astype({'year': str, 'month': str, 'FC': float}) # FCを割る製造拠点の稼働日を集計しFCを日当り数量へ FC['開始日'] = pd.to_datetime( (FC['year'] + '/' + FC['month'] + '/01'), format='%Y/%m/%d') FC['終了日'] = FC['開始日'] + offsets.MonthBegin(1) FC['月稼働日'] = [ (z - y).days - len(list(filter(lambda x: y <= x < z, nowork_day_w))) for y, z in zip(FC['開始日'], FC['終了日']) ] FC['月平均本数'] = (FC['FC'] / FC['月稼働日']).round(3) FC = FC.astype({'year': int, 'month': int}) # 曜日ごとの稼働日をカウント Tgt_S = dt.datetime.strptime(Tgt_S, '%Y%m%d') Tgt_E = dt.datetime.strptime(Tgt_E, '%Y%m%d') week_count = pd.DataFrame({'count': [], 'weekday': []}) count = 0 while Tgt_S <= Tgt_E: if not Tgt_S.strftime( '%Y-%m-%d') in calendar_dict[pg_name]: weekday = Tgt_S.strftime('%a') week_count.loc[count] = [Tgt_S, weekday] count += 1 Tgt_S = Tgt_S + dt.timedelta(days=1) week_count = week_count.groupby(['weekday'], as_index=False).count() # 曜日指数計算 # 日付毎に数量を集計 # 大口を除く order_small = order_sup[order_sup['アンフィット種別'] == '0'] order_day = order_small.groupby( ['現法コード', 'weekday', '受注日'], as_index=False)['数量'].sum() order_week = order_day.groupby(['現法コード', 'weekday'], as_index=False)['数量'].sum() order_week = pd.merge(order_week, week_count, on=['weekday'], how='left') order_week.loc[ order_week['count'] != 0, '数量'] = order_week['数量'] / order_week['count'] order_week.loc[order_week['count'] == 0, '数量'] = 0 order_week1 = order_week[order_week['weekday'] != 'Sun'] order_week1 = order_week1.groupby(['現法コード', 'weekday'])['数量'].sum() order_week1 = order_week1.groupby(['現法コード' ]).transform(qtyave) order_week2 = order_week.groupby(['現法コード', 'weekday'])['数量'].sum() order_week2 = order_week2.groupby(['現法コード' ]).transform(qtyave) order_week1 = order_week1.reset_index() order_week2 = order_week2.reset_index() order_week2 = order_week2[order_week2['weekday'] == 'Sun'] order_week1 = order_week1.append(order_week2, sort=False) order_week1 = order_week1.rename( columns={'数量': 'week_ratio'}) # 現法、曜日、納期属性の箱を用意 base_sh = pd.read_csv(local_pass + 'base_sh.csv', encoding=font, index_col=None, dtype={ '数量': int, '納期属性': int }) # 現法、曜日、納期属性毎の数量を合計 order_A = order_sup[order_sup['アンフィット種別'] == '0'] # 大口を除く order_A = order_A.groupby(['現法コード', 'weekday', '納期属性'])['数量'].sum() order_A = order_A.groupby(['現法コード', 'weekday']).transform(qtyratio) order_A = order_A.reset_index() # base_shと結合し0を補足 base_sh = pd.merge(base_sh, order_A, on=['現法コード', 'weekday', '納期属性'], how='outer') base_sh.loc[base_sh['数量_y'].notnull(), '数量_x'] = base_sh['数量_y'] base_sh = base_sh.rename(columns={'数量_x': '数量'}) base_sh.drop(['数量_y'], axis=1, inplace=True) base_sh = base_sh.round({'数量': 4}) base_sh = base_sh.rename(columns={'数量': 'n_ratio'}) # 小口比率を計算 small_ratio = order_sup.groupby( ['現法コード'], as_index=False)['数量'].sum() small_ratio_A = order_sup[order_sup['アンフィット種別'] == '0'] # 大口を除く small_ratio_A = small_ratio_A.groupby( ['現法コード'], as_index=False)['数量'].sum() small_ratio = pd.merge(small_ratio_A, small_ratio, on=['現法コード'], how='right') small_ratio.loc[small_ratio['数量_x'].isnull(), '数量_x'] = 0 small_ratio['small_ratio'] = small_ratio[ '数量_x'] / small_ratio['数量_y'] small_ratio.drop(['数量_x', '数量_y'], axis=1, inplace=True) # 開始日終了日をdate形式に Pre_S = dt.datetime.strptime(Pre_S, '%Y%m%d') Pre_E = dt.datetime.strptime(Pre_E, '%Y%m%d') # 受注日のリストを作成 day_list = [Pre_S] day_n = Pre_S while day_n <= Pre_E: day_list.append(day_n) day_n = day_n + datetime.timedelta(days=1) # prediction = pd.date_range(start=Pre_S, end=Pre_E, freq='D', name='受注日') # prediction = prediction.to_series() # prediction = pd.DataFrame(prediction) # 受注日*出荷日のリストを作成 so_day_list = [] sd_day_list = [] # noukizokusei = list(range(73)) noukizokusei_list = [] for nouki in range(len(day_list)): noworkday_count = 0 for n in range(73): so_day_list.append(day_list[nouki]) noukizokusei_list.append(n) sd_day = day_list[nouki] + datetime.timedelta( days=(n + noworkday_count)) # 出荷日稼働flgを作成し非稼働日なら+1する while sd_day.strftime( '%Y-%m-%d') in calendar_dict[pg_name]: sd_day = sd_day + dt.timedelta(days=1) noworkday_count = noworkday_count + 1 sd_day_list.append(sd_day) # マルチプロセス処理の結果を入れるdfを作成 prediction_sum = pd.DataFrame({ '現法コード': [], '受注日': [], '受注日稼働flg': [], '出荷日': [], '出荷日稼働flg': [], '納期属性': [] }) # 現法毎にマルチプロセスで処理 pool = Pool(multi.cpu_count() - 2) list1 = [(x, so_day_list, sd_day_list, noukizokusei_list, calendar_dict, sub_name) for x in range(14)] pre_list = pool.map(wrapper, list1) pool.close() # 返り値がlist形式で格納しされるのでfor文で結合 for x in range(14): prediction_sum = prediction_sum.append(pre_list[x], sort=False) # 受注曜日カラムを追加 prediction = prediction_sum prediction['weekday'] = [ x.strftime('%a') for x in prediction['受注日'] ] ''' # 受注日をDatetimeIndexとし年、月、曜日のindexを追加 prediction = prediction.set_index('受注日') prediction = prediction.set_index([prediction.index.year, prediction.index.month, prediction.index]) prediction.index.names = ['year_so', 'month_so', '受注日'] prediction = prediction.reset_index() ''' # 出荷日をDatetimeIndexとし年、月、曜日のindexを追加 prediction = prediction.set_index('出荷日') prediction = prediction.set_index([ prediction.index.year, prediction.index.month, prediction.index ]) prediction.index.names = ['year', 'month', '出荷日'] prediction = prediction.reset_index() # FCを結合 prediction = pd.merge(prediction, FC, on=['現法コード', 'year', 'month'], how='left') # 曜日比率を結合 prediction = pd.merge(prediction, order_week1, on=['現法コード', 'weekday'], how='left') # 納期属性を結合 prediction = pd.merge(prediction, base_sh, on=['現法コード', 'weekday', '納期属性'], how='left') # 小口比率を追加 prediction = pd.merge(prediction, small_ratio, on=['現法コード'], how='left') # ブランクを0で埋める prediction.loc[prediction['月平均本数'].isnull(), '月平均本数'] = 0 prediction.loc[prediction['week_ratio'].isnull(), 'week_ratio'] = 0 # 現法毎日当たり数量を算出 prediction['数量'] = prediction['受注日稼働flg'] * prediction[ '出荷日稼働flg'] * prediction['月平均本数'] * prediction[ 'week_ratio'] * prediction['n_ratio'] * prediction[ 'small_ratio'] prediction = prediction.round({'数量': 3}) # 受注日と出荷日毎のデータを出力 f_name = pg_name + '_prediction_row.tsv' prediction.to_csv(local_pass + f_name, sep='\t', encoding=font, quotechar='"', line_terminator='\n', index=False) # 受注日*出荷日毎の数量を合計 日付の型をあとで修正 prediction = prediction.groupby( ['受注日', '出荷日'], as_index=False)['数量'].sum() prediction['受注日'] = pd.to_datetime(prediction['受注日']) q, mod = divmod(((Pre_E - Pre_S).days + 1), 20) FACILITY_DICT = {} FACILITY_DICT['0143'] = ['MJP', '0143', 'AIO'] FACILITY_DICT['7017'] = ['MJP', '7017', 'MAL'] FACILITY_DICT['3764'] = ['MJP', '3764', 'AAL'] FACILITY_DICT['0FCN'] = ['CHN', '0FCN', 'FAL'] FACILITY_DICT['0AIO'] = ['CHN', '0AIO', 'F2A'] FACILITY_DICT['SPCM'] = ['VNM', 'SPCM', 'SAL'] FACILITY_L = FACILITY_DICT[pg_name] # ■受注予測(d) 対象日から見た予測数量積み上げ分 # マルチプロセス対応 マルチプロセス回数は20回とする pre_c = pd.DataFrame({ 'BASE_DATE': [], 'BASE_DATE_ADD_DAYS': [], 'PREDICTION_QUANTITY': [] }) pre_c = pre_c.astype({ 'BASE_DATE_ADD_DAYS': int, 'PREDICTION_QUANTITY': float }) # q*20パート for s in range(0, q): pool = Pool(multi.cpu_count() - 2) list3 = [(d, Pre_S, prediction) for d in range((20 * s), (20 * s + 20))] prediction_sum3 = pool.map(wrapper3, list3) pool.close() for d in range(0, 20): pre_c = pre_c.append(prediction_sum3[d], sort=False) # modパート pool = Pool(multi.cpu_count() - 2) list3 = [(d, Pre_S, prediction) for d in range((20 * q), (20 * q + mod))] prediction_sum4 = pool.map(wrapper3, list3) pool.close() for d in range(0, mod): pre_c = pre_c.append(prediction_sum4[d], sort=False) pre_c.reset_index(drop=True, inplace=True) pre_c.loc[:, 'SUBSIDIARY_CD'] = FACILITY_L[0] pre_c.loc[:, 'SUPPLIER_CD'] = FACILITY_L[1] pre_c.loc[:, 'FACILITY_CD'] = FACILITY_L[2] prediction_all = prediction_all.append(pre_c, sort=False) else: print(pg_name + 'に該当する実績データがありません!') # ファイルアウトプット Today = "'" + dt.datetime.today().strftime("%Y-%m-%d") + "'" prediction_all = prediction_all.round({'PREDICTION_QUANTITY': 3}) prediction_all.loc[:, 'UPD_COUNT'] = '0' prediction_all.loc[:, 'DEL_FLG'] = '0' prediction_all.loc[:, 'REG_USR'] = fc_cd prediction_all.loc[:, 'REG_TIME'] = Today prediction_all.loc[:, 'UPD_USR'] = fc_cd prediction_all.loc[:, 'UPD_TIME'] = Today f_name = 'est_' + fc_cd + '.tsv' prediction_all.to_csv(local_pass + f_name, sep='\t', encoding=font, quotechar='"', line_terminator='\n', index=False) print('FC_' + fc_cd + 'を元に' + f_name + 'を作成しました!') print('受注予測作成完了しました!') else: print('FC_*.csvファイルが複数ある、または存在しません!') print('受注予測作成終了します')
def Orders_prediction_for_Supplier(): # from tqdm import tqdm csv.field_size_limit(1000000000) font = 'utf-8' # font='shift_jisx0213' # スクリプトのあるディレクトリの絶対パスを取得 script_pass = os.path.dirname(os.path.abspath(__name__)) if __name__ == '__main__': local_pass = script_pass + '/' else: local_pass = script_pass + '/Orders_prediction/' # ファイル選択ダイアログの表示 root = tk.Tk() root.withdraw() fTyp = [("", "*")] iDir = local_pass # 受注実績データの取得 # ここの1行を変更 askopenfilename → askopenfilenames file = tkinter.filedialog.askopenfilenames(filetypes=fTyp, initialdir=iDir, title='受注実績データの取得') if len(file) != 0: # 選択ファイルリスト作成 list_f = list(file) # リストボックスの作成を実行 result = getFACI_CD() pg_name = result[0] check_list = [result[5], result[6], result[7], result[8], result[9], result[10], result[11], result[12]] for c in range(len(check_list)): if len(check_list[c]) == 1: check_list[c] = '0' + check_list[c] Tgt_S = result[1] + check_list[0] + check_list[4] Tgt_E = result[2] + check_list[1] + check_list[5] Pre_S = result[3] + check_list[2] + check_list[6] Pre_E = result[4] + check_list[3] + check_list[7] # 1つ目のファイルを開く f_name = os.path.basename(list_f[0]) f_pass = os.path.dirname(list_f[0]) # 必要な列のみ読み込む print(f_pass + '/' + f_name) order = pd.read_csv(f_pass + '/' + f_name, sep='\t', encoding=font, dtype=object, engine='python', error_bad_lines=False, usecols=['番号', '現法コード', 'グローバル番号', '受注日・見積回答日', '受注時間・見積回答時間', 'JST変換受注日・JST変換見積回答日', 'JST変換受注時間・JST変換見積回答時間', '見積有効日', '見積有効時間', 'JST変換見積有効日', 'JST変換見積有効時間', 'アンフィット種別', '得意先コード', '直送先コード', 'MCコード', 'インナーコード', '商品コード', '実績現法コード', '実績仕入先コード', '実績管理単位コード', 'ACE仕入先コード', 'ACE仕入先カテゴリコード', '受注実績SSD', '見積回答SSD', '数量', '納入区分', '顧客希望納期', '置場コード1', '置場コード2']) # ファイルを繰り返し開き結合する for r in range(1, len(list_f)): f_name = os.path.basename(list_f[r]) print(f_name) order_add = pd.read_csv(f_pass + '/' + f_name, sep='\t', encoding=font, dtype=object, engine='python', error_bad_lines=False, usecols=['番号', '現法コード', 'グローバル番号', '受注日・見積回答日', '受注時間・見積回答時間', 'JST変換受注日・JST変換見積回答日', 'JST変換受注時間・JST変換見積回答時間', '見積有効日', '見積有効時間', 'JST変換見積有効日', 'JST変換見積有効時間', 'アンフィット種別', '得意先コード', '直送先コード', 'MCコード', 'インナーコード', '商品コード', '実績現法コード', '実績仕入先コード', '実績管理単位コード', 'ACE仕入先コード', 'ACE仕入先カテゴリコード', '受注実績SSD', '見積回答SSD', '数量', '納入区分', '顧客希望納期', '置場コード1', '置場コード2']) # ファイルを追加する order = order.append(order_add, sort=False) # 非稼働日データを読み込む nowork_day = pd.read_csv(local_pass + 'nowork_day.csv', encoding=font, dtype='object', index_col=None) # 各現法、拠点の非稼働日をリスト化 sub_name = ['CHN', 'GRM', 'HKG', 'IND', 'JKT', 'KOR', 'MEX', 'MJP', 'MYS', 'SGP', 'THA', 'TIW', 'USA', 'VNM', '0143', '7017', '3764', '0FCN', '0AIO', 'SPCM'] calendar_name = ['CAAAA', 'GAAAA', 'NAAAA', 'DAAAA', 'JAAAA', 'KAAAA', 'QAAAA', '5AAAA', 'MAAAA', 'SAAAA', 'HAAAA', 'TAAAA', 'UAAAA', 'VAAAA', '5AAAA', '5AAAA', '5AAAA', 'C8677', 'C8677', '50SPC'] calendar_dict = {} for i in range(0, len(sub_name)): noworkday_df = nowork_day[nowork_day['CALENDAR_CD'] == calendar_name[i]] noworkday_df = noworkday_df.loc[::, ['OFF_DATE']] noworkday_df = noworkday_df.T noworkday_list = noworkday_df.values.tolist() calendar_dict[sub_name[i]] = noworkday_list[0] # 見積りデータを除く order = order[order['見積有効日'].isnull()] # 受注日・見積回答日の名前を変える order = order.rename(columns={'受注日・見積回答日': '受注日'}) order = order.astype({'受注日': int, '受注実績SSD': int}) # 出荷実績SSDデータ利用期間に限定する # 基本的に受注予測は受注日でのサマリ # 条件を作成 condition = Tgt_S + ' <= 受注日 <= ' + Tgt_E order = order.query(condition) # orderのサプライヤコードが指定のサプライヤのものだけにする order = order[order['実績仕入先コード'] == pg_name] if len(order) > 0: # 受注日、出荷日をdate形式へ変更 order = order.astype({'受注日': str, '受注実績SSD': str}) order['受注日'] = order['受注日'].str[0:4] + '-' + order['受注日'].str[4:6] + '-' + order['受注日'].str[6:8] order['受注実績SSD'] = order['受注実績SSD'].str[0:4] + '-' + order['受注実績SSD'].str[4:6] + '-' + order['受注実績SSD'].str[ 6:8] # 南通で実施の場合は広州向けSSDを1日前倒しする order.loc[order[], '受注実績SSD'] # 受注曜日カラムを追加 order['weekday'] = [dt.datetime.strptime(x, "%Y-%m-%d").strftime('%a') for x in order['受注日']] # 受注日と受注実績SSDをdate形式に変換 order['受注日'] = pd.to_datetime(order['受注日']) order['受注実績SSD'] = pd.to_datetime(order['受注実績SSD']) # 受注日をDatetimeIndexとし年、月、曜日のindexを追加 order = order.set_index('受注日') order = order.set_index([order.index.year, order.index.month, order.index]) order.index.names = ['year', 'month', '受注日'] order = order.reset_index() # 納期属性カラムを追加 # 非稼働日カレンダを指定 nowork_day_w = calendar_dict[pg_name][:] # カレンダのリストのstrを1行ずつdate形式に変換 for i in range(0, len(nowork_day_w)): nowork_day_w[i] = dt.datetime.strptime(nowork_day_w[i], "%Y-%m-%d") # 納期属性を計算 order['納期属性'] = [(z - y).days - len(list(filter(lambda x: y <= x <= z, nowork_day_w))) for y, z in zip(order['受注日'], order['受注実績SSD'])] # 納期属性を73以上は全て73,0未満のRECは0へ(削除も検討?正しくない) order.loc[order['納期属性'] > 73, '納期属性'] = 73 order.loc[order['納期属性'] < 0, '納期属性'] = 0 # order = order.query('納期属性 >= 0') order = order.astype({'数量': int}) # 現法毎の実績,稼働日数を集計 # 現法毎の年間数量を集計 order_subtotal = order.groupby(['year', 'month'], as_index=False)['数量'].sum() order_subtotal = order_subtotal.astype({'year': str, 'month': str}) # order_subtotalを割る製造拠点の稼働日を集計 order_subtotal['開始日'] = pd.to_datetime((order_subtotal['year'] + '/' + order_subtotal['month'] + '/01'), format='%Y/%m/%d') order_subtotal['終了日'] = order_subtotal['開始日'] + offsets.MonthBegin(1) order_subtotal['月稼働日'] = [(z - y).days - len(list(filter(lambda x: y <= x < z, nowork_day_w))) for y, z in zip(order_subtotal['開始日'], order_subtotal['終了日'])] order_subtotal['月平均本数'] = (order_subtotal['数量'] / order_subtotal['月稼働日']).round(3) # 現法毎の期間中の受注数量比を算出 order_subratio = order.groupby(['year', '現法コード'], as_index=False)['数量'].sum() order_subratio['合計'] = order_subtotal['数量'].sum() order_subratio['ratio'] = (order_subratio['数量'] / order_subratio['合計']).round(3) order_subratio.drop(['数量', '合計'], axis=1, inplace=True) # 月ごとの数量合計に現法比率をかける yearのtypeを統一する order_subtotal = order_subtotal.astype({'year': int, 'month': int}) order_subtotal = pd.merge(order_subtotal, order_subratio, on=['year'], how='left') order_subtotal['月平均本数'] = (order_subtotal['月平均本数'] * order_subtotal['ratio']).round(3) # とりあえず実績をFCの数値をして利用する # FCを実績で作成する場合は期間を出荷日で規定 # しかし受注予測は受注日で規定 Tgt_S,Eを使ったフィルタリングでは不足 # とはいえ基本的にFCは外から入れることにする # 不要列削除 # order_subtotal.drop(['数量', '開始日', '終了日', '月稼働日', 'ratio'], axis=1, inplace=True) # FC = order_subtotal # FCデータ取り込み FC = pd.read_csv(local_pass + 'FC.csv', encoding=font, dtype='object', index_col=None) # header名の変更 FC = FC.rename(columns={'SUBSIDIARY_CD': '現法コード'}) # SUPPLIER_CDの指定 FC = FC[FC['SUPPLIER_CD'] == pg_name] FC = FC.astype({'year': str, 'month': str, 'FC': float}) # FCを割る製造拠点の稼働日を集計しFCを日当り数量へ FC['開始日'] = pd.to_datetime((FC['year'] + '/' + FC['month'] + '/01'), format='%Y/%m/%d') FC['終了日'] = FC['開始日'] + offsets.MonthBegin(1) FC['月稼働日'] = [(z - y).days - len(list(filter(lambda x: y <= x < z, nowork_day_w))) for y, z in zip(FC['開始日'], FC['終了日'])] FC['月平均本数'] = (FC['FC'] / FC['月稼働日']).round(3) FC = FC.astype({'year': int, 'month': int}) # 曜日ごとの稼働日をカウント Tgt_S = dt.datetime.strptime(Tgt_S, '%Y%m%d') Tgt_E = dt.datetime.strptime(Tgt_E, '%Y%m%d') week_count = pd.DataFrame({'count': [], 'weekday': []}) count = 0 while Tgt_S <= Tgt_E: if not Tgt_S.strftime('%Y-%m-%d') in calendar_dict[pg_name]: weekday = Tgt_S.strftime('%a') week_count.loc[count] = [Tgt_S, weekday] count += 1 Tgt_S = Tgt_S + dt.timedelta(days=1) week_count = week_count.groupby(['weekday'], as_index=False).count() # 曜日指数計算 # 日付毎に数量を集計 # 大口を除く order_small = order[order['アンフィット種別'] == '0'] order_day = order_small.groupby(['現法コード', 'weekday', '受注日'], as_index=False)['数量'].sum() order_week = order_day.groupby(['現法コード', 'weekday'], as_index=False)['数量'].sum() order_week = pd.merge(order_week, week_count, on=['weekday'], how='left') order_week.loc[order_week['count'] != 0, '数量'] = order_week['数量'] / order_week['count'] order_week.loc[order_week['count'] == 0, '数量'] = 0 order_week1 = order_week[order_week['weekday'] != 'Sun'] order_week1 = order_week1.groupby(['現法コード', 'weekday'])['数量'].sum() order_week1 = order_week1.groupby(['現法コード']).transform(qtyave) order_week2 = order_week.groupby(['現法コード', 'weekday'])['数量'].sum() order_week2 = order_week2.groupby(['現法コード']).transform(qtyave) order_week1 = order_week1.reset_index() order_week2 = order_week2.reset_index() order_week2 = order_week2[order_week2['weekday'] == 'Sun'] order_week1 = order_week1.append(order_week2, sort=False) order_week1 = order_week1.rename(columns={'数量': 'week_ratio'}) # 現法、曜日、納期属性の箱を用意 base_sh = pd.read_csv(local_pass + 'base_sh.csv', encoding=font, index_col=None, dtype={'数量': int, '納期属性': int}) # 現法、曜日、納期属性毎の数量を合計 order_A = order[order['アンフィット種別'] == '0'] # 大口を除く order_A = order_A.groupby(['現法コード', 'weekday', '納期属性'])['数量'].sum() order_A = order_A.groupby(['現法コード', 'weekday']).transform(qtyratio) order_A = order_A.reset_index() # base_shと結合し0を補足 base_sh = pd.merge(base_sh, order_A, on=['現法コード', 'weekday', '納期属性'], how='outer') base_sh.loc[base_sh['数量_y'].notnull(), '数量_x'] = base_sh['数量_y'] base_sh = base_sh.rename(columns={'数量_x': '数量'}) base_sh.drop(['数量_y'], axis=1, inplace=True) base_sh = base_sh.round({'数量': 4}) base_sh = base_sh.rename(columns={'数量': 'n_ratio'}) # 小口比率を計算 small_ratio = order.groupby(['現法コード'], as_index=False)['数量'].sum() small_ratio_A = order[order['アンフィット種別'] == '0'] # 大口を除く small_ratio_A = small_ratio_A.groupby(['現法コード'], as_index=False)['数量'].sum() small_ratio = pd.merge(small_ratio_A, small_ratio, on=['現法コード'], how='right') small_ratio.loc[small_ratio['数量_x'].isnull(), '数量_x'] = 0 small_ratio['small_ratio'] = small_ratio['数量_x'] / small_ratio['数量_y'] small_ratio.drop(['数量_x', '数量_y'], axis=1, inplace=True) # 開始日終了日をdate形式に Pre_S = dt.datetime.strptime(Pre_S, '%Y%m%d') Pre_E = dt.datetime.strptime(Pre_E, '%Y%m%d') # 受注日のリストを作成 day_list = [Pre_S] day_n = Pre_S while day_n <= Pre_E: day_list.append(day_n) day_n = day_n + datetime.timedelta(days=1) # 工場受注日を作成、工場非稼働日の際は稼働日までずらす supday_list = day_list.copy() for supp in range(len(supday_list)): # 非稼働日なら+1する while supday_list[supp].strftime('%Y-%m-%d') in calendar_dict[pg_name]: supday_list[supp] = supday_list[supp] + dt.timedelta(days=1) # 受注日*出荷日のリストを作成 so_day_list = [] sd_day_list = [] # 工場受注日のリストを作成 sosup_day_list = [] noukizokusei = list(range(73)) noukizokusei_list = [] for nouki in range(len(day_list)): noworkday_count = 0 for n in range(73): so_day_list.append(day_list[nouki]) sosup_day_list.append(supday_list[nouki]) noukizokusei_list.append(n) sd_day = day_list[nouki] + datetime.timedelta(days=(n + noworkday_count)) # 出荷日稼働flgを作成し非稼働日なら+1する while sd_day.strftime('%Y-%m-%d') in calendar_dict[pg_name]: sd_day = sd_day + dt.timedelta(days=1) noworkday_count = noworkday_count + 1 sd_day_list.append(sd_day) # マルチプロセス処理の結果を入れるdfを作成 prediction_sum = pd.DataFrame( {'現法コード': [], '受注日': [], '受注日稼働flg': [], '工場受注日': [], '出荷日': [], '出荷日稼働flg': [], '納期属性': []}) # 現法毎にマルチプロセスで処理 pool = Pool(multi.cpu_count() - 2) list1 = [(x, so_day_list, sd_day_list, sosup_day_list, noukizokusei_list, calendar_dict, sub_name) for x in range(14)] pre_list = pool.map(wrapper, list1) pool.close() # 返り値がlist形式で格納しされるのでfor文で結合 for x in range(14): prediction_sum = prediction_sum.append(pre_list[x], sort=False) # 受注曜日カラムを追加 prediction = prediction_sum prediction['weekday'] = [x.strftime('%a') for x in prediction['受注日']] ''' # 受注日をDatetimeIndexとし年、月、曜日のindexを追加 prediction = prediction.set_index('受注日') prediction = prediction.set_index([prediction.index.year, prediction.index.month, prediction.index]) prediction.index.names = ['year_so', 'month_so', '受注日'] prediction = prediction.reset_index() ''' # 出荷日をDatetimeIndexとし年、月、曜日のindexを追加 prediction = prediction.set_index('出荷日') prediction = prediction.set_index([prediction.index.year, prediction.index.month, prediction.index]) prediction.index.names = ['year', 'month', '出荷日'] prediction = prediction.reset_index() # FCを結合 prediction = pd.merge(prediction, FC, on=['現法コード', 'year', 'month'], how='left') # 曜日比率を結合 prediction = pd.merge(prediction, order_week1, on=['現法コード', 'weekday'], how='left') # 納期属性を結合 prediction = pd.merge(prediction, base_sh, on=['現法コード', 'weekday', '納期属性'], how='left') # 小口比率を追加 prediction = pd.merge(prediction, small_ratio, on=['現法コード'], how='left') # ブランクを0で埋める prediction.loc[prediction['月平均本数'].isnull(), '月平均本数'] = 0 prediction.loc[prediction['week_ratio'].isnull(), 'week_ratio'] = 0 # 現法毎日当たり数量を算出 prediction['数量'] = prediction['受注日稼働flg'] * prediction['出荷日稼働flg'] * prediction['月平均本数'] * prediction[ 'week_ratio'] * prediction['n_ratio'] * prediction['small_ratio'] prediction = prediction.round({'数量': 3}) # 受注日と出荷日毎のデータを出力 f_name = pg_name + '_prediction_row.tsv' prediction.to_csv(local_pass + f_name, sep='\t', encoding=font, quotechar='"', line_terminator='\n', index=False) # 集計後、リスト形式からスプレットシート形式に変更して出力 prediction_sup = prediction.groupby(['納期属性', '工場受注日'], as_index=True)['数量'].sum() prediction_sup = prediction_sup.round(3) prediction_sup = prediction_sup.unstack() f_name = pg_name + '_prediction_for_Supplier.tsv' prediction_sup.to_csv(local_pass + f_name, sep='\t', encoding=font, quotechar='"', line_terminator='\n', index=True) print('受注予測作成 Finish!') else: print('受注予測作成 Finish 該当する実績データがありません!')
def get_business_days_in_month(*, end_of_month): """ Returns business days from start of the month until specified date of the same month """ start_of_month = end_of_month - pdo.MonthBegin() return ContractCalendar.get_business_days(start=start_of_month, end=end_of_month)