def test_repr(self): expected = "<QuarterEnd: startingMonth=3>" assert repr(QuarterEnd()) == expected expected = "<QuarterEnd: startingMonth=3>" assert repr(QuarterEnd(startingMonth=3)) == expected expected = "<QuarterEnd: startingMonth=1>" assert repr(QuarterEnd(startingMonth=1)) == expected
def get_max_dt(): with get_session() as ss: g = ss.query( fund.PortfolioAsset.wind_code, sa.func.max( fund.PortfolioAsset.end_date).label('max_dt')).group_by( fund.PortfolioAsset.wind_code).subquery('g') fund_list = pd.DataFrame( ss.query( fund.Description.wind_code, fund.Description.setup_date, fund.Description.maturity_date, fund.Description.is_initial, g.c.max_dt, ).join( g, fund.Description.wind_code == g.c.wind_code, isouter=True # left join ).order_by(fund.Description.is_initial.desc())) for c in ('setup_date', 'maturity_date', 'max_dt'): fund_list.loc[:, c] = pd.to_datetime(fund_list[c]) fund_list = fund_list.fillna({'max_dt': fund_list['setup_date']}) fund_list = fund_list.loc[ lambda df: df['max_dt'] + QuarterEnd(n=0) < df[ 'maturity_date'].clip(upper=get_last_td()) - QuarterEnd(n=1)] return fund_list
def run(self, *args, **kwargs): fund_list = self.get_max_dt( ).loc[lambda df: df['max_dt'] < '2018-12-31'] for _, row in fund_list.iterrows(): ts_data = self.get_tushare_data('asset_portfolio', ts_code=row['wind_code']) ts_data = ts_data.loc[ lambda df: df['end_date'] >= row['max_dt'] - QuarterEnd(n=1)] self.insert_data(ts_data, fund.PortfolioAsset) self.insert_data(ts_data.loc[lambda df: df['bond_value'].gt(0)], fund.PortfolioAssetBond) # 临时的修补坑数据 fund_list = self.get_max_dt().loc[lambda df: df['is_initial'].eq(1)] mapping = get_wind_conf('crawler_mf_prf') for _, row in fund_list.iterrows(): w_data = self.query_wind( api_name='wsd', codes=row['wind_code'], fields=mapping['fields'].keys(), col_mapping=mapping['fields'], beginTime=row['max_dt'] + QuarterEnd(n=1), endTime=pd.Timestamp.now() - QuarterEnd(n=1), options=mapping['options']).assign(wind_code=row['wind_code']) w_data['end_date'] = w_data.index self.insert_data(w_data, fund.PortfolioAsset, msg=row['wind_code']) self.clean_duplicates( fund.PortfolioAsset, [fund.PortfolioAsset.wind_code, fund.PortfolioAsset.end_date]) self.clean_duplicates(fund.PortfolioAssetBond, [ fund.PortfolioAssetBond.wind_code, fund.PortfolioAssetBond.end_date ])
def findHolder(mkey): today = datetime.now().date() d = today - pd.DateOffset(months=18) submit_date = QuarterEnd().rollback(d) tdf = pd.read_sql_query( "select sh.holder_name,sh.holder_code,sh.holder_type,sh.report_date from stock_holder sh " \ "where sh.holder_code like %(mkey)s and report_date >= %(submit_date)s order by report_date desc", db.engine, \ params={'mkey': '%' + mkey + '%', 'submit_date': submit_date.strftime('%Y-%m-%d')}) gtdf = tdf.groupby(['holder_code']) bdf = gtdf.first() bdf['hold_size'] = gtdf.size() bdf = bdf.reset_index().sort_index(by='report_date',ascending=False) return bdf
def findHolder(mkey): today = datetime.now().date() d = today - pd.DateOffset(months=18) submit_date = QuarterEnd().rollback(d) tdf = pd.read_sql_query( "select sh.holder_name,sh.holder_code,sh.holder_type,sh.report_date from stock_holder sh " \ "where sh.holder_code like %(mkey)s and report_date >= %(submit_date)s order by report_date desc", db.engine, \ params={'mkey': '%' + mkey + '%', 'submit_date': submit_date.strftime('%Y-%m-%d')}) gtdf = tdf.groupby(['holder_code']) bdf = gtdf.first() bdf['hold_size'] = gtdf.size() bdf = bdf.reset_index().sort_index(by='report_date', ascending=False) return bdf
def format_date(curr_date, prev_date, freq): if freq == '6': # Quarterly if int(curr_date[6:]) == 1: curr_date = curr_date[:5] + '01/01' elif int(curr_date[6:]) == 2: curr_date = curr_date[:5] + '04/01' elif int(curr_date[6:]) == 3: curr_date = curr_date[:5] + '07/01' else: curr_date = curr_date[:5] + '10/01' report_date = datetime.strptime(curr_date, "%Y/%m/%d") + QuarterEnd(1) elif freq == '8': curr_date = curr_date + '/01' report_date = datetime.strptime(curr_date, "%Y/%m/%d") + MonthEnd(1) elif freq == '9': if not prev_date or prev_date != curr_date: curr_date = curr_date + '/15' report_date = datetime.strptime(curr_date, "%Y/%m/%d") else: curr_date = curr_date + '/01' report_date = datetime.strptime(curr_date, "%Y/%m/%d") + MonthEnd(1) return report_date
def _time_format(self, start: str, end: str, freq='d'): '''轉換日期格式''' if freq == 'd': pass elif freq == 'm': start = pd.to_datetime(start, format='%Y%m') + MonthEnd(1) end = pd.to_datetime(end, format='%Y%m') + MonthEnd(1) elif freq == 'q': start = start[0:4] + start[4:6].replace('0', 'Q') start = pd.to_datetime(start) + QuarterEnd(1) end = end[0:4] + end[4:6].replace('0', 'Q') end = pd.to_datetime(end) + QuarterEnd(1) elif freq == 'y': start = pd.to_datetime(start) + YearEnd(1) end = pd.to_datetime(end) + YearEnd(1) return start, end
def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, **kwds): self.n = n self.normalize = normalize self.kwds.update(kwds) self.offset = kwds.get('offset', timedelta(0)) self.startingMonth = kwds.get('startingMonth', 3) self.cbday = CustomBusinessDay(n=1, normalize=normalize, weekmask=weekmask, holidays=holidays, calendar=calendar) self.calendar = self.cbday.calendar self.holidays = holidays self.startingMonth = self.startingMonth self.q_offset = QuarterEnd(1)
def _get_start_dates(sess, code, class_): """获取数据库指定代码所对应的表开始日期""" last_date = sess.query(func.max( class_.date)).filter(class_.code == code).scalar() if last_date is None: start = sess.query(Issue.A004_上市日期).filter(Issue.code == code).scalar() else: # 开始日期递延到下一天 start = last_date + timedelta(days=1) # 没有上市日期 if start is None: return None elif start > pd.Timestamp('today').date(): return None else: # 当存在开始日期时,移动到季度末 qe = QuarterEnd() start = qe.apply(start).date() if start > pd.Timestamp('today').date(): return None else: return start
class CustomBusinessQuaterEnd(QuarterOffset): _cacheable = False _prefix = 'CBQE' _attributes = frozenset({'holidays', 'calendar'} | set(QuarterOffset._attributes)) def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, **kwds): self.n = n self.normalize = normalize self.kwds.update(kwds) self.offset = kwds.get('offset', timedelta(0)) self.startingMonth = kwds.get('startingMonth', 3) self.cbday = CustomBusinessDay(n=1, normalize=normalize, weekmask=weekmask, holidays=holidays, calendar=calendar) self.calendar = self.cbday.calendar self.holidays = holidays self.startingMonth = self.startingMonth self.q_offset = QuarterEnd(1) @apply_wraps def apply(self, other): n = self.n cur_qend = self.q_offset.rollforward(other) cur_cqend = self.cbday.rollback(cur_qend) if n == 0 and other != cur_cqend: n += 1 if other < cur_cqend and n >= 1: n -= 1 if other > cur_cqend and n <= -1: n += 1 new = cur_qend + n * self.q_offset result = self.cbday.rollback(new) return result def onOffset(self, dt): if self.normalize and not _is_normalized(dt): return False if not self.cbday.onOffset(dt): return False return (dt + self.cbday).quarter != dt.quarter
def period_backward(dates, quarter=None, back_nyear=1, back_nquarter=None): """计算上年报告期, 默认返回上年同期""" if back_nquarter is not None: dates = pd.to_datetime(dates.astype('str')) - QuarterEnd(back_nquarter) return np.asarray(dates.strftime("%Y%m%d")).astype('int') year = dates // 10000 month = dates % 10000 // 100 c = calendar if quarter is not None: if isinstance(quarter, int): month = np.ones(len(dates)).astype('int') * quarter * 3 else: month = quarter * 3 day = np.asarray([c.monthrange(y-back_nyear, m)[1] for y, m in zip(year, month)]) return (year - back_nyear) * 10000 + month * 100 + day
def add_quarter_data(df, label): quarter_label = 'quarter_' + label df[quarter_label] = None for index, line in df.iterrows(): # 计算每个季度的总收入 if '0331' == line['end_date'][4:]: df.loc[index, quarter_label] = line[label] else: this_quarter = pd.Timestamp(line['end_date']) last_quarter = (this_quarter - QuarterEnd(n=1)).strftime("%Y%m%d") tmp = df[df.end_date == last_quarter] if not tmp.empty: last_quarter_line = tmp.iloc[-1] df.loc[index, quarter_label] = line[label] - last_quarter_line[label]
def set_current(self): now = datetime.date.today() offset_m, offset_q = MonthEnd(), QuarterEnd() self.newest_date['M'] = offset_m.rollback(now) self.newest_date['Q'] = offset_q.rollback(now) self.newest_date['D'] = now - timedelta(days=1) self.newest_date['Y'] = YearEnd().rollback(now) half1 = datetime.date(now.year, 6, 30) half2 = datetime.date(now.year, 12, 31) if now < half1: self.newest_date['H'] = datetime.date(now.year - 1, 12, 31) elif now < half2: self.newest_date['H'] = half1 else: self.newest_date['H'] = half2
def ecos(code='021Y125', item1='?', item2='?', item3='?', freq='Q', first='1900', last='2100', N='10000', detail=True, col=None): '''retreive monthly, quarterly, annul time series from ecos. run 'open_ecosapi() to explore ecos api codes.''' ecos_key = "http://ecos.bok.or.kr/api/StatisticSearch/390S6FIOF95M7MHASMEA" freq_str = {'QQ': 'Q', 'MM': '-'} freq += freq # Y, Q, M, D -> YY, QQ, MM, DD url = f"{ecos_key}/json/kr/1/{N}/{code}/{freq}/{first}/{last}/{item1}/{item2}/{item3}/" result = urlopen(url) data = json.loads(result.read()) data = data["StatisticSearch"]["row"] df = pd.DataFrame(data) if detail: print( f"통계: {df.loc[0, 'STAT_NAME']}", f"단위: {df.loc[0, 'UNIT_NAME']}", f"기간: {df.loc[0, 'TIME']} - {df.loc[df.index[-1], 'TIME']}", f"항목: {df.loc[0, 'ITEM_NAME1']}", ) df = df.set_index("TIME") df.index.names = ['DATE'] if (freq == 'MM'): df.index = pd.DatetimeIndex(df.index.str[:4] + freq_str[freq] + df.index.str[4:]) df.index = df.index + MonthEnd() elif (freq == 'QQ'): df.index = pd.DatetimeIndex(df.index.str[:4] + freq_str[freq] + df.index.str[4:]) df.index = df.index + QuarterEnd() elif (freq == 'YY'): df.index = pd.DatetimeIndex(df.index) df.index = df.index + YearEnd() elif (freq == 'DD'): df.index = pd.DatetimeIndex(df.index) else: print('frequency is not one of D, M, Q, A.') return df["DATA_VALUE"] = df["DATA_VALUE"].astype("float") return df['DATA_VALUE'].to_frame(col)
def join_crsp_and_funda(crsp, funda, offset=QuarterEnd(2)): crsp = crsp.copy() crsp['time_idx_p'] = crsp['time_idx'] - offset joined = pd.merge(crsp, funda, left_on=['permno', 'time_idx_p'], right_on=['permno', 'time_idx'], how='left', suffixes=('', '_d')).drop('time_idx_d', axis=1) joined.sort_values(['permno', 'time_idx'], inplace=True) joined = joined.groupby('permno', as_index=False).fillna(method='ffill').dropna() print(f'CRSP recrods: {crsp.shape[0]}') print( f'Merged recrods: {joined.shape[0]} ({joined.shape[0] / crsp.shape[0]:.2%})' ) return joined
def __init__(self, start_year=1993): self.start_year = start_year self.ftp = ftplib.FTP(self.FTP_ADDR) self._last_quarter = Timestamp('now') - QuarterEnd(1) self.periods = period_range(self.start_year, self._last_quarter, freq='Q') self._start = '{year}Q{qtr}'.format(year=self.periods[0].year, qtr=self.periods[0].quarter) self._end = '{year}Q{qtr}'.format(year=self.periods[-1].year, qtr=self.periods[-1].quarter) self.CACHE = os.path.join( self.ER_CORPUS_DIR, self.CACHE_FORMAT.format(start=self._start, end=self._end)) self.archive = DataFrame(columns=self.COLUMNS)
def findStocksByHolder(mkey): sql = "select max(report_date) from stock_holder sh where sh.holder_name like :mkey and sh.holder_type != '自然人股'" resultProxy = db.session.execute(text(sql), {'mkey': '%' + mkey + '%'}) _max_date = resultProxy.scalar() if (_max_date == None): _max_date = pd.to_datetime('2000-06-30') _next_date = QuarterEnd().rollback(_max_date - DateOffset(days=1)) bdf = pd.read_sql_query( "select sh.* from stock_holder sh " \ "where sh.holder_name like %(mkey)s and sh.report_date >= %(mdate)s and sh.holder_type != '自然人股'", db.engine, \ params={'mkey': '%' + mkey + '%', 'mdate': _next_date}) df3 = dbs.get_global_data() df = pd.merge(bdf, df3, how='left', on='code') df['holder_amt'] = df['t_cap'] * df['rate'] / 100 return df
def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, **kwds): object.__setattr__(self, "n", n) object.__setattr__(self, "normalized", normalize) self.kwds.update(kwds) object.__setattr__(self, "offset", kwds.get('offset', timedelta(0))) object.__setattr__(self, "startingMonth", kwds.get('startingMonth', 3)) object.__setattr__( self, "cbday", CustomBusinessDay(n=1, normalize=normalize, weekmask=weekmask, holidays=holidays, calendar=calendar)) object.__setattr__(self, "calendar", self.cbday.calendar) object.__setattr__(self, "holidays", holidays) object.__setattr__(self, "q_offset", QuarterEnd(1))
def period_backward(dates, quarter=None, back_nyear=1, back_nquarter=None): """返回N年之前的同期日期 Parameters: ----------- dates: list of int date 原始日期序列。 quarter: int or list of int, Default None 季度参数(1,2,3,4),结果会返回N年之前该季度的最后一天。 back_nyear: int 回溯N年之前的同期日期。 back_nquarter: int, default None 回溯N个季度之前的日期, 如果该参数不是None, quarter 和back_nyear两个参数无效。 Examples: ---------- >>> period_backward([20101231], back_nyear=2) [20081231] >>> period_backward([20101231], back_nquarter=2) [20090630] >>> period_backward([20101231], quarter=1) [20090331] """ if back_nquarter is not None: dates = pd.to_datetime(dates.astype('str')) - QuarterEnd(back_nquarter) return np.asarray(dates.strftime("%Y%m%d")).astype('int') year = dates // 10000 month = dates % 10000 // 100 c = calendar if quarter is not None: if isinstance(quarter, int): month = np.ones(len(dates)).astype('int') * quarter * 3 else: month = quarter.astype('int') * 3 day = np.asarray( [c.monthrange(y - back_nyear, m)[1] for y, m in zip(year, month)]) return (year - back_nyear) * 10000 + month * 100 + day
def run(self, *args, **kwargs): mapping = get_wind_conf('crawler_mf_prf') with get_session() as ss: max_dt, = ss.query(sa.func.max(fund.PortfolioAsset.end_date)).one() if max_dt is not None: max_dt = min((pd.to_datetime(max_dt), pd.Timestamp.now() - QuarterEnd(n=1))) else: max_dt = pd.Timestamp('2009-12-30') for quarter_end in pd.date_range(max_dt, pd.Timestamp.now(), freq='Q'): fund_list = [ f for f, *_ in ss.query(fund.Description.wind_code).filter( fund.Description.setup_date < quarter_end, fund.Description.maturity_date >= quarter_end, fund.Description.is_initial == 1, ).all() ] for i, funds in enumerate(utils.chunk(fund_list, 1499), start=1): data = self.query_wind(api_name='wss', codes=funds, fields=mapping['fields'].keys(), col_mapping=mapping['fields']) self.insert_data( data.assign(end_date=quarter_end, wind_code=data.index), fund.PortfolioAsset, msg= f'{quarter_end} - {min(i * 1499 / 8000, 1) * 100:.2f}%' ) self.clean_duplicates( fund.PortfolioAsset, [fund.PortfolioAsset.wind_code, fund.PortfolioAsset.end_date])
'BAS' : BYearBegin(month=1), 'BAS-FEB' : BYearBegin(month=2), 'BAS-MAR' : BYearBegin(month=3), 'BAS-APR' : BYearBegin(month=4), 'BAS-MAY' : BYearBegin(month=5), 'BAS-JUN' : BYearBegin(month=6), 'BAS-JUL' : BYearBegin(month=7), 'BAS-AUG' : BYearBegin(month=8), 'BAS-SEP' : BYearBegin(month=9), 'BAS-OCT' : BYearBegin(month=10), 'BAS-NOV' : BYearBegin(month=11), 'BAS-DEC' : BYearBegin(month=12), # Quarterly - Calendar # 'Q' : QuarterEnd(startingMonth=3), 'Q-JAN' : QuarterEnd(startingMonth=1), 'Q-FEB' : QuarterEnd(startingMonth=2), 'Q-MAR' : QuarterEnd(startingMonth=3), 'Q-APR' : QuarterEnd(startingMonth=4), 'Q-MAY' : QuarterEnd(startingMonth=5), 'Q-JUN' : QuarterEnd(startingMonth=6), 'Q-JUL' : QuarterEnd(startingMonth=7), 'Q-AUG' : QuarterEnd(startingMonth=8), 'Q-SEP' : QuarterEnd(startingMonth=9), 'Q-OCT' : QuarterEnd(startingMonth=10), 'Q-NOV' : QuarterEnd(startingMonth=11), 'Q-DEC' : QuarterEnd(startingMonth=12), # Quarterly - Calendar (Start) # 'QS' : QuarterBegin(startingMonth=1), 'QS-JAN' : QuarterBegin(startingMonth=1),
def create_data(): """ create the pickle/msgpack data """ data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] } scalars = dict(timestamp=Timestamp('20130101'), period=Period('2012', 'M')) index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10), float=Index(np.arange(10, dtype=np.float64)), uint=Index(np.arange(10, dtype=np.uint64)), timedelta=timedelta_range('00:00:00', freq='30T', periods=10)) if _loose_version >= LooseVersion('0.18'): from pandas import RangeIndex index['range'] = RangeIndex(10) if _loose_version >= LooseVersion('0.21'): from pandas import interval_range index['interval'] = interval_range(0, periods=10) mi = dict(reg2=MultiIndex.from_tuples(tuple( zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), names=['first', 'second'])) series = dict(float=Series(data['A']), int=Series(data['B']), mixed=Series(data['E']), ts=Series(np.arange(10).astype(np.int64), index=date_range('20130101', periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple( zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=['one', 'two'])), dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), cat=Series(Categorical(['foo', 'bar', 'baz'])), dt=Series(date_range('20130101', periods=5)), dt_tz=Series( date_range('20130101', periods=5, tz='US/Eastern')), period=Series([Period('2000Q1')] * 5)) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") frame = dict(float=DataFrame({ 'A': series['float'], 'B': series['float'] + 1 }), int=DataFrame({ 'A': series['int'], 'B': series['int'] + 1 }), mixed=DataFrame({k: data[k] for k in ['A', 'B', 'C', 'D']}), mi=DataFrame( { 'A': np.arange(5).astype(np.float64), 'B': np.arange(5).astype(np.int64) }, index=MultiIndex.from_tuples(tuple( zip(*[['bar', 'bar', 'baz', 'baz', 'baz'], ['one', 'two', 'one', 'two', 'three']])), names=['first', 'second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=['A', 'B', 'A']), cat_onecol=DataFrame({'A': Categorical(['foo', 'bar'])}), cat_and_float=DataFrame({ 'A': Categorical(['foo', 'bar', 'baz']), 'B': np.arange(3).astype(np.int64) }), mixed_dup=mixed_dup_df, dt_mixed_tzs=DataFrame( { 'A': Timestamp('20130102', tz='US/Eastern'), 'B': Timestamp('20130603', tz='CET') }, index=range(5)), dt_mixed2_tzs=DataFrame( { 'A': Timestamp('20130102', tz='US/Eastern'), 'B': Timestamp('20130603', tz='CET'), 'C': Timestamp('20130603', tz='UTC') }, index=range(5))) cat = dict(int8=Categorical(list('abcdefg')), int16=Categorical(np.arange(1000)), int32=Categorical(np.arange(10000))) timestamp = dict(normal=Timestamp('2011-01-01'), nat=NaT, tz=Timestamp('2011-01-01', tz='US/Eastern')) if _loose_version < LooseVersion('0.19.2'): timestamp['freq'] = Timestamp('2011-01-01', offset='D') timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', offset='M') else: timestamp['freq'] = Timestamp('2011-01-01', freq='D') timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', freq='M') off = { 'DateOffset': DateOffset(years=1), 'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824), 'BusinessDay': BusinessDay(offset=timedelta(seconds=9)), 'BusinessHour': BusinessHour(normalize=True, n=6, end='15:14'), 'CustomBusinessDay': CustomBusinessDay(weekmask='Mon Fri'), 'SemiMonthBegin': SemiMonthBegin(day_of_month=9), 'SemiMonthEnd': SemiMonthEnd(day_of_month=24), 'MonthBegin': MonthBegin(1), 'MonthEnd': MonthEnd(1), 'QuarterBegin': QuarterBegin(1), 'QuarterEnd': QuarterEnd(1), 'Day': Day(1), 'YearBegin': YearBegin(1), 'YearEnd': YearEnd(1), 'Week': Week(1), 'Week_Tues': Week(2, normalize=False, weekday=1), 'WeekOfMonth': WeekOfMonth(week=3, weekday=4), 'LastWeekOfMonth': LastWeekOfMonth(n=1, weekday=3), 'FY5253': FY5253(n=2, weekday=6, startingMonth=7, variation="last"), 'Easter': Easter(), 'Hour': Hour(1), 'Minute': Minute(1) } return dict(series=series, frame=frame, index=index, scalars=scalars, mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()), cat=cat, timestamp=timestamp, offsets=off)
def test_offset_corner_case(self): # corner offset = QuarterEnd(n=-1, startingMonth=1) assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 31)
def test_isAnchored(self): assert QuarterEnd(startingMonth=1).isAnchored() assert QuarterEnd().isAnchored() assert not QuarterEnd(2, startingMonth=1).isAnchored()
class TestQuarterEnd(Base): _offset = QuarterEnd def test_repr(self): expected = "<QuarterEnd: startingMonth=3>" assert repr(QuarterEnd()) == expected expected = "<QuarterEnd: startingMonth=3>" assert repr(QuarterEnd(startingMonth=3)) == expected expected = "<QuarterEnd: startingMonth=1>" assert repr(QuarterEnd(startingMonth=1)) == expected def test_isAnchored(self): assert QuarterEnd(startingMonth=1).isAnchored() assert QuarterEnd().isAnchored() assert not QuarterEnd(2, startingMonth=1).isAnchored() def test_offset_corner_case(self): # corner offset = QuarterEnd(n=-1, startingMonth=1) assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 31) offset_cases = [] offset_cases.append((QuarterEnd(startingMonth=1), { datetime(2008, 1, 1): datetime(2008, 1, 31), datetime(2008, 1, 31): datetime(2008, 4, 30), datetime(2008, 2, 15): datetime(2008, 4, 30), datetime(2008, 2, 29): datetime(2008, 4, 30), datetime(2008, 3, 15): datetime(2008, 4, 30), datetime(2008, 3, 31): datetime(2008, 4, 30), datetime(2008, 4, 15): datetime(2008, 4, 30), datetime(2008, 4, 30): datetime(2008, 7, 31)})) offset_cases.append((QuarterEnd(startingMonth=2), { datetime(2008, 1, 1): datetime(2008, 2, 29), datetime(2008, 1, 31): datetime(2008, 2, 29), datetime(2008, 2, 15): datetime(2008, 2, 29), datetime(2008, 2, 29): datetime(2008, 5, 31), datetime(2008, 3, 15): datetime(2008, 5, 31), datetime(2008, 3, 31): datetime(2008, 5, 31), datetime(2008, 4, 15): datetime(2008, 5, 31), datetime(2008, 4, 30): datetime(2008, 5, 31)})) offset_cases.append((QuarterEnd(startingMonth=1, n=0), { datetime(2008, 1, 1): datetime(2008, 1, 31), datetime(2008, 1, 31): datetime(2008, 1, 31), datetime(2008, 2, 15): datetime(2008, 4, 30), datetime(2008, 2, 29): datetime(2008, 4, 30), datetime(2008, 3, 15): datetime(2008, 4, 30), datetime(2008, 3, 31): datetime(2008, 4, 30), datetime(2008, 4, 15): datetime(2008, 4, 30), datetime(2008, 4, 30): datetime(2008, 4, 30)})) offset_cases.append((QuarterEnd(startingMonth=1, n=-1), { datetime(2008, 1, 1): datetime(2007, 10, 31), datetime(2008, 1, 31): datetime(2007, 10, 31), datetime(2008, 2, 15): datetime(2008, 1, 31), datetime(2008, 2, 29): datetime(2008, 1, 31), datetime(2008, 3, 15): datetime(2008, 1, 31), datetime(2008, 3, 31): datetime(2008, 1, 31), datetime(2008, 4, 15): datetime(2008, 1, 31), datetime(2008, 4, 30): datetime(2008, 1, 31), datetime(2008, 7, 1): datetime(2008, 4, 30)})) offset_cases.append((QuarterEnd(startingMonth=1, n=2), { datetime(2008, 1, 31): datetime(2008, 7, 31), datetime(2008, 2, 15): datetime(2008, 7, 31), datetime(2008, 2, 29): datetime(2008, 7, 31), datetime(2008, 3, 15): datetime(2008, 7, 31), datetime(2008, 3, 31): datetime(2008, 7, 31), datetime(2008, 4, 15): datetime(2008, 7, 31), datetime(2008, 4, 30): datetime(2008, 10, 31)})) @pytest.mark.parametrize('case', offset_cases) def test_offset(self, case): offset, cases = case for base, expected in compat.iteritems(cases): assert_offset_equal(offset, base, expected) on_offset_cases = [ (QuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), (QuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False), (QuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False), (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False), (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False), (QuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False), (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 31), False), (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False), (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False), (QuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False), (QuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False), (QuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False), (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False), (QuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False), (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), False), (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 31), True), (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False), (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False), (QuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False), (QuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True), (QuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False), (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), False), (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), True), (QuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 31), False), (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), False), (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), True)] @pytest.mark.parametrize('case', on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected)
offset = cls(n=n) rng = pd.date_range(start='1/1/2000', periods=100000, freq='T') ser = pd.Series(rng) res = rng + offset res_v2 = offset.apply_index(rng) assert (res == res_v2).all() assert res[0] == rng[0] + offset assert res[-1] == rng[-1] + offset res2 = ser + offset # apply_index is only for indexes, not series, so no res2_v2 assert res2.iloc[0] == ser.iloc[0] + offset assert res2.iloc[-1] == ser.iloc[-1] + offset @pytest.mark.parametrize('offset', [QuarterBegin(), QuarterEnd(), BQuarterBegin(), BQuarterEnd()]) def test_on_offset(offset): dates = [datetime(2016, m, d) for m in [10, 11, 12] for d in [1, 2, 3, 28, 29, 30, 31] if not (m == 11 and d == 31)] for date in dates: res = offset.onOffset(date) slow_version = date == (date + offset) - offset assert res == slow_version # -------------------------------------------------------------------- # Months class TestMonthBegin(Base):
ser = pd.Series(rng) res = rng + offset assert res.freq is None # not retained res_v2 = offset.apply_index(rng) assert (res == res_v2).all() assert res[0] == rng[0] + offset assert res[-1] == rng[-1] + offset res2 = ser + offset # apply_index is only for indexes, not series, so no res2_v2 assert res2.iloc[0] == ser.iloc[0] + offset assert res2.iloc[-1] == ser.iloc[-1] + offset @pytest.mark.parametrize( "offset", [QuarterBegin(), QuarterEnd(), BQuarterBegin(), BQuarterEnd()] ) def test_on_offset(offset): dates = [ datetime(2016, m, d) for m in [10, 11, 12] for d in [1, 2, 3, 28, 29, 30, 31] if not (m == 11 and d == 31) ] for date in dates: res = offset.is_on_offset(date) slow_version = date == (date + offset) - offset assert res == slow_version # --------------------------------------------------------------------
ser = pd.Series(rng) res = rng + offset res_v2 = offset.apply_index(rng) assert (res == res_v2).all() assert res[0] == rng[0] + offset assert res[-1] == rng[-1] + offset res2 = ser + offset # apply_index is only for indexes, not series, so no res2_v2 assert res2.iloc[0] == ser.iloc[0] + offset assert res2.iloc[-1] == ser.iloc[-1] + offset @pytest.mark.parametrize( 'offset', [QuarterBegin(), QuarterEnd(), BQuarterBegin(), BQuarterEnd()]) def test_on_offset(offset): dates = [ datetime(2016, m, d) for m in [10, 11, 12] for d in [1, 2, 3, 28, 29, 30, 31] if not (m == 11 and d == 31) ] for date in dates: res = offset.onOffset(date) slow_version = date == (date + offset) - offset assert res == slow_version # -------------------------------------------------------------------- # Months
def time_for_next_update(last_time, freq='D', num=9, is_end=False): """前次更新后下一次更新时间 Arguments: last_time {obj} -- 上次时间 Keyword Arguments: freq {str} -- 更新周期 (default: {'D'}) num {int} -- 日级别以下为单位数,以上为小时数 (default: {9}) is_end {bool} -- 是否为周期尾部 (default: {False}) Raises: TypeError: 不能识别的周期类型 Returns: Timestamp -- 下一次更新时间 Notes: 一、 freq < D `num`代表周期数 上一时点`normalize`后移动`num`周期,不考虑开始及结束问题 二、 freq in D、B `num`代表小时 对于历史时间,上一时点`normalize`后一律移动到下一个周期,且将小时调整到指定的num 如上一时点其日期为当前日期,且在其`normalize`及调整小时后的值晚于上一时点,则取调整后的值 三、 freq > D 开始及结束才有效 `num`无效 如周初、周末、月初、月末、季初、季末、年初、年末 此时num数字不起作用 """ valid_freq = ('B', 'D', 'W', 'M', 'Q', 'H', 'MIN') if pd.isnull(last_time): return pd.Timestamp(MARKET_START) assert isinstance( last_time, pd.Timestamp), f'类型错误,希望Timestamp,实际为{type(last_time)}' now = pd.Timestamp.now(tz=last_time.tz) assert last_time <= now, '过去时间必须小于当前时间' freq = freq.upper() if freq == 'MIN': offset = Minute(n=num) return offset.apply(last_time.floor(freq)) if freq == 'H': offset = Hour(n=num) return offset.apply(last_time.floor(freq)) if freq == 'D': # √ 此处要考虑小时数 limit = last_time.floor(freq).replace(hour=num) if last_time < limit: return limit else: offset = Day() return offset.apply(last_time.floor(freq)).replace(hour=num) if freq == 'B': offset = BDay() # 工作日 if last_time.weekday() in range(0, 5): # √ 此处要考虑小时数 limit = last_time.normalize().replace(hour=num) if last_time < limit: return limit else: return offset.apply(last_time.normalize()).replace(hour=num) else: return offset.apply(last_time.normalize()).replace(hour=num) if freq == 'W': nw = last_time.normalize() + pd.Timedelta(weeks=1) if is_end: return nw + pd.Timedelta(days=7-nw.weekday()) - pd.Timedelta(nanoseconds=1) else: return nw - pd.Timedelta(days=nw.weekday()) if freq == 'M': if is_end: offset = MonthEnd(n=2) res = offset.apply(last_time.normalize()) if last_time.is_month_end: res = offset.rollback(res) return res else: offset = MonthBegin() return offset.apply(last_time.normalize()) if freq == 'Q': if is_end: offset = QuarterEnd(n=2, startingMonth=3, normalize=True) res = offset.apply(last_time) if last_time.is_quarter_end: offset = QuarterEnd(n=-1, startingMonth=3, normalize=True) res = offset.apply(res) return res else: offset = QuarterBegin(n=1, normalize=True, startingMonth=1) return offset.apply(last_time) if freq == 'Y': if last_time.year == now.year: if is_end: return last_time.normalize().replace(year=now.year, month=12, day=31) else: return last_time.normalize().replace(year=now.year, month=1, day=1) if is_end: offset = YearEnd(normalize=True, month=12, n=2) res = offset.apply(last_time) if last_time.is_year_end: offset = YearEnd(n=-1, month=12, normalize=True) res = offset.apply(res) return res else: offset = YearBegin(normalize=True, month=1, n=1) return offset.apply(last_time) raise ValueError('不能识别的周期类型,仅接受{}。实际输入为{}'.format( valid_freq, freq))
def create_data(): """ create the pickle/msgpack data """ data = { "A": [0.0, 1.0, 2.0, 3.0, np.nan], "B": [0, 1, 0, 1, 0], "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], "D": date_range("1/1/2009", periods=5), "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], } scalars = dict(timestamp=Timestamp("20130101"), period=Period("2012", "M")) index = dict( int=Index(np.arange(10)), date=date_range("20130101", periods=10), period=period_range("2013-01-01", freq="M", periods=10), float=Index(np.arange(10, dtype=np.float64)), uint=Index(np.arange(10, dtype=np.uint64)), timedelta=timedelta_range("00:00:00", freq="30T", periods=10), ) index["range"] = RangeIndex(10) if _loose_version >= LooseVersion("0.21"): from pandas import interval_range index["interval"] = interval_range(0, periods=10) mi = dict(reg2=MultiIndex.from_tuples( tuple( zip(*[ ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], ["one", "two", "one", "two", "one", "two", "one", "two"], ])), names=["first", "second"], )) series = dict( float=Series(data["A"]), int=Series(data["B"]), mixed=Series(data["E"]), ts=Series(np.arange(10).astype(np.int64), index=date_range("20130101", periods=10)), mi=Series( np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple( zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"]), ), dup=Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]), cat=Series(Categorical(["foo", "bar", "baz"])), dt=Series(date_range("20130101", periods=5)), dt_tz=Series(date_range("20130101", periods=5, tz="US/Eastern")), period=Series([Period("2000Q1")] * 5), ) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") frame = dict( float=DataFrame({ "A": series["float"], "B": series["float"] + 1 }), int=DataFrame({ "A": series["int"], "B": series["int"] + 1 }), mixed=DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}), mi=DataFrame( { "A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64) }, index=MultiIndex.from_tuples( tuple( zip(*[ ["bar", "bar", "baz", "baz", "baz"], ["one", "two", "one", "two", "three"], ])), names=["first", "second"], ), ), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"]), cat_onecol=DataFrame({"A": Categorical(["foo", "bar"])}), cat_and_float=DataFrame({ "A": Categorical(["foo", "bar", "baz"]), "B": np.arange(3).astype(np.int64), }), mixed_dup=mixed_dup_df, dt_mixed_tzs=DataFrame( { "A": Timestamp("20130102", tz="US/Eastern"), "B": Timestamp("20130603", tz="CET"), }, index=range(5), ), dt_mixed2_tzs=DataFrame( { "A": Timestamp("20130102", tz="US/Eastern"), "B": Timestamp("20130603", tz="CET"), "C": Timestamp("20130603", tz="UTC"), }, index=range(5), ), ) cat = dict( int8=Categorical(list("abcdefg")), int16=Categorical(np.arange(1000)), int32=Categorical(np.arange(10000)), ) timestamp = dict( normal=Timestamp("2011-01-01"), nat=NaT, tz=Timestamp("2011-01-01", tz="US/Eastern"), ) timestamp["freq"] = Timestamp("2011-01-01", freq="D") timestamp["both"] = Timestamp("2011-01-01", tz="Asia/Tokyo", freq="M") off = { "DateOffset": DateOffset(years=1), "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824), "BusinessDay": BusinessDay(offset=timedelta(seconds=9)), "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"), "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"), "SemiMonthBegin": SemiMonthBegin(day_of_month=9), "SemiMonthEnd": SemiMonthEnd(day_of_month=24), "MonthBegin": MonthBegin(1), "MonthEnd": MonthEnd(1), "QuarterBegin": QuarterBegin(1), "QuarterEnd": QuarterEnd(1), "Day": Day(1), "YearBegin": YearBegin(1), "YearEnd": YearEnd(1), "Week": Week(1), "Week_Tues": Week(2, normalize=False, weekday=1), "WeekOfMonth": WeekOfMonth(week=3, weekday=4), "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3), "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"), "Easter": Easter(), "Hour": Hour(1), "Minute": Minute(1), } return dict( series=series, frame=frame, index=index, scalars=scalars, mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()), cat=cat, timestamp=timestamp, offsets=off, )
vectorizer, topic_model = train_lda_model(df_reports, train_last, n_dims) pickle.dump((vectorizer, topic_model), open(pickle_name, "wb")) # loading reports for training set df_reports_train = df_reports.loc[train_first:train_last] train_range = df_reports_train.index train_x = [] train_y = [] mean_sims = [] if model_train_sample == "whole": for date in train_range: returns_stop = date + QuarterEnd(startingMonth=3, n=time_horizon_quarters) print("training for period:") print(date + pd.DateOffset(days=1)) print(returns_stop) # loading reports and returns for period, finding the companies in which both datapoints exist reports = get_reports_for_date(df_reports, date) returns = get_returns_for_period(df_returns, date + pd.DateOffset(days=1), returns_stop) returns, reports = find_column_intersection([returns, reports]) # covariance and correlation matrix for period cov = predict_cov_sample(returns) cor = predict_cov_sample(returns, True)