예제 #1
0
 def _writer_massive(self, s_date, e_date):
     """
         获取时间区间内股票大宗交易,时间最好在一个月之内, 缺失值 --- '—'
     """
     deadline = self._retrieve_deadlines_from_sqlite('massive')
     print('massive deadline', deadline)
     page = 1
     pages = 1
     while page <= pages:
         url = ASSET_FUNDAMENTAL_URL['massive'].format(page=page,
                                                       start=s_date,
                                                       end=e_date)
         # print('url', url)
         data = self._arbitrary_parser(url, encoding='utf-8')
         try:
             frame = pd.DataFrame(data['data'])
             frame.rename(columns=MassiveFields, inplace=True)
             frame['declared_date'] = frame['declared_date'].apply(
                 lambda x: pd.Timestamp(x).strftime('%Y-%m-%d'))
             frame.replace('-', 0.0, inplace=True)
             frame.dropna(axis=0, how='all', inplace=True)
             massive = frame[frame['declared_date'] > deadline.max(
             )] if not deadline.empty else frame
             if massive.empty:
                 break
             print('massive', massive.head())
             db.writer('massive', massive)
             page = page + 1
             print('present massive page', page)
             pages = data['pages']
             print('massive pages', pages)
             time.sleep(np.random.randint(0, 3))
         except Exception as e:
             print('error', e)
예제 #2
0
 def calculate_mcap(self):
     """由于存在一个变动时点出现多条记录,保留最大total_assets的记录,先按照最大股本降序,保留第一个记录"""
     ownership = self._retrieve_ownership()
     for sid in set(ownership):
         print('sid', sid)
         owner = ownership[sid]
         owner.sort_values(by='general', ascending=False, inplace=True)
         owner.drop_duplicates(subset='date', keep='first', inplace=True)
         owner.set_index('date', inplace=True)
         close = self._retrieve_array(sid)
         print('close', close)
         if close.empty:
             print('%s close is empty' % sid)
         else:
             re_owner = owner.reindex(index=close.index)
             re_owner.sort_index(inplace=True)
             re_owner.fillna(method='bfill', inplace=True)
             re_owner.fillna(method='ffill', inplace=True)
             # 当每日更新的时候re_owner(reindex 为None) --- 需要通过最近的日期来填充
             re_owner = re_owner.fillna({'float': owner['float'][0], 'general': owner['general'][0]})
             print('adjust owner', re_owner)
             mcap = re_owner.apply(lambda x: x * close)
             mcap.loc[:, 'trade_dt'] = mcap.index
             mcap.loc[:, 'sid'] = sid
             mcap.loc[:, 'strict'] = mcap['general'] - mcap['float']
             mcap.rename(columns=RENAME_COLUMNS, inplace=True)
             print('mcap', mcap)
             db.writer('m_cap', mcap)
예제 #3
0
 def _writer_holder(self, *args):
     """股票增持、减持、变动情况"""
     deadline = self._retrieve_deadlines_from_sqlite('holder')
     print('holder deadline', deadline)
     page = 1
     pages = 1
     while page <= pages:
         url = ASSET_FUNDAMENTAL_URL['holder'] % page
         text = self._arbitrary_parser(url, direct=False)
         try:
             match = re.search('pages:(\d)*', text)
             pages = int(re.split(':', match.group())[-1])
             print('holder pages', pages)
             match = re.search('\[(.*.)\]', text)
             data = json.loads(match.group())
             data = [item.split(',')[:-1] for item in data]
             frame = pd.DataFrame(data, columns=HolderFields)
             frame.loc[:, 'sid'] = frame['代码']
             # '' -> 0.0
             frame.replace(to_replace='', value=0.0, inplace=True)
             holdings = frame[frame['declared_date'] > deadline.max(
             )] if not deadline.empty else frame
             if holdings.empty:
                 break
             print('holding', holdings.head())
             db.writer('holder', holdings)
             page = page + 1
             print('present holder page', page)
             time.sleep(np.random.randint(0, 3))
         except Exception as e:
             print('error', e)
예제 #4
0
 def _parse_equity_divdend(self, content, sid):
     """获取分红配股数据"""
     text = list()
     table = content.find('table', {'id': 'sharebonus_1'})
     [text.append(item.get_text()) for item in table.tbody.findAll('tr')]
     if len(text) == 1 and text[0] == '暂时没有数据!':
         print('------------code : %s has not splits and divdend' % sid, text[0])
     else:
         sep_text = [item.split('\n')[1:-2] for item in text]
         frame = pd.DataFrame(sep_text, columns=['declared_date', 'sid_bonus', 'sid_transfer', 'bonus',
                                                 'progress', 'pay_date', 'ex_date', 'effective_date'])
         frame.loc[:, 'sid'] = sid
         # deadline = self.deadlines['equity_splits'].get(sid, None)
         # divdends = frame[frame['declared_date'] > deadline] if deadline else frame
         ex_deadline = self.deadlines['equity_splits'].get(sid, None)
         divdends = frame[frame['ex_date'] > ex_deadline] if ex_deadline else frame
         db.writer('equity_splits', divdends)
예제 #5
0
 def _parse_equity_rights(self, content, symbol):
     """配股"""
     text = list()
     table = content.find('table', {'id': 'sharebonus_2'})
     [text.append(item.get_text()) for item in table.tbody.findAll('tr')]
     if len(text) == 1 and text[0] == '暂时没有数据!':
         print('------------code : %s has not 配股' % symbol, text[0])
     else:
         sep_text = [item.split('\n')[1:-2] for item in text]
         frame = pd.DataFrame(sep_text, columns=['declared_date', 'rights_bonus', 'rights_price',
                                                 'benchmark_share', 'pay_date', 'ex_date',
                                                 '缴款起始日', '缴款终止日', 'effective_date', '募集资金合计'])
         frame.loc[:, 'sid'] = symbol
         # deadline = self.deadlines['equity_rights'].get(symbol, None)
         # rights = frame[frame['declared_date'] > deadline] if deadline else frame
         ex_deadline = self.deadlines['equity_rights'].get(symbol, None)
         rights = frame[frame['ex_date'] > ex_deadline] if ex_deadline else frame
         db.writer('equity_rights', rights)
예제 #6
0
 def _parse_equity_ownership(self, content, symbol):
     """获取股票股权结构分布"""
     frame = pd.DataFrame()
     tbody = content.findAll('tbody')
     if len(tbody) == 0:
         print('cannot set a frame with no defined index and a scalar when tbody is null')
     for th in tbody:
         formatted = parse_content_from_header(th)
         frame = frame.append(formatted)
     # rename columns
     frame.rename(columns=OwnershipFields, inplace=True)
     # 调整
     frame.loc[:, 'sid'] = symbol
     frame.index = range(len(frame))
     ex_deadline = self.deadlines.get(symbol, None)
     print('ex_deadline', ex_deadline)
     equity = frame[frame['ex_date'] > ex_deadline] if ex_deadline else frame
     db.writer('ownership', equity)
예제 #7
0
 def _crawler(self, mapping, tbl, pct=False):
     sid = mapping['sid']
     url = ASSETS_BUNDLES_URL[tbl].format(mapping['request_sid'], self.lmt)
     obj = _parse_url(url, bs=False)
     kline = json.loads(obj)['data']
     cols = self.default + ['pct'] if pct else self.default
     if kline and len(kline['klines']):
         frame = pd.DataFrame([item.split(',') for item in kline['klines']],
                              columns=cols)
         frame.loc[:, 'sid'] = sid
         # 过滤
         try:
             deadline = self._cache_deadlines[tbl][sid]
         except Exception as e:
             print('error :%s raise from sid come to market today' % e)
             deadline = None
         # frame = frame[frame['trade_dt'] > self._cache_deadlines[tbl][sid]]
         frame = frame[frame['trade_dt'] > deadline] if deadline else frame
         db.writer(tbl, frame)
예제 #8
0
 def _writer_release(self, s_date, e_date):
     """
         获取A股解禁数据
     """
     deadline = self._retrieve_deadlines_from_sqlite('unfreeze')
     print('release deadline', deadline)
     page = 1
     pages = 1
     while page <= pages:
         url = ASSET_FUNDAMENTAL_URL['release'].format(page=page,
                                                       start=s_date,
                                                       end=e_date)
         text = self._arbitrary_parser(url, encoding='utf-8')
         try:
             info = text['data']
             data = [[
                 item['gpdm'], item['ltsj'], item['xsglx'], item['zb']
             ] for item in info]
             # release_date --- declared_date
             frame = pd.DataFrame(
                 data,
                 columns=['sid', 'declared_date', 'release_type', 'zb'])
             frame['declared_date'] = frame['declared_date'].apply(
                 lambda x: pd.Timestamp(x).strftime('%Y-%m-%d'))
             frame.dropna(axis=0, how='all', inplace=True)
             release = frame[frame['declared_date'] > deadline.max(
             )] if not deadline.empty else frame
             if release.empty:
                 break
             print('release', release.head())
             release.replace('-', 0.0, inplace=True)
             db.writer('unfreeze', release)
             page = page + 1
             print('present release page', page)
             pages = text['pages']
             print('release pages', pages)
             time.sleep(np.random.randint(0, 3))
         # else:
         except Exception as e:
             print('error', e)
예제 #9
0
 def _writer_margin(self, *args):
     """获取市场全量融资融券"""
     deadline = self._retrieve_deadlines_from_sqlite('margin')
     print('margin deadline', deadline)
     page = 1
     pages = 1
     while page <= pages:
         req_url = ASSET_FUNDAMENTAL_URL['margin'] % page
         text = self._arbitrary_parser(req_url)
         try:
             raw = [[
                 item['DIM_DATE'], item['RZYE'], item['RZYEZB'],
                 item['RQYE']
             ] for item in text['result']['data']]
             frame = pd.DataFrame(
                 raw, columns=['declared_date', 'rzye', 'rzyezb', 'rqye'])
             frame['declared_date'] = frame['declared_date'].apply(
                 lambda x: pd.Timestamp(x).strftime('%Y-%m-%d'))
             frame.loc[:,
                       ['rzye', 'rqye']] = frame.loc[:,
                                                     ['rzye', 'rqye']].div(
                                                         1e8)
             frame.fillna(0.0, inplace=True)
             margin = frame[
                 frame['declared_date'] > deadline] if deadline else frame
             print('marign', margin.head())
             if margin.empty:
                 break
             db.writer('margin', margin)
             page = page + 1
             print('present margin page', page)
             pages = text['result']['pages']
             print('margin pages', pages)
             time.sleep(np.random.randint(0, 3))
         except Exception as e:
             print('error', e)
예제 #10
0
 def _write_df_to_table(self, tbl, df, include=True):
     df = df.copy()
     if include:
         self._write_assets(df)
     db.writer(tbl, df)
예제 #11
0
 def _write_assets(frame):
     # symbols_mapping = frame.loc[:, _rename_router_cols]
     renamed_frame = frame.reindex(columns=_rename_router_cols,
                                   fill_value='')
     db.writer('asset_router', renamed_frame)