def _writer_massive(self, s_date, e_date): """ 获取时间区间内股票大宗交易,时间最好在一个月之内, 缺失值 --- '—' """ deadline = self._retrieve_deadlines_from_sqlite('massive') print('massive deadline', deadline) page = 1 pages = 1 while page <= pages: url = ASSET_FUNDAMENTAL_URL['massive'].format(page=page, start=s_date, end=e_date) # print('url', url) data = self._arbitrary_parser(url, encoding='utf-8') try: frame = pd.DataFrame(data['data']) frame.rename(columns=MassiveFields, inplace=True) frame['declared_date'] = frame['declared_date'].apply( lambda x: pd.Timestamp(x).strftime('%Y-%m-%d')) frame.replace('-', 0.0, inplace=True) frame.dropna(axis=0, how='all', inplace=True) massive = frame[frame['declared_date'] > deadline.max( )] if not deadline.empty else frame if massive.empty: break print('massive', massive.head()) db.writer('massive', massive) page = page + 1 print('present massive page', page) pages = data['pages'] print('massive pages', pages) time.sleep(np.random.randint(0, 3)) except Exception as e: print('error', e)
def calculate_mcap(self): """由于存在一个变动时点出现多条记录,保留最大total_assets的记录,先按照最大股本降序,保留第一个记录""" ownership = self._retrieve_ownership() for sid in set(ownership): print('sid', sid) owner = ownership[sid] owner.sort_values(by='general', ascending=False, inplace=True) owner.drop_duplicates(subset='date', keep='first', inplace=True) owner.set_index('date', inplace=True) close = self._retrieve_array(sid) print('close', close) if close.empty: print('%s close is empty' % sid) else: re_owner = owner.reindex(index=close.index) re_owner.sort_index(inplace=True) re_owner.fillna(method='bfill', inplace=True) re_owner.fillna(method='ffill', inplace=True) # 当每日更新的时候re_owner(reindex 为None) --- 需要通过最近的日期来填充 re_owner = re_owner.fillna({'float': owner['float'][0], 'general': owner['general'][0]}) print('adjust owner', re_owner) mcap = re_owner.apply(lambda x: x * close) mcap.loc[:, 'trade_dt'] = mcap.index mcap.loc[:, 'sid'] = sid mcap.loc[:, 'strict'] = mcap['general'] - mcap['float'] mcap.rename(columns=RENAME_COLUMNS, inplace=True) print('mcap', mcap) db.writer('m_cap', mcap)
def _writer_holder(self, *args): """股票增持、减持、变动情况""" deadline = self._retrieve_deadlines_from_sqlite('holder') print('holder deadline', deadline) page = 1 pages = 1 while page <= pages: url = ASSET_FUNDAMENTAL_URL['holder'] % page text = self._arbitrary_parser(url, direct=False) try: match = re.search('pages:(\d)*', text) pages = int(re.split(':', match.group())[-1]) print('holder pages', pages) match = re.search('\[(.*.)\]', text) data = json.loads(match.group()) data = [item.split(',')[:-1] for item in data] frame = pd.DataFrame(data, columns=HolderFields) frame.loc[:, 'sid'] = frame['代码'] # '' -> 0.0 frame.replace(to_replace='', value=0.0, inplace=True) holdings = frame[frame['declared_date'] > deadline.max( )] if not deadline.empty else frame if holdings.empty: break print('holding', holdings.head()) db.writer('holder', holdings) page = page + 1 print('present holder page', page) time.sleep(np.random.randint(0, 3)) except Exception as e: print('error', e)
def _parse_equity_divdend(self, content, sid): """获取分红配股数据""" text = list() table = content.find('table', {'id': 'sharebonus_1'}) [text.append(item.get_text()) for item in table.tbody.findAll('tr')] if len(text) == 1 and text[0] == '暂时没有数据!': print('------------code : %s has not splits and divdend' % sid, text[0]) else: sep_text = [item.split('\n')[1:-2] for item in text] frame = pd.DataFrame(sep_text, columns=['declared_date', 'sid_bonus', 'sid_transfer', 'bonus', 'progress', 'pay_date', 'ex_date', 'effective_date']) frame.loc[:, 'sid'] = sid # deadline = self.deadlines['equity_splits'].get(sid, None) # divdends = frame[frame['declared_date'] > deadline] if deadline else frame ex_deadline = self.deadlines['equity_splits'].get(sid, None) divdends = frame[frame['ex_date'] > ex_deadline] if ex_deadline else frame db.writer('equity_splits', divdends)
def _parse_equity_rights(self, content, symbol): """配股""" text = list() table = content.find('table', {'id': 'sharebonus_2'}) [text.append(item.get_text()) for item in table.tbody.findAll('tr')] if len(text) == 1 and text[0] == '暂时没有数据!': print('------------code : %s has not 配股' % symbol, text[0]) else: sep_text = [item.split('\n')[1:-2] for item in text] frame = pd.DataFrame(sep_text, columns=['declared_date', 'rights_bonus', 'rights_price', 'benchmark_share', 'pay_date', 'ex_date', '缴款起始日', '缴款终止日', 'effective_date', '募集资金合计']) frame.loc[:, 'sid'] = symbol # deadline = self.deadlines['equity_rights'].get(symbol, None) # rights = frame[frame['declared_date'] > deadline] if deadline else frame ex_deadline = self.deadlines['equity_rights'].get(symbol, None) rights = frame[frame['ex_date'] > ex_deadline] if ex_deadline else frame db.writer('equity_rights', rights)
def _parse_equity_ownership(self, content, symbol): """获取股票股权结构分布""" frame = pd.DataFrame() tbody = content.findAll('tbody') if len(tbody) == 0: print('cannot set a frame with no defined index and a scalar when tbody is null') for th in tbody: formatted = parse_content_from_header(th) frame = frame.append(formatted) # rename columns frame.rename(columns=OwnershipFields, inplace=True) # 调整 frame.loc[:, 'sid'] = symbol frame.index = range(len(frame)) ex_deadline = self.deadlines.get(symbol, None) print('ex_deadline', ex_deadline) equity = frame[frame['ex_date'] > ex_deadline] if ex_deadline else frame db.writer('ownership', equity)
def _crawler(self, mapping, tbl, pct=False): sid = mapping['sid'] url = ASSETS_BUNDLES_URL[tbl].format(mapping['request_sid'], self.lmt) obj = _parse_url(url, bs=False) kline = json.loads(obj)['data'] cols = self.default + ['pct'] if pct else self.default if kline and len(kline['klines']): frame = pd.DataFrame([item.split(',') for item in kline['klines']], columns=cols) frame.loc[:, 'sid'] = sid # 过滤 try: deadline = self._cache_deadlines[tbl][sid] except Exception as e: print('error :%s raise from sid come to market today' % e) deadline = None # frame = frame[frame['trade_dt'] > self._cache_deadlines[tbl][sid]] frame = frame[frame['trade_dt'] > deadline] if deadline else frame db.writer(tbl, frame)
def _writer_release(self, s_date, e_date): """ 获取A股解禁数据 """ deadline = self._retrieve_deadlines_from_sqlite('unfreeze') print('release deadline', deadline) page = 1 pages = 1 while page <= pages: url = ASSET_FUNDAMENTAL_URL['release'].format(page=page, start=s_date, end=e_date) text = self._arbitrary_parser(url, encoding='utf-8') try: info = text['data'] data = [[ item['gpdm'], item['ltsj'], item['xsglx'], item['zb'] ] for item in info] # release_date --- declared_date frame = pd.DataFrame( data, columns=['sid', 'declared_date', 'release_type', 'zb']) frame['declared_date'] = frame['declared_date'].apply( lambda x: pd.Timestamp(x).strftime('%Y-%m-%d')) frame.dropna(axis=0, how='all', inplace=True) release = frame[frame['declared_date'] > deadline.max( )] if not deadline.empty else frame if release.empty: break print('release', release.head()) release.replace('-', 0.0, inplace=True) db.writer('unfreeze', release) page = page + 1 print('present release page', page) pages = text['pages'] print('release pages', pages) time.sleep(np.random.randint(0, 3)) # else: except Exception as e: print('error', e)
def _writer_margin(self, *args): """获取市场全量融资融券""" deadline = self._retrieve_deadlines_from_sqlite('margin') print('margin deadline', deadline) page = 1 pages = 1 while page <= pages: req_url = ASSET_FUNDAMENTAL_URL['margin'] % page text = self._arbitrary_parser(req_url) try: raw = [[ item['DIM_DATE'], item['RZYE'], item['RZYEZB'], item['RQYE'] ] for item in text['result']['data']] frame = pd.DataFrame( raw, columns=['declared_date', 'rzye', 'rzyezb', 'rqye']) frame['declared_date'] = frame['declared_date'].apply( lambda x: pd.Timestamp(x).strftime('%Y-%m-%d')) frame.loc[:, ['rzye', 'rqye']] = frame.loc[:, ['rzye', 'rqye']].div( 1e8) frame.fillna(0.0, inplace=True) margin = frame[ frame['declared_date'] > deadline] if deadline else frame print('marign', margin.head()) if margin.empty: break db.writer('margin', margin) page = page + 1 print('present margin page', page) pages = text['result']['pages'] print('margin pages', pages) time.sleep(np.random.randint(0, 3)) except Exception as e: print('error', e)
def _write_df_to_table(self, tbl, df, include=True): df = df.copy() if include: self._write_assets(df) db.writer(tbl, df)
def _write_assets(frame): # symbols_mapping = frame.loc[:, _rename_router_cols] renamed_frame = frame.reindex(columns=_rename_router_cols, fill_value='') db.writer('asset_router', renamed_frame)