def _request_convertible_basics(update_mapping): bond_mappings = update_mapping['convertible'] if bond_mappings: # bond basics 已上市的basics text = _parse_url(ASSET_SUPPLEMENT_URL['convertible_supplement'], encoding=None, bs=False) text = json.loads(text) # 取交集 common_bond = set(bond_mappings) & set( [basic['id'] for basic in text['rows']]) print('common', len(common_bond)) # combine two dict object --- single --- 保持数据的完整性 [ basic['cell'].update(bond_mappings[basic['id']]) for basic in text['rows'] if basic['id'] in common_bond ] basics = [ basic['cell'] for basic in text['rows'] if basic['id'] in common_bond ] basics_frame = pd.DataFrame(basics) else: basics_frame = pd.DataFrame() return basics_frame
def _initialize_symbols(): raw = _parse_url(BENCHMARK_URL['symbols'], encoding='utf-8') data = json.loads(raw.text) index_mappings = { item['f14']: item['f12'] for item in data['data']['diff'] } return index_mappings
def _request_funds(): # 获取存量的ETF 基金主要分为 固定收益 分级杠杆(A/B) ( ETF场内| QDII-ETF ) obj = _parse_url(ASSERT_URL_MAPPING['fund']) raw = [data.find_all('td') for data in obj.find_all(id='tableDiv')] text = [t.get_text() for t in raw[0]] # 由于format原因,最后两列为空 frame = pd.DataFrame(partition_all(14, text[18:]), columns=text[2:16]).iloc[:, :-2] return frame
def _arbitrary_parser(self, url, encoding='gbk', direct=True): try: text = _parse_url(url, encoding=encoding, bs=False) raw = json.loads(text) if direct else text return raw except Exception as e: print('error %r' % e) time.sleep(np.random.randint(5, 10)) self._arbitrary_parser(url, encoding=encoding, direct=direct)
def lookup_index_symbols(cls): raw = json.loads( _parse_url(ASSERT_URL_MAPPING['benchmark'], encoding='utf-8', bs=False)) symbols = raw['data']['diff'] frame = pd.DataFrame(symbols.values()) frame.set_index('f12', inplace=True) dct = frame.iloc[:, 0].to_dict() return dct
def _writer_internal(self, equities): for sid in equities: try: content = _parse_url(OWNERSHIP % sid) self._parse_equity_ownership(content, sid) print('successfully spider ownership of code : %s' % sid) except Exception as e: print('spider code: % s ownership failure due to % r' % (sid, e)) self.missed.add(sid) else: self.missed.discard(sid)
def _request_equity_basics(code): url = ASSET_SUPPLEMENT_URL['equity_supplement'] % code obj = _parse_url(url) table = obj.find('table', {'id': 'comInfo1'}) tag = [item.findAll('td') for item in table.findAll('tr')] tag_chain = list(chain(*tag)) raw = [item.get_text() for item in tag_chain] # remove format raw = [i.replace(':', '') for i in raw] raw = [i.strip() for i in raw] brief = list(zip(raw[::2], raw[1::2])) mapping = {item[0]: item[1] for item in brief} mapping.update({'代码': code}) return mapping
def _request_duals(): # 获取存量AH两地上市的标的 dual_mappings = {} page = 1 while True: url = ASSERT_URL_MAPPING['dual'] % page raw = _parse_url(url, bs=False, encoding=None) raw = json.loads(raw) diff = raw['data'] if diff and len(diff['diff']): # f12 -- hk ; 191 -- code diff = {item['f191']: item['f12'] for item in diff['diff']} dual_mappings.update(diff) page = page + 1 else: break return dual_mappings
def get_current_minutes(sid): """ return current reality tickers data """ _url = 'http://push2.eastmoney.com/api/qt/stock/trends2/get?fields1=f1&' \ 'fields2=f51,f52,f53,f54,f55,f56,f57,f58&iscr=0&secid={}' # 处理数据 req_sid = '0.' + sid if sid.startswith('6') else '1.' + sid req_url = _url.format(req_sid) obj = _parse_url(req_url, bs=False) d = json.loads(obj) raw_array = [item.split(',') for item in d['data']['trends']] minutes = pd.DataFrame(raw_array, columns=[ 'ticker', 'open', 'close', 'high', 'low', 'volume', 'turnover', 'avg' ]) return minutes
def suspend(cls, dt): """ 获取dt停盘信息 e.g:2020-07-13 """ supspend_url = 'http://datainterface.eastmoney.com/EM_DataCenter/JS.aspx?type=FD&sty=SRB&st=0&sr=-1&p=1&ps=50&'\ 'js={"pages":(pc),"data":[(x)]}&mkt=1&fd=%s' % dt text = _parse_url(supspend_url, bs=False, encoding=None) text = json.loads(text) text = [t.split(',') for t in text['data']] # list(partition(9, text['data'])) frame = pd.DataFrame(text, columns=[ 'sid', 'name', 'open_ticker', 'close_ticker', 'suspend', 'reason', 'market', 'date', 'market_date' ]) print('frame', frame.iloc[0, :]) return frame
def load_raw_arrays(self, dts, assets, fields=None): """获取GDP数据""" page = 1 gross_value = pd.DataFrame() while True: req_url = self._url % page obj = _parse_url(req_url) raw = obj.findAll('div', {'class': 'Content'}) text = [t.get_text() for t in raw[1].findAll('td')] text = [item.strip() for item in text] data = zip(text[::9], text[1::9]) data = pd.DataFrame(data, columns=['季度', '总值']) gross_value = gross_value.append(data) if len(gross_value) != len( gross_value.drop_duplicates(ignore_index=True)): gross_value.drop_duplicates(inplace=True, ignore_index=True) break page = page + 1 return gross_value
def _crawler(self, mapping, tbl, pct=False): sid = mapping['sid'] url = ASSETS_BUNDLES_URL[tbl].format(mapping['request_sid'], self.lmt) obj = _parse_url(url, bs=False) kline = json.loads(obj)['data'] cols = self.default + ['pct'] if pct else self.default if kline and len(kline['klines']): frame = pd.DataFrame([item.split(',') for item in kline['klines']], columns=cols) frame.loc[:, 'sid'] = sid # 过滤 try: deadline = self._cache_deadlines[tbl][sid] except Exception as e: print('error :%s raise from sid come to market today' % e) deadline = None # frame = frame[frame['trade_dt'] > self._cache_deadlines[tbl][sid]] frame = frame[frame['trade_dt'] > deadline] if deadline else frame db.writer(tbl, frame)
def _calculate_alternative_returns(self, index_name): """ dt --- 1990-01-01 """ try: index = lookup_benchmark[index_name] except KeyError: raise ValueError url = BENCHMARK_URL['periphera_kline'] % (index, '3000-01-01') text = _parse_url(url, bs=False, encoding='utf-8') raw = json.loads(text) kline = pd.DataFrame( raw['data'][index]['day'], columns=['trade_dt', 'open', 'close', 'high', 'low', 'turnover']) kline.set_index('trade_dt', inplace=True) kline.sort_index(inplace=True) kline = kline.astype('float64') returns = kline['close'] / kline['close'].shift(1) - 1 daily_returns = self._compute_session_returns(returns) return daily_returns
def _request_convertibles(): # 获取上市的可转债的标的 page = 1 bonds = [] while True: bond_url = ASSERT_URL_MAPPING['convertible'] % page text = _parse_url(bond_url, encoding='utf-8', bs=False) text = json.loads(text) data = text['data'] if data: bonds = chain(bonds, data) page = page + 1 else: break bonds = list(bonds) # 过滤未上市的可转债 bond_id : bond_basics bond_mappings = { bond['BONDCODE']: bond for bond in bonds if bond['LISTDATE'] != '-' } return bond_mappings
def _calculate_returns(self, sid): """ date --- 19900101 """ symbol = '1.' + sid if sid.startswith('0') else '0.' + sid url = BENCHMARK_URL['kline'].format(symbol, '30000101') obj = _parse_url(url, bs=False) data = json.loads(obj) raw = data['data'] if raw and len(raw['klines']): raw = [item.split(',') for item in raw['klines']] kline = pd.DataFrame(raw, columns=[ 'trade_dt', 'open', 'close', 'high', 'low', 'turnover', 'volume', 'amount' ]) kline.set_index('trade_dt', inplace=True) kline.sort_index(inplace=True) close = kline['close'].astype(np.float) returns = close / close.shift(1) - 1 daily_returns = self._compute_session_returns(returns) return daily_returns
def _request_equities(): # 获取存量股票包括退市 raw = json.loads(_parse_url(ASSERT_URL_MAPPING['equity'], bs=False)) equities = [item['f12'] for item in raw['data']['diff']] return equities
def _parser_writer(self, sid): contents = _parse_url(DIVDEND % sid) # 解析网页内容 self._parse_equity_rights(contents, sid) self._parse_equity_divdend(contents, sid)