def fetch_csi_index_component(self, df: pd.DataFrame): """ 抓取上证、中证指数成分股 """ query_url = 'http://www.csindex.com.cn/uploads/file/autofile/cons/{}cons.xls' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) try: response = requests.get(url) response.raise_for_status() except requests.HTTPError as error: self.logger.error(f'{index["name"]} - {index_code} 成分股抓取错误 ({error})') continue response_df = pd.read_excel(io.BytesIO(response.content)) index_id = f'index_cn_{index_code}' response_df = response_df[['成分券代码Constituent Code']].rename(columns={'成分券代码Constituent Code': 'stock_code'}) response_df['id'] = response_df['stock_code'].apply(lambda x: f'{index_id}_{china_stock_code_to_id(str(x))}') response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x))) response_df['index_id'] = index_id response_df.drop('stock_code', axis=1, inplace=True) df_to_db(data_schema=self.data_schema, df=response_df, provider=self.provider) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') self.sleep()
def fetch_szse_index_component(self, df: pd.DataFrame): """ 抓取深证指数成分股 """ query_url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1747_zs&TABKEY=tab1&ZSDM={}' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) response = requests.get(url) response_df = pd.read_excel(io.BytesIO(response.content), dtype='str') index_id = f'index_cn_{index_code}' response_df = response_df[['证券代码']] response_df['id'] = response_df['证券代码'].apply(lambda x: f'{index_id}_{china_stock_code_to_id(str(x))}') response_df['stock_id'] = response_df['证券代码'].apply(lambda x: china_stock_code_to_id(str(x))) response_df['index_id'] = index_id response_df.drop('证券代码', axis=1, inplace=True) df_to_db(data_schema=self.data_schema, df=response_df, provider=self.provider) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') self.sleep()
def download_sh_etf_component(self, df: pd.DataFrame): """ ETF_CLASS => 1. 单市场 ETF 2.跨市场 ETF 3. 跨境 ETF 5. 债券 ETF 6. 黄金 ETF :param df: ETF 列表数据 :return: None """ query_url = 'http://query.sse.com.cn/infodisplay/queryConstituentStockInfo.do?' \ 'isPagination=false&type={}&etfClass={}' etf_df = df[(df['ETF_CLASS'] == '1') | (df['ETF_CLASS'] == '2')] etf_df = self.populate_sh_etf_type(etf_df) for _, etf in etf_df.iterrows(): url = query_url.format(etf['ETF_TYPE'], etf['ETF_CLASS']) response = requests.get(url, headers=DEFAULT_SH_ETF_LIST_HEADER) response_dict = demjson.decode(response.text) response_df = pd.DataFrame(response_dict.get('result', [])) etf_code = etf['FUND_ID'] index_id = f'index_sh_{etf_code}' response_df = response_df[['instrumentId']] response_df['id'] = response_df['instrumentId'].apply( lambda code: f'{index_id}_{china_stock_code_to_id(code)}') response_df['stock_id'] = response_df['instrumentId'].apply( lambda code: china_stock_code_to_id(code)) response_df['index_id'] = index_id response_df.drop('instrumentId', axis=1, inplace=True) df_to_db(data_schema=self.data_schema, df=response_df, provider=self.provider) self.logger.info(f'{etf["FUND_NAME"]} - {etf_code} 成分股抓取完成...') self.sleep()
def download_sz_etf_component(self, df: pd.DataFrame): query_url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vII_NewestComponent/indexid/{}.phtml' self.parse_sz_etf_underlying_index(df) for _, etf in df.iterrows(): underlying_index = etf['拟合指数'] etf_code = etf['证券代码'] if len(underlying_index) == 0: self.logger.info(f'{etf["证券简称"]} - {etf_code} 非 A 股市场指数,跳过...') continue url = query_url.format(underlying_index) response = requests.get(url) response.encoding = 'gbk' try: dfs = pd.read_html(response.text, header=1) except ValueError as error: self.logger.error( f'HTML parse error: {error}, response: {response.text}') continue if len(dfs) < 4: continue response_df = dfs[3].copy() response_df = response_df.dropna(axis=1, how='any') response_df['品种代码'] = response_df['品种代码'].apply( lambda x: f'{x:06d}') index_id = f'index_sz_{etf_code}' response_df = response_df[['品种代码']] response_df['id'] = response_df['品种代码'].apply( lambda code: f'{index_id}_{china_stock_code_to_id(code)}') response_df['stock_id'] = response_df['品种代码'].apply( lambda code: china_stock_code_to_id(code)) response_df['index_id'] = index_id response_df.drop('品种代码', axis=1, inplace=True) df_to_db(data_schema=self.data_schema, df=response_df, provider=self.provider) self.logger.info(f'{etf["证券简称"]} - {etf_code} 成分股抓取完成...') self.sleep()
def fetch_cni_index_component(self, df: pd.DataFrame): """ 抓取国证指数成分股 """ query_url = 'http://www.cnindex.com.cn/docs/yb_{}.xls' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) try: response = requests.get(url) response.raise_for_status() except requests.HTTPError as error: self.logger.error(f'{index["name"]} - {index_code} 成分股抓取错误 ({error})') continue response_df = pd.read_excel(io.BytesIO(response.content), dtype='str') index_id = f'index_cn_{index_code}' try: response_df = response_df[['样本股代码']] except KeyError: response_df = response_df[['证券代码']] response_df.columns = ['stock_code'] response_df['id'] = response_df['stock_code'].apply(lambda x: f'{index_id}_{china_stock_code_to_id(str(x))}') response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x))) response_df['index_id'] = index_id response_df.drop('stock_code', axis=1, inplace=True) df_to_db(data_schema=self.data_schema, df=response_df, provider=self.provider) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') self.sleep()
def run(self): # get stock category from sina for category, url in self.category_map_url.items(): resp = requests.get(url) resp.encoding = 'GBK' tmp_str = resp.text json_str = tmp_str[tmp_str.index('{'):tmp_str.index('}') + 1] tmp_json = json.loads(json_str) for code in tmp_json: name = tmp_json[code].split(',')[1] id = 'index_cn_{}'.format(code) if id in self.index_ids: continue self.session.add( Index(id=id, type='index', exchange='cn', code=code, name=name, category=category.value)) self.session.commit() indices = get_securities( session=self.session, security_type=SecurityType.index, return_type='domain', filters=[Index.category != StockCategory.main.value], provider=self.provider) for index_item in indices: for page in range(1, 5): resp = requests.get( self.category_stocks_url.format(page, index_item.code)) try: if resp.text == 'null' or resp.text is None: break category_jsons = demjson.decode(resp.text) the_list = [] for category in category_jsons: stock_code = category['code'] stock_id = china_stock_code_to_id(stock_code) index_id = index_item.id the_list.append({ 'id': '{}_{}'.format(index_id, stock_id), 'index_id': index_id, 'stock_id': stock_id }) if the_list: df = pd.DataFrame.from_records(the_list) df_to_db(data_schema=self.data_schema, df=df, provider=self.provider) self.logger.info('finish recording index:{},{}'.format( index_item.category, index_item.name)) except Exception as e: self.logger.error("error:,resp.text:", e, resp.text) self.sleep()