def spider_closed(self, spider, reason): if self.sh_df[self.category_type].any(): self.sh_df.to_csv(get_security_list_path('stock', 'sh'), index=False) if self.sz_df[self.category_type].any(): self.sz_df.to_csv(get_security_list_path('stock', 'sz'), index=False) spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
def download_stock_list(self, response): exchange = response.meta['exchange'] path = files_contract.get_security_list_path('stock', exchange) df = pd.read_csv(io.BytesIO(response.body), dtype=str) if df is not None: if os.path.exists(path): df_current = pd.read_csv(path, dtype=str) df_current = df_current.set_index('code', drop=False) else: df_current = pd.DataFrame() df = df.loc[:, ['Symbol', 'Name', 'IPOyear', 'Sector', 'industry']] df = df.dropna(subset=['Symbol', 'Name']) df.columns = ['code', 'name', 'listDate', 'sector', 'industry'] df.listDate = df.listDate.apply(lambda x: to_time_str(x)) df['exchange'] = exchange df['type'] = 'stock' df['id'] = df[['type', 'exchange', 'code']].apply(lambda x: '_'.join(x.astype(str)), axis=1) df['sinaIndustry'] = '' df['sinaConcept'] = '' df['sinaArea'] = '' df = df.set_index('code', drop=False) diff = set(df.index.tolist()) - set(df_current.index.tolist()) diff = [item for item in diff if item != 'nan'] if diff: df_current = df_current.append(df.loc[diff, :], ignore_index=False) df_current = df_current.loc[:, STOCK_META_COL] df_current.columns = STOCK_META_COL df_current.to_csv(path, index=False)
def download_stock_list(self, response): exchange = response.meta['exchange'] path = files_contract.get_security_list_path('stock', exchange) df = None if exchange == 'sh': df = pd.read_csv(io.BytesIO(response.body), sep='\s+', encoding='GB2312', dtype=str) elif exchange == 'sz': df = pd.read_excel(io.BytesIO(response.body), sheet_name='上市公司列表', dtype=str) if df is not None: if os.path.exists(path): df_current = pd.read_csv(path, dtype=str) df_current = df_current.set_index('code', drop=False) else: df_current = pd.DataFrame() df = df.loc[:, ['A股代码', 'A股简称', 'A股上市日期']] df.columns = ['code', 'name', 'listDate'] df['exchange'] = exchange df['type'] = 'stock' df['id'] = df[['type', 'exchange', 'code']].apply(lambda x: '_'.join(x.astype(str)), axis=1) df = df.dropna(axis=0, how='any') df = df.set_index('code', drop=False) diff = set(df.index.tolist()) - set(df_current.index.tolist()) diff = [item for item in diff if item != 'nan'] if diff: df_current = df_current.append(df.loc[diff, :], ignore_index=False) df_current = df_current.loc[:, STOCK_META_COL] df_current.columns = STOCK_META_COL df_current.to_csv(path, index=False)
def get_security_list(security_type='stock', exchanges=['sh', 'sz'], start=STOCK_START_CODE, end=STOCK_END_CODE, mode='simple', start_date=None): if security_type == 'stock': df = pd.DataFrame() for exchange in exchanges: the_path = files_contract.get_security_list_path(security_type, exchange) if os.path.exists(the_path): if mode == 'simple': df1 = pd.read_csv(the_path, converters={'code': str}) else: df1 = pd.read_csv(the_path, converters={'code': str, 'sinaIndustry': convert_to_list_if_need, 'sinaConcept': convert_to_list_if_need, 'sinaArea': convert_to_list_if_need}) df = df.append(df1, ignore_index=True) elif security_type == 'index': df = pd.DataFrame(CHINA_STOCK_INDEX) if df.size > 0: df = df[df["code"] <= end] df = df[df["code"] >= start] if start_date: df['listDate'] = pd.to_datetime(df['listDate']) df = df[df['listDate'] >= pd.Timestamp(start_date)] df = df.set_index(df['code'], drop=False) return df
def download_stock_list(self, response): exchange = response.meta['exchange'] path = files_contract.get_security_list_path('stock', exchange) df = None if exchange == 'sh': df = pd.read_csv(io.BytesIO(response.body), sep='\s+', encoding='GB2312', dtype=str) elif exchange == 'sz': df = pd.read_excel(io.BytesIO(response.body), sheet_name='上市公司列表', dtype=str) if df is not None: if os.path.exists(path): df_current = pd.read_csv(path, dtype=str) df_current = df_current.set_index('code', drop=False) else: df_current = pd.DataFrame() df = df.loc[:, ['A股代码', 'A股简称', 'A股上市日期']] df.columns = ['code', 'name', 'listDate'] df['exchange'] = exchange df['type'] = 'stock' df['id'] = df[['type', 'exchange', 'code']].apply(lambda x: '_'.join(x.astype(str)), axis=1) df['timestamp'] = df['listDate'] df = df.dropna(axis=0, how='any') df = df.set_index('code', drop=False) # 只添加增量 diff = set(df.index.tolist()) - set(df_current.index.tolist()) diff = [item for item in diff if item != 'nan'] if diff: df_current = df_current.append(df.loc[diff, :], ignore_index=False) df_current = df_current.loc[:, STOCK_META_COL] df_current.columns = STOCK_META_COL df_current.to_csv(path, index=False)
def init_markets(exchanges=CRYPTOCURRENCY_EXCHANGES): for exchange_str in set(ccxt.exchanges) & set(exchanges): exchange_dir = get_exchange_dir(security_type='cryptocurrency', exchange=exchange_str) # 创建交易所目录 if not os.path.exists(exchange_dir): os.makedirs(exchange_dir) exchange = eval("ccxt.{}()".format(exchange_str)) try: markets = exchange.fetch_markets() df = pd.DataFrame() # markets有些为key=symbol的dict,有些为list markets_type = type(markets) if markets_type != dict and markets_type != list: logger.exception("unknown return markets type {}".format(markets_type)) return for market in markets: if markets_type == dict: name = market code = name.replace('/', "-") if markets_type == list: name = market['symbol'] code = name.replace('/', "-") security_item = generate_security_item(security_type='cryptocurrency', exchange=exchange_str, code=code, name=name, list_date=None) kdata_dir = get_kdata_dir(security_item) if not os.path.exists(kdata_dir): os.makedirs(kdata_dir) df = df.append(security_item, ignore_index=True) logger.info("init_markets,exchange:{} security:{}".format(exchange_str, security_item)) if markets_type == dict: security_info = markets[market] if markets_type == list: security_info = market # 存储数字货币的meta信息 if security_info: with open(get_security_meta_path(security_type='cryptocurrency', exchange=exchange_str, code=code), "w") as f: json.dump(security_info, f, ensure_ascii=False) # 存储该交易所的数字货币列表 if not df.empty: df.to_csv(get_security_list_path(security_type='cryptocurrency', exchange=exchange_str), index=False) logger.exception("init_markets for {} success".format(exchange_str)) except Exception as e: logger.exception("init_markets for {} failed".format(exchange_str), e)
def get_security_list(security_type='stock', exchanges=['sh', 'sz'], start=None, end=None, mode='simple', start_date=None, codes=None): if security_type == 'stock': df = pd.DataFrame() df_usa = pd.DataFrame() for exchange in exchanges: the_path = files_contract.get_security_list_path(security_type, exchange) if os.path.exists(the_path): if exchange == 'sh' or exchange == 'sz': if mode == 'simple': df1 = pd.read_csv(the_path, converters={'code': str}) else: df1 = pd.read_csv(the_path, converters={'code': str, 'sinaIndustry': convert_to_list_if_need, 'sinaConcept': convert_to_list_if_need, 'sinaArea': convert_to_list_if_need}) df = df.append(df1, ignore_index=True) elif exchange == 'nasdaq': df_usa = pd.read_csv(the_path, dtype=str) elif security_type == 'index': df = pd.DataFrame(CHINA_STOCK_INDEX) df_usa = pd.DataFrame() if 'nasdaq' in exchanges: df_usa = pd.DataFrame(USA_STOCK_INDEX) if df.size > 0: if start: df = df[df["code"] <= end] if end: df = df[df["code"] >= start] if start_date: df['listDate'] = pd.to_datetime(df['listDate']) df = df[df['listDate'] >= pd.Timestamp(start_date)] df = df.set_index(df['code'], drop=False) if df_usa.size > 0: df_usa = df_usa.set_index(df_usa['code'], drop=False) if codes: df_usa = df_usa.loc[codes] df = df.append(df_usa, ignore_index=True) return df
def parse_shfe_data(force_parse=False): the_dir = get_exchange_cache_dir(security_type='future', exchange='shfe') need_parse_files = [] for the_zip_file in [ os.path.join(the_dir, f) for f in os.listdir(the_dir) if f.endswith('.zip') ]: dst_file = the_zip_file.replace('.zip', ".xls") if not os.path.exists(dst_file): dst_dir = the_zip_file.replace('.zip', "") os.makedirs(dst_dir) unzip(the_zip_file, dst_dir) files = [ os.path.join(dst_dir, f) for f in os.listdir(dst_dir) if f.endswith('.xls') ] if len(files) == 1: os.rename(files[0], dst_file) need_parse_files.append(dst_file) if force_parse: need_parse_files = [ os.path.join(the_dir, f) for f in os.listdir(the_dir) if f.endswith('.xls') ] for the_file in need_parse_files: logger.info("parse {}".format(the_file)) df = pd.read_excel(the_file, skiprows=2, skip_footer=4, index_col='合约', converters={'日期': str}) df.index = pd.Series(df.index).fillna(method='ffill') df = df.loc[:, [ '日期', '前收盘', '前结算', '开盘价', '最高价', '最低价', '收盘价', '结算价', '涨跌1', '涨跌2', '成交量', '成交金额', '持仓量' ]] df.columns = [ 'timestamp', 'preClose', 'preSettlement', 'open', 'high', 'low', 'close', 'settlement', 'change', 'change1', 'volume', 'turnover', 'openInterest' ] # 日期格式统一,方便导入es # df.timestamp = df.timestamp.apply(lambda x: to_time_str(x)) unique_index = df.index.drop_duplicates() security_list = get_security_list(security_type='future', exchanges=['shfe']) for the_contract in unique_index: logger.info("start handling {} in {}".format( the_contract, the_file)) security_item = { 'code': the_contract, 'name': get_future_name(the_contract), 'id': 'future_{}_{}'.format('shfe', the_contract), 'exchange': 'shfe', 'type': 'future' } # 检查是否需要保存合约meta if (not security_list.empty) and ('code' in security_list.columns): security_list = security_list.set_index(security_list['code'], drop=False) if the_contract not in security_list.index: security_list = security_list.append(security_item, ignore_index=True) security_list = security_list.sort_index() security_list.to_csv(get_security_list_path('future', 'shfe'), index=False) the_df = df.loc[the_contract, ] the_df['code'] = the_contract the_df['name'] = get_future_name(the_contract) the_df['securityId'] = 'future_{}_{}'.format('shfe', the_contract) the_df['changePct'] = the_df['change'] / the_df['preClose'] the_df['changePct1'] = the_df['change1'] / the_df['preSettlement'] kdata_path = get_kdata_path(item=security_item, source='exchange') # TODO:这些逻辑应该统一处理 kdata_dir = get_kdata_dir(item=security_item) if not os.path.exists(kdata_dir): os.makedirs(kdata_dir) if os.path.exists(kdata_path): saved_df = pd.read_csv(kdata_path, dtype=str) else: saved_df = pd.DataFrame() saved_df = saved_df.append(the_df, ignore_index=True) saved_df = saved_df.loc[:, KDATA_FUTURE_COL] if not saved_df.empty: kdata_df_save(saved_df, kdata_path) logger.info("end handling {} in {}".format(the_contract, the_file))
def parse_shfe_day_data(force_parse=False): cache_dir = get_exchange_cache_dir(security_type='future', exchange='shfe', the_year=datetime.datetime.today().year, data_type="day_kdata") the_parsed_path = os.path.join(cache_dir, 'parsed') the_parsed = [] if os.path.exists(the_parsed_path): with open(the_parsed_path) as data_file: the_parsed = json.load(data_file) if force_parse: the_dates = [f for f in os.listdir(cache_dir) if f != 'parsed' and f] else: the_dates = [ f for f in os.listdir(cache_dir) if f != 'parsed' and f not in the_parsed ] for the_date in the_dates: the_path = os.path.join(cache_dir, the_date) logger.info("start handling {}".format(the_path)) with open(the_path, 'r', encoding='UTF8') as f: tmp_str = f.read() the_json = json.loads(tmp_str) the_datas = the_json['o_curinstrument'] # 日期,代码,名称,最低,开盘,收盘,最高,成交量(手),成交额(元),唯一标识,前收盘,涨跌额,涨跌幅(%),持仓量,结算价,前结算,涨跌额(按结算价),涨跌幅(按结算价) KDATA_COLUMN_FUTURE = [ 'timestamp', 'code', 'name', 'low', 'open', 'close', 'high', 'volume', 'turnover', 'securityId', 'preClose', 'change', 'changePct', 'openInterest', 'settlement', 'preSettlement', 'change1', 'changePct1' ] for the_data in the_datas: # {'CLOSEPRICE': 11480, # 'DELIVERYMONTH': '1809', # 'HIGHESTPRICE': 11555, # 'LOWESTPRICE': 11320, # 'OPENINTEREST': 425692, # 'OPENINTERESTCHG': 3918, # 'OPENPRICE': 11495, # 'ORDERNO': 0, # 'PRESETTLEMENTPRICE': 11545, # 'PRODUCTID': 'ru_f ', # 'PRODUCTNAME': '天然橡胶 ', # 'PRODUCTSORTNO': 100, # 'SETTLEMENTPRICE': 11465, # 'VOLUME': 456574, # 'ZD1_CHG': -65, # 'ZD2_CHG': -80} if not re.match("\d{4}", the_data['DELIVERYMONTH']): continue code = "{}{}".format( the_data['PRODUCTID'][:the_data['PRODUCTID'].index('_')], the_data['DELIVERYMONTH']) logger.info("start handling {} for {}".format(code, the_date)) name = get_future_name(code) security_id = "future_shfe_{}".format(code) security_list = get_security_list(security_type='future', exchanges=['shfe']) logger.info("start handling {} for {}".format(code, the_date)) security_item = { 'code': code, 'name': name, 'id': security_id, 'exchange': 'shfe', 'type': 'future' } # 检查是否需要保存合约meta if security_list is not None and 'code' in security_list.columns: security_list = security_list.set_index( security_list['code'], drop=False) if code not in security_list.index: security_list = security_list.append(security_item, ignore_index=True) security_list.to_csv(get_security_list_path( 'future', 'shfe'), index=False) kdata_path = get_kdata_path(item=security_item, source='exchange') # TODO:这些逻辑应该统一处理 kdata_dir = get_kdata_dir(item=security_item) if not os.path.exists(kdata_dir): os.makedirs(kdata_dir) if os.path.exists(kdata_path): saved_df = pd.read_csv(kdata_path, dtype=str) saved_df = saved_df.set_index(saved_df['timestamp'], drop=False) else: saved_df = pd.DataFrame() if saved_df.empty or the_date not in saved_df.index: low_price = the_data['LOWESTPRICE'] if not low_price: low_price = 0 open_price = the_data['OPENPRICE'] if not open_price: open_price = 0 close_price = the_data['CLOSEPRICE'] if not close_price: close_price = 0 high_price = the_data['HIGHESTPRICE'] if not high_price: high_price = 0 volume = the_data['VOLUME'] if not volume: volume = 0 if type(the_data['ZD1_CHG']) == str: change = 0 else: change = the_data['ZD1_CHG'] if type(the_data['ZD2_CHG']) == str: change1 = 0 else: change1 = the_data['ZD2_CHG'] pre_close = close_price - change pre_settlement = the_data['PRESETTLEMENTPRICE'] # 首日交易 if pre_close != 0: change_pct = change / pre_close else: change_pct = 0 if pre_settlement != 0: change_pct1 = change1 / pre_settlement else: change_pct1 = 0 the_json = { "timestamp": to_time_str(the_date), "code": code, "name": name, "low": low_price, "open": open_price, "close": close_price, "high": high_price, "volume": volume, # 成交额为估算 "turnover": (low_price + open_price + close_price + high_price / 4) * volume, "securityId": security_id, "preClose": pre_close, "change": change, "changePct": change_pct, "openInterest": the_data['OPENINTEREST'], "settlement": the_data['SETTLEMENTPRICE'], "preSettlement": the_data['PRESETTLEMENTPRICE'], "change1": change1, "changePct1": change_pct1 } saved_df = saved_df.append(the_json, ignore_index=True) saved_df = saved_df.loc[:, KDATA_COLUMN_FUTURE] saved_df = saved_df.drop_duplicates(subset='timestamp', keep='last') saved_df = saved_df.set_index(saved_df['timestamp'], drop=False) saved_df.index = pd.to_datetime(saved_df.index) saved_df = saved_df.sort_index() saved_df.to_csv(kdata_path, index=False) logger.info("end handling {} for {}".format( code, the_date)) if the_date not in the_parsed: the_parsed.append(the_date) if the_parsed: result_list = drop_duplicate(the_parsed) result_list = sorted(result_list) with open(the_parsed_path, 'w') as outfile: json.dump(result_list, outfile) logger.info("end handling {}".format(the_path))
def get_security_list(security_type='stock', exchanges=None, start=None, end=None, mode='simple', start_list_date=None, codes=None): """ get security list. Parameters ---------- security_type : str {‘stock’, 'future'},default: stock exchanges : str or list ['sh', 'sz','nasdaq','nyse','amex','shfe','dce','zce'],default: ['sh','sz'] start : str the start code,work with end,default:None if using codes,it would be ignored end : str the end code,works with start,default:None if using codes,it would be ignored mode : str whether parse more security info,{'simple','es'},default:'simple' start_list_date : Timestamp str or Timestamp the filter for start list date,default:None codes : list the exact codes to query,default:None Returns ------- DataFrame the security list """ df = pd.DataFrame() if type(exchanges) == str: exchanges = [exchanges] if not exchanges: exchanges = SECURITY_TYPE_MAP_EXCHANGES[security_type] if security_type == 'index': df = df.append(pd.DataFrame(CHINA_STOCK_SH_INDEX), ignore_index=True) df = df.append(pd.DataFrame(CHINA_STOCK_SZ_INDEX), ignore_index=True) df = df.append(pd.DataFrame(USA_STOCK_NASDAQ_INDEX), ignore_index=True) else: for exchange in exchanges: the_path = get_security_list_path(security_type, exchange) if os.path.exists(the_path): if mode == 'es' and security_type == 'stock': df = df.append( pd.read_csv(the_path, converters={ 'code': str, 'sinaIndustry': convert_to_list_if_need, 'sinaConcept': convert_to_list_if_need, 'sinaArea': convert_to_list_if_need })) else: df = df.append(pd.read_csv(the_path, dtype=str), ignore_index=True) if not df.empty > 0: if start_list_date: df['listDate'] = pd.to_datetime(df['listDate']) df = df[df['listDate'] >= pd.Timestamp(start_list_date)] df = df.set_index(df['code'], drop=False) if codes: df = df.loc[codes] elif start and end: df = df[(df["code"] >= start) & (df["code"] <= end)] if security_type != 'cryptocurrency': df = df.drop_duplicates(subset='code', keep='last') return df
def get_security_list(security_type='stock', exchanges=['sh', 'sz'], start=None, end=None, mode='simple', start_date=None, codes=None): """ get security list. Parameters ---------- security_type : str {‘stock’, 'future'},default: stock exchanges : list ['sh', 'sz','nasdaq','nyse','amex'],default: ['sh','sz'] start : str the start code,default:None only works when exchanges is ['sh','sz'] end : str the end code,default:None only works when exchanges is ['sh','sz'] mode : str whether parse more security info,{'simple','es'},default:'simple' start_date : Timestamp str or Timestamp the filter for start list date,default:None codes : list the exact codes to query,default:None Returns ------- DataFrame the security list """ if security_type == 'stock': df = pd.DataFrame() df_usa = pd.DataFrame() for exchange in exchanges: the_path = files_contract.get_security_list_path( security_type, exchange) if os.path.exists(the_path): if exchange == 'sh' or exchange == 'sz': if mode == 'simple': df1 = pd.read_csv(the_path, converters={'code': str}) else: df1 = pd.read_csv(the_path, converters={ 'code': str, 'sinaIndustry': convert_to_list_if_need, 'sinaConcept': convert_to_list_if_need, 'sinaArea': convert_to_list_if_need }) df = df.append(df1, ignore_index=True) elif exchange == 'nasdaq': df_usa = pd.read_csv(the_path, dtype=str) elif security_type == 'index': df = pd.DataFrame(CHINA_STOCK_INDEX) df_usa = pd.DataFrame() if 'nasdaq' in exchanges: df_usa = pd.DataFrame(USA_STOCK_INDEX) if df.size > 0: if start: df = df[df["code"] <= end] if end: df = df[df["code"] >= start] if start_date: df['listDate'] = pd.to_datetime(df['listDate']) df = df[df['listDate'] >= pd.Timestamp(start_date)] df = df.set_index(df['code'], drop=False) if df_usa.size > 0: df_usa = df_usa.set_index(df_usa['code'], drop=False) if codes: df_usa = df_usa.loc[codes] df = df.append(df_usa, ignore_index=True) return df
def spider_closed(self, spider, reason): self.sh_df.to_csv(get_security_list_path('stock', 'sh'), index=False) self.sz_df.to_csv(get_security_list_path('stock', 'sz'), index=False) spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
def get_security_list(security_type='stock', exchanges=['sh', 'sz'], start=None, end=None, mode='simple', start_list_date=None, codes=None): """ get security list. Parameters ---------- security_type : str {‘stock’, 'future'},default: stock exchanges : list ['sh', 'sz','nasdaq','nyse','amex','shfe','dce','zce'],default: ['sh','sz'] start : str the start code,work with end,default:None if using codes,it would be ignored end : str the end code,works with start,default:None if using codes,it would be ignored mode : str whether parse more security info,{'simple','es'},default:'simple' start_list_date : Timestamp str or Timestamp the filter for start list date,default:None codes : list the exact codes to query,default:None Returns ------- DataFrame the security list """ df = pd.DataFrame() if security_type == 'stock' or security_type == 'future': for exchange in exchanges: the_path = get_security_list_path(security_type, exchange) if os.path.exists(the_path): # 股票的元数据如果存到es,需要做一些转化 if mode == 'es' and security_type == 'stock': tmp_df = pd.read_csv(the_path, converters={ 'code': str, 'sinaIndustry': convert_to_list_if_need, 'sinaConcept': convert_to_list_if_need, 'sinaArea': convert_to_list_if_need }) else: tmp_df = pd.read_csv(the_path, dtype=str) df = df.append(tmp_df, ignore_index=True) elif security_type == 'index': for exchange in exchanges: if 'sh' == exchange: df = df.append(pd.DataFrame(CHINA_STOCK_SH_INDEX), ignore_index=True) if 'sz' == exchange: df = df.append(pd.DataFrame(CHINA_STOCK_SZ_INDEX), ignore_index=True) if 'nasdaq' == exchange: df = df.append(pd.DataFrame(USA_STOCK_NASDAQ_INDEX), ignore_index=True) if df.size > 0: if start_list_date: df['listDate'] = pd.to_datetime(df['listDate']) df = df[df['listDate'] >= pd.Timestamp(start_list_date)] df = df.set_index(df['code'], drop=False) if codes: df = df.loc[codes] elif start and end: df = df[(df["code"] >= start) & (df["code"] <= end)] # FIXME: # 期货列表有重复的数据,需要检查一下 df = df.drop_duplicates(subset='code', keep='last') return df