def test_get_china_stock_list(): print(settings.FOOLTRADER_STORE_PATH) df = quote.get_security_list('stock', exchanges=['sh', 'sz']) assert '000001' in df.index assert '金融行业' == df.loc['000001', 'sinaIndustry'] df = quote.get_security_list('stock', exchanges=['sh']) assert '600000' in df.index assert '金融行业' == df.loc['600000', 'sinaIndustry'] df = quote.get_security_list('stock', exchanges=['sh', 'sz'], start='000338', end='600388') assert '000338' in df.index assert '600388' in df.index assert '600389' not in df.index df = quote.get_security_list('stock', exchanges=['sh', 'sz'], codes=['300027', '000002']) assert len(df.index) == 2 df = quote.get_security_list('stock', exchanges=['sh', 'sz'], mode='es') assert type(df.loc['600004', 'sinaArea']) == list assert '广州' in (df.loc['600004', 'sinaArea']) assert '广东' in (df.loc['600004', 'sinaArea'])
def init_env(): if not os.path.exists(FOOLTRADER_STORE_PATH): print("{} is a wrong path") print("please set env FOOLTRADER_STORE_PATH to working path or set it in settings.py") else: # 初始化股票文件夹 for _, item in get_security_list(exchanges=EXCHANGE_LIST_COL).iterrows(): mkdir_for_stock(item) # 初始化指数文件夹 for _, item in get_security_list(security_type='index', exchanges=['sh', 'sz', 'nasdaq']).iterrows(): kdata_dir = get_kdata_dir(item) if not os.path.exists(kdata_dir): os.makedirs(kdata_dir) # 初始化期货文件夹 for exchange in ['shfe', 'dce', 'zce']: exchange_cache_dir = get_exchange_cache_dir(security_type='future', exchange=exchange) if not os.path.exists(exchange_cache_dir): os.makedirs(exchange_cache_dir) exchange_cache_dir = get_exchange_cache_dir(security_type='future', exchange='shfe', the_year=datetime.today().year, data_type="day_kdata") if not os.path.exists(exchange_cache_dir): os.makedirs(exchange_cache_dir) exchange_dir = get_exchange_dir(security_type='future', exchange=exchange) if not os.path.exists(exchange_dir): os.makedirs(exchange_dir)
def init_env(): if not os.path.exists(settings.FILES_STORE): os.makedirs(settings.FILES_STORE) # 初始化股票文件夹 for _, item in get_security_list(exchanges=EXCHANGE_LIST_COL).iterrows(): mkdir_for_security(item) # 初始化指数文件夹 for _, item in get_security_list(security_type='index', exchanges=['sh', 'sz', 'nasdaq']).iterrows(): kdata_dir = get_kdata_dir(item) if not os.path.exists(kdata_dir): os.makedirs(kdata_dir)
def init_env(): if not os.path.exists(settings.FILES_STORE): os.makedirs(settings.FILES_STORE) # 初始化股票文件夹 for _, item in get_security_list().iterrows(): mkdir_for_security(item) # 初始化指数文件夹 for _, item in get_security_list(security_type='index').iterrows(): kdata_dir = get_kdata_dir(item) if not os.path.exists(kdata_dir): os.makedirs(kdata_dir)
def init_env(): if not os.path.exists(settings.FOOLTRADER_STORE_PATH): os.makedirs(settings.FOOLTRADER_STORE_PATH) # 初始化股票文件夹 for _, item in get_security_list(exchanges=EXCHANGE_LIST_COL).iterrows(): mkdir_for_security(item) # 初始化指数文件夹 for _, item in get_security_list(security_type='index', exchanges=['sh', 'sz', 'nasdaq']).iterrows(): kdata_dir = get_kdata_dir(item) if not os.path.exists(kdata_dir): os.makedirs(kdata_dir)
def newhighergenerator(start_date,fuquan='qfq',source='163',period=20): baseindex = 'index_sh_000001' df = quote.get_kdata(baseindex,start_date=start_date,source=source) #df = quote.get_kdata('index_sh_000001',start_date='2017-05-10',source='163') dh = pd.DataFrame(0,index=df.index,columns=['total']) #暂时只添加total,后续需要添加各个市场total #stocklist = quote.get_security_list(security_type='stock', mode='simple') stocklist = quote.get_security_list(security_type='stock',start='600000',end='600030', mode='simple') for _, item in stocklist.iterrows(): print("caculating {}".format(item.id)) for ts in dh.index: if((ts-datetime.datetime.strptime(item.listDate,'%Y-%m-%d')).days<period): dh.at[ts,item.id] = 0 else: ds = quote.get_kdata(item.id,fuquan=fuquan) indexlist = list(ds['timestamp']) tsstr = ts.strftime('%Y-%m-%d') if(tsstr in indexlist): pos = list(ds['timestamp']).index(ts.strftime('%Y-%m-%d')) if (ds['close'][pos] >= max(ds['close'][pos -period+1 :pos + 1])): dh.at[ts, item.id] = 1 else: dh.at[ts, item.id] = 0 else: dh.at[ts,item.id] = 0 df['total'] = dh.apply(lambda x:x.sum(),axis=1) df['index_c'] = df['close'] dh.to_csv('newhigher.csv') return True
def cash_flow_statement_to_es(force=False): es_index_mapping('cash_flow_statement', CashFlowStatement) for _, security_item in get_security_list().iterrows(): try: start_date = None if not force: query = {"term": {"securityId": ""}} query["term"]["securityId"] = security_item["id"] latest_record = es_get_latest_record( index='cash_flow_statement', time_field='reportDate', query=query) logger.info("latest_record:{}".format(latest_record)) if latest_record: start_date = latest_record['reportDate'] for json_object in get_cash_flow_statement_items( security_item, start_date=start_date): if start_date and is_same_date(start_date, json_object['reportDate']): continue cash_flow_statement = CashFlowStatement( meta={'id': json_object['id']}) fill_doc_type(cash_flow_statement, json_object) cash_flow_statement.save() except Exception as e: logger.warn("wrong CashFlowStatement:{},error:{}", security_item, e)
def security_meta_to_es(security_type='cryptocurrency', force=False): if security_type == 'cryptocurrency': doc_type = CryptoCurrencyMeta es_index_mapping('crypto_currency_meta', doc_type) start_date = None if not force: latest_record = es_get_latest_record('crypto_currency_meta', time_field='listDate') logger.info("latest_record:{}".format(latest_record)) if latest_record and 'listDate' in latest_record: start_date = latest_record['listDate'] actions = [] for _, item in get_security_list(security_type=security_type).iterrows(): if start_date and item['listDate'] and is_same_date( start_date, item['listDate']): continue try: security_meta = doc_type(meta={'id': item['id']}) fill_doc_type(security_meta, json.loads(item.to_json())) actions.append(security_meta.to_dict(include_meta=True)) except Exception as e: logger.warn("wrong SecurityItem:{},error:{}", item, e) if actions: resp = elasticsearch.helpers.bulk(es_client, actions) logger.info(resp)
def stock_kdata_to_es(start='000001', end='666666', force=False): for _, security_item in get_security_list(start=start, end=end).iterrows(): # 创建索引 index_name = get_es_kdata_index(security_item['id']) es_index_mapping(index_name, StockKData) start_date = None if not force: latest_record = es_get_latest_record(index_name) logger.info("latest_record:{}".format(latest_record)) if latest_record: start_date = latest_record['timestamp'] for _, kdata_item in get_kdata(security_item, start_date=start_date).iterrows(): if start_date and is_same_date(start_date, kdata_item['timestamp']): continue try: id = '{}_{}'.format(kdata_item['securityId'], kdata_item['timestamp']) kdata = StockKData(meta={'id': id}, id=id) fill_doc_type(kdata, json.loads(kdata_item.to_json())) kdata.save(index=index_name) except Exception as e: logger.warn("wrong KdataDay:{},error:{}", kdata_item, e)
def crawl_index_quote(): for _, security_item in get_security_list(security_type='index').iterrows(): # 抓取日K线 logger.info("{} get index kdata start".format(security_item['code'])) start_date = get_latest_download_trading_date(security_item, source='163') end_date = pd.Timestamp.today() if start_date > end_date: logger.info("{} kdata is ok".format(security_item['code'])) else: process_crawl(StockKdataSpider163, {"security_item": security_item, "start_date": start_date, "end_date": end_date}) logger.info("{} get index kdata from 163 end".format(security_item['code'])) # 获取市场概况数据[上海,深圳,中小板,创业板] if security_item['id'] in ['index_sh_000001', 'index_sz_399106', 'index_sz_399005', 'index_sz_399006']: # if security_item['id'] in ['index_sz_399106', 'index_sz_399005', 'index_sz_399006']: df = get_kdata(security_item=security_item) df = df[df['turnoverRate'].isna() | df['tCap'].isna() | df['mCap'].isna() | df[ 'pe'].isna()] if not df.empty: dates = df.index.strftime('%Y-%m-%d').tolist() # if security_item['id'] == 'index_sz_399106': # dates = [the_date for the_date in dates if # pd.Timestamp(the_date).date().year >= 2018] if dates: process_crawl(StockSummarySpider, {"security_item": security_item, "the_dates": dates})
def crawl_usa_stock_data(): # crawl the stock list process_crawl(AmericaListSpider, {}) # crawl the kdata for _, security_item in get_security_list(security_type='stock', exchanges=['nasdaq'], codes=US_STOCK_CODES).iterrows(): process_crawl(AmericaStockKdataSpider, {"security_item": security_item})
def legacy_kdata_to_csv(): for index, security_item in get_security_list().iterrows(): for fuquan in (True, False): dir = get_kdata_dir_old(security_item, fuquan) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if ('all' not in f and 'json' in f and os.path.isfile(os.path.join(dir, f)))] for f in files: tmp = os.path.basename(f).split('_') if fuquan: csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'hfq') if not os.path.exists(csv_path): df = pd.read_json(f, dtype={'code': str}) logger.info("{} to {}".format(f, csv_path)) df = df.loc[:, ['timestamp', 'code', 'low', 'open', 'close', 'high', 'volume', 'turnover', 'securityId', 'fuquan']] df.columns = KDATA_COLUMN_FQ df.to_csv(csv_path, index=False) else: csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'bfq') if not os.path.exists(csv_path): df = pd.read_json(f, dtype={'code': str}) logger.info("{} to {}".format(f, csv_path)) df = df.loc[:, KDATA_COLUMN] df.to_csv(csv_path, index=False)
def stock_kdata_to_es(start='000001', end='666666', codes=US_STOCK_CODES, force=False): for _, security_item in get_security_list(start=start, end=end, exchanges=['sh', 'sz', 'nasdaq'], codes=codes).iterrows(): # 创建索引 index_name = get_es_kdata_index(security_item['type'], security_item['exchange']) es_index_mapping(index_name, StockKData) start_date = None if not force: query = { "term": {"securityId": ""} } query["term"]["securityId"] = security_item["id"] latest_record = es_get_latest_record(index_name, query=query) logger.info("latest_record:{}".format(latest_record)) if latest_record: start_date = latest_record['timestamp'] actions = [] for _, kdata_item in get_kdata(security_item, start_date=start_date).iterrows(): if start_date and is_same_date(start_date, kdata_item['timestamp']): continue try: id = '{}_{}'.format(kdata_item['securityId'], kdata_item['timestamp']) kdata = StockKData(meta={'id': id}, id=id) kdata.meta['index'] = index_name fill_doc_type(kdata, json.loads(kdata_item.to_json())) # kdata.save(index=index_name) actions.append(kdata.to_dict(include_meta=True)) except Exception as e: logger.warn("wrong KdataDay:{},error:{}", kdata_item, e) if actions: resp = elasticsearch.helpers.bulk(es, actions) logger.info(resp)
def start_requests(self): security_item = self.settings.get("security_item") if security_item is not None: item = security_item data_url = self.get_finance_url(item['code']) data_path = get_finance_path(item) yield Request(url=data_url, meta={ 'path': data_path, 'item': item }, callback=self.download_finance_csv) else: for _, item in get_security_list(exchanges=['nasdaq']).iterrows(): data_url = self.get_finance_url(item['code']) data_path = get_finance_path(item) yield Request(url=data_url, meta={ 'path': data_path, 'item': item }, callback=self.download_finance_csv)
def income_statement_to_es(force=False): es_index_mapping('income_statement', IncomeStatement) for _, security_item in get_security_list().iterrows(): try: start_date = None if not force: query = {"term": {"securityId": ""}} query["term"]["securityId"] = security_item["id"] latest_record = es_get_latest_record(index='income_statement', time_field='reportDate', query=query) logger.info("latest_record:{}".format(latest_record)) if latest_record: start_date = latest_record['reportDate'] actions = [] for json_object in get_income_statement_items( security_item, start_date=start_date): if start_date and is_same_date(start_date, json_object['reportDate']): continue income_statement = IncomeStatement( meta={'id': json_object['id']}) fill_doc_type(income_statement, json_object) # income_statement.save() actions.append(income_statement.to_dict(include_meta=True)) if actions: resp = elasticsearch.helpers.bulk(es, actions) logger.info(resp) except Exception as e: logger.warn("wrong IncomeStatement:{},error:{}", security_item, e)
def check_convert_result(): for index, security_item in get_security_list().iterrows(): for fuquan in ('bfq', 'hfq'): dayk_path = get_kdata_path(security_item, fuquan=fuquan) if os.path.exists(dayk_path): df_result = pd.read_csv(dayk_path) if fuquan == 'hfq': df = pd.DataFrame(columns=data_contract.KDATA_COLUMN_FQ) else: df = pd.DataFrame(columns=data_contract.KDATA_COLUMN) dir = get_kdata_dir(security_item, fuquan=fuquan) if os.path.exists(dir): files = [ os.path.join(dir, f) for f in os.listdir(dir) if ('day' not in f and 'csv' in f and os.path.isfile(os.path.join(dir, f))) ] for f in files: df = df.append(pd.read_csv(f), ignore_index=True) assert_df(df, df_result) logger.info("{} merge as one ok".format( security_item['code']))
def cash_flow_statement_to_es(force=False): es_index_mapping('cash_flow_statement', CashFlowStatement) for _, security_item in get_security_list().iterrows(): try: start_date = None if not force: query = { "term": {"securityId": ""} } query["term"]["securityId"] = security_item["id"] latest_record = es_get_latest_record(index='cash_flow_statement', time_field='reportDate', query=query) logger.info("latest_record:{}".format(latest_record)) if latest_record: start_date = latest_record['reportDate'] actions = [] for json_object in get_cash_flow_statement_items(security_item, start_date=start_date): if start_date and is_same_date(start_date, json_object['reportDate']): continue cash_flow_statement = CashFlowStatement(meta={'id': json_object['id']}) fill_doc_type(cash_flow_statement, json_object) # cash_flow_statement.save() actions.append(cash_flow_statement.to_dict(include_meta=True)) if actions: resp = elasticsearch.helpers.bulk(es, actions) logger.info(resp) except Exception as e: logger.warn("wrong CashFlowStatement:{},error:{}", security_item, e)
def usa_stock_finance_to_es(force=False): es_index_mapping('finance_summary', FinanceSummary) for _, security_item in get_security_list(exchanges=['nasdaq'], codes=US_STOCK_CODES).iterrows(): try: start_date = None if not force: query = { "term": {"securityId": ""} } query["term"]["securityId"] = security_item["id"] latest_record = es_get_latest_record(index='finance_summary', time_field='reportDate', query=query) logger.info("latest_record:{}".format(latest_record)) if latest_record: start_date = latest_record['reportDate'] actions = [] for _, json_object in get_finance_summary_items(security_item, start_date=start_date).iterrows(): if start_date and is_same_date(start_date, json_object['reportDate']): continue finance_summary = FinanceSummary(meta={'id': json_object['id']}) fill_doc_type(finance_summary, json_object.to_dict()) actions.append(finance_summary.to_dict(include_meta=True)) if actions: resp = elasticsearch.helpers.bulk(es, actions) logger.info(resp) except Exception as e: logger.warn("wrong FinanceSummary:{},error:{}", security_item, e)
def stock_meta_to_es(force=False): es_index_mapping('stock_meta', StockMeta) start_date = None if not force: latest_record = es_get_latest_record('stock_meta', time_field='listDate') logger.info("latest_record:{}".format(latest_record)) if latest_record: start_date = latest_record['listDate'] actions = [] for _, item in get_security_list(mode='es', start_date=start_date, exchanges=EXCHANGE_LIST_COL).iterrows(): if start_date and is_same_date(start_date, item['listDate']): continue try: stock_meta = StockMeta(meta={'id': item['id']}) fill_doc_type(stock_meta, json.loads(item.to_json())) actions.append(stock_meta.to_dict(include_meta=True)) except Exception as e: logger.warn("wrong SecurityItem:{},error:{}", item, e) if actions: resp = elasticsearch.helpers.bulk(es, actions) logger.info(resp)
def usa_stock_finance_to_es(force=False): es_index_mapping('finance_summary', FinanceSummary) for _, security_item in get_security_list(exchanges=['nasdaq'], codes=US_STOCK_CODES).iterrows(): try: start_date = None if not force: query = {"term": {"securityId": ""}} query["term"]["securityId"] = security_item["id"] latest_record = es_get_latest_record(index='finance_summary', time_field='reportDate', query=query) logger.info("latest_record:{}".format(latest_record)) if latest_record: start_date = latest_record['reportDate'] actions = [] for _, json_object in get_finance_summary_items( security_item, start_date=start_date).iterrows(): if start_date and is_same_date(start_date, json_object['reportDate']): continue finance_summary = FinanceSummary( meta={'id': json_object['id']}) fill_doc_type(finance_summary, json_object.to_dict()) actions.append(finance_summary.to_dict(include_meta=True)) if actions: resp = elasticsearch.helpers.bulk(es, actions) logger.info(resp) except Exception as e: logger.warn("wrong FinanceSummary:{},error:{}", security_item, e)
def stock_kdata_to_es(start='000001', end='666666', force=False): for _, security_item in get_security_list(start=start, end=end).iterrows(): # 创建索引 index_name = get_es_kdata_index(security_item['type'], security_item['exchange']) es_index_mapping(index_name, StockKData) start_date = None if not force: query = {"term": {"securityId": ""}} query["term"]["securityId"] = security_item["id"] latest_record = es_get_latest_record(index_name, query=query) logger.info("latest_record:{}".format(latest_record)) if latest_record: start_date = latest_record['timestamp'] actions = [] for _, kdata_item in get_kdata(security_item, start_date=start_date).iterrows(): if start_date and is_same_date(start_date, kdata_item['timestamp']): continue try: id = '{}_{}'.format(kdata_item['securityId'], kdata_item['timestamp']) kdata = StockKData(meta={'id': id}, id=id) fill_doc_type(kdata, json.loads(kdata_item.to_json())) # kdata.save(index=index_name) actions.append(kdata.to_dict(include_meta=True)) except Exception as e: logger.warn("wrong KdataDay:{},error:{}", kdata_item, e) if actions: resp = elasticsearch.helpers.bulk(es, actions) logger.info(resp)
def remove_old_kdata(): for index, security_item in get_security_list().iterrows(): for fuquan in (True, False): dir = get_kdata_dir_old(security_item, fuquan) if os.path.exists(dir): if fuquan: logger.info("remove {}".format(dir)) shutil.rmtree(dir)
def start_requests(self): item = self.settings.get("security_item") if item is not None: for request in self.yield_request(item): yield request else: for _, item in get_security_list().iterrows(): for request in self.yield_request(item): yield request
def kdata_to_kafka(fuquan): for _, security_item in get_security_list().iterrows(): for _, kdata_item in get_kdata(security_item, source='sina', fuquan=fuquan).iterrows(): the_json = kdata_item.to_json(force_ascii=False) producer.send(get_kafka_kdata_topic(security_item['id'], fuquan), bytes(the_json, encoding='utf8'), timestamp_ms=int(datetime.datetime.strptime(kdata_item['timestamp'], TIME_FORMAT_DAY).timestamp())) logger.debug("kdata_to_kafka {}".format(the_json))
def start_requests(self): security_item = self.settings.get("security_item") if security_item is not None: for request in self.yield_request(security_item): yield request else: for _, item in get_security_list().iterrows(): for request in self.yield_request(item): yield request
def start_requests(self): for _, item in get_security_list().iterrows(): url = self.get_forecast_url(item['code']) yield Request(url=url, headers=DEFAULT_KDATA_HEADER, meta={ 'item': item, }, callback=self.download_forecast_data)
def remove_old_tick(): for index, security_item in get_security_list().iterrows(): dir = get_tick_dir(security_item) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if ('xls' in f and 'lock' not in f and 'error' not in f and os.path.isfile(os.path.join(dir, f)))] for f in files: logger.info("remove {}".format(f)) os.remove(f)
def start_requests(self): item = self.settings.get("security_item") trading_dates = self.settings.get("trading_dates") if item is not None: for request in self.yield_request(item, trading_dates): yield request else: for _, item in get_security_list().iterrows(): for request in self.yield_request(item): yield request
def start_requests(self): item = self.settings.get("security_item") if item is not None: for request in self.yield_request(item): yield request else: for _, item in get_security_list(start=STOCK_START_CODE, end=STOCK_END_CODE).iterrows(): for request in self.yield_request(item): yield request
def start_requests(self): security_item = self.settings.get("security_item") finance_type = self.settings.get("report_type") if security_item is not None: for request in self.yield_request(security_item, finance_type): yield request else: for _, item in get_security_list().iterrows(): for request in self.yield_request(item): yield request
def init_env(): if not os.path.exists(FOOLTRADER_STORE_PATH): print("{} is a wrong path") print( "please set env FOOLTRADER_STORE_PATH to working path or set it in settings.py" ) else: # 初始化股票文件夹 for _, item in get_security_list( exchanges=EXCHANGE_LIST_COL).iterrows(): mkdir_for_security(item) # 初始化指数文件夹 for _, item in get_security_list(security_type='index', exchanges=['sh', 'sz', 'nasdaq']).iterrows(): kdata_dir = get_kdata_dir(item) if not os.path.exists(kdata_dir): os.makedirs(kdata_dir)
def tick_to_kafka(): for _, security_item in get_security_list().iterrows(): for df in get_ticks(security_item): for _, tick_item in df.iterrows(): the_json = tick_item.to_json(force_ascii=False) producer.send(get_kafka_tick_topic(security_item['id']), bytes(the_json, encoding='utf8'), timestamp_ms=int(datetime.datetime.strptime(tick_item['timestamp'], TIME_FORMAT_SEC).timestamp())) logger.debug("tick_to_kafka {}".format(the_json))
def crawl_finance_data(start_code=STOCK_START_CODE, end_code=STOCK_END_CODE): for _, security_item in get_security_list(start=start_code, end=end_code).iterrows(): try: # 先抓事件,有些后续抓取依赖事件 process_crawl(StockFinanceReportEventSpider, {"security_item": security_item}) current_report_date = get_report_date() # 资产负债表 path = get_balance_sheet_path(security_item) if not os.path.exists(path): process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "balance_sheet"}) else: for balance_sheet_item in get_balance_sheet_items(security_item): # 当前报告期还没抓取 if balance_sheet_item['reportDate'] != current_report_date: # 报告出来了 df = event.get_finance_report_event(security_item, index='reportDate') if current_report_date in df.index: process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "balance_sheet"}) break # 利润表 path = get_income_statement_path(security_item) if not os.path.exists(path): process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "income_statement"}) else: for balance_sheet_item in get_income_statement_items(security_item): if balance_sheet_item['reportDate'] != current_report_date: # 报告出来了 df = event.get_finance_report_event(security_item, index='reportDate') if current_report_date in df.index: process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "income_statement"}) break # 现金流量表 path = get_cash_flow_statement_path(security_item) if not os.path.exists(path): process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "cash_flow"}) else: for balance_sheet_item in get_cash_flow_statement_items(security_item): if balance_sheet_item['reportDate'] != current_report_date: # 报告出来了 df = event.get_finance_report_event(security_item, index='reportDate') if current_report_date in df.index: process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "cash_flow"}) break except Exception as e: logger.error(e)
def start_requests(self): item = self.settings.get("security_item") start_date = self.settings.get("start_date") end_date = self.settings.get("end_date") if item is not None: for request in self.yield_request(item, start_date, end_date): yield request else: for _, item in get_security_list().iterrows(): for request in self.yield_request(item): yield request
def kdata_to_es(start=None, end=None, security_type='stock', exchanges=['sh', 'sz'], force=False): if security_type == 'stock': doc_type = StockKData elif security_type == 'index': doc_type = IndexKData elif security_type == 'cryptocurrency': doc_type = CryptoCurrencyKData for _, security_item in get_security_list(security_type=security_type, exchanges=exchanges, start=start, end=end).iterrows(): # 创建索引 index_name = get_es_kdata_index(security_item['type'], security_item['exchange']) es_index_mapping(index_name, doc_type) start_date = None if not force: query = {"term": {"securityId": ""}} query["term"]["securityId"] = security_item["id"] latest_record = es_get_latest_record(index_name, query=query) logger.info("latest_record:{}".format(latest_record)) if latest_record: start_date = latest_record['timestamp'] actions = [] df_kdata = get_kdata(security_item, start_date=start_date) for _, kdata_item in df_kdata.iterrows(): if start_date and is_same_date(start_date, kdata_item['timestamp']): continue try: id = '{}_{}'.format(kdata_item['securityId'], kdata_item['timestamp']) kdata = doc_type(meta={'id': id}, id=id) kdata.meta['index'] = index_name kdata_json = json.loads(kdata_item.to_json()) fill_doc_type(kdata, kdata_json) # kdata.save(index=index_name) actions.append(kdata.to_dict(include_meta=True)) except Exception as e: logger.warn("wrong KdataDay:{},error:{}", kdata_item, e) if actions: resp = elasticsearch.helpers.bulk(es_client, actions) logger.info(resp)
def remove_old_tick(): for index, security_item in get_security_list().iterrows(): dir = get_tick_dir(security_item) if os.path.exists(dir): files = [ os.path.join(dir, f) for f in os.listdir(dir) if ('xls' in f and 'lock' not in f and 'error' not in f and os.path.isfile(os.path.join(dir, f))) ] for f in files: logger.info("remove {}".format(f)) os.remove(f)
def tick_to_kafka(): for _, security_item in get_security_list().iterrows(): for df in get_ticks(security_item): for _, tick_item in df.iterrows(): the_json = tick_item.to_json(force_ascii=False) producer.send(get_kafka_tick_topic(security_item['id']), bytes(the_json, encoding='utf8'), timestamp_ms=int( datetime.datetime.strptime( tick_item['timestamp'], TIME_FORMAT_SEC).timestamp())) logger.debug("tick_to_kafka {}".format(the_json))
def forecast_event_to_csv(): for index, security_item in get_security_list().iterrows(): the_path = get_forecast_event_path(security_item) if os.path.exists(the_path): df = pd.read_json(get_forecast_event_path(security_item)) df = df.rename(columns={'reportDate': 'timestamp'}) df = df.loc[:, EVENT_STOCK_FINANCE_FORECAST_COL] df.to_csv(get_finance_forecast_event_path(security_item), index=False) logger.info("transform {} forecast event".format( security_item['code'])) os.remove(the_path)
def start_requests(self): self.category_type = self.settings.get("category_type") self.sh_df = get_security_list(exchanges=['sh']) self.sz_df = get_security_list(exchanges=['sz']) self.file_lock = threading.RLock() # 清除老数据 self.sh_df[self.category_type] = None self.sz_df[self.category_type] = None if self.category_type == 'sinaIndustry': url = 'http://vip.stock.finance.sina.com.cn/q/view/newSinaHy.php' elif self.category_type == 'sinaConcept': url = 'http://money.finance.sina.com.cn/q/view/newFLJK.php?param=class' elif self.category_type == 'sinaArea': url = 'http://money.finance.sina.com.cn/q/view/newFLJK.php?param=area' else: return yield Request(url=url, callback=self.download_sina_category)
def kdata_to_kafka(fuquan): for _, security_item in get_security_list().iterrows(): for _, kdata_item in get_kdata(security_item, source='sina', fuquan=fuquan).iterrows(): the_json = kdata_item.to_json(force_ascii=False) producer.send(get_kafka_kdata_topic(security_item['id'], fuquan), bytes(the_json, encoding='utf8'), timestamp_ms=int( datetime.datetime.strptime( kdata_item['timestamp'], TIME_FORMAT_DAY).timestamp())) logger.debug("kdata_to_kafka {}".format(the_json))
def check_result(): for index, security_item in get_security_list().iterrows(): for fuquan in ('bfq', 'hfq'): dayk_path = get_kdata_path(security_item, fuquan=fuquan) if not os.path.exists(dayk_path): logger.warn(get_security_dir(security_item)) dir = get_tick_dir(security_item) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if ('csv' in f and os.path.isfile(os.path.join(dir, f)))] if not files: logger.warn(get_security_dir(security_item))
def forecast_event_to_es(): for _, security_item in get_security_list().iterrows(): # 创建索引 index_name = get_es_forecast_event_index(security_item['id']) es_index_mapping(index_name, ForecastEvent) for json_object in get_forecast_items(security_item): try: forcast_event = ForecastEvent(meta={'id': json_object['id']}) fill_doc_type(forcast_event, json_object) forcast_event.save() except Exception as e: logger.warn("wrong ForecastEvent:{},error:{}", json_object, e)
def start_requests(self): # 两种模式: # 1)item,trading_dates不指定,用于全量下载数据 # 2)指定,用于修复 item = self.settings.get("security_item") trading_dates = self.settings.get("trading_dates") fuquan = self.settings.get("fuquan") if item is not None: for request in self.yield_request(item, trading_dates, fuquan): yield request else: for _, item in get_security_list().iterrows(): for request in self.yield_request(item): yield request
def handle_error_tick(): for index, security_item in get_security_list().iterrows(): dir = get_tick_dir(security_item) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if (('fatal' in f or 'error' in f) and os.path.isfile(os.path.join(dir, f)))] for f in files: try: the_date = get_file_name(f) csv_path = get_tick_path(security_item, the_date) if not os.path.exists(csv_path): logger.info("{} to {}".format(f, csv_path)) sina_tick_to_csv(security_item, f, the_date) except Exception as e: logger.warn(e) os.rename(f, f + ".fatal")
def start_requests(self): for _, item in get_security_list().iterrows(): for fuquan in ['hfq', 'bfq']: data_path = get_kdata_path(item, fuquan=fuquan, source='ths') data_exist = os.path.isfile(data_path) if not data_exist or True: # get day k data if fuquan == 'hfq': flag = 2 else: flag = 0 url = self.get_k_data_url(item['code'], flag) yield Request(url=url, headers=TONGHUASHUN_KDATA_HEADER, meta={'path': data_path, 'item': item, 'fuquan': fuquan}, callback=self.download_day_k_data) else: self.logger.info("{} kdata existed".format(item['code']))
def start_requests(self): security_item = self.settings.get("security_item") if security_item is not None: item = security_item data_url = self.get_finance_url(item['code']) data_path = get_finance_path(item) yield Request(url=data_url, meta={'path': data_path, 'item': item}, callback=self.download_finance_csv) else: for _, item in get_security_list(exchanges=['nasdaq']).iterrows(): data_url = self.get_finance_url(item['code']) data_path = get_finance_path(item) yield Request(url=data_url, meta={'path': data_path, 'item': item}, callback=self.download_finance_csv)
def start_requests(self): item = self.settings.get("security_item") start_date = self.settings.get("start_date") end_date = self.settings.get("end_date") today = pd.Timestamp.today() the_years = None if start_date and end_date: if (today - start_date).days <= 5: pass else: the_years = range(start_date.year, end_date.year + 1) if item is not None and the_years: for request in self.yield_request(item, the_years): yield request else: for _, item in get_security_list(exchanges=['nasdaq'], codes=US_STOCK_CODES).iterrows(): for request in self.yield_request(item): yield request
def check_convert_result(): for index, security_item in get_security_list().iterrows(): for fuquan in ('bfq', 'hfq'): dayk_path = get_kdata_path(security_item, fuquan=fuquan) if os.path.exists(dayk_path): df_result = pd.read_csv(dayk_path) if fuquan == 'hfq': df = pd.DataFrame( columns=data_contract.KDATA_COLUMN_FQ) else: df = pd.DataFrame( columns=data_contract.KDATA_COLUMN) dir = get_kdata_dir(security_item, fuquan=fuquan) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if ('day' not in f and 'csv' in f and os.path.isfile(os.path.join(dir, f)))] for f in files: df = df.append(pd.read_csv(f), ignore_index=True) assert_df(df, df_result) logger.info("{} merge as one ok".format(security_item['code']))
def remove_old_163_trading_dates(): for index, security_item in get_security_list().iterrows(): the_path = get_trading_dates_path_163(security_item) if os.path.exists(the_path): logger.info("remove {}".format(the_path)) os.remove(the_path)
def start_requests(self): for _, item in get_security_list().iterrows(): url = self.get_forecast_url(item['code']) yield Request(url=url, headers=DEFAULT_KDATA_HEADER, meta={'item': item, }, callback=self.download_forecast_data)
def __init__(self, name=None, **kwargs): super().__init__(name, **kwargs) self.sh_df = get_security_list(exchanges=['sh']) self.sz_df = get_security_list(exchanges=['sz']) self.file_lock = threading.RLock()