def crawl_index_quote(): for _, security_item in get_security_list(security_type='index').iterrows(): # 抓取日K线 logger.info("{} get index kdata start".format(security_item['code'])) start_date, _ = get_latest_download_trading_date(security_item, source='163') end_date = pd.Timestamp.today() if start_date > end_date: logger.info("{} kdata is ok".format(security_item['code'])) else: process_crawl(StockKdata163Spider, {"security_item": security_item, "start_date": start_date, "end_date": end_date}) logger.info("{} get index kdata from 163 end".format(security_item['code'])) # 获取市场概况数据[上海,深圳,中小板,创业板] if security_item['id'] in ['index_sh_000001', 'index_sz_399106', 'index_sz_399005', 'index_sz_399006']: # if security_item['id'] in ['index_sz_399106', 'index_sz_399005', 'index_sz_399006']: df = get_kdata(security_item=security_item) df = df[df['turnoverRate'].isna() | df['tCap'].isna() | df['mCap'].isna() | df[ 'pe'].isna()] if not df.empty: dates = df.index.strftime('%Y-%m-%d').tolist() # if security_item['id'] == 'index_sz_399106': # dates = [the_date for the_date in dates if # pd.Timestamp(the_date).date().year >= 2018] if dates: process_crawl(StockSummarySpider, {"security_item": security_item, "the_dates": dates})
def crawl_stock_meta(): # 更新股票列表 # TODO:看是否有必要判断有新股上市,目前每天抓一次列表,问题不大 if True: logger.info('download stock list start') process_crawl(ChinaStockListSpider, {}) logger.info('download stock list finish')
def crawl_usa_stock_data(): # crawl the stock list process_crawl(AmericaListSpider, {}) # crawl the kdata for _, security_item in get_security_list(security_type='stock', exchanges=['nasdaq'], codes=US_STOCK_CODES).iterrows(): process_crawl(AmericaStockKdataSpider, {"security_item": security_item})
def crawl_ine_quote(): # 先抓历年历史数据 # process_crawl(FutureShfeSpider, {}) # 抓今年的交易日历 # process_crawl(ShfeTradingCalendarSpider, {}) # 增量抓 # cache_dir = get_exchange_cache_dir(security_type='future', exchange='shfe', the_year=datetime.today().year, # data_type="day_kdata") # saved_kdata_dates = [f for f in os.listdir(cache_dir)] # trading_dates = get_trading_calendar(security_type='future', exchange='shfe') # the_dates = set(trading_dates) - set(saved_kdata_dates) process_crawl( FutureIneSpider, { # "trading_dates": the_dates, 'dataType': "day_kdata" })
def crawl_shfe_quote(): # 先抓历年历史数据 process_crawl(FutureShfeSpider, {}) # 抓今年的交易日历 process_crawl(ShfeTradingCalendarSpider, {}) # 增量抓 cache_dir = get_exchange_cache_dir(security_type='future', exchange='shfe', the_year=datetime.today().year, data_type="day_kdata") saved_kdata_dates = [f for f in os.listdir(cache_dir)] trading_dates = get_trading_calendar(security_type='future', exchange='shfe') the_dates = set(trading_dates) - set(saved_kdata_dates) process_crawl(FutureShfeSpider, {"trading_dates": the_dates})
def crawl_stock_quote(start_code=STOCK_START_CODE, end_code=STOCK_END_CODE, crawl_tick=True): # 抓取股票k线 for _, security_item in get_security_list(start_code=start_code, end_code=end_code).iterrows(): # 抓取日K线 logger.info("{} get stock kdata start".format(security_item['code'])) start_date, _ = get_latest_download_trading_date(security_item, source='163') end_date = pd.Timestamp.today() if start_date > end_date: logger.info("{} stock kdata is ok".format(security_item['code'])) else: process_crawl(StockKdata163Spider, {"security_item": security_item, "start_date": start_date, "end_date": end_date}) logger.info("{} get stock kdata from 163 end".format(security_item['code'])) base_dates = set(get_trading_dates(security_item, source='163')) for fuquan in ('bfq', 'hfq'): sina_dates = set(get_trading_dates(security_item, source='sina', fuquan=fuquan)) diff_dates = base_dates - sina_dates if diff_dates: logger.info("{} get {} kdata from sina start".format(security_item['code'], fuquan)) process_crawl(StockKDataSinaSpider, {"security_item": security_item, "trading_dates": diff_dates, "fuquan": fuquan}) logger.info("{} get {} kdata from sina end".format(security_item['code'], fuquan)) else: logger.info("{} {} kdata from sina is ok".format(security_item['code'], fuquan)) # 抓取tick # FIXME:新浪该服务已不可用 if crawl_tick and False: tick_dates = {x for x in base_dates if x >= settings.START_TICK_DATE} diff_dates = tick_dates - set(get_available_tick_dates(security_item)) if diff_dates: logger.info("{} get tick start".format(security_item['code'])) process_crawl(StockTickSpider, {"security_item": security_item, "trading_dates": diff_dates}) logger.info("{} get tick end".format(security_item['code'])) else: logger.info("{} tick is ok".format(security_item['code']))
def crawl_finance_data(start_code=STOCK_START_CODE, end_code=STOCK_END_CODE): for _, security_item in get_security_list(start_code=start_code, end_code=end_code).iterrows(): try: # 先抓事件,有些后续抓取依赖事件 process_crawl(StockFinanceReportEventSpider, {"security_item": security_item}) current_report_period = get_report_period() # 资产负债表 path = get_balance_sheet_path(security_item) if not os.path.exists(path): process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "balance_sheet"}) else: current_items = get_balance_sheet_items(security_item) # 当前报告期还没抓取 if current_report_period != current_items[-1]['reportPeriod']: # 报告出来了 # df = event.get_finance_report_event(security_item, index='reportPeriod') # if current_report_period in df.index: process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "balance_sheet"}) # 利润表 path = get_income_statement_path(security_item) if not os.path.exists(path): process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "income_statement"}) else: current_items = get_income_statement_items(security_item) # 当前报告期还没抓取 if current_report_period != current_items[-1]['reportPeriod']: # 报告出来了 # df = event.get_finance_report_event(security_item, index='reportPeriod') # if current_report_period in df.index: process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "income_statement"}) # 现金流量表 path = get_cash_flow_statement_path(security_item) if not os.path.exists(path): process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "cash_flow"}) else: current_items = get_cash_flow_statement_items(security_item) # 当前报告期还没抓取 if current_report_period != current_items[-1]['reportPeriod']: # 报告出来了 # df = event.get_finance_report_event(security_item, index='reportPeriod') # if current_report_period in df.index: process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "cash_flow"}) except Exception as e: logger.exception(e)
def craw_stock_category(): process_crawl(SinaCategorySpider, {'category_type': 'sinaIndustry'}) process_crawl(SinaCategorySpider, {'category_type': 'sinaConcept'}) process_crawl(SinaCategorySpider, {'category_type': 'sinaArea'})
def crawl_cffex_quote(): process_crawl(FutureCffexSpider, {'dataType': "day_kdata"}) process_crawl(FutureCffexSpider, {'dataType': "inventory"})
def crawl_czce_quote(): process_crawl(FutureCzceSpider, {'dataType': "day_kdata"}) process_crawl(FutureCzceSpider, {'dataType': "inventory"})
def scheduled_job2(): process_crawl(StockForecastSpider) es_connector.finance_event_to_es(event_type='finance_forecast')
logger = logging.getLogger(__name__) sched = BackgroundScheduler() @sched.scheduled_job('cron', hour=18, minute=00) def scheduled_job1(): crawl_finance_data('000001', '666666') es_connector.finance_sheet_to_es() es_connector.finance_event_to_es(event_type='finance_report') @sched.scheduled_job('cron', hour=18, minute=10) def scheduled_job2(): process_crawl(StockForecastSpider) es_connector.finance_event_to_es(event_type='finance_forecast') if __name__ == '__main__': logger.info("start crawling finance data") crawl_finance_data('000001', '666666') process_crawl(StockForecastSpider) logger.info("shed crawling finance data") sched.start() logger.info("I would crawl finance data at 18:00") sched._thread.join()
def scheduled_job2(): process_crawl(StockForecastSpider)