def crawl_index_quote():
    for _, security_item in get_security_list(security_type='index').iterrows():
        # 抓取日K线
        logger.info("{} get index kdata start".format(security_item['code']))

        start_date, _ = get_latest_download_trading_date(security_item, source='163')
        end_date = pd.Timestamp.today()
        if start_date > end_date:
            logger.info("{} kdata is ok".format(security_item['code']))
        else:
            process_crawl(StockKdata163Spider, {"security_item": security_item,
                                                "start_date": start_date,
                                                "end_date": end_date})

        logger.info("{} get index kdata from 163 end".format(security_item['code']))

        # 获取市场概况数据[上海,深圳,中小板,创业板]
        if security_item['id'] in ['index_sh_000001', 'index_sz_399106', 'index_sz_399005', 'index_sz_399006']:
            # if security_item['id'] in ['index_sz_399106', 'index_sz_399005', 'index_sz_399006']:
            df = get_kdata(security_item=security_item)
            df = df[df['turnoverRate'].isna() | df['tCap'].isna() | df['mCap'].isna() | df[
                'pe'].isna()]
            if not df.empty:
                dates = df.index.strftime('%Y-%m-%d').tolist()
                # if security_item['id'] == 'index_sz_399106':
                # dates = [the_date for the_date in dates if
                #          pd.Timestamp(the_date).date().year >= 2018]
                if dates:
                    process_crawl(StockSummarySpider, {"security_item": security_item,
                                                       "the_dates": dates})
示例#2
0
def crawl_stock_meta():
    # 更新股票列表
    # TODO:看是否有必要判断有新股上市,目前每天抓一次列表,问题不大
    if True:
        logger.info('download stock list start')
        process_crawl(ChinaStockListSpider, {})
        logger.info('download stock list finish')
def crawl_usa_stock_data():
    # crawl the stock list
    process_crawl(AmericaListSpider, {})
    # crawl the kdata
    for _, security_item in get_security_list(security_type='stock', exchanges=['nasdaq'],
                                              codes=US_STOCK_CODES).iterrows():
        process_crawl(AmericaStockKdataSpider, {"security_item": security_item})
def crawl_ine_quote():
    # 先抓历年历史数据
    #    process_crawl(FutureShfeSpider, {})
    # 抓今年的交易日历
    # process_crawl(ShfeTradingCalendarSpider, {})
    # 增量抓
    # cache_dir = get_exchange_cache_dir(security_type='future', exchange='shfe', the_year=datetime.today().year,
    #                                    data_type="day_kdata")

    # saved_kdata_dates = [f for f in os.listdir(cache_dir)]
    # trading_dates = get_trading_calendar(security_type='future', exchange='shfe')

    # the_dates = set(trading_dates) - set(saved_kdata_dates)

    process_crawl(
        FutureIneSpider,
        {
            # "trading_dates": the_dates,
            'dataType': "day_kdata"
        })
def crawl_shfe_quote():
    # 先抓历年历史数据
    process_crawl(FutureShfeSpider, {})
    # 抓今年的交易日历
    process_crawl(ShfeTradingCalendarSpider, {})
    # 增量抓
    cache_dir = get_exchange_cache_dir(security_type='future',
                                       exchange='shfe',
                                       the_year=datetime.today().year,
                                       data_type="day_kdata")

    saved_kdata_dates = [f for f in os.listdir(cache_dir)]
    trading_dates = get_trading_calendar(security_type='future',
                                         exchange='shfe')

    the_dates = set(trading_dates) - set(saved_kdata_dates)

    process_crawl(FutureShfeSpider, {"trading_dates": the_dates})
def crawl_stock_quote(start_code=STOCK_START_CODE, end_code=STOCK_END_CODE, crawl_tick=True):
    # 抓取股票k线
    for _, security_item in get_security_list(start_code=start_code, end_code=end_code).iterrows():
        # 抓取日K线
        logger.info("{} get stock kdata start".format(security_item['code']))

        start_date, _ = get_latest_download_trading_date(security_item, source='163')
        end_date = pd.Timestamp.today()
        if start_date > end_date:
            logger.info("{} stock kdata is ok".format(security_item['code']))
        else:
            process_crawl(StockKdata163Spider, {"security_item": security_item,
                                                "start_date": start_date,
                                                "end_date": end_date})

        logger.info("{} get stock kdata from 163 end".format(security_item['code']))

        base_dates = set(get_trading_dates(security_item, source='163'))
        for fuquan in ('bfq', 'hfq'):
            sina_dates = set(get_trading_dates(security_item, source='sina', fuquan=fuquan))
            diff_dates = base_dates - sina_dates
            if diff_dates:
                logger.info("{} get {} kdata from sina start".format(security_item['code'], fuquan))
                process_crawl(StockKDataSinaSpider, {"security_item": security_item,
                                                     "trading_dates": diff_dates,
                                                     "fuquan": fuquan})
                logger.info("{} get {} kdata from sina end".format(security_item['code'], fuquan))
            else:
                logger.info("{} {} kdata from sina is ok".format(security_item['code'], fuquan))

        # 抓取tick
        # FIXME:新浪该服务已不可用
        if crawl_tick and False:
            tick_dates = {x for x in base_dates if x >= settings.START_TICK_DATE}
            diff_dates = tick_dates - set(get_available_tick_dates(security_item))

            if diff_dates:
                logger.info("{} get tick start".format(security_item['code']))
                process_crawl(StockTickSpider, {"security_item": security_item,
                                                "trading_dates": diff_dates})
                logger.info("{} get tick end".format(security_item['code']))
            else:
                logger.info("{} tick is ok".format(security_item['code']))
def crawl_finance_data(start_code=STOCK_START_CODE, end_code=STOCK_END_CODE):
    for _, security_item in get_security_list(start_code=start_code, end_code=end_code).iterrows():
        try:
            # 先抓事件,有些后续抓取依赖事件
            process_crawl(StockFinanceReportEventSpider, {"security_item": security_item})

            current_report_period = get_report_period()

            # 资产负债表
            path = get_balance_sheet_path(security_item)
            if not os.path.exists(path):
                process_crawl(StockFinanceSpider, {"security_item": security_item,
                                                   "report_type": "balance_sheet"})
            else:
                current_items = get_balance_sheet_items(security_item)
                # 当前报告期还没抓取

                if current_report_period != current_items[-1]['reportPeriod']:
                    # 报告出来了
                    # df = event.get_finance_report_event(security_item, index='reportPeriod')
                    # if current_report_period in df.index:
                    process_crawl(StockFinanceSpider, {"security_item": security_item,
                                                       "report_type": "balance_sheet"})

            # 利润表
            path = get_income_statement_path(security_item)
            if not os.path.exists(path):
                process_crawl(StockFinanceSpider, {"security_item": security_item,
                                                   "report_type": "income_statement"})
            else:
                current_items = get_income_statement_items(security_item)
                # 当前报告期还没抓取
                if current_report_period != current_items[-1]['reportPeriod']:
                    # 报告出来了
                    # df = event.get_finance_report_event(security_item, index='reportPeriod')
                    # if current_report_period in df.index:
                    process_crawl(StockFinanceSpider, {"security_item": security_item,
                                                       "report_type": "income_statement"})

            # 现金流量表
            path = get_cash_flow_statement_path(security_item)
            if not os.path.exists(path):
                process_crawl(StockFinanceSpider, {"security_item": security_item,
                                                   "report_type": "cash_flow"})
            else:
                current_items = get_cash_flow_statement_items(security_item)
                # 当前报告期还没抓取
                if current_report_period != current_items[-1]['reportPeriod']:
                    # 报告出来了
                    # df = event.get_finance_report_event(security_item, index='reportPeriod')
                    # if current_report_period in df.index:
                    process_crawl(StockFinanceSpider, {"security_item": security_item,
                                                       "report_type": "cash_flow"})
        except Exception as e:
            logger.exception(e)
def craw_stock_category():
    process_crawl(SinaCategorySpider, {'category_type': 'sinaIndustry'})
    process_crawl(SinaCategorySpider, {'category_type': 'sinaConcept'})
    process_crawl(SinaCategorySpider, {'category_type': 'sinaArea'})
def crawl_cffex_quote():
    process_crawl(FutureCffexSpider, {'dataType': "day_kdata"})
    process_crawl(FutureCffexSpider, {'dataType': "inventory"})
def crawl_czce_quote():
    process_crawl(FutureCzceSpider, {'dataType': "day_kdata"})
    process_crawl(FutureCzceSpider, {'dataType': "inventory"})
示例#11
0
def scheduled_job2():
    process_crawl(StockForecastSpider)
    es_connector.finance_event_to_es(event_type='finance_forecast')
示例#12
0
logger = logging.getLogger(__name__)

sched = BackgroundScheduler()


@sched.scheduled_job('cron', hour=18, minute=00)
def scheduled_job1():
    crawl_finance_data('000001', '666666')
    es_connector.finance_sheet_to_es()
    es_connector.finance_event_to_es(event_type='finance_report')


@sched.scheduled_job('cron', hour=18, minute=10)
def scheduled_job2():
    process_crawl(StockForecastSpider)
    es_connector.finance_event_to_es(event_type='finance_forecast')


if __name__ == '__main__':
    logger.info("start crawling finance data")

    crawl_finance_data('000001', '666666')
    process_crawl(StockForecastSpider)

    logger.info("shed crawling finance data")

    sched.start()

    logger.info("I would crawl finance data at 18:00")
    sched._thread.join()
示例#13
0
def scheduled_job2():
    process_crawl(StockForecastSpider)