def crawl_finance_data(start_code=STOCK_START_CODE, end_code=STOCK_END_CODE): for _, security_item in get_security_list(start=start_code, end=end_code).iterrows(): try: # 先抓事件,有些后续抓取依赖事件 process_crawl(StockFinanceReportEventSpider, {"security_item": security_item}) current_report_date = get_report_date() # 资产负债表 path = get_balance_sheet_path(security_item) if not os.path.exists(path): process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "balance_sheet"}) else: for balance_sheet_item in get_balance_sheet_items(security_item): # 当前报告期还没抓取 if balance_sheet_item['reportDate'] != current_report_date: # 报告出来了 df = event.get_finance_report_event(security_item, index='reportDate') if current_report_date in df.index: process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "balance_sheet"}) break # 利润表 path = get_income_statement_path(security_item) if not os.path.exists(path): process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "income_statement"}) else: for balance_sheet_item in get_income_statement_items(security_item): if balance_sheet_item['reportDate'] != current_report_date: # 报告出来了 df = event.get_finance_report_event(security_item, index='reportDate') if current_report_date in df.index: process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "income_statement"}) break # 现金流量表 path = get_cash_flow_statement_path(security_item) if not os.path.exists(path): process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "cash_flow"}) else: for balance_sheet_item in get_cash_flow_statement_items(security_item): if balance_sheet_item['reportDate'] != current_report_date: # 报告出来了 df = event.get_finance_report_event(security_item, index='reportDate') if current_report_date in df.index: process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "cash_flow"}) break except Exception as e: logger.error(e)
def finance_event_to_es(event_type='finance_forecast', start_code=None, end_code=None, force=False): if event_type == 'finance_forecast': doc_type = FinanceForecastEvent elif event_type == 'finance_report': doc_type = FinanceReportEvent for _, security_item in get_security_list(start_code=start_code, end_code=end_code).iterrows(): if event_type == 'finance_forecast': df = get_finance_forecast_event(security_item) elif event_type == 'finance_report': df = get_finance_report_event(security_item) df_to_es(df, doc_type=doc_type, security_item=security_item, force=force)
def download_fi_report_event_data(self, response): security_item = response.meta['item'] period_type = response.meta['period_type'] path = get_event_path(security_item, event='finance_report') df = event.get_finance_report_event(security_item, index='reportEventDate') try: report_event_dates = response.xpath('//*[@id="con02-7"]/table[2]/tr/td[2]//ul/text()').extract() report_event_dates = [date.strip() for date in report_event_dates if date.strip()] report_contents = response.xpath('//*[@id="con02-7"]/table[2]/tr/td[2]//ul//a').extract() for i, tr in enumerate(report_contents): href = Selector(text=tr).xpath('//@href').extract()[0] title = Selector(text=tr).xpath('//text()').extract()[0] report_period = self.report_period_from_title(title, period_type, report_event_dates[i]) # 如果最新的事件已经抓取,直接返回 if i == 0: if not df.empty: latest = pd.Timestamp(report_event_dates[0]).date() if df.index.contains(latest) and (df.loc[latest, 'title'] == title): self.logger.info( "{} {} report has been the latest".format(security_item['code'], report_period)) return df = df.append({ "id": "{}_{}_{}".format(security_item['id'], report_event_dates[i], report_period), "securityId": security_item['id'], "reportEventDate": report_event_dates[i], "url": "http://vip.stock.finance.sina.com.cn" + href, "title": title, "reportDate": report_period}, ignore_index=True) if not df.empty: df = df.drop_duplicates(subset=['id', 'title'], keep='last') df = index_df_with_time(df, index='reportEventDate') df.to_csv(path, index=False) except Exception as e: self.logger.error('error when getting k data url={} error={}'.format(response.url, e))
def finance_event_to_es(event_type='finance_forecast', start_code=None, end_code=None, force=False): if event_type == 'finance_forecast': doc_type = FinanceForecastEvent elif event_type == 'finance_report': doc_type = FinanceReportEvent for _, security_item in get_security_list(start_code=start_code, end_code=end_code).iterrows(): query = None if not force: query = {"term": {"securityId": ""}} query["term"]["securityId"] = security_item["id"] if event_type == 'finance_forecast': df = get_finance_forecast_event(security_item) elif event_type == 'finance_report': df = get_finance_report_event(security_item) df_to_es(df, doc_type=doc_type, query=query, force=force)
def download_fi_report_event_data(self, response): security_item = response.meta['item'] period_type = response.meta['period_type'] path = get_event_path(security_item, event='finance_report') df = event.get_finance_report_event(security_item, index='reportEventDate') try: report_event_dates = response.xpath( '//*[@id="con02-7"]/table[2]/tr/td[2]//ul/text()').extract() report_event_dates = [ date.strip() for date in report_event_dates if date.strip() ] report_contents = response.xpath( '//*[@id="con02-7"]/table[2]/tr/td[2]//ul//a').extract() for i, tr in enumerate(report_contents): href = Selector(text=tr).xpath('//@href').extract()[0] title = Selector(text=tr).xpath('//text()').extract()[0] report_period = self.report_period_from_title( title, period_type, report_event_dates[i]) # 如果最新的事件已经抓取,直接返回 if i == 0: if not df.empty: latest = pd.Timestamp(report_event_dates[0]).date() if df.index.contains(latest) and (df.loc[latest, 'title'] == title): self.logger.info( "{} {} report has been the latest".format( security_item['code'], report_period)) return df = df.append( { "id": "{}_{}_{}".format(security_item['id'], report_event_dates[i], report_period), "securityId": security_item['id'], "reportEventDate": report_event_dates[i], "url": "http://vip.stock.finance.sina.com.cn" + href, "title": title, "reportDate": report_period }, ignore_index=True) if not df.empty: df = df.drop_duplicates(subset=['id', 'title'], keep='last') df = index_df_with_time(df, index='reportEventDate') df.to_csv(path, index=False) except Exception as e: self.logger.error( 'error when getting k data url={} error={}'.format( response.url, e))