예제 #1
0
def crawl_finance_data(start_code=STOCK_START_CODE, end_code=STOCK_END_CODE):
    for _, security_item in get_security_list(start=start_code, end=end_code).iterrows():
        try:
            # 先抓事件,有些后续抓取依赖事件
            process_crawl(StockFinanceReportEventSpider, {"security_item": security_item})

            current_report_date = get_report_date()

            # 资产负债表
            path = get_balance_sheet_path(security_item)
            if not os.path.exists(path):
                process_crawl(StockFinanceSpider, {"security_item": security_item,
                                                   "report_type": "balance_sheet"})
            else:
                for balance_sheet_item in get_balance_sheet_items(security_item):
                    # 当前报告期还没抓取
                    if balance_sheet_item['reportDate'] != current_report_date:
                        # 报告出来了
                        df = event.get_finance_report_event(security_item, index='reportDate')
                        if current_report_date in df.index:
                            process_crawl(StockFinanceSpider, {"security_item": security_item,
                                                               "report_type": "balance_sheet"})
                    break

            # 利润表
            path = get_income_statement_path(security_item)
            if not os.path.exists(path):
                process_crawl(StockFinanceSpider, {"security_item": security_item,
                                                   "report_type": "income_statement"})
            else:
                for balance_sheet_item in get_income_statement_items(security_item):
                    if balance_sheet_item['reportDate'] != current_report_date:
                        # 报告出来了
                        df = event.get_finance_report_event(security_item, index='reportDate')
                        if current_report_date in df.index:
                            process_crawl(StockFinanceSpider, {"security_item": security_item,
                                                               "report_type": "income_statement"})
                    break

            # 现金流量表
            path = get_cash_flow_statement_path(security_item)
            if not os.path.exists(path):
                process_crawl(StockFinanceSpider, {"security_item": security_item,
                                                   "report_type": "cash_flow"})
            else:
                for balance_sheet_item in get_cash_flow_statement_items(security_item):
                    if balance_sheet_item['reportDate'] != current_report_date:
                        # 报告出来了
                        df = event.get_finance_report_event(security_item, index='reportDate')
                        if current_report_date in df.index:
                            process_crawl(StockFinanceSpider, {"security_item": security_item,
                                                               "report_type": "cash_flow"})
                    break
        except Exception as e:
            logger.error(e)
예제 #2
0
def finance_event_to_es(event_type='finance_forecast',
                        start_code=None,
                        end_code=None,
                        force=False):
    if event_type == 'finance_forecast':
        doc_type = FinanceForecastEvent
    elif event_type == 'finance_report':
        doc_type = FinanceReportEvent

    for _, security_item in get_security_list(start_code=start_code,
                                              end_code=end_code).iterrows():
        if event_type == 'finance_forecast':
            df = get_finance_forecast_event(security_item)
        elif event_type == 'finance_report':
            df = get_finance_report_event(security_item)

        df_to_es(df,
                 doc_type=doc_type,
                 security_item=security_item,
                 force=force)
    def download_fi_report_event_data(self, response):
        security_item = response.meta['item']
        period_type = response.meta['period_type']

        path = get_event_path(security_item, event='finance_report')

        df = event.get_finance_report_event(security_item, index='reportEventDate')

        try:
            report_event_dates = response.xpath('//*[@id="con02-7"]/table[2]/tr/td[2]//ul/text()').extract()
            report_event_dates = [date.strip() for date in report_event_dates if date.strip()]

            report_contents = response.xpath('//*[@id="con02-7"]/table[2]/tr/td[2]//ul//a').extract()

            for i, tr in enumerate(report_contents):
                href = Selector(text=tr).xpath('//@href').extract()[0]
                title = Selector(text=tr).xpath('//text()').extract()[0]
                report_period = self.report_period_from_title(title, period_type, report_event_dates[i])

                # 如果最新的事件已经抓取,直接返回
                if i == 0:
                    if not df.empty:
                        latest = pd.Timestamp(report_event_dates[0]).date()
                        if df.index.contains(latest) and (df.loc[latest, 'title'] == title):
                            self.logger.info(
                                "{} {} report has been the latest".format(security_item['code'], report_period))
                            return

                df = df.append({
                    "id": "{}_{}_{}".format(security_item['id'], report_event_dates[i], report_period),
                    "securityId": security_item['id'],
                    "reportEventDate": report_event_dates[i],
                    "url": "http://vip.stock.finance.sina.com.cn" + href,
                    "title": title,
                    "reportDate": report_period}, ignore_index=True)
            if not df.empty:
                df = df.drop_duplicates(subset=['id', 'title'], keep='last')
                df = index_df_with_time(df, index='reportEventDate')
                df.to_csv(path, index=False)
        except Exception as e:
            self.logger.error('error when getting k data url={} error={}'.format(response.url, e))
예제 #4
0
def finance_event_to_es(event_type='finance_forecast',
                        start_code=None,
                        end_code=None,
                        force=False):
    if event_type == 'finance_forecast':
        doc_type = FinanceForecastEvent
    elif event_type == 'finance_report':
        doc_type = FinanceReportEvent

    for _, security_item in get_security_list(start_code=start_code,
                                              end_code=end_code).iterrows():
        query = None
        if not force:
            query = {"term": {"securityId": ""}}
            query["term"]["securityId"] = security_item["id"]

        if event_type == 'finance_forecast':
            df = get_finance_forecast_event(security_item)
        elif event_type == 'finance_report':
            df = get_finance_report_event(security_item)

        df_to_es(df, doc_type=doc_type, query=query, force=force)
예제 #5
0
    def download_fi_report_event_data(self, response):
        security_item = response.meta['item']
        period_type = response.meta['period_type']

        path = get_event_path(security_item, event='finance_report')

        df = event.get_finance_report_event(security_item,
                                            index='reportEventDate')

        try:
            report_event_dates = response.xpath(
                '//*[@id="con02-7"]/table[2]/tr/td[2]//ul/text()').extract()
            report_event_dates = [
                date.strip() for date in report_event_dates if date.strip()
            ]

            report_contents = response.xpath(
                '//*[@id="con02-7"]/table[2]/tr/td[2]//ul//a').extract()

            for i, tr in enumerate(report_contents):
                href = Selector(text=tr).xpath('//@href').extract()[0]
                title = Selector(text=tr).xpath('//text()').extract()[0]
                report_period = self.report_period_from_title(
                    title, period_type, report_event_dates[i])

                # 如果最新的事件已经抓取,直接返回
                if i == 0:
                    if not df.empty:
                        latest = pd.Timestamp(report_event_dates[0]).date()
                        if df.index.contains(latest) and (df.loc[latest,
                                                                 'title']
                                                          == title):
                            self.logger.info(
                                "{} {} report has been the latest".format(
                                    security_item['code'], report_period))
                            return

                df = df.append(
                    {
                        "id":
                        "{}_{}_{}".format(security_item['id'],
                                          report_event_dates[i],
                                          report_period),
                        "securityId":
                        security_item['id'],
                        "reportEventDate":
                        report_event_dates[i],
                        "url":
                        "http://vip.stock.finance.sina.com.cn" + href,
                        "title":
                        title,
                        "reportDate":
                        report_period
                    },
                    ignore_index=True)
            if not df.empty:
                df = df.drop_duplicates(subset=['id', 'title'], keep='last')
                df = index_df_with_time(df, index='reportEventDate')
                df.to_csv(path, index=False)
        except Exception as e:
            self.logger.error(
                'error when getting k data url={} error={}'.format(
                    response.url, e))