def download_forecast_data(self, response):
        security_item = response.meta['item']
        trs = response.xpath('//*[@id="dataTable"]//tr').extract()

        forecast_jsons = []

        try:
            for tr in trs[1:]:
                tds = Selector(text=tr).xpath('//td//text()').extract()
                tds = [x.strip() for x in tds if x.strip()]

                # 业绩变动字符串转为float
                change_str = tds[7]
                change_start = None

                if '~' in change_str:
                    i = change_str.index('~')
                    change_start = change_str[0:i]
                    change = change_str[i + 1:]
                else:
                    change = change_str

                if change:
                    change = change.strip('%')
                    change = float(change) / 100
                if change_start:
                    change_start = change_start.strip('%')
                    change_start = float(change_start) / 100

                # preEPS可能为空
                preEPS = None
                try:
                    preEPS = float(tds[6])
                except Exception as e:
                    pass

                json_item = {
                    "securityId": security_item['id'],
                    "timestamp": tds[3],
                    "reportPeriod": tds[4],
                    "type": tds[2],
                    "description": tds[5],
                    "preEPS": preEPS,
                    "changeStart": change_start,
                    "change": change,
                }
                forecast_jsons.append(json_item)

            if forecast_jsons:
                df = pd.DataFrame(forecast_jsons)
                df = df.drop_duplicates()
                df = df[:, EVENT_STOCK_FINANCE_FORECAST_COL]
                df = index_df_with_time(df)
                df.to_csv(get_finance_forecast_event_path(security_item),
                          index=False)

        except Exception as e:
            self.logger.exception(
                'error when getting k data url={} error={}'.format(
                    response.url, e))
Exemplo n.º 2
0
def get_finance_report_event(security_item, index='timestamp'):
    """
    get finance report event items.

    Parameters
    ----------
    security_item : SecurityItem or str
        the security item,id or code

    index : {'timestamp','reportPeriod'} default is 'timestamp'
        the index for the return df

    Returns
    -------
    DataFrame

    """
    security_item = to_security_item(security_item)
    path = get_finance_report_event_path(security_item)

    if os.path.exists(path):
        df = pd.read_csv(path)
        df = index_df_with_time(df, index=index)
    else:
        df = pd.DataFrame()
    return df
Exemplo n.º 3
0
    def on_tick(self, tick_item):
        # 只计算日内,超短线
        if not is_same_date(self.current_time, tick_item['timestamp']):
            self.today_traded = False
            if not self.df.empty:
                self.df = pd.DataFrame()
        else:
            if not self.today_traded:
                if not self.df.empty and self.df.index.size == 1:
                    self.df = index_df_with_time(self.df)
                self.df = self.df.append(tick_item, ignore_index=True)
                # 14:50时,计算当日资金流
                if ("14:50:" in tick_item['timestamp']):
                    money_flow = (self.df['turnover'] * self.df['direction']).sum()
                    money_all = (self.df['turnover'] * abs(self.df['direction'])).sum()

                    # 净流入
                    if money_flow > 0 and not self.account_service.get_position(tick_item['securityId']):
                        # 使用用后复权价格
                        factor = \
                            get_kdata(tick_item['securityId'], the_date=pd.Timestamp(tick_item['timestamp']).date())[
                                'factor']
                        self.buy(tick_item['securityId'], current_price=tick_item['price'] * factor)
                    # 净流出
                    elif money_flow < 0 and self.account_service.get_position(tick_item['securityId']):
                        # 使用用后复权价格
                        factor = \
                            get_kdata(tick_item['securityId'], the_date=pd.Timestamp(tick_item['timestamp']).date())[
                                'factor']
                        self.sell(tick_item['securityId'], current_price=tick_item['price'] * factor)
                    self.today_traded = True
Exemplo n.º 4
0
def get_finance_report_event(security_item, index='reportEventDate'):
    """
    get finance report event items.

    Parameters
    ----------
    security_item : SecurityItem or str
        the security item,id or code

    index : {'reportEventDate','reportDate'} default is 'reportEventDate'
        the index for the return df

    Returns
    -------
    DataFrame

    """
    path = get_event_path(security_item, event='finance_report')

    if os.path.exists(path):
        df = pd.read_csv(path)
        df = index_df_with_time(df, index=index)
    else:
        df = pd.DataFrame()
    return df
Exemplo n.º 5
0
    def on_tick(self, tick_item):
        # 只计算日内,超短线
        if not is_same_date(self.current_time, tick_item['timestamp']):
            self.today_traded = False
            if not self.df.empty:
                self.df = pd.DataFrame()
        else:
            if not self.today_traded:
                if not self.df.empty and self.df.index.size == 1:
                    self.df = index_df_with_time(self.df)
                self.df = self.df.append(tick_item, ignore_index=True)
                # 14:50时,计算当日资金流
                if ("14:50:" in tick_item['timestamp']):
                    money_flow = (self.df['turnover'] * self.df['direction']).sum()
                    money_all = (self.df['turnover'] * abs(self.df['direction'])).sum()

                    # 净流入
                    if money_flow > 0 and not self.account_service.get_position(tick_item['securityId']):
                        # 使用用后复权价格
                        factor = \
                            get_kdata(tick_item['securityId'], the_date=pd.Timestamp(tick_item['timestamp']).date())[
                                'factor']
                        self.buy(tick_item['securityId'], current_price=tick_item['price'] * factor)
                    # 净流出
                    elif money_flow < 0 and self.account_service.get_position(tick_item['securityId']):
                        # 使用用后复权价格
                        factor = \
                            get_kdata(tick_item['securityId'], the_date=pd.Timestamp(tick_item['timestamp']).date())[
                                'factor']
                        self.sell(tick_item['securityId'], current_price=tick_item['price'] * factor)
                    self.today_traded = True
Exemplo n.º 6
0
def get_finance_report_event(security_item, index='reportEventDate'):
    path = get_event_path(security_item, event='finance_report')

    if os.path.exists(path):
        df = pd.read_csv(path)
        df = index_df_with_time(df, index=index)
    else:
        df = pd.DataFrame()
    return df
Exemplo n.º 7
0
    def download_sp500_pe(self, response):
        trs = response.xpath('//*[@id="datatable"]/tr').extract()

        price_jsons = []

        try:
            for tr in trs[1:]:
                tds = Selector(text=tr).xpath('//td//text()').extract()
                tds = [x.strip() for x in tds if x.strip()]

                price_jsons.append({"timestamp": to_time_str(tds[0]),
                                    "pe": to_float(tds[1])})

            if price_jsons:
                self.df_pe = self.df_pe.append(price_jsons, ignore_index=True)
                self.df_pe = index_df_with_time(self.df_pe)
        except Exception as e:
            self.logger.error('error when getting sp500 pe url={} error={}'.format(response.url, e))
Exemplo n.º 8
0
    def download_sp500_pe(self, response):
        trs = response.xpath('//*[@id="datatable"]/tr').extract()

        price_jsons = []

        try:
            for tr in trs[1:]:
                tds = Selector(text=tr).xpath('//td//text()').extract()
                tds = [x.strip() for x in tds if x.strip()]

                price_jsons.append({"timestamp": to_time_str(tds[0]),
                                    "pe": to_float(tds[1])})

            if price_jsons:
                self.df_pe = self.df_pe.append(price_jsons, ignore_index=True)
                self.df_pe = index_df_with_time(self.df_pe)
        except Exception as e:
            self.logger.exception('error when getting sp500 pe url={} error={}'.format(response.url, e))
Exemplo n.º 9
0
def finance_sheet_to_es(sheet_type=None,
                        start_code=None,
                        end_code=None,
                        force=False):
    if sheet_type is None:
        sheet_types = [
            'balance_sheet', 'income_statement', 'cash_flow_statement'
        ]
    else:
        sheet_types = [sheet_type]

    for sheet_type in sheet_types:
        if sheet_type == 'balance_sheet':
            doc_type = BalanceSheet
        elif sheet_type == 'income_statement':
            doc_type = IncomeStatement
        elif sheet_type == 'cash_flow_statement':
            doc_type = CashFlowStatement

        es_index_mapping(sheet_type, doc_type)

        for _, security_item in get_security_list(
                start_code=start_code, end_code=end_code).iterrows():
            try:
                if sheet_type == 'balance_sheet':
                    items = get_balance_sheet_items(security_item)
                elif sheet_type == 'income_statement':
                    items = get_income_statement_items(security_item)
                elif sheet_type == 'cash_flow_statement':
                    items = get_cash_flow_statement_items(security_item)

                df = pd.DataFrame(items)

                df = index_df_with_time(df, index='reportPeriod')

                df_to_es(df,
                         doc_type=doc_type,
                         timestamp_filed='reportPeriod',
                         security_item=security_item,
                         force=force)
            except Exception as e:
                logger.exception(
                    "index {} {} failed".format(security_item['code'],
                                                sheet_type), e)
    def download_finance_csv(self, response):

        content_type_header = response.headers.get('content-type', None)

        if content_type_header.decode("utf-8") == content_type_header.decode(
                "utf-8") == 'text/csv':
            path = response.meta['path']
            security_item = response.meta['item']

            df = pd.read_csv(io.BytesIO(response.body), na_values='None')
            df.columns = [
                "reportDate", "shares", "sharesAdjusted", "factor",
                "totalAssets", "totalCurrentAssets", "totalLiabilities",
                "totalCurrentLiabilities", "bookValue", "minorityBookValue",
                "preferredEquity", "goodwill", "longTermBorrowing",
                "operatingRevenue", "netProfit",
                "netProfitAttributedToParentCompanyOwner", "EPS", "dilutedEPS",
                "DPS", "netCashFlowsFromOperatingActivities",
                "netCashFlowsFromInvesting",
                "netCashFlowsFromFinancingActivities", "cashChange",
                "cashAtTheEndOfPeriod", "capitalExpenditures", "price",
                "priceHigh", "priceLow", "ROE", "ROA", "BVPS", "PB", "PE",
                "cumulativeDividendsPerShare", "dividendPayoutRatio",
                "longTermDebtToEquityRatio", "equityToAssetsRatio",
                "netMargin", "assetTurnover", "freeCashFlowPerShare",
                "currentRatio"
            ]

            df['code'] = security_item['code']
            df['securityId'] = security_item['id']
            df['id'] = df[['securityId', 'reportDate'
                           ]].apply(lambda x: '_'.join(x.astype(str)), axis=1)

            df = index_df_with_time(df, index='reportDate')

            df.fillna(0, inplace=True)

            df.to_csv(path, index=False)
        else:
            self.logger.error(
                "get finance csv error:url={} content type={} body={}".format(
                    response.url, content_type_header, response.body))
    def download_fi_report_event_data(self, response):
        security_item = response.meta['item']
        period_type = response.meta['period_type']

        path = get_event_path(security_item, event='finance_report')

        df = event.get_finance_report_event(security_item, index='reportEventDate')

        try:
            report_event_dates = response.xpath('//*[@id="con02-7"]/table[2]/tr/td[2]//ul/text()').extract()
            report_event_dates = [date.strip() for date in report_event_dates if date.strip()]

            report_contents = response.xpath('//*[@id="con02-7"]/table[2]/tr/td[2]//ul//a').extract()

            for i, tr in enumerate(report_contents):
                href = Selector(text=tr).xpath('//@href').extract()[0]
                title = Selector(text=tr).xpath('//text()').extract()[0]
                report_period = self.report_period_from_title(title, period_type, report_event_dates[i])

                # 如果最新的事件已经抓取,直接返回
                if i == 0:
                    if not df.empty:
                        latest = pd.Timestamp(report_event_dates[0]).date()
                        if df.index.contains(latest) and (df.loc[latest, 'title'] == title):
                            self.logger.info(
                                "{} {} report has been the latest".format(security_item['code'], report_period))
                            return

                df = df.append({
                    "id": "{}_{}_{}".format(security_item['id'], report_event_dates[i], report_period),
                    "securityId": security_item['id'],
                    "reportEventDate": report_event_dates[i],
                    "url": "http://vip.stock.finance.sina.com.cn" + href,
                    "title": title,
                    "reportDate": report_period}, ignore_index=True)
            if not df.empty:
                df = df.drop_duplicates(subset=['id', 'title'], keep='last')
                df = index_df_with_time(df, index='reportEventDate')
                df.to_csv(path, index=False)
        except Exception as e:
            self.logger.error('error when getting k data url={} error={}'.format(response.url, e))
Exemplo n.º 12
0
    def download_fi_report_event_data(self, response):
        security_item = response.meta['item']
        period_type = response.meta['period_type']

        path = get_finance_report_event_path(security_item)

        df = pd.DataFrame()

        try:
            report_timestamps = response.xpath(
                '//*[@id="con02-7"]/table[2]/tr/td[2]//ul/text()').extract()
            report_timestamps = [
                date.strip() for date in report_timestamps if date.strip()
            ]

            report_contents = response.xpath(
                '//*[@id="con02-7"]/table[2]/tr/td[2]//ul//a').extract()

            for i, tr in enumerate(report_contents):
                href = Selector(text=tr).xpath('//@href').extract()[0]
                title = Selector(text=tr).xpath('//text()').extract()[0]
                report_period = self.report_period_from_title(
                    title, period_type, report_timestamps[i])

                df = df.append(
                    {
                        "securityId": security_item['id'],
                        "timestamp": report_timestamps[i],
                        "url": "http://vip.stock.finance.sina.com.cn" + href,
                        "title": title,
                        "reportPeriod": report_period
                    },
                    ignore_index=True)
            if not df.empty:
                df = df.drop_duplicates()
                df = index_df_with_time(df)
                df.to_csv(path, index=False)
        except Exception as e:
            self.logger.exception('error when getting k data url={}'.format(
                response.url))
Exemplo n.º 13
0
def get_finance_forecast_event(security_item):
    """
    get forecast items.

    Parameters
    ----------
    security_item : SecurityItem or str
        the security item,id or code

    Returns
    -------
    DataFrame

    """
    security_item = to_security_item(security_item)
    path = get_finance_forecast_event(security_item)

    if os.path.exists(path):
        df = pd.read_csv(path)
        df = index_df_with_time(df)
    else:
        df = pd.DataFrame()
    return df
Exemplo n.º 14
0
def finance_sheet_to_es(sheet_type='balance_sheet',
                        start_code=None,
                        end_code=None,
                        force=False):
    if sheet_type == 'balance_sheet':
        doc_type = BalanceSheet
    elif sheet_type == 'income_statement':
        doc_type = IncomeStatement
    elif sheet_type == 'cash_flow_statement':
        doc_type = CashFlowStatement

    es_index_mapping(sheet_type, doc_type)

    for _, security_item in get_security_list(start_code=start_code,
                                              end_code=end_code).iterrows():
        query = None
        if not force:
            query = {"term": {"securityId": ""}}
            query["term"]["securityId"] = security_item["id"]

        if sheet_type == 'balance_sheet':
            items = get_balance_sheet_items(security_item)
        elif sheet_type == 'income_statement':
            items = get_income_statement_items(security_item)
        elif sheet_type == 'cash_flow_statement':
            items = get_cash_flow_statement_items(security_item)

        df = pd.DataFrame(items)

        df = index_df_with_time(df, index='reportPeriod')

        df_to_es(df,
                 doc_type=doc_type,
                 timestamp_filed='reportPeriod',
                 query=query,
                 force=force)
Exemplo n.º 15
0
    def download_fi_report_event_data(self, response):
        security_item = response.meta['item']
        period_type = response.meta['period_type']

        path = get_event_path(security_item, event='finance_report')

        df = event.get_finance_report_event(security_item,
                                            index='reportEventDate')

        try:
            report_event_dates = response.xpath(
                '//*[@id="con02-7"]/table[2]/tr/td[2]//ul/text()').extract()
            report_event_dates = [
                date.strip() for date in report_event_dates if date.strip()
            ]

            report_contents = response.xpath(
                '//*[@id="con02-7"]/table[2]/tr/td[2]//ul//a').extract()

            for i, tr in enumerate(report_contents):
                href = Selector(text=tr).xpath('//@href').extract()[0]
                title = Selector(text=tr).xpath('//text()').extract()[0]
                report_period = self.report_period_from_title(
                    title, period_type, report_event_dates[i])

                # 如果最新的事件已经抓取,直接返回
                if i == 0:
                    if not df.empty:
                        latest = pd.Timestamp(report_event_dates[0]).date()
                        if df.index.contains(latest) and (df.loc[latest,
                                                                 'title']
                                                          == title):
                            self.logger.info(
                                "{} {} report has been the latest".format(
                                    security_item['code'], report_period))
                            return

                df = df.append(
                    {
                        "id":
                        "{}_{}_{}".format(security_item['id'],
                                          report_event_dates[i],
                                          report_period),
                        "securityId":
                        security_item['id'],
                        "reportEventDate":
                        report_event_dates[i],
                        "url":
                        "http://vip.stock.finance.sina.com.cn" + href,
                        "title":
                        title,
                        "reportDate":
                        report_period
                    },
                    ignore_index=True)
            if not df.empty:
                df = df.drop_duplicates(subset=['id', 'title'], keep='last')
                df = index_df_with_time(df, index='reportEventDate')
                df.to_csv(path, index=False)
        except Exception as e:
            self.logger.error(
                'error when getting k data url={} error={}'.format(
                    response.url, e))
    def download_finance_csv(self, response):

        content_type_header = response.headers.get('content-type', None)

        if content_type_header.decode("utf-8") == content_type_header.decode("utf-8") == 'text/csv':
            path = response.meta['path']
            security_item = response.meta['item']

            df = pd.read_csv(io.BytesIO(response.body), na_values='None')
            df.columns = [
                "reportDate",
                "shares",
                "sharesAdjusted",
                "factor",
                "totalAssets",
                "totalCurrentAssets",
                "totalLiabilities",
                "totalCurrentLiabilities",
                "bookValue",
                "minorityBookValue",
                "preferredEquity",
                "goodwill",
                "longTermBorrowing",
                "operatingRevenue",
                "netProfit",
                "netProfitAttributedToParentCompanyOwner",
                "EPS",
                "dilutedEPS",
                "DPS",
                "netCashFlowsFromOperatingActivities",
                "netCashFlowsFromInvesting",
                "netCashFlowsFromFinancingActivities",
                "cashChange",
                "cashAtTheEndOfPeriod",
                "capitalExpenditures",
                "price",
                "priceHigh",
                "priceLow",
                "ROE",
                "ROA",
                "BVPS",
                "PB",
                "PE",
                "cumulativeDividendsPerShare",
                "dividendPayoutRatio",
                "longTermDebtToEquityRatio",
                "equityToAssetsRatio",
                "netMargin",
                "assetTurnover",
                "freeCashFlowPerShare",
                "currentRatio"]

            df['code'] = security_item['code']
            df['securityId'] = security_item['id']
            df['id'] = df[['securityId', 'reportDate']].apply(lambda x: '_'.join(x.astype(str)), axis=1)

            df = index_df_with_time(df, index='reportDate')

            df.fillna(0, inplace=True)

            df.to_csv(path, index=False)
        else:
            self.logger.error(
                "get finance csv error:url={} content type={} body={}".format(response.url, content_type_header,
                                                                              response.body))