Пример #1
0
    def record(self, entity, start, end, size, timestamps):
        q = query(finance.FUND_PORTFOLIO_STOCK).filter(finance.FUND_PORTFOLIO_STOCK.pub_date >= start).filter(
            finance.FUND_PORTFOLIO_STOCK.code == entity.code)
        df = finance.run_query(q)
        if pd_is_not_null(df):
            #          id    code period_start  period_end    pub_date  report_type_id report_type  rank  symbol  name      shares    market_cap  proportion
            # 0   8640569  159919   2018-07-01  2018-09-30  2018-10-26          403003        第三季度     1  601318  中国平安  19869239.0  1.361043e+09        7.09
            # 1   8640570  159919   2018-07-01  2018-09-30  2018-10-26          403003        第三季度     2  600519  贵州茅台    921670.0  6.728191e+08        3.50
            # 2   8640571  159919   2018-07-01  2018-09-30  2018-10-26          403003        第三季度     3  600036  招商银行  18918815.0  5.806184e+08        3.02
            # 3   8640572  159919   2018-07-01  2018-09-30  2018-10-26          403003        第三季度     4  601166  兴业银行  22862332.0  3.646542e+08        1.90
            df['timestamp'] = pd.to_datetime(df['pub_date'])

            df.rename(columns={'symbol': 'stock_code', 'name': 'stock_name'}, inplace=True)
            df['proportion'] = df['proportion'] * 0.01

            df = portfolio_relate_stock(df, entity)

            df['stock_id'] = df['stock_code'].apply(lambda x: china_stock_code_to_id(x))
            df['id'] = df[['entity_id', 'stock_id', 'pub_date', 'id']].apply(lambda x: '_'.join(x.astype(str)), axis=1)
            df['report_date'] = pd.to_datetime(df['period_end'])
            df['report_period'] = df['report_type'].apply(lambda x: jq_to_report_period(x))

            df_to_db(df=df, data_schema=self.data_schema, provider=self.provider, force_update=self.force_update)

            # self.logger.info(df.tail())
            self.logger.info(f"persist etf {entity.code} portfolio success")

        return None
    def record(self, entity, start, end, size, timestamps):
        for page in range(1, 5):
            resp = requests.get(self.category_stocks_url.format(page, entity.code))
            try:
                if resp.text == 'null' or resp.text is None:
                    break
                category_jsons = demjson.decode(resp.text)
                the_list = []
                for category in category_jsons:
                    stock_code = category['code']
                    stock_id = china_stock_code_to_id(stock_code)
                    block_id = entity.id
                    the_list.append({
                        'id': '{}_{}'.format(block_id, stock_id),
                        'entity_id': block_id,
                        'entity_type': 'block',
                        'exchange': entity.exchange,
                        'code': entity.code,
                        'name': entity.name,
                        'timestamp': now_pd_timestamp(),
                        'stock_id': stock_id,
                        'stock_code': stock_code,
                        'stock_name': category['name'],

                    })
                if the_list:
                    df = pd.DataFrame.from_records(the_list)
                    df_to_db(data_schema=self.data_schema, df=df, provider=self.provider,
                             force_update=True)

                self.logger.info('finish recording BlockStock:{},{}'.format(entity.category, entity.name))

            except Exception as e:
                self.logger.error("error:,resp.text:", e, resp.text)
            self.sleep()
Пример #3
0
    def record(self, entity, start, end, size, timestamps):
        resp = requests.get(self.category_stocks_url.format(entity.code, '1'))
        try:
            results = json_callback_param(resp.text)
            the_list = []
            for result in results:
                items = result.split(',')
                stock_code = items[1]
                stock_id = china_stock_code_to_id(stock_code)
                block_id = entity.id

                the_list.append({
                    'id': '{}_{}'.format(block_id, stock_id),
                    'entity_id': block_id,
                    'entity_type': 'block',
                    'exchange': entity.exchange,
                    'code': entity.code,
                    'name': entity.name,
                    'timestamp': now_pd_timestamp(),
                    'stock_id': stock_id,
                    'stock_code': stock_code,
                    'stock_name': items[2],

                })
            if the_list:
                df = pd.DataFrame.from_records(the_list)
                df_to_db(data_schema=self.data_schema, df=df, provider=self.provider, force_update=True)

            self.logger.info('finish recording block:{},{}'.format(entity.category, entity.name))

        except Exception as e:
            self.logger.error("error:,resp.text:", e, resp.text)
        self.sleep()
Пример #4
0
    def fetch_csi_index_component(self):
        """
        抓取上证、中证指数成分股
        """
        for _, index in self.all_index.iterrows():
            response_df = pd.DataFrame()
            index_code = index.name.split(".")[0]

            stocks = get_index_stocks(index.name)
            response_df['stock_code'] = stocks
            response_df['stock_code'] = response_df['stock_code'].apply(
                lambda x: x.split(".")[0])
            index_id = f'index_cn_{index_code}'
            response_df['entity_id'] = index_id

            response_df['entity_type'] = 'index'
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index.display_name
            response_df['timestamp'] = now_pd_timestamp()

            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda x: china_stock_code_to_id(str(x)))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            df_to_db(data_schema=self.data_schema,
                     df=response_df,
                     provider=self.provider,
                     force_update=True)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            self.sleep()
    def format(self, entity, df):
        #          id    code period_start  period_end    pub_date  report_type_id report_type  rank  symbol  name      shares    market_cap  proportion
        # 0   8640569  159919   2018-07-01  2018-09-30  2018-10-26          403003        第三季度     1  601318  中国平安  19869239.0  1.361043e+09        7.09
        # 1   8640570  159919   2018-07-01  2018-09-30  2018-10-26          403003        第三季度     2  600519  贵州茅台    921670.0  6.728191e+08        3.50
        # 2   8640571  159919   2018-07-01  2018-09-30  2018-10-26          403003        第三季度     3  600036  招商银行  18918815.0  5.806184e+08        3.02
        # 3   8640572  159919   2018-07-01  2018-09-30  2018-10-26          403003        第三季度     4  601166  兴业银行  22862332.0  3.646542e+08        1.90

        if 'timestamp' not in df.columns:
            df['timestamp'] = pd.to_datetime(df[self.get_original_time_field()])
        elif not isinstance(df['timestamp'].dtypes, datetime):
            df['timestamp'] = pd.to_datetime(df['timestamp'])

        df.rename(columns={'symbol': 'stock_code', 'name': 'stock_name'}, inplace=True)
        df['proportion'] *= 0.01
        df['stock_id'] = df['stock_code'].apply(lambda x: china_stock_code_to_id(x))
        df['report_date'] = pd.to_datetime(df['period_end'])
        df['report_period'] = df['report_type'].apply(lambda x: jq_to_report_period(x))

        df['entity_id'] = entity.id
        df['provider'] = self.provider.value
        df['entity_type'] = entity.entity_type
        df['exchange'] = entity.exchange
        df['code'] = entity.code
        df['name'] = entity.name

        df['id'] = self.generate_domain_id(entity, df)
        return df
Пример #6
0
    def fetch_szse_index_component(self, df: pd.DataFrame, http_session):
        """
        抓取深证指数成分股
        """
        query_url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1747_zs&TABKEY=tab1&ZSDM={}'

        for _, index in df.iterrows():
            index_code = index['code']

            url = query_url.format(index_code)
            content = sync_get(http_session, url, return_type='content')
            if content is None:
                continue

            response_df = pd.read_excel(io.BytesIO(content), dtype='str')

            index_id = f'index_cn_{index_code}'
            response_df['entity_id'] = index_id
            response_df['entity_type'] = EntityType.Index.value
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index['name']
            response_df['timestamp'] = now_pd_timestamp(self.region)

            response_df.rename(columns={'证券代码': 'stock_code', '证券简称': 'stock_name'}, inplace=True)
            response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x)))

            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            df_to_db(df=response_df, ref_df=None, region=self.region, data_schema=self.data_schema, provider=self.provider)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            self.sleep()
Пример #7
0
    def download_sz_etf_component(self, df: pd.DataFrame, http_session):
        query_url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vII_NewestComponent/indexid/{}.phtml'

        self.parse_sz_etf_underlying_index(df)
        for _, etf in df.iterrows():
            underlying_index = etf['拟合指数']
            etf_code = etf['证券代码']

            if len(underlying_index) == 0:
                self.logger.info(f'{etf["证券简称"]} - {etf_code} 非 A 股市场指数,跳过...')
                continue

            url = query_url.format(underlying_index)
            response = request_get(http_session, url)
            response.encoding = 'gbk'

            try:
                dfs = pd.read_html(response.text, header=1)
            except ValueError as error:
                self.logger.error(
                    f'HTML parse error: {error}, response: {response.text}')
                continue

            if len(dfs) < 4:
                continue

            response_df = dfs[3].copy()
            response_df = response_df.dropna(axis=1, how='any')
            response_df['品种代码'] = response_df['品种代码'].apply(
                lambda x: f'{x:06d}')

            etf_id = f'etf_sz_{etf_code}'
            response_df = response_df[['品种代码', '品种名称']].copy()
            response_df.rename(columns={
                '品种代码': 'stock_code',
                '品种名称': 'stock_name'
            },
                               inplace=True)

            response_df['entity_id'] = etf_id
            response_df['entity_type'] = EntityType.ETF.value
            response_df['exchange'] = 'sz'
            response_df['code'] = etf_code
            response_df['name'] = etf['证券简称']
            response_df['timestamp'] = now_pd_timestamp(Region.CHN)

            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda code: china_stock_code_to_id(code))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{etf_id}_{x}')

            df_to_db(df=response_df,
                     region=Region.CHN,
                     data_schema=self.data_schema,
                     provider=self.provider)
            self.logger.info(f'{etf["证券简称"]} - {etf_code} 成分股抓取完成...')

            self.sleep()
Пример #8
0
    def download_sh_etf_component(self, df: pd.DataFrame, http_session):
        """
        ETF_CLASS => 1. 单市场 ETF 2.跨市场 ETF 3. 跨境 ETF
                        5. 债券 ETF 6. 黄金 ETF
        :param df: ETF 列表数据
        :return: None
        """
        query_url = 'http://query.sse.com.cn/infodisplay/queryConstituentStockInfo.do?' \
                    'isPagination=false&type={}&etfClass={}'

        etf_df = df[(df['ETF_CLASS'] == '1') | (df['ETF_CLASS'] == '2')]
        etf_df = self.populate_sh_etf_type(etf_df, http_session)

        for _, etf in etf_df.iterrows():
            url = query_url.format(etf['ETF_TYPE'], etf['ETF_CLASS'])
            text = sync_get(http_session,
                            url,
                            headers=DEFAULT_SH_ETF_LIST_HEADER,
                            return_type='text')
            if text is None:
                continue
            response_dict = demjson.decode(text)
            response_df = pd.DataFrame(response_dict.get('result', []))

            etf_code = etf['FUND_ID']
            etf_id = f'etf_sh_{etf_code}'
            response_df = response_df[['instrumentId',
                                       'instrumentName']].copy()
            response_df.rename(columns={
                'instrumentId': 'stock_code',
                'instrumentName': 'stock_name'
            },
                               inplace=True)

            response_df['entity_id'] = etf_id
            response_df['entity_type'] = EntityType.ETF.value
            response_df['exchange'] = 'sh'
            response_df['code'] = etf_code
            response_df['name'] = etf['FUND_NAME']
            response_df['timestamp'] = now_pd_timestamp(Region.CHN)

            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda code: china_stock_code_to_id(code))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{etf_id}_{x}')

            df_to_db(df=response_df,
                     ref_df=None,
                     region=Region.CHN,
                     data_schema=self.data_schema,
                     provider=self.provider)
            self.logger.info(f'{etf["FUND_NAME"]} - {etf_code} 成分股抓取完成...')

            self.sleep()
 def numba_boost_up(category_jsons):
     the_list = []
     for category in category_jsons:
         stock_code = category['code']
         stock_id = china_stock_code_to_id(stock_code)
         the_list.append({
             'stock_id': stock_id,
             'stock_code': stock_code,
             'stock_name': category['name'],
         })
     return the_list
Пример #10
0
 def numba_boost_up(results):
     the_list = []
     for result in results:
         items = result.split(',')
         stock_code = items[1]
         stock_id = china_stock_code_to_id(stock_code)
         the_list.append({
             'stock_id': stock_id,
             'stock_code': stock_code,
             'stock_name': items[2],
         })
     return the_list
    def fetch_csi_index_component(self, df: pd.DataFrame, http_session):
        """
        抓取上证、中证指数成分股
        """
        query_url = 'http://www.csindex.com.cn/uploads/file/autofile/cons/{}cons.xls'

        for _, index in df.iterrows():
            index_code = index['code']

            url = query_url.format(index_code)

            try:
                response = request_get(http_session, url)
                response.raise_for_status()
            except requests.HTTPError as error:
                self.logger.error(
                    f'{index["name"]} - {index_code} 成分股抓取错误 ({error})')
                continue

            response_df = pd.read_excel(io.BytesIO(response.content))

            response_df = response_df[[
                '成分券代码Constituent Code', '成分券名称Constituent Name'
            ]].rename(
                columns={
                    '成分券代码Constituent Code': 'stock_code',
                    '成分券名称Constituent Name': 'stock_name'
                })

            index_id = f'index_cn_{index_code}'
            response_df['entity_id'] = index_id
            response_df['entity_type'] = EntityType.Index.value
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index['name']
            response_df['timestamp'] = now_pd_timestamp(Region.CHN)

            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda x: china_stock_code_to_id(str(x)))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            df_to_db(df=response_df,
                     region=Region.CHN,
                     data_schema=self.data_schema,
                     provider=self.provider,
                     force_update=True)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            self.sleep()
    def fetch_cni_index_component(self, df: pd.DataFrame, http_session):
        """
        抓取国证指数成分股
        """
        query_url = 'http://www.cnindex.com.cn/docs/yb_{}.xls'

        for _, index in df.iterrows():
            index_code = index['code']

            url = query_url.format(index_code)

            try:
                response = request_get(http_session, url)
                response.raise_for_status()
            except requests.HTTPError as error:
                self.logger.error(
                    f'{index["name"]} - {index_code} 成分股抓取错误 ({error})')
                continue

            response_df = pd.read_excel(io.BytesIO(response.content),
                                        dtype='str')

            index_id = f'index_cn_{index_code}'

            try:
                response_df = response_df[['样本股代码']]
            except KeyError:
                response_df = response_df[['证券代码']]

            response_df['entity_id'] = index_id
            response_df['entity_type'] = EntityType.Index.value
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index['name']
            response_df['timestamp'] = now_pd_timestamp(Region.CHN)

            response_df.columns = ['stock_code']
            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda x: china_stock_code_to_id(str(x)))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            df_to_db(df=response_df,
                     region=Region.CHN,
                     data_schema=self.data_schema,
                     provider=self.provider)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            self.sleep()
Пример #13
0
    def fetch_csi_index_component(self, df: pd.DataFrame, http_session):
        """
        抓取上证、中证指数成分股
        """
        query_url = 'http://www.csindex.com.cn/uploads/file/autofile/cons/{}cons.xls'

        for _, index in df.iterrows():
            index_code = index['code']
            url = query_url.format(index_code)
            content = sync_get(http_session, url, return_type='content')
            if content is None:
                self.logger.error(f'{index["name"]} - {index_code} 成分股抓取错误')
                continue

            response_df = pd.read_excel(io.BytesIO(content))

            response_df = response_df[[
                '成分券代码Constituent Code', '成分券名称Constituent Name'
            ]].rename(
                columns={
                    '成分券代码Constituent Code': 'stock_code',
                    '成分券名称Constituent Name': 'stock_name'
                })

            index_id = f'index_cn_{index_code}'
            response_df['entity_id'] = index_id
            response_df['entity_type'] = EntityType.Index.value
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index['name']
            response_df['timestamp'] = now_pd_timestamp(self.region)

            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda x: china_stock_code_to_id(str(x)))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            df_to_db(df=response_df,
                     ref_df=None,
                     region=self.region,
                     data_schema=self.data_schema,
                     provider=self.provider)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            self.sleep()
Пример #14
0
    def record(self, entity, start, end, size, timestamps):
        if start < to_pd_timestamp("2008-01-01"):
            start = to_pd_timestamp("2008-01-01")
        # 获取数据库中已有数据
        data_schema_df = self.data_schema.query_data(entity_id=entity.id)
        if not data_schema_df.empty and data_schema_df.timestamp.max(
        ) <= start:
            from datetime import timedelta
            # bdate_range_date = pd.bdate_range(start, start + timedelta(weeks=1))
            bdate_range_date = pd.bdate_range(start,
                                              start + timedelta(weeks=4 * 6))
            if bdate_range_date[
                    bdate_range_date > start][-1] > now_pd_timestamp():
                start = to_time_str(now_pd_timestamp())
            else:
                start = to_time_str(
                    bdate_range_date[bdate_range_date > start][-1])

        data = get_index_stocks(to_jq_entity_id(entity), date=start)

        df = pd.DataFrame(data, columns=['code'])
        if pd_is_not_null(df):
            df['stock_code'] = df['code'].apply(lambda x: str(x).split('.')[0])
            df['stock_exchange'] = df['code'].apply(
                lambda x: str(x).split('.')[1])
            df['stock_exchange'] = df['stock_exchange'].replace(
                'XSHG', 'sh').replace('XSHE', 'sz')
            df['stock_id'] = df['stock_code'].apply(
                lambda x: china_stock_code_to_id(x))
            df['stock_name'] = df.stock_id.apply(
                lambda x: StockDetail.query_data(entity_id=x).name[0])
            df_old = df[df.stock_id.isin(data_schema_df.stock_id)].copy()
            df_new = df[~df.stock_id.isin(data_schema_df.stock_id)].copy()
            out_data = data_schema_df[~data_schema_df.stock_id.isin(df.stock_id
                                                                    )].copy()
            if df_new.empty and out_data.empty:
                # 没有调入的股票  没有调出的股票
                data_schema_df['timestamp'] = pd.to_datetime(start)
                data_schema_df['pub_date'] = data_schema_df['timestamp']
                df_to_db(df=data_schema_df,
                         data_schema=self.data_schema,
                         provider=self.provider,
                         force_update=self.force_update)
                return None
            elif df_new.empty and not out_data.empty:
                # 没有调入,有调出
                data_schema_df['timestamp'] = pd.to_datetime(start)
                data_schema_df['pub_date'] = data_schema_df['timestamp']
                df2 = pd.DataFrame()
                for index, data_old in data_schema_df.iterrows():
                    schema_details = out_data.query(
                        "id == @data_old.id").copy()
                    if not schema_details.empty:
                        data_schema_df['timestamp'] = pd.to_datetime(start)
                        # 更新调出时间
                        schema_details['out_date'] = pd.to_datetime(start)
                        data_schema_df['pub_date'] = pd.to_datetime(start)
                        df2 = df2.append(schema_details)
                    else:
                        df2 = df2.append(data_old)
                df_to_db(df=df2,
                         data_schema=self.data_schema,
                         provider=self.provider,
                         force_update=self.force_update)
                return None
            elif not df_new.empty and not out_data.empty:
                # 有调入,有调出
                data_schema_df['timestamp'] = pd.to_datetime(start)
                data_schema_df['pub_date'] = data_schema_df['timestamp']
                df2 = pd.DataFrame()
                # 处理调出
                for index, data_old in data_schema_df.iterrows():
                    schema_details = out_data.query(
                        "id == @data_old.id").copy()
                    if not schema_details.empty:
                        # 更新调出时间
                        schema_details['out_date'] = pd.to_datetime(start)
                        schema_details['pub_date'] = pd.to_datetime(start)
                        df2 = df2.append(schema_details)
                    else:
                        # 无调出的不变
                        df2 = df2.append(data_old)
                # 处理调入,新增
                df_new['timestamp'] = pd.to_datetime(start)
                df_new['pub_date'] = df_new['timestamp']
                df_new['into_date'] = start
                df_new['out_date'] = pd.to_datetime("2200-01-01")
                df_new['pub_date2'] = df_new['timestamp'].apply(
                    lambda x: to_time_str(x))
                df_new['stock_code'] = df_new['code'].apply(
                    lambda x: str(x).split('.')[0])
                df_new['entity_id'] = entity.id
                df_new['id'] = df_new[['entity_id', 'stock_id']].apply(
                    lambda x: '_'.join(x.astype(str)), axis=1)
                df_new['code'] = entity.code
                df_new['entity_type'] = entity.entity_type
                df_new['exchange'] = entity.exchange
                df_new['name'] = entity.name
                df2 = df2.append(df_new)
                df_to_db(df=df2,
                         data_schema=self.data_schema,
                         provider=self.provider,
                         force_update=self.force_update)
                return None
            elif not df_new.empty and out_data.empty:
                # 处理调入,新增
                df_new['timestamp'] = pd.to_datetime(start)
                df_new['pub_date'] = df_new['timestamp']
                df_new['into_date'] = start
                df_new['out_date'] = pd.to_datetime("2200-01-01")
                df_new['pub_date2'] = df_new['timestamp'].apply(
                    lambda x: to_time_str(x))
                df_new['stock_code'] = df_new['code'].apply(
                    lambda x: str(x).split('.')[0])
                df_new['entity_id'] = entity.id
                df_new['id'] = df_new[['entity_id', 'stock_id']].apply(
                    lambda x: '_'.join(x.astype(str)), axis=1)
                df_new['code'] = entity.code
                df_new['entity_type'] = entity.entity_type
                df_new['exchange'] = entity.exchange
                df_new['name'] = entity.name
                df_to_db(df=df_new,
                         data_schema=self.data_schema,
                         provider=self.provider,
                         force_update=self.force_update)
                return None
            else:
                print('1')

            # self.logger.info(df.tail())
            self.logger.info(f"persist etf {entity.code} portfolio success")

        return None