Exemplo n.º 1
0
 def sz_start(self):
     self._spider_init()
     for page in range(1, 8):
         logger.info("page is {}".format(page))
         datas = self._make_sz_params(page)
         resp = requests.post(self.sz_url, headers=self.headers, data=datas)
         # print(resp)
         if resp.status_code == 200:
             ret = resp.text
             py_ret = json.loads(ret)
             announcements = py_ret.get("data")
             items = []
             for a in announcements:
                 # print(a)
                 item = dict()
                 item['market'] = 90  # 深交所
                 item['title'] = a.get("doctitle")
                 item['link'] = a.get("docpuburl")
                 item['time'] = self.trans_dt(a.get('docpubtime'))
                 # eg. http://www.szse.cn/disclosure/notice/general/t20200430_576647.json
                 content_json_url = urljoin("http://www.szse.cn", a.get("docpubjsonurl"))
                 content = self.parse_json_content(content_json_url)
                 item['content'] = content
                 items.append(item)
             print(items)
             ret = self._batch_save(self.spider_client, items, self.announcement_table, self.firelds)
         else:
             self.error_pages.append(page)
Exemplo n.º 2
0
def diff_task():
    """对比两天清单差异的任务"""
    now = lambda: time.time()
    start_time = now()
    SzGener().start()
    logger.info(
        f"用时: {now() - start_time} 秒")  # (end)大概是 80s (dispose)大概是 425s
    def crawl(self):
        # (1)建表
        self._create_table()

        # (2) 下载昨天和前天的数据文件
        _today = datetime.datetime.combine(datetime.datetime.today(),
                                           datetime.time.min)

        _yester_day = _today - datetime.timedelta(days=1)
        print(_yester_day)

        _before_yester_day = _today - datetime.timedelta(days=2)
        print(_before_yester_day)

        _yester_file = self.load_xlsx(_yester_day)
        print(">>>> ")
        _before_yester_day_file = self.load_xlsx(_before_yester_day)
        print("<<<<")

        # (3) 将数据存入数据库
        save_ret1 = self.read_xls(_yester_file, _yester_day)
        save_ret2 = self.read_xls(_before_yester_day_file, _before_yester_day)
        logger.info("入库结果: {} >>> {}, {} >>> {}".format(
            _yester_day, save_ret1, _before_yester_day, save_ret2))
        local_str = "本地" if LOCAL else "远程"
        self.ding("{}: 深交所融资融券清单爬虫入库结果: {} >>{}, {} >>{} ".format(
            local_str, _yester_day, save_ret1, _before_yester_day, save_ret2))
Exemplo n.º 4
0
    def start(self):
        msg = ''

        local_str = "本地测试: " if LOCAL else "远程: "
        msg += local_str
        msg += '上交所数据生成:\n'

        _today = datetime.datetime.combine(datetime.datetime.today(),
                                           datetime.time.min)
        _yester_day = _today - datetime.timedelta(days=1)
        _before_yester_day = _today - datetime.timedelta(days=2)
        print(_yester_day, "***", _before_yester_day)
        msg += '时间点: {} 与 {}\n'.format(_yester_day, _before_yester_day)
        _type_str_map = {
            10: "融资",
            20: "融券",
        }
        for _type in (10, 20):
            logger.info(_type)
            # 昨日的清单
            _yester_day_list = self.get_spider_dt_list(_yester_day, _type)
            # 前日的清单
            _before_yester_day_list = self.get_spider_dt_list(
                _before_yester_day, _type)
            print(len(_yester_day_list), len(_before_yester_day_list))

            if _yester_day and _before_yester_day:
                to_add = set(_yester_day_list) - set(_before_yester_day_list)
                to_delete = set(_before_yester_day_list) - set(
                    _yester_day_list)
                logger.info("需新增数据: {}, 需删除数据: {}".format(to_add, to_delete))
                msg += "{}: 需新增数据: {}, 需删除数据: {}\n".format(
                    _type_str_map.get(_type), to_add, to_delete)

                if to_add:
                    for one in to_add:
                        # 数据 时间 融资融券类型 移入移出类型
                        self._update(one, _yester_day, _type, 1)

                if to_delete:
                    for one in to_delete:
                        self._update(one, _yester_day, _type, 0)

        msg += '一致性检查: \n'

        # 状态计算清单
        dc_list_10 = set(sorted(self.product_dt_datas(83, 10)))
        # 真正清单
        spider_list_10 = set(sorted(self.get_spider_latest_list(83, 10)))
        msg += "融资一致性 check : dc_list - latest_list_spider >> {},latest_list_spider - dc_list>>{} \n".format(
            dc_list_10 - spider_list_10, spider_list_10 - dc_list_10)

        dc_list_20 = set(sorted(self.product_dt_datas(83, 20)))
        spider_list_20 = set(sorted(self.get_spider_latest_list(83, 20)))
        msg += "融券一致性 check : dc_list - latest_list_spider >> {},latest_list_spider - dc_list>>{} \n".format(
            dc_list_20 - spider_list_20, spider_list_20 - dc_list_20)

        print(msg)
        self.ding(msg)
Exemplo n.º 5
0
    def start(self):
        self._create_table()

        self.sh_start()

        self.sz_start()

        logger.info(self.error_urls)
        logger.info(self.error_pages)
Exemplo n.º 6
0
 def get_spider_dt_list(self, dt, category):
     """获取爬虫库中具体某一天的清单"""
     self._spider_init()
     sql_dt = '''select max(ListDate) as mx from {} where ListDate <= '{}' and SecuMarket =83 and TargetCategory = {}; 
     '''.format(self.spider_table_name, dt, category)
     dt_ = self.spider_client.select_one(sql_dt).get("mx")
     logger.info("距离 {} 最近的之前的一天是{}".format(dt, dt_))
     if dt_:
         sql = '''select InnerCode from {} where ListDate = '{}' and SecuMarket = 83 and TargetCategory = {};
         '''.format(self.spider_table_name, dt_, category)
         ret = self.spider_client.select_all(sql)
         ret = [r.get("InnerCode") for r in ret]
         return ret
     else:
         return []
Exemplo n.º 7
0
        def check2():
            # 融券
            latest_list_spider = self.dt_datas(_today, 2)
            latest_list_spider = set(sorted(latest_list_spider))

            target = self._init_pool(self.product_cfg)
            sql = 'select InnerCode from {} where SecuMarket = 90 and TargetFlag = 1 and  TargetCategory = 20; '.format(
                self.target_table_name)
            ret = target.select_all(sql)

            dc_list = set(sorted([r.get("InnerCode") for r in ret]))

            r1 = dc_list - latest_list_spider
            r2 = latest_list_spider - dc_list
            msg = "融券一致性 check : dc_list - latest_list_spider >> {},latest_list_spider - dc_list>>{}".format(
                r1, r2)
            logger.info(msg)
            return msg
Exemplo n.º 8
0
    def history_diff(self, dt1, dt2, type=1):
        """
        将历史数据中某两天的数据进行 diff
        dt1 是变更发生的时间 
        dt2 是变更发生的前一天
        type: 1 是融资 2 是融券
        """

        data1 = self.dt_datas(dt1, type)
        data1 = set(sorted(data1))

        data2 = self.dt_datas(dt2, type)
        data2 = set(sorted(data2))

        to_add_set = data1 - data2
        to_delete_set = data2 - data1

        logger.info("要增加的标的: {}".format(to_add_set))
        logger.info("要剔除的标的: {}".format(to_delete_set))

        return to_add_set, to_delete_set
Exemplo n.º 9
0
 def _create_table(self):
     """对公告爬虫建表 """
     self._spider_init()
     sql = '''
     CREATE TABLE IF NOT EXISTS `{}` (
       `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'ID',
       `market` int(11) DEFAULT NULL COMMENT '证券市场',  
       `title` varchar(200) DEFAULT NULL COMMENT '公告标题',
       `link` varchar(200) DEFAULT NULL COMMENT '公告链接',
       `time` datetime NOT NULL COMMENT '公告发布时间', 
       `content` text CHARACTER SET utf8 COLLATE utf8_bin COMMENT '公告内容',
       `keyword` text CHARACTER SET utf8 COLLATE utf8_bin COMMENT '公告关键词',
       `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP,
       `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
       PRIMARY KEY (`id`),
       UNIQUE KEY `un2` (`market`, `link`) USING BTREE
     ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='交易所融资融券公告信息';
     
     '''.format(self.announcement_table)
     self.spider_client.insert(sql)
     logger.info("爬虫公告表建表成功")
Exemplo n.º 10
0
    def start(self):
        msg = ''
        # 建表[远程没有建表的权限]
        if LOCAL:
            self._create_table()

        # 将聚源数据库的数据导出
        if FIRST:
            self.load_juyuan()
            logger.info("已经导出聚源数据库")
            self.parse_announcemen_byhuman()

        _today = datetime.datetime.combine(datetime.datetime.today(),
                                           datetime.time.min)
        _yester_day = _today - datetime.timedelta(days=1)
        _before_yester_day = _today - datetime.timedelta(days=2)

        # print(_yester_day)
        # print(_before_yester_day)

        logger.info("开始处理融资数据")
        msg += self.gene_records(_yester_day, _before_yester_day, 1)

        msg += '\n'

        logger.info("开始处理融券数据")
        msg += self.gene_records(_yester_day, _before_yester_day, 2)

        msg += '\n\n'
        msg += self.monitor()

        # print(msg)
        self.ding(msg)
Exemplo n.º 11
0
 def _create_table(self):
     sql = '''
     CREATE TABLE IF NOT EXISTS `{}` (
       `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'ID',
       `SecuMarket` int(11) DEFAULT NULL COMMENT '证券市场',
       `InnerCode` int(11) NOT NULL COMMENT '证券内部编码',
       `InDate` datetime NOT NULL COMMENT '调入日期',
       `OutDate` datetime DEFAULT NULL COMMENT '调出日期',
       `TargetCategory` int(11) NOT NULL COMMENT '标的类别',
       `TargetFlag` int(11) DEFAULT NULL COMMENT '标的状态',
       `ChangeReasonDesc` varchar(2000) DEFAULT NULL COMMENT '变更原因描述',
       `UpdateTime` datetime NOT NULL COMMENT '数据源更新时间',
       `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP,
       `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
       PRIMARY KEY (`id`),
       UNIQUE KEY `IX_MT_TargetSecurities` (`SecuMarket`, `InnerCode`,`TargetCategory`,`InDate`,`TargetFlag`)
     ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='融资融券标的证券变更记录';
     '''.format(self.target_table_name)
     target = self._init_pool(self.product_cfg)
     target.insert(sql)
     target.dispose()
     logger.info("尝试建表")
Exemplo n.º 12
0
        def check1():
            # 融资
            latest_list_spider = self.dt_datas(_today, 1)
            latest_list_spider = set(sorted(latest_list_spider))

            target = self._init_pool(self.product_cfg)
            sql = 'select InnerCode from {} where SecuMarket = 90 and TargetFlag = 1 and  TargetCategory = 10; '.format(
                self.target_table_name)
            ret = target.select_all(sql)

            dc_list = set(sorted([r.get("InnerCode") for r in ret]))

            r1 = dc_list - latest_list_spider
            r2 = latest_list_spider - dc_list
            '''
            {204806}
            set()
            '''
            # delete  from stk_mttargetsecurities where InnerCode = '204806';
            msg = "融资一致性 check : dc_list - latest_list_spider >> {},latest_list_spider - dc_list>>{}".format(
                r1, r2)
            logger.info(msg)
            return msg
Exemplo n.º 13
0
    def gene_records(self, dt1, dt2, type):
        """
        生成数据库的一条 "融资" 变更记录
        dt1 是较大的时间点
        dt2 是较小的时间点 是 dt1 的前一天
        """
        fields = [
            "SecuMarket", "InnerCode", "InDate", "OutDate", "TargetCategory",
            "TargetFlag", "ChangeReasonDesc"
        ]

        target = self._init_pool(self.product_cfg)

        to_add_set, to_delete_set = self.history_diff(dt1, dt2, type)
        logger.info("{} 和 {} 的 diff 结果: add: {} , delete: {}".format(
            dt1, dt2, to_add_set, to_delete_set))
        local_str = "本地" if LOCAL else "远程"
        type_str = "融资" if type == 1 else "融券"
        msg = "深交所: {}:{} 变更记录生成: {} 和 {} 的 diff 结果: add: {} , delete: {}".format(
            local_str, type_str, dt1, dt2, to_add_set, to_delete_set)

        if to_add_set:
            for inner_code in to_add_set:
                if type == 1:
                    item = {
                        "SecuMarket": 90,
                        "InnerCode": inner_code,
                        "InDate": dt1,
                        'TargetCategory': 10,
                        'TargetFlag': 1,
                        'ChangeReasonDesc': '',
                        'UpdateTime': datetime.datetime.now(),
                    }
                else:
                    item = {
                        "SecuMarket": 90,
                        "InnerCode": inner_code,
                        "InDate": dt1,
                        'TargetCategory': 20,
                        'TargetFlag': 1,
                        'ChangeReasonDesc': '',
                        'UpdateTime': datetime.datetime.now(),
                    }
                count = self._save(target, item, self.target_table_name,
                                   fields)
                logger.info("type: {}, add 记录条数 {}".format(type, count))

        if type == 1:
            base_sql = '''update {} set OutDate = '{}', TargetFlag = 0 where SecuMarket = 90 and InnerCode = {}\
            and TargetCategory = 10  and TargetFlag = 1; '''
        else:
            base_sql = '''update {} set OutDate = '{}', TargetFlag = 0 where SecuMarket = 90 and InnerCode = {}\
            and TargetCategory = 20  and TargetFlag = 1; '''

        if to_delete_set:
            for inner_code in to_delete_set:
                sql = base_sql.format(self.target_table_name, dt1, inner_code)
                ret = target.update(sql)
                logger.info("type: {}, update 记录条数是 {}".format(type, ret))

        try:
            target.dispose()
        except:
            logger.warning("dispose error")
            raise

        return msg
    def read_xls(self, file: str, dt: datetime.datetime):
        """
        读取单个时间点的文件
        :param file: 文件路径
        :param dt: 文件对应的日期
        :return:
        """
        wb = xlrd.open_workbook(file)
        detail = wb.sheet_by_name("融资融券标的证券信息")

        rows = detail.nrows - 1
        logger.info("总数据量 {}".format(rows))

        # 根据行数检测出没有数据的文件
        if rows < 10:
            logger.warning("当日无数据更新 {}".format(dt))
            # return True
            return '当日无数据更新'

        heads = detail.row_values(0)
        # print("表头信息", heads)
        # ['证券代码', '证券简称', '融资标的', '融券标的', '当日可融资', '当日可融券', '融券卖出价格限制']

        fields = [
            'SecuMarket',
            'SecuCode',
            'InnerCode',
            'SecuAbbr',
            'SerialNumber',
            'ListDate',
            'FinanceBool',  # 融资标的
            'FinanceBuyToday',  # 当日可融资
            'SecurityBool',  # 融券标的
            'SecuritySellToday',  # 当日可融券
            'SecuritySellLimit',  # 融券卖出价格限制
        ]

        # list_date = datetime.datetime.strptime(str(dt), "%Y%m%d")
        items = []
        for i in range(1, rows + 1):
            data = detail.row_values(i)
            # print(data)
            item = dict()
            item['SecuMarket'] = 90  # 深交所
            secu_code = data[0]
            item['SecuCode'] = secu_code
            item['InnerCode'] = self.get_inner_code(secu_code)
            item['SecuAbbr'] = data[1]
            item['SerialNumber'] = i
            item['ListDate'] = dt
            item['FinanceBool'] = 1 if data[2] == "Y" else 0  # 融资标的
            item['FinanceBuyToday'] = 1 if data[4] == "Y" else 0  # 当日可融资
            item['SecurityBool'] = 1 if data[3] == 'Y' else 0  # 融券标的
            item['SecuritySellToday'] = 1 if data[5] == "Y" else 0  # 当日可融券
            item['SecuritySellLimit'] = 1 if data[6] == "Y" else 0  # 融券卖出价格限制
            print(item)
            items.append(item)

        # client = self._init_pool(self.spider_cfg)
        for item in items:
            self.spider_conn.table_insert(self.history_table_name, item,
                                          fields)
            # self._save(client, item, self.history_table_name, fields)

        # try:
        #     client.dispose()
        # except:
        #     logger.warning("dispose error")
        #     raise

        # return True
        return '数据更新成功'
def sz_history_task():
    now = lambda: time.time()
    start_dt = now()
    SzListSpider().crawl()
    logger.info("耗时: {} 秒".format(now() - start_dt))
Exemplo n.º 16
0
    def parse_announcemen_byhuman(self):
        """从公告中提取更改信息 """
        base_sql = '''update {} set OutDate = '{}', TargetFlag = 2 where SecuMarket = 90 and InnerCode = {}\
        and TargetCategory in (10, 20) and TargetFlag = 1; '''

        target = self._init_pool(self.product_cfg)

        # # (1) http://www.szse.cn/disclosure/notice/general/t20200415_575996.html
        # # 本所于2020年4月16日起将 南京华东电子信息科技股份有限公司股票(证券代码:000727)  调出融资融券标的证券名单
        # inner_code = self.get_inner_code('000727')
        # print(inner_code)    # 401

        # (2) http://www.szse.cn/disclosure/notice/general/t20200428_576534.html
        # 本所于2020年4月29日起将 华映科技(集团)股份有限公司股票(证券代码:000536)  调出融资融券标的证券名单
        inner_code = self.get_inner_code("000536")  # 220
        dt = datetime.datetime(2020, 4, 29)
        sql = '''update {} set OutDate = '{}', TargetFlag = 2 where SecuMarket = 90 and InnerCode = {}\
 and TargetCategory in (10, 20) and TargetFlag = 1; '''.format(
            self.target_table_name, dt, inner_code)
        ret = target.update(sql)
        print(ret)
        target.end()

        # (3) http://www.szse.cn/disclosure/notice/general/t20200429_576571.html
        # 本所于2020年4月30日起将 苏州胜利精密制造科技股份有限公司股票(证券代码:002426) 调出融资融券标的证券名单。
        inner_code = self.get_inner_code('002426')
        # print(inner_code)   # 10476
        dt = datetime.datetime(2020, 4, 30)
        sql = '''update {} set OutDate = '{}', TargetFlag = 2 where SecuMarket = 90 and InnerCode = {}\
        and TargetCategory in (10, 20) and TargetFlag = 1; '''.format(
            self.target_table_name, dt, inner_code)
        ret = target.update(sql)
        print(ret)
        target.end()

        # (4) http://www.szse.cn/disclosure/notice/general/t20200429_576572.html
        # 本所于2020年4月30日起将  江西特种电机股份有限公司股票(证券代码:002176) 调出融资融券标的证券名单
        dt = datetime.datetime(2020, 4, 30)
        inner_code = self.get_inner_code('002176')
        # print(inner_code)   # 6139
        sql = '''update {} set OutDate = '{}', TargetFlag = 2 where SecuMarket = 90 and InnerCode = {}\
        and TargetCategory in (10, 20) and TargetFlag = 1; '''.format(
            self.target_table_name, dt, inner_code)
        ret = target.update(sql)
        print(ret)
        target.end()

        # # (5) http://www.szse.cn/disclosure/notice/general/t20200430_576649.html
        # # 本所于2020年5月6日起将  深圳市奋达科技股份有限公司股票(证券代码:002681)  调出融资融券标的证券名单。
        dt = datetime.datetime(2020, 5, 6)
        inner_code = self.get_inner_code('002681')
        # print(inner_code)   # 16668
        sql = '''update {} set OutDate = '{}', TargetFlag = 2 where SecuMarket = 90 and InnerCode = {}\
        and TargetCategory in (10, 20) and TargetFlag = 1; '''.format(
            self.target_table_name, dt, inner_code)
        ret = target.update(sql)
        print(ret)
        target.end()

        # # (6) http://www.szse.cn/disclosure/notice/general/t20200430_576648.html
        # # 本所于2020年5月6日起将 大连晨鑫网络科技股份有限公司股票(证券代码:002447) 调出融资融券标的证券名单
        dt = datetime.datetime(2020, 5, 6)
        inner_code = self.get_inner_code("002447")
        # print(inner_code)   # 10493
        sql = base_sql.format(self.target_table_name, dt, inner_code)
        ret = target.update(sql)
        print(ret)
        target.end()

        # (7) http://www.szse.cn/disclosure/notice/general/t20200430_576646.html
        # 本所于2020年5月6日起将 藏格控股股份有限公司股票(证券代码:000408) 调出融资融券标的证券名单。
        dt = datetime.datetime(2020, 5, 6)
        inner_code = self.get_inner_code("000408")
        # print(inner_code)   # 155
        sql = base_sql.format(self.target_table_name, dt, inner_code)
        ret = target.update(sql)
        print(ret)
        target.end()

        # (8) http://www.szse.cn/disclosure/notice/general/t20200430_576647.html
        # 本所于2020年5月6日起将该 深圳市同洲电子股份有限公司股票(证券代码:002052) 调出融资融券标的证券名单。
        dt = datetime.datetime(2020, 5, 6)
        inner_code = self.get_inner_code("002052")
        # print(inner_code)  # 4347
        sql = base_sql.format(self.target_table_name, dt, inner_code)
        ret = target.update(sql)
        print(ret)
        target.end()

        try:
            target.dispose()
        except:
            logger.info("dispose error")
            raise
Exemplo n.º 17
0
    SzGener().start()
    logger.info(
        f"用时: {now() - start_time} 秒")  # (end)大概是 80s (dispose)大概是 425s


if __name__ == "__main__":
    scheduler = BlockingScheduler()
    # 确保重启时可以执行一次
    diff_task()

    scheduler.add_job(diff_task,
                      'cron',
                      hour='16, 23',
                      max_instances=10,
                      id="diff_task")
    logger.info(
        'Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))
    try:
        scheduler.start()
    except (KeyboardInterrupt, SystemExit):
        pass
    except Exception as e:
        logger.info(f"本次任务执行出错{e}")
        sys.exit(0)
'''部署 
docker build -f Dockerfile_szdiff -t registry.cn-shenzhen.aliyuncs.com/jzdev/jzdata/margin_sz_diff:v1 .
docker push registry.cn-shenzhen.aliyuncs.com/jzdev/jzdata/margin_sz_diff:v1 
sudo docker pull registry.cn-shenzhen.aliyuncs.com/jzdev/jzdata/margin_sz_diff:v1 

# remote 
sudo docker run --log-opt max-size=10m --log-opt max-file=3 -itd \
--env LOCAL=0 \