示例#1
0
    def __init__(self):
        super(WxSpider, self).__init__()
        # 必须指定self.log
        self.log = UtilLogger(
            'WxSpider',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'log_WxSpider.log'))

        self.log_record = UtilLogger(
            'SourceSpider',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'log_SourceSpider.log'))

        self.ext = Wx_extractor()
        # self.new_store = SourceStore(config.TEST_DB)
        self.step = 100  # 800 一个批次
        self.queue_maxsize = 800  # 发送量
        self.sended_queue_maxsize = 1000  # 已发送

        self.table_count = 1000000
        self.table_index = 0
        self.md5_table = "news_md5"
        self.s_table = "news_{}"
        self.create_table_sql = """
            create table news_{} like news_copy;
        """

        self.spider_count = 0
        self.repeat_count = 0
        self.no_china_count = 0
示例#2
0
    def __init__(self):
        super(ToutiaoSpider, self).__init__()
        # 必须指定self.log
        self.log = UtilLogger(
            'ToutiaoSpider',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'log_ToutiaoSpider.log'))

        self.log_record = UtilLogger(
            'SourceSpider',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'log_SourceSpider.log'))

        self.ext = ToutiaoExtractor()
        # self.new_store = SourceStore(config.TEST_DB)
        self.step = 100  # 800 一个批次
        self.queue_maxsize = 500  # 发送量
        self.sended_queue_maxsize = 800  # 已发送

        self.table_count = 1000000
        self.table_index = 0
        self.md5_table = "news_md5"
        self.s_table = "news_{}"
        self.create_table_sql = """
            create table news_{} like news_copy;
        """

        self.spider_count = 0
        self.repeat_count = 0
        self.no_china_count = 0
        self.send_url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword={}&autoload=true&count=20&cur_tab=1&from=search_tab'
示例#3
0
    def __init__(self):
        super(WxSpider, self).__init__()
        # 必须指定self.log
        self.log = UtilLogger(
            'WxSpider',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'log_WxSpider.log'))

        self.log_record = UtilLogger(
            'SourceSpider',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'log_SourceSpider.log'))

        self.ext = Wx_extractor()
        # self.new_store = SourceStore(config.TEST_DB)
        self.step = 500  # 800 一个批次
        self.queue_maxsize = 1500
        self.sended_queue_maxsize = 3000

        self.table_count = 1000000
        self.table_index = 8
        self.md5_table = "news_md5"
        self.s_table = "news_{}"
        self.create_table_sql = """
            create table news_{} like news_copy;
        """

        # """
        # CREATE TABLE `news_{}` (
        #    `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
        #    `title` varchar(512) NOT NULL COMMENT '标题',
        #    `summary` text NOT NULL COMMENT '概要',
        #    `content` longtext CHARACTER SET utf8mb4 NOT NULL COMMENT '文章内容html',
        #    `wechat_name` varchar(255) NOT NULL COMMENT '微信名称',
        #    `wechat_num` varchar(255) DEFAULT '' COMMENT '微信公众号',
        #    `keyword` varchar(100) NOT NULL COMMENT '关键词',
        #    `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
        #    PRIMARY KEY (`id`)
        #  ) ENGINE=InnoDB AUTO_INCREMENT=1043500 DEFAULT CHARSET=utf8
        # """

        self.spider_count = 0
        self.repeat_count = 0
        self.no_china_count = 0
示例#4
0
    def __init__(self):
        super(WxSpider, self).__init__()
        # 必须指定self.log
        self.log = UtilLogger(
            'WxSpider',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'log_WxSpider.log'))

        self.log_record = UtilLogger(
            'SourceSpider',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'log_SourceSpider.log'))

        self.ext = Wx_extractor()
        # self.new_store = SourceStore(config.TEST_DB)
        self.step = 1000
        self.sended_queue_maxsize = 2000

        self.spider_count = 0
        self.repeat_count = 0
示例#5
0
    def __init__(self):
        super(BaseRankSpider, self).__init__()
        # 定时休眠时间  分钟
        self.difsecond = 180
        log_path = os.path.dirname(
            os.path.dirname(os.path.dirname(
                os.path.abspath(__file__)))) + "/logs/"
        self.log = UtilLogger('PageSpider', log_path + 'log_page_spider')
        self.log_record = UtilLogger('RecordPageSpider',
                                     log_path + 'log_record_page_spider')

        self.rank_store = RankStore()
        self.history_store = RankHistoryStore()
        self.sleep_time = 60 * 15  # 没有任务休眠时间
        self.sended_queue_maxsize = 3000  # 发送限制
        self.send_one_tasks = 800  # 一次取出

        self.reset_task_time = 60 * 60  # 1小时
        # self.saveport = 3  # 端口
        self.task_table = "task"
        self.conf_finish_state = False
        self.re_send_count = 4

        self.db_pool = StoreMysqlPool(**config.baidu_spider_move)
示例#6
0
class WxSpider(BaseSpiderSign):
    """
    爬取微信文章
    流程:
        1 从数据库检索出不同的关键词
        2 从入口:搜狗微信搜索搜索关键词并解析列表页
            入口url:http://weixin.sogou.com/weixin?type=2&s_from=input&query=%E4%BD%A0%E5%93%88&ie=utf8&_sug_=y&_sug_type_=&w=01019900&sut=2889&sst0=1511337321983&lkt=0%2C0%2C0
        3 解析列表业码后得出该关键词对应文章的 页码数量  从而拼出剩下的页码url
        4 解析列表页内的文章 链接、摘要,传入 详情页的解析器
        5 解析微信内的文章最后存入数据库

    """
    def __init__(self):
        super(WxSpider, self).__init__()
        # 必须指定self.log
        self.log = UtilLogger(
            'WxSpider',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'log_WxSpider.log'))

        self.log_record = UtilLogger(
            'SourceSpider',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'log_SourceSpider.log'))

        self.ext = Wx_extractor()
        # self.new_store = SourceStore(config.TEST_DB)
        self.step = 100  # 800 一个批次
        self.queue_maxsize = 800  # 发送量
        self.sended_queue_maxsize = 1000  # 已发送

        self.table_count = 1000000
        self.table_index = 0
        self.md5_table = "news_md5"
        self.s_table = "news_{}"
        self.create_table_sql = """
            create table news_{} like news_copy;
        """

        self.spider_count = 0
        self.repeat_count = 0
        self.no_china_count = 0

    def get_user_password(self):
        # return 'zhouhao', 'zhspider'
        # return 'xuliang', 'xlspider'
        return 'sunxiang', 'sxspider'

    def send_get_spider(self, urls):
        """
        封装好 GET request请求,并发送到下载队列
        """
        basic_request = SpiderRequest(
            headers={'User-Agent': random.choice(self.pc_user_agents)},
            urls=urls,
            config={"redirect": 1})
        self.sending_queue.put_nowait(basic_request)

    def is_get_tasks(self):
        if self.sended_queue.qsize() < self.sended_queue_maxsize and self.sending_queue.qsize() < self.queue_maxsize \
                and self.response_queue.qsize() < self.queue_maxsize and self.store_queue.qsize() < self.queue_maxsize:
            return True
        else:
            return False

    def start_requests(self):
        try:
            while 1:
                if self.is_get_tasks():
                    db = StoreMysql(**config.local_content)
                    update_time = str(datetime.now()).split(".")[0]
                    sql = "select id, keyword from keywords where status = 1 order by update_time asc, priority desc  limit 0, {}".format(
                        self.step)
                    rows = db.query(sql)
                    self.log_record.info(
                        "datetime:{},task_results length:{}".format(
                            datetime.now(), len(rows)))
                    ids = list()
                    if rows:
                        for word in rows:
                            task_id = word[0]
                            ids.append({
                                "id": task_id,
                                "update_time": update_time
                            })

                            keyword = word[1]
                            for i in range(1, 11):
                                send_url = "http://weixin.sogou.com/weixin?query={}&_sug_type_=&s_from=input&_sug_=n&type=2&page={}&ie=utf8".format(
                                    keyword, i)
                                urls = [{
                                    "url": send_url,
                                    "type": 1,
                                    "ext_type": 1,
                                    'keyword': keyword,
                                    'task_id': task_id,
                                    'unique_key': self.get_unique_key()
                                }]
                                self.send_get_spider(urls)

                        self.stores[0].store_table(ids,
                                                   "keywords",
                                                   type=2,
                                                   field="id")
                    else:
                        time.sleep(60 * 10)
                    db.close()
                time.sleep(60 * 1)
        except Exception:
            print traceback.format_exc()

    def get_stores(self):
        """
        可定义多个数据源
        :return:
        """
        stores = list()
        stores.append(SourceStore(config.local_content))
        self.stores = stores
        return stores

    def deal_response_results_status(self, task_status, url, result, request):
        """
            处理 task_status 是2,3的任务  重试返回数组, 若重试需切换headers内容需自行定义
        :param task_status:
        :param url:
        :param result:
        :param request:
        :return:
        """
        if task_status == '2':
            ext_type = url["ext_type"]
            if ext_type == 1:
                self.deal_response_list(url, result['result'])
            elif ext_type == 2:
                self.deal_response_detail(url, result['result'])
        else:
            self.log.info("status is 3 url:{}; headers:{}; config:{}".format(
                url["url"], request.headers, request.config))

    # @fn_timer
    def deal_response_list(self, url, html):
        try:
            keyword = url['keyword']
            task_id = url['task_id']
            info_list = self.ext.list_extractor(html, keyword, task_id)
            if info_list == -1:
                self.log.info("deal_response_list exception url:{}".format(
                    url["url"]))
            else:
                self.store_queue.put({"result": info_list, "type": 1})
        except:
            print(traceback.format_exc())

    # @fn_timer
    def deal_response_detail(self, url, html):
        try:
            info = url['info']
            info.pop("we_name")
            res = self.ext.detail_extractor(html, info)
            if res != -1:
                self.store_queue.put({"result": res, "type": 2})
            else:
                self.log.info("deal_response_detail exception url:{}".format(
                    url["url"]))
        except:
            print(traceback.format_exc())

    def to_store_results(self, results, stores):
        """
            type 1  列表页 title name  去重
                2  详情页 数据
        :param results:
        :param stores:
        :return:
        """
        try:
            result = results["result"]
            type = results["type"]
            if type == 1:
                # log_start = time.time()
                for info in result:
                    log_md5 = UtilMD5.md5(info["title"] + info["we_name"])
                    sql = "insert ignore into {}(md5) values('{}')".format(
                        self.md5_table, str(log_md5))
                    s_id = stores[0].insert_row(sql)
                    if s_id > 0:
                        # self.spider_count += 1
                        urls = [{
                            "url": info['url'],
                            "type": 1,
                            "ext_type": 2,
                            'info': info,
                            'unique_key': self.get_unique_key()
                        }]
                        self.send_get_spider(urls)
                    else:
                        self.repeat_count += 1

                if self.repeat_count > 1000:
                    self.log_record.info("repeat_count:{}".format(
                        self.repeat_count))
                    self.repeat_count = 0

            elif type == 2:
                data = result
                if not self.judge_china(data["content"]):
                    # 没有中文
                    self.no_china_count += 1
                    if self.no_china_count > 1000:
                        self.log_record.info("no_china_count:{}".format(
                            self.no_china_count))
                        self.no_china_count = 0
                    return

                weixin_content = {
                    "summary": data.get("summary", ""),
                    "content": data.get("content", ""),
                    "keyword": data.get("keyword", ""),
                    "title": data.get("title", ""),
                    "wechat_name": data.get("wechat_name", ""),
                    "wechat_num": data.get("wechat_num", "")
                }
                s_id = stores[0].store_table_one(
                    weixin_content, "news_{}".format(self.table_index))
                if s_id > 0:
                    if s_id % self.table_count == 0:
                        db = StoreMysql(**config.weixin_content)

                        update_sql = "update spider_table set status = 1 where table_name = 'news_{}'".format(
                            self.table_index)
                        db.do(update_sql)

                        self.table_index += 1
                        db.do(self.create_table_sql.format(self.table_index))

                        insert_sql = "insert into spider_table(table_name) values('news_{}')".format(
                            self.table_index)
                        db.do(insert_sql)

                        time.sleep(1)
                        db.close()
                else:
                    time.sleep(0.1)
                time.sleep(2)
        except:
            print(traceback.format_exc())

    def judge_china(self, c_text):
        zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
        match = zhPattern.search(u"" + str(c_text))
        if match:
            return True
        else:
            return False

    def send_wait(self):
        """
        发送等待, 控制发往下载中心的速率
        """
        time.sleep(1)
示例#7
0
class WxSpider(BaseSpiderSign):
    """
    爬取微信文章
    流程:
        1 从数据库检索出不同的关键词
        2 从入口:搜狗微信搜索搜索关键词并解析列表页
            入口url:http://weixin.sogou.com/weixin?type=2&s_from=input&query=%E4%BD%A0%E5%93%88&ie=utf8&_sug_=y&_sug_type_=&w=01019900&sut=2889&sst0=1511337321983&lkt=0%2C0%2C0
        3 解析列表业码后得出该关键词对应文章的 页码数量  从而拼出剩下的页码url
        4 解析列表页内的文章 链接、摘要,传入 详情页的解析器
        5 解析微信内的文章最后存入数据库

    """
    def __init__(self):
        super(WxSpider, self).__init__()
        # 必须指定self.log
        self.log = UtilLogger(
            'WxSpider',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'log_WxSpider.log'))

        self.log_record = UtilLogger(
            'SourceSpider',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'log_SourceSpider.log'))

        self.ext = Wx_extractor()
        # self.new_store = SourceStore(config.TEST_DB)
        self.step = 1000
        self.sended_queue_maxsize = 2000

        self.spider_count = 0
        self.repeat_count = 0

    def get_user_password(self):
        return 'zhouhao', 'zhspider'
        # return 'xuliang', 'xlspider'

    def send_get_spider(self, urls):
        """
        封装好 GET request请求,并发送到下载队列
        """
        basic_request = SpiderRequest(
            headers={'User-Agent': random.choice(self.pc_user_agents)},
            urls=urls,
            config={"redirect": 1})
        self.sending_queue.put_nowait(basic_request)

    def start_requests(self):
        try:
            while 1:
                if self.sended_queue.qsize() < self.sended_queue_maxsize and self.sending_queue.qsize() < self.sended_queue_maxsize \
                        and self.response_queue.qsize() < self.sended_queue_maxsize and self.store_queue.qsize() < self.sended_queue_maxsize:
                    db = StoreMysql(**config.weixin_content)
                    source = SourceStore(config.weixin_content)
                    update_time = str(datetime.now()).split(".")[0]
                    sql = "select id, keyword from content_center.keywords order by update_time asc,  priority desc  limit 0, {};".format(
                        self.step)
                    rows = db.query(sql)
                    self.log.info("datetime:{},task_results length:{}".format(
                        datetime.now(), len(rows)))
                    ids = list()
                    if rows:
                        for word in rows:
                            task_id = word[0]
                            ids.append({
                                "id": task_id,
                                "update_time": update_time
                            })

                            keyword = word[1]
                            send_url = 'http://weixin.sogou.com/weixin?type=2&s_from=input&query={}&ie=utf8&_sug_=y&_sug_type_='.format(
                                keyword)
                            urls = [{
                                "url": send_url,
                                "type": 1,
                                "ext_type": 3,
                                'keyword': keyword,
                                'task_id': task_id,
                                'unique_key': self.get_unique_key()
                            }]
                            self.send_get_spider(urls)

                        source.store_table(ids, "keywords", type=2, field="id")
                    db.close()
                time.sleep(60 * 2)
        except Exception:
            print traceback.format_exc()

    def get_stores(self):
        """
        可定义多个数据源
        :return:
        """
        stores = list()
        stores.append(SourceStore(config.weixin_spider))
        self.stores = stores
        return stores

    def deal_response_results_status(self, task_status, url, result, request):
        """
            处理 task_status 是2,3的任务  重试返回数组, 若重试需切换headers内容需自行定义
        :param task_status:
        :param url:
        :param result:
        :param request:
        :return:
        """
        if task_status == '2':
            ext_type = url["ext_type"]
            if ext_type == 3:
                self.deal_response_page(url, result['result'])
            elif ext_type == 1:
                self.deal_response_list(url, result['result'])
            elif ext_type == 2:
                self.deal_response_detail(url, result['result'])
        else:
            self.log.info("status is 3 url:{}; headers:{}; config:{}".format(
                url["url"], request.headers, request.config))

    def deal_response_page(self, url, html):
        try:
            keyword = url['keyword']
            task_id = url['task_id']
            page = self.ext.page_extractor(html)
            if page == -1:
                self.log.info("deal_response_page  exception url:{}".format(
                    url["url"]))
            else:
                # 最多10 页
                page_c = 10
                if page < 10:
                    page_c = page
                for i in range(1, page_c + 1):
                    send_url = "http://weixin.sogou.com/weixin?query={}&_sug_type_=&s_from=input&_sug_=n&type=2&page={}&ie=utf8".format(
                        keyword, i)
                    urls = [{
                        "url": send_url,
                        "type": 1,
                        "ext_type": 1,
                        'keyword': keyword,
                        'task_id': task_id,
                        'unique_key': self.get_unique_key()
                    }]
                    self.send_get_spider(urls)
        except:
            print 'ext page count error!{}'.format(url)

    # @fn_timer
    def deal_response_list(self, url, html):
        try:
            keyword = url['keyword']
            task_id = url['task_id']
            # 解析列表页逻辑:
            info_list = self.ext.list_extractor(html, keyword, task_id)
            if info_list == -1:
                self.log.info("deal_response_list  exception url:{}".format(
                    url["url"]))
            else:
                self.store_queue.put({"result": info_list, "type": 1})
        except:
            print(traceback.format_exc())

    # @fn_timer
    def deal_response_detail(self, url, html):
        try:
            info = url['info']
            info.pop("we_name")
            res = self.ext.detail_extractor(html, info)
            if res != -1:
                self.store_queue.put({"result": res, "type": 2})
            else:
                self.log.info("deal_response_detail  = -1 url:{}".format(
                    url["url"]))
        except:
            print(traceback.format_exc())

    def to_store_results(self, results, stores):
        """
            type 1  列表页 title name  去重
                2  详情页 数据
        :param results:
        :param stores:
        :return:
        """
        try:
            result = results["result"]
            type = results["type"]
            if type == 1:
                log_start = time.time()
                for info in result:
                    log_md5 = UtilMD5.md5(info["title"] + info["we_name"])
                    sql = "insert ignore into spider_log(md5, type) values('{}', '1')".format(
                        str(log_md5))
                    s_id = stores[0].insert_row(sql)
                    if s_id > 0:
                        self.spider_count += 1
                        urls = [{
                            "url": info['url'],
                            "type": 1,
                            "ext_type": 2,
                            'info': info,
                            'unique_key': self.get_unique_key(),
                        }]
                        self.send_get_spider(urls)
                    else:
                        self.repeat_count += 1
                        # self.log_record.info("spider_log title:{}".format(info["title"]))

                if self.spider_count > 1000:
                    self.log_record.info("spider_count:{}".format(
                        self.spider_count))
                    self.spider_count = 0
                if self.repeat_count > 1000:
                    self.log_record.info("repeat_count:{}".format(
                        self.repeat_count))
                    self.repeat_count = 0

                t_inter = int(time.time() - log_start)
                if t_inter > 5:
                    self.log_record.info("spider_log time:{}".format(t_inter))
            elif type == 2:
                # data_start = time.time()
                data = result
                ke_id = str(data["keyword_id"])[-1:]
                spider_weixin = 'spider_weixin_{}'.format(ke_id)
                spider_weixin_content = 'spider_weixin_content_{}'.format(
                    ke_id)

                if not self.judge_china(data["content"]):
                    # self.log_record.info("spider_weixin_lang add")
                    return
                    # spider_weixin = "spider_weixin_lang"
                    # spider_weixin_content = "spider_weixin_content_lang"

                weixin_content = {
                    "summary": data.pop("summary", ""),
                    "content": data.pop("content", ""),
                    "keyword_id": data.get("keyword_id", 0),
                    "keyword": data.get("keyword", "")
                }
                s_id = stores[0].store_table_one(data, spider_weixin)
                if s_id > 0:
                    weixin_content["id"] = s_id
                    stores[0].store_table_one(weixin_content,
                                              spider_weixin_content)

                # self.log_record.info("data_weixin time:{}".format(time.time() - data_start))
        except:
            print(traceback.format_exc())

    def judge_china(self, c_text):
        zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
        match = zhPattern.search(u"" + str(c_text))
        if match:
            return True
            # print '有中文:%s' % (match.group(0),)
        else:
            return False

    def send_wait(self):
        """
        发送等待, 控制发往下载中心的速率
        """
        if self.sended_queue.qsize() > 4000:
            time.sleep(0.4)
        elif self.sending_queue.qsize() < 10000:
            time.sleep(0.4)
示例#8
0
class BaseRankSpider(BaseSpiderSign):
    """
        层级拓展
        完全匹配更改
        1、最靠前排名 暂存 内存  查完url 发送
        2、本地 返回真实url
    """
    search_device = None
    extractor = None

    def __init__(self):
        super(BaseRankSpider, self).__init__()
        # 定时休眠时间  分钟
        self.difsecond = 180
        log_path = os.path.dirname(
            os.path.dirname(os.path.dirname(
                os.path.abspath(__file__)))) + "/logs/"
        self.log = UtilLogger('PageSpider', log_path + 'log_page_spider')
        self.log_record = UtilLogger('RecordPageSpider',
                                     log_path + 'log_record_page_spider')

        self.rank_store = RankStore()
        self.history_store = RankHistoryStore()
        self.sleep_time = 60 * 15  # 没有任务休眠时间
        self.sended_queue_maxsize = 3000  # 发送限制
        self.send_one_tasks = 800  # 一次取出

        self.reset_task_time = 60 * 60  # 1小时
        # self.saveport = 3  # 端口
        self.task_table = "task"
        self.conf_finish_state = False
        self.re_send_count = 4

        self.db_pool = StoreMysqlPool(**config.baidu_spider_move)

    def get_user_password(self):
        return 'fxt', 'fxt_spider'

    def removeCharacters(self, previou_url):
        if previou_url.startswith("https://"):
            previou_url = previou_url.replace("https://", "")
        if previou_url.startswith("http://"):
            previou_url = previou_url.replace("http://", "")
        if previou_url.endswith("/"):
            previou_url = previou_url[0:len(previou_url) - 1]
        return previou_url

    def start_requests(self):
        try:
            while True:
                print(self.sended_queue.qsize())
                if self.sended_queue.qsize() < self.sended_queue_maxsize and self.sending_queue.qsize() < self.sended_queue_maxsize \
                        and self.response_queue.qsize() < self.sended_queue_maxsize and self.store_queue.qsize() < self.sended_queue_maxsize:
                    device = self.search_device.name if self.search_device != DeviceEnums.pc_360 else '360_pc'
                    task_results = self.rank_store.find_task_lists(
                        device, self.send_one_tasks)
                    if len(task_results) > 0:
                        print "datetime:{},task_results length:{}".format(
                            datetime.now(), len(task_results))
                        for result in task_results:
                            #  id, keyword, urlAddress, device, page, searchType, keyword_id, saveport
                            task_id = result[0]
                            keyword = result[1]
                            target_url = result[2]
                            page = result[3]
                            spidertype = result[4]  # 不完全匹配
                            keyword_id = result[5]
                            site_name = result[6]

                            req = self.get_request_param(
                                task_id, keyword, target_url, page, spidertype,
                                keyword_id, site_name, 1)

                            basic_request = SpiderRequest(
                                headers=req['headers'],
                                urls=req['urls'],
                                config=req['configs'])
                            self.sending_queue.put(basic_request)
                        time.sleep(20)
                    else:
                        time.sleep(self.sleep_time)
                else:
                    time.sleep(self.sleep_time)
        except Exception:
            print traceback.format_exc()

    def deal_rank_spider_response(self, url, html, r_capture, request, ip):
        page = url["page"]  # 总页数
        pnum = url["pnum"]  # 当前页数
        pcount = (pnum - 1) * 10
        result = self.extractor.extractor(html,
                                          ck=url['ckurl'],
                                          site_name=url['site_name'],
                                          pcount=pcount)
        if result == 0:
            self.log_record.info("extractor failure result 0")
            self.store_rank(url, -2, html, ip)
        elif type(result) == int:
            self.store_rank(url, -1, html, ip)
            self.log_record.info(
                "extractor failure deal_baidu_response_pc url:{}   request:{}".
                format(url["url"], request.headers['User-Agent']))
            return True
        else:
            if "rank" in result:
                # for rank_result in result["rank"]:
                self.store_rank(url,
                                result["rank"],
                                html,
                                ip,
                                realaddress=result["realaddress"],
                                r_capture=r_capture)
            elif pnum <= page:
                req = self.get_request_param(task_id=url["id"],
                                             keyword=url["keyword"],
                                             target_url=url["ckurl"],
                                             page=url["page"],
                                             spidertype=url["spidertype"],
                                             keyword_id=url["keyword_id"],
                                             site_name=url['site_name'],
                                             pnum=pnum + 1)
                basic_request = SpiderRequest(headers=req['headers'],
                                              urls=req['urls'],
                                              config=req['configs'])
                self.sending_queue.put(basic_request)
            else:
                self.store_rank(url, -2, html, ip)

    @abc.abstractmethod
    def get_request_param(self, task_id, keyword, target_url, page, spidertype,
                          keyword_id, site_name, pnum):
        """{'headers':{}, 'configs':{}, 'url':''}"""
        return

    def store_rank(self,
                   url,
                   rank,
                   response_body,
                   ip,
                   realaddress="",
                   r_capture=""):
        item = dict()
        item["keyword"] = url["keyword"]
        item["rank"] = rank
        item["taskId"] = int(url["id"])
        item["device"] = url["search_device"]
        item["response_body"] = response_body
        item['ip'] = ip

        if realaddress != "":
            item["urlAddress"] = realaddress
        else:
            item["urlAddress"] = ""

        self.store_queue.put({
            "result": item,
            "task_id": url["id"],
            "type": StoreTypeEnums.mysql.value,
            "rank": rank,
            "keyword_id": url["keyword_id"],
            "r_capture": r_capture
        })

    def get_stores(self):
        stores = list()
        stores.append(SourceStore(config.baidu_spider_move))
        self.stores = stores
        return stores

    def query_status(self, id):
        """
        查询status
        """
        db = StoreMysql(**config.baidu_spider_move)
        query_status_sql = 'select `status` from {} where id = {}'.format(
            self.rank_store.table, id)
        try:
            result = db.query(query_status_sql)
            db.close()
            return result[0][0]
        except:
            print "query_status error"
            traceback.print_exc()

    def deal_response_results_status(self, task_status, url, result, request):
        try:
            status = self.query_status(url['id'])
            if status is not None and status <= 2:
                if task_status == '2':
                    r_html = ''
                    r_capture = ''
                    r_l = result["result"].split("||||")
                    if len(r_l) == 1:  # 非截图
                        r_html = r_l[0]
                        r_capture = ""
                    elif len(r_l) == 2:  # 截图
                        r_capture = r_l[0]
                        r_html = r_l[1]

                    ip = result['inter_pro']
                    self.deal_rank_spider_response(url, r_html, r_capture,
                                                   request, ip)

                else:
                    # 根据情况做处理
                    ip = result['inter_pro']
                    self.store_rank(url, -1, result["result"], ip)
                    self.log.info('spider failure:%s' % url)
                    self.re_send(url, request)
        except:
            print "deal_response_results_status error"
            traceback.print_exc()

    def re_send(self, url, request):
        self.log_record.info("re_send url:{}, User-Agent:{}".format(
            url["url"], request.headers["User-Agent"]))
        retry_urls = list()
        if "conf_search_count" in url:
            if int(url["conf_search_count"]) < self.re_send_count:
                url["conf_search_count"] = int(url["conf_search_count"]) + 1
                retry_urls.append(url)
            else:
                self.log_record.info(
                    "datetime:{}; state_url:{}; heasers:{}; config:{}".format(
                        datetime.now(), url["url"], request.headers,
                        request.config))
                return
        else:
            url["conf_search_count"] = 1
            retry_urls.append(url)
        new_request = SpiderRequest(headers=request.headers,
                                    config=request.config)
        new_request.urls = retry_urls
        new_request.config["priority"] = 3
        new_request.headers["User-Agent"] = UserAgentUtil().random_one(
            self.search_device)
        self.sending_queue.put(new_request)

    def send_response_body_cos(self, response_body, keyword_id, device, ip):
        """
        将response_body存入腾讯云
        """
        try:
            region = config.qcloud_cos.get('region')
            app_id = config.qcloud_cos.get('app_id')
            secret_id = config.qcloud_cos.get('secret_id')
            secret_key = config.qcloud_cos.get('secret_key')
            token = config.qcloud_cos.get('token')
            scheme = config.qcloud_cos.get('scheme')
            bucket = config.qcloud_cos.get('bucket')
            prefix = config.qcloud_cos.get('prefix')
            db_name = config.baidu_spider_move.get("db")
            filename = "{prefix}/html/{date}/{device}/{db_name}/{keyword_id}_{ip}.txt".format(
                prefix=prefix,
                date=date.today().isoformat(),
                device=device,
                db_name=db_name,
                keyword_id=keyword_id,
                ip=ip)
            cos_config = CosConfig(Region=region,
                                   Appid=app_id,
                                   SecretId=secret_id,
                                   SecretKey=secret_key,
                                   Token=token,
                                   Scheme=scheme)
            client = CosS3Client(cos_config)
            response = client.put_object(Bucket=bucket,
                                         Body=response_body,
                                         Key=filename,
                                         StorageClass='STANDARD',
                                         EnableMD5=False)
            print response['ETag']
        except Exception as e:
            print "save_response_body_cos error: {}".format(e)

    # @timeout(10)
    def to_store_results(self, results, stores):
        """
            results  type 1: 正常删除task 新增rank,2,3:判断, 3:有完全匹配
                    task_id  task表id
        """
        try:
            # start_time = time.time()
            task_id = results["task_id"]
            keyword_id = results["keyword_id"]
            result = results["result"]
            rank = result["rank"]
            response_body = result["response_body"]
            screenshot_url = ''
            device = result["device"]
            ip = result['ip']
            if config.is_send_html_to_cos:
                self.send_response_body_cos(response_body, keyword_id, device,
                                            ip)

            if results.get("r_capture", "") != "":
                r_capture = results.get("r_capture")
                r_capture_bin = base64.b64decode(r_capture)
                m = hashlib.md5()
                m.update(r_capture_bin)
                md5 = m.hexdigest()
                region = config.qcloud_cos.get('region')
                app_id = config.qcloud_cos.get('app_id')
                secret_id = config.qcloud_cos.get('secret_id')
                secret_key = config.qcloud_cos.get('secret_key')
                token = config.qcloud_cos.get('token')
                scheme = config.qcloud_cos.get('scheme')
                bucket = config.qcloud_cos.get('bucket')
                prefix = config.qcloud_cos.get('prefix')
                db_name = config.baidu_spider_move.get("db")
                filename = "{prefix}/rank_imgs/{date}/{device}/{db_name}/{keyword_id}_{ip}.png".format(
                    prefix=prefix,
                    date=date.today().isoformat(),
                    device=device,
                    db_name=db_name,
                    keyword_id=keyword_id,
                    ip=ip)
                cos_config = CosConfig(Region=region,
                                       Appid=app_id,
                                       SecretId=secret_id,
                                       SecretKey=secret_key,
                                       Token=token,
                                       Scheme=scheme)
                client = CosS3Client(cos_config)
                response = client.put_object(Bucket=bucket,
                                             Body=r_capture_bin,
                                             Key=filename,
                                             StorageClass='STANDARD',
                                             EnableMD5=False)
                print response['ETag']
                screenshot_url = "https://{}.cos.{}.myqcloud.com/{}".format(
                    bucket, region, filename)

            device = self.search_device.name if self.search_device != DeviceEnums.pc_360 else '360_pc'
            rank_data = [{
                "keywordid": keyword_id,
                "url": result["urlAddress"],
                "rank": rank,
                "device": device,
                "keyword": result["keyword"],
                "screenshot": '',
                "screenshot_url": screenshot_url
            }]
            send_data = {"rankLists": json.dumps(rank_data)}
            flag = self.send_rank_data(config.callback_url, send_data)
            if flag:
                if int(result["rank"]) > 0 and results.get("r_capture",
                                                           "") == "":
                    self.log.info(
                        "r_capture kong keyword_id:{}, send_url:{}".format(
                            keyword_id, config.callback_url))

                self.log_record.info(
                    "one finish keyword_id:{}, rank:{}; urlAddress: {}".format(
                        keyword_id, result["rank"],
                        result.get("urlAddress", "")))
            else:
                self.log.info(
                    "send exception keyword_id:{}, send_url:{}".format(
                        keyword_id, config.callback_url))
                # self.log.info("send exception data:{}".format(send_data))
            self.rank_store.update_status_id(task_id, result["rank"])
            self.history_store.save(self.search_device.name, keyword_id, rank)
        except:
            print traceback.format_exc()

    def send_rank_data(self, send_url, send_data):
        for i in xrange(0, 2):
            try:
                request = urllib2.Request(send_url,
                                          data=urllib.urlencode(send_data))
                response = urllib2.urlopen(request, timeout=10)
                res_content = response.read()
                if not str(res_content).find("success") > -1:
                    self.log_record.info(
                        "res_content no success send_url: {}".format(send_url))
                    return False
                return True
            except:
                self.log.info(traceback.format_exc())
                self.log.info(send_url + ",send_rank_data: " +
                              urllib.urlencode(send_data))
                time.sleep(2)
        return False

    def reset_task(self):
        """
        重置任务表 状态
        """
        while True:
            time.sleep(10)
            # self.log_record.info("reset:%s" % str(datetime.today()))
            self.rank_store.reset_task(self.reset_task_time)
            time.sleep(self.reset_task_time)
示例#9
0
class BaseSugRankSpider(BaseSpiderSign):
    """
        层级拓展
        完全匹配更改
        1、最靠前排名 暂存 内存  查完url 发送
        2、本地 返回真实url
    """
    search_device = None
    extractor = None

    def __init__(self):
        super(BaseSugRankSpider, self).__init__()
        # 定时休眠时间  分钟
        self.difsecond = 180
        log_path = os.path.dirname(
            os.path.dirname(os.path.dirname(
                os.path.abspath(__file__)))) + "/logs/"
        self.log = UtilLogger('SugSpider', log_path + 'log_sug_spider')
        self.log_record = UtilLogger('RecordSugSpider',
                                     log_path + 'log_record_sug_spider')

        self.rank_store = RankStore()
        self.history_store = RankHistoryStore()
        self.sleep_time = 60 * 15  # 没有任务休眠时间
        self.sended_queue_maxsize = 1500  # 发送限制
        self.send_one_tasks = 800  # 一次取出

        self.reset_task_time = 60 * 60  # 1小时
        # self.saveport = 3  # 端口
        self.task_table = "task"
        self.conf_finish_state = False
        self.re_send_count = 4

        self.db_pool = StoreMysqlPool(**config.baidu_spider_move)

    def get_user_password(self):
        return 'fxt', 'fxt_spider'

    def start_requests(self):
        try:
            while True:
                print(self.sended_queue.qsize())
                if self.sended_queue.qsize() < self.sended_queue_maxsize and self.sending_queue.qsize() < self.sended_queue_maxsize \
                        and self.response_queue.qsize() < self.sended_queue_maxsize and self.store_queue.qsize() < self.sended_queue_maxsize:
                    device = self.search_device.name if self.search_device != DeviceEnums.pc_360 else '360_pc'
                    task_results = self.rank_store.find_task_lists(
                        device, self.send_one_tasks)
                    if len(task_results) > 0:
                        print "datetime:{},task_results length:{}".format(
                            datetime.now(), len(task_results))
                        for result in task_results:
                            #  id, keyword, urlAddress, device, page, searchType, keyword_id, saveport
                            task_id = result[0]
                            keyword = result[1]
                            target_url = result[2]
                            page = result[3]
                            spidertype = result[4]
                            keyword_id = result[5]

                            req = self.get_request_param(
                                task_id, keyword, target_url, keyword_id)
                            basic_request = SpiderRequest(
                                urls=req['urls'], config=req['configs'])
                            self.sending_queue.put(basic_request)
                        time.sleep(20)
                    else:
                        time.sleep(self.sleep_time)
                else:
                    time.sleep(self.sleep_time)
        except Exception:
            print traceback.format_exc()

    def deal_rank_spider_response(self, url, html, ip):
        result = self.extractor.extractor(html, url['ckurl'])
        self.store_rank(url, result, html, ip)

    @abc.abstractmethod
    def get_request_param(self, task_id, keyword, target_url, keyword_id):
        """{'headers':{}, 'configs':{}, 'url':''}"""
        return

    def store_rank(self, url, rank, response_body, ip):
        item = dict()
        item["keyword"] = url["keyword"]
        item["rank"] = rank
        item["taskId"] = int(url["id"])
        item['target'] = url['ckurl']
        item["response_body"] = response_body
        item['device'] = url['search_device']
        item['ip'] = ip

        self.store_queue.put({
            "result": item,
            "task_id": url["id"],
            "type": StoreTypeEnums.mysql.value,
            "rank": rank,
            "keyword_id": url["keyword_id"]
        })

    def get_stores(self):
        stores = list()
        stores.append(SourceStore(config.baidu_spider_move))
        self.stores = stores
        return stores

    def deal_response_results_status(self, task_status, url, result, request):
        if task_status == '2':
            self.deal_rank_spider_response(url, result["result"],
                                           result['inter_pro'])
        else:
            # 根据情况做处理
            self.store_rank(url, -1, result["result"], result['inter_pro'])
            self.log.info('spider failure:%s' % url)
            self.re_send(url, request)

    def re_send(self, url, request):
        self.log_record.info("re_send url:{}, User-Agent:{}".format(
            url["url"], request.headers["User-Agent"]))
        retry_urls = list()
        if "conf_search_count" in url:
            if int(url["conf_search_count"]) < self.re_send_count:
                url["conf_search_count"] = int(url["conf_search_count"]) + 1
                retry_urls.append(url)
            else:
                self.log_record.info(
                    "datetime:{}; state_url:{}; heasers:{}; config:{}".format(
                        datetime.now(), url["url"], request.headers,
                        request.config))
                return
        else:
            url["conf_search_count"] = 1
            retry_urls.append(url)
        new_request = SpiderRequest(headers=request.headers,
                                    config=request.config)
        new_request.urls = retry_urls
        new_request.config["priority"] = 3
        new_request.headers["User-Agent"] = UserAgentUtil().random_one(
            self.search_device)
        self.sending_queue.put(new_request)

    def send_response_body_cos(self, response_body, keyword_id, device, ip):
        """
        将response_body存入腾讯云
        :return:
        """
        try:
            region = config.qcloud_cos.get('region')
            app_id = config.qcloud_cos.get('app_id')
            secret_id = config.qcloud_cos.get('secret_id')
            secret_key = config.qcloud_cos.get('secret_key')
            token = config.qcloud_cos.get('token')
            scheme = config.qcloud_cos.get('scheme')
            bucket = config.qcloud_cos.get('bucket')
            prefix = config.qcloud_cos.get('prefix')
            db_name = config.baidu_spider_move.get("db")
            filename = "{prefix}/html/{date}/{device}/{db_name}/{keyword_id}_{ip}.txt".format(
                prefix=prefix,
                date=date.today().isoformat(),
                device=device,
                db_name=db_name,
                keyword_id=keyword_id,
                ip=ip)
            cos_config = CosConfig(Region=region,
                                   Appid=app_id,
                                   SecretId=secret_id,
                                   SecretKey=secret_key,
                                   Token=token,
                                   Scheme=scheme)
            client = CosS3Client(cos_config)
            response = client.put_object(Bucket=bucket,
                                         Body=response_body,
                                         Key=filename,
                                         StorageClass='STANDARD',
                                         EnableMD5=False)
            print response['ETag']
        except Exception as e:
            print "save_response_body_cos error: {}".format(e)

    # @timeout(10)
    def to_store_results(self, results, stores):
        """
            results  type 1: 正常删除task 新增rank,2,3:判断, 3:有完全匹配
                    task_id  task表id
        """
        try:
            # start_time = time.time()
            task_id = results["task_id"]
            keyword_id = results["keyword_id"]
            result = results["result"]
            rank = result["rank"]
            response_body = result["response_body"]
            ip = result['ip']
            device = result['device']
            if config.is_send_html_to_cos:
                self.send_response_body_cos(response_body, keyword_id, device,
                                            ip)

            device = self.search_device.name if self.search_device != DeviceEnums.pc_360 else '360_pc'
            rank_data = [{
                "keywordid": keyword_id,
                "url": result["target"],
                "rank": rank,
                "device": device,
                "keyword": result["keyword"]
            }]
            send_data = {"rankLists": json.dumps(rank_data)}
            flag = self.send_rank_data(config.callback_url, send_data)
            if flag:
                self.log_record.info(
                    "one finish keyword_id:{}, rank:{}; target: {}".format(
                        keyword_id, result["rank"], result.get("target", "")))
            else:
                self.log.info(
                    "send exception keyword_id:{}, send_url:{}".format(
                        keyword_id, config.callback_url))
                # self.log.info("send exception data:{}".format(send_data))
            self.rank_store.update_status_id(task_id, result["rank"])
            self.history_store.save(self.search_device.name, keyword_id, rank)
        except:
            print traceback.format_exc()

    def send_rank_data(self, send_url, send_data):
        for i in xrange(0, 2):
            try:
                request = urllib2.Request(send_url,
                                          data=urllib.urlencode(send_data))
                response = urllib2.urlopen(request, timeout=10)
                res_content = response.read()
                if not str(res_content).find("success") > -1:
                    self.log_record.info(
                        "res_content no success send_url: {}".format(send_url))
                    return False
                return True
            except:
                self.log.info(traceback.format_exc())
                self.log.info(send_url + ",send_rank_data: " +
                              urllib.urlencode(send_data))
                time.sleep(2)
        return False

    def reset_task(self):
        """
         重置任务表 状态
        :return:
        """
        while True:
            time.sleep(10)
            # self.log_record.info("reset:%s" % str(datetime.today()))
            self.rank_store.reset_task(self.reset_task_time)
            time.sleep(self.reset_task_time)
示例#10
0
class ToutiaoSpider(BaseSpiderSign):
    """
    爬取头条文章
    流程:
        1 从数据库检索出不同的关键词
        2 从入口:搜狗微信搜索搜索关键词并解析列表页
            入口url:https://www.toutiao.com/search_content/?offset=40&format=json&keyword=%E8%8B%B9%E6%9E%9CWWDC%E6%97%B6%E9%97%B4&autoload=true&count=20&cur_tab=1&from=search_tab
        3 解析列表业码后得出该关键词对应文章的 页码数量  从而拼出剩下的页码url
        4 解析列表页内的文章 链接、摘要,传入 详情页的解析器
        5 解析微信内的文章最后存入数据库

    """
    def __init__(self):
        super(ToutiaoSpider, self).__init__()
        # 必须指定self.log
        self.log = UtilLogger(
            'ToutiaoSpider',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'log_ToutiaoSpider.log'))

        self.log_record = UtilLogger(
            'SourceSpider',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'log_SourceSpider.log'))

        self.ext = ToutiaoExtractor()
        # self.new_store = SourceStore(config.TEST_DB)
        self.step = 100  # 800 一个批次
        self.queue_maxsize = 500  # 发送量
        self.sended_queue_maxsize = 800  # 已发送

        self.table_count = 1000000
        self.table_index = 0
        self.md5_table = "news_md5"
        self.s_table = "news_{}"
        self.create_table_sql = """
            create table news_{} like news_copy;
        """

        self.spider_count = 0
        self.repeat_count = 0
        self.no_china_count = 0
        self.send_url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword={}&autoload=true&count=20&cur_tab=1&from=search_tab'

    def get_user_password(self):
        # return 'zhouhao', 'zhspider'
        # return 'xuliang', 'xlspider'
        return 'sunxiang', 'sxspider'

    def send_get_spider(self, urls):
        """
        封装好 GET request请求,并发送到下载队列
        """
        basic_request = SpiderRequest(
            headers={'User-Agent': random.choice(self.pc_user_agents)},
            urls=urls,
            config={"redirect": 1})
        self.sending_queue.put_nowait(basic_request)

    def is_get_tasks(self):
        if self.sended_queue.qsize() < self.sended_queue_maxsize and self.sending_queue.qsize() < self.queue_maxsize \
                and self.response_queue.qsize() < self.queue_maxsize and self.store_queue.qsize() < self.queue_maxsize:
            return True
        else:
            return False

    def start_requests(self):
        try:
            while 1:
                if self.is_get_tasks():
                    db = StoreMysql(**config.local_content)
                    update_time = str(datetime.now()).split(".")[0]
                    sql = "select id, keyword from keywords where status = 1 order by update_time asc, priority desc  limit 0, {}".format(
                        self.step)
                    rows = db.query(sql)
                    self.log_record.info(
                        "datetime:{},task_results length:{}".format(
                            datetime.now(), len(rows)))
                    ids = list()
                    if rows:
                        for word in rows:
                            task_id = word[0]
                            ids.append({
                                "id": task_id,
                                "update_time": update_time
                            })

                            keyword = word[1]
                            for i in range(0, 6):
                                send_url = self.send_url.format(
                                    i * 20, keyword)
                                urls = [{
                                    "url": send_url,
                                    "type": 1,
                                    "ext_type": 1,
                                    'keyword': keyword,
                                    'unique_key': self.get_unique_key()
                                }]
                                self.send_get_spider(urls)

                        self.stores[0].store_table(ids,
                                                   "keywords",
                                                   type=2,
                                                   field="id")
                    else:
                        time.sleep(60 * 10)
                    db.close()
                time.sleep(10)
        except Exception:
            print traceback.format_exc()

    def get_stores(self):
        """
        可定义多个数据源
        :return:
        """
        stores = list()
        stores.append(SourceStore(config.local_content))
        self.stores = stores
        return stores

    def deal_response_results_status(self, task_status, url, result, request):
        """
            处理 task_status 是2,3的任务  重试返回数组, 若重试需切换headers内容需自行定义
        :param task_status:
        :param url:
        :param result:
        :param request:
        :return:
        """
        if task_status == '2':
            ext_type = url["ext_type"]
            if ext_type == 1:
                self.deal_response_list(url, result['result'])
            elif ext_type == 2:
                self.deal_response_detail(url, result['result'])
        else:
            self.log.info("status is 3 url:{}; headers:{}; config:{}".format(
                url["url"], request.headers, request.config))

    # @fn_timer
    def deal_response_list(self, url, html):
        try:
            keyword = url['keyword']
            # task_id = url['task_id']
            info_list = self.ext.list_extractor(html, keyword)
            if info_list == -1:
                self.log.info("deal_response_list exception url:{}".format(
                    url["url"]))
            else:
                self.store_queue.put({"result": info_list, "type": 1})
        except:
            print(traceback.format_exc())

    # @fn_timer
    def deal_response_detail(self, url, html):
        try:
            list_info = url['info']
            # info.pop("we_name")\

            res = self.ext.detail_extractor(html, list_info)
            if res != -1:
                self.store_queue.put({"result": res, "type": 2})
            else:
                self.log.info("deal_response_detail exception url:{}".format(
                    url["url"]))
        except:
            print(traceback.format_exc())

    def to_store_results(self, results, stores):
        """
            type 1  列表页 title name  去重
                2  详情页 数据
        :param results:从store里读取的一个封装好的字典{'result':{'title':'...','abstract':'...'},'type':2}
        :param stores:
        :return:
        """
        try:
            result = results["result"]  # 这个是真正的结果,格式:字典
            type = results["type"]  # 处理方式,type=1 处理列表页,2:处理详情页
            if type == 1:
                # log_start = time.time()
                # keyword = results["keyword"]
                for info in result:
                    log_md5 = UtilMD5.md5(info["title"])
                    sql = "insert ignore into {}(md5) values('{}')".format(
                        self.md5_table, str(log_md5))
                    s_id = stores[0].insert_row(sql)  # 返回一个rowcount(受影响的行?)
                    if s_id > 0:
                        # self.spider_count += 1
                        urls = [{
                            "url": info['url'],
                            "type": 1,
                            "ext_type": 2,
                            'info': info,
                            'unique_key': self.get_unique_key()
                        }]
                        self.send_get_spider(urls)  # 封装列表页获取的的url,去请求
                    else:
                        self.repeat_count += 1

                if self.repeat_count > 1000:
                    self.log_record.info("repeat_count:{}".format(
                        self.repeat_count))
                    self.repeat_count = 0

            elif type == 2:
                data = result
                if not self.judge_china(data["content"]):
                    # 没有中文
                    self.no_china_count += 1
                    if self.no_china_count > 1000:
                        self.log_record.info("no_china_count:{}".format(
                            self.no_china_count))
                        self.no_china_count = 0
                    return

                toutiao_content = {
                    "category": data.get("category", ""),
                    "content": data.get("content", ""),
                    "publish_time": data.get("publish_time", ""),
                    "title": data.get("title", ""),
                    "abstract": data.get("abstract", ""),
                    "tags": data.get("tags", ""),
                    'url': data.get('url', ''),
                    'keyword': data.get('keyword', '')
                }
                s_id = stores[0].store_table_one(
                    toutiao_content, "news_{}".format(self.table_index))
                if s_id > 0:
                    if s_id % self.table_count == 0:
                        db = StoreMysql(**config.toutiao_content)

                        update_sql = "update spider_table set status = 1 where table_name = 'news_{}'".format(
                            self.table_index)
                        db.do(update_sql)

                        self.table_index += 1
                        db.do(self.create_table_sql.format(self.table_index))

                        insert_sql = "insert into spider_table(table_name) values('news_{}')".format(
                            self.table_index)
                        db.do(insert_sql)

                        time.sleep(1)
                        db.close()
                else:
                    time.sleep(0.1)
                time.sleep(4)
        except:
            print(traceback.format_exc())

    def judge_china(self, c_text):
        zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
        match = zhPattern.search(u"" + str(c_text))
        if match:
            return True
        else:
            return False

    def send_wait(self):
        """
        发送等待, 控制发往下载中心的速率
        """
        time.sleep(1)