def get_part_req_data(self, nickname):
     """
     仅获取阅读量和评论的请求数据
     :param nickname:公众号昵称
     :return:最后成功与否取决在redis中是否找到有有效数据
     """
     TidyReqData.flush_data()
     redis_instance.set('current_nickname', nickname)
     self.home_to_gzh_search()
     self.search_gzh(nickname)
     self.click_a_message(args=1)
     self.check_comments()
     self.home()
 def get_all_req_data(self, nickname, hand=False):
     """
     获取关于一个公众号的全部请求数据 当前程序使用baidu API受到网络和并发限制效果并十分理想
     :param nickname: 公众号昵称
     :return:最后成功与否取决在redis中是否找到有有效数据
     """
     TidyReqData.flush_data("*.req")
     redis_instance.set('current_nickname', nickname)
     self.home_to_gzh_search()
     self.search_gzh(nickname)
     if hand == False:
         self.all_message()
         self.click_a_message()
         # self.check_comments()
     else:
         input("请一一手动或取参数 回车退出")
     self.home()
Пример #3
0
 def process_request(self, request, spider):
     current_req_data = self.req_data_list[self.counter % self.wx_num]
     req_data = TidyReqData.req_to_dict(
         current_req_data['load_more']['req_data'])
     request.set_method(req_data['method'])
     req_data['url_param_dict']['offset'] = request.meta['list_offset']
     url = req_data['url'] + dict_to_str(req_data['url_param_dict'])
     request._set_url(url)
     request.set_headers(req_data['headers'])
     self.counter += 1
     return None
Пример #4
0
 def __init__(self, *args, **kwargs):
     """
     :param args:
     :param kwargs:
     实例化爬虫需要调用的函数
     """
     # 包含当前公众号所有不存在文本内容数据的生成器
     self.current_nickname = TidyReqData.get_nickname()
     self.articles_list = get_collection_article(self.current_nickname,
                                                 article={"$exists": False},
                                                 title={"$exists": True})
     self.crawler_begin_time = time()
     self.crawler_parse_counter = 0
Пример #5
0
 def process_request(self, request, spider):
     current_req_data = self.req_data_list[self.counter % self.wx_num]
     req_data = TidyReqData.req_to_dict(
         current_req_data['content']['req_data'])
     url = request._get_url()
     raw_url = copy(url)
     if "https" in raw_url:
         raw_url = raw_url.replace("https", "http")
     request.set_ext_data({"raw_url": raw_url})
     if "https" not in url:
         url = url.replace("http", "https")
     request._set_url(url)
     request.set_method(req_data['method'])
     if "Cookie" in req_data['headers']:
         req_data['headers'].pop("Cookie")
     request.set_headers(req_data['headers'])
     self.counter += 1
     return None
Пример #6
0
 def process_request(self, request, spider):
     current_req_data = self.req_data_list[self.counter % self.wx_num]
     req_data = TidyReqData.req_to_dict(
         current_req_data['getappmsgext']['req_data'])
     content_url = request._get_url()
     content_url_param_dict = str_to_dict(
         content_url.split('?')[-1], '&', '=')
     body_dict = req_data['body_dict']
     body_dict.update(content_url_param_dict)
     body_dict['comment_id'] = request.get_ext_data['comment_id']
     body_dict['is_need_reward'] = 1
     url = req_data['url'] + req_data['url_param_str']
     request._set_url(url)
     request.set_method(req_data['method'])
     request.set_headers(req_data['headers'])
     body_str = dict_to_str(body_dict)
     request._set_body(body_str)
     self.counter += 1
     return None
Пример #7
0
 def __init__(self, *args, **kwargs):
     """
     :param args:
     :param kwargs:
     实例化爬虫需要调用的函数
     """
     # 包含当前公众号所有不存在文本内容数据的生成器
     self.current_nickname = TidyReqData.get_nickname()
     print(self.current_nickname)
     articles_list = get_collection_article(self.current_nickname,
                                            read_num={"$exists": False},
                                            comment_id={"$exists": True})
     self.articles_list = []
     for article in articles_list:
         self.articles_list.append(article)
     self.task_num = len(self.articles_list)
     self.task_counter = 0
     self.begin_time = time()
     self.pre_time = time()
Пример #8
0
    def prepare_req_data(self, current_req_data, request, _type):
        """
        :param current_req_data: 本轮请求需要使用的请求参数
        :param request: Request对象
        :return: 准备爬取阅读数据的请求参数
        """
        request_data = {}

        if _type in ['getappmsgext', 'appmsg_comment']:
            req_data = TidyReqData.req_to_dict(
                current_req_data[_type]['req_data'])
        else:
            return request_data

        #根据原始文章的url构建body参数
        content_url = request._get_url()
        content_url_param_dict = str_to_dict(
            content_url.split('?')[-1], '&', '=')
        body_dict = copy(req_data['body_dict'])
        from tools.utils import update_dict_by_dict
        update_dict_by_dict(body_dict, content_url_param_dict,
                            ['mid', 'sn', 'idx', 'scene'])
        body_dict['comment_id'] = request.meta['comment_id']
        body_dict['is_need_reward'] = 1
        # 如果请求的是评论内容
        if "comment_id" in req_data['url_param_dict']:
            url_param_dict = copy(req_data['url_param_dict'])
            url_param_dict['comment_id'] = request.meta['comment_id']
            url_param_dict['idx'] = content_url_param_dict['idx']
            from tools.utils import dict_to_str
            url_param_str = dict_to_str(url_param_dict)
            request_data['url_str'] = req_data['url'] + url_param_str
        # 如果请求的是阅读量
        else:
            request_data[
                'url_str'] = req_data['url'] + req_data['url_param_str']
        request_data['header_dict'] = req_data['headers']
        request_data['body_dict'] = body_dict

        return request_data
    def get_xcx_item_list(self, nickname, hand=False):
        """
        获取小程序所有请求数据
        :param hand: 是否手动
        :param nickname: 小程序名称
        :return:
        """
        print(nickname)
        TidyReqData.flush_data("*.req")
        self.home_to_search()
        self.search_xcx(nickname)
        # 选中第一个结果后进入小程序,先选择第一个栏目
        self.oap.tap(tuple(eval(self.data['BTN']['JIU_QIAN_ZFJY'])))
        time.sleep(1)
        # self.oap.tap(tuple(eval(self.data['BTN']['JIU_QIAN_HWYJ'])))
        # 截图 与记录匹配获取相关信息

        # 方案一:先拉取全部文章列表,然后遍历获取每篇文章
        # 方案二:现截现获取信息
        get_list_slide_num = 0
        while redis_instance.get("xcx_get_list_stop") is None:
            self.oap.swap([60, 1000], [60, 250])
            get_list_slide_num = get_list_slide_num + 1
            time.sleep(0.5)
        # 回退到首部
        if redis_instance.get("xcx_get_list_stop"):
            for i in range(get_list_slide_num):
                self.oap.swap([60, 250], [60, 1000])

        # 获取小程序信息列表
        xcx_item_list = TidyReqData.get_xcx_req_data("*._xcx")
        # xcx_item_list = []
        for item in xcx_item_list:
            print("当前文档", item['title'])
            if xcx.doc_exist("jqzt", item['id']):
                self.oap.swap([60, 500], [60, 250])
                continue
            # 遍历每一项,并截图处理
            item_pos = self.vc.click_by_words(item['title'], tap=False)
            print(item_pos, "", item['title'])
            self.oap.tap(item_pos)
            time.sleep(3)
            self.oap.key(self.data['KEY']['BACK_KEYEVENT'])

            # 到达限制次数,退出循环
            if redis_instance.get("xcx_get_detail_stop"):
                break

            self.oap.swap([60, 500], [60, 250])
            # 滑动拉取列表拉完停止
            time.sleep(1)

        self.oap.key(self.data['KEY']['BACK_KEYEVENT'])
        self.oap.key(self.data['KEY']['BACK_KEYEVENT'])
        print("原始数据进入mongo %s" % ("xcx_jqzt"))
        TidyReqData.insert_xcx_to_mongo("xcx_jqzt")
        print("原始数据进入mongo %s 完成" % ("xcx_jqzt"))
        print("正在为 %s 创建索引..." % ("jqzt"))
        index_result = xcx.index_db_docs("jqzt")
        print("索引完成", index_result)
        print("redis 相关数据设置缓存时间")
        ttl_result = TidyReqData.set_redis_ttl(60 * 60 * 5)
        print("redis 5小时失效时间设置完成")
Пример #10
0
 def spider_opened(self, spider):
     self.wx_num, self.req_data_dict, self.req_data_list = TidyReqData.get_gzh_req_data(
     )
     if self.wx_num == 0:
         self.wx_num = 1
     self.pre_crawl_time = time.time()
Пример #11
0
 def spider_opened(self, spider):
     self.wx_num, self.req_data_dict, self.req_data_list = TidyReqData.get_gzh_req_data(
     )
     if self.wx_num == 0:
         self.wx_num = 1
Пример #12
0
class ArticleListSpider(scrapy.Spider):
    name = 'article_list'
    allowed_domains = ['mp.weixin.qq.com']
    start_url = []
    custom_settings = get_global_settings()
    wx_num, _, _ = TidyReqData.get_gzh_req_data()
    if wx_num == 0:
        wx_num = 1
    custom_settings['DOWNLOAD_DELAY'] = round(2.0 / wx_num, 2)
    custom_settings['ITEM_PIPELINES'] = {
        'crawler.crawler.pipelines.load_more.ResponseArticleListPipeline': 300,
    }
    custom_settings['DOWNLOADER_MIDDLEWARES'] = {
        'crawler.crawler.middlewares.load_more.LoadMoreMiddleware': 543,
    }
    counter = 0
    list_offset = 0

    def __init__(self, *args, **kwargs):
        """
        :param args:
        :param kwargs:
        实例化爬虫需要调用的函数
        """
        self.current_nickname = ''

    def start_requests(self):
        """
        :return:重新爬虫的入口函数, 否者直接请求start_urls中的各个url
        重写之后手动调用Request并指定回调函数例如self.parse
        """
        yield Request(url='http://www.aii.com',
                      meta={"list_offset": self.list_offset},
                      callback=self.parse,
                      dont_filter=True)

    def parse(self, response):
        """
        :param response:
        :return:请求完成之后的回调函数
        """
        self.counter += 1
        cmc = response.get_ext_data['can_msg_continue']
        next_offset = response.get_ext_data['next_offset']
        item = LoadMoreItem()
        item['article_list'] = response.get_ext_data['data']
        item['nickname'] = response.get_ext_data['nickname']
        self.current_nickname = response.get_ext_data['nickname']
        gc.report_crawling({
            'nickname': item['nickname'],
            'percent': 'UNK',
            'more': cmc,
            'title': len(item['article_list'])
        })
        yield item
        if cmc == 1:
            yield Request(url='http://www.aii.com',
                          meta={"list_offset": next_offset},
                          callback=self.parse,
                          dont_filter=True)

    def close(self, reason):
        """
        :param reason:
        :return:所有url请求完毕之后关闭爬虫的回调函数
        """
        # 删除被删除的公众号 被删除的公众号content_url为空
        from db import delete
        delete(self.current_nickname, content_url="")
        print(self.name, "爬虫关闭")
Пример #13
0
def flush_req_data():
    from crawler_assist.tidy_req_data import TidyReqData
    TidyReqData.flush_data("*.req")
    return "缓存的请求数据已经删除"
Пример #14
0
    def get_xcx_item_list_mini_batch(self, nickname, cur_phone):
        """
        获取小程序所有请求数据
        :param cur_phone: 当前设备
        :param nickname: 小程序名称
        :return:
        """
        print(nickname)
        # TidyReqData.flush_data("*.req")
        self.back_to_weixin_home()
        self.home_to_search()
        self.search_xcx(nickname)
        time.sleep(3)
        # 在这里监听,设备的search_key是否需要更新
        # print("config data", OPENID_PHONE)
        open_id = OPENID_PHONE[cur_phone]
        task_device_list = device_manager.get_task_type_devices("wxzs")
        open_id_device = dict(zip(OPENID_PHONE.values(), OPENID_PHONE.keys()))
        #检查该账号对应search_key是否有效
        while True:
            need_update = TidyReqData.get_need_update_keys()
            # 校验待更新设备是否存在,不存在则去除该open_id
            if len(need_update) > 0:
                need_del_open_id_list = []
                for need_update_open_id in need_update:
                    need_update_device_num = open_id_device[
                        need_update_open_id]
                    if need_update_device_num not in task_device_list:
                        need_del_open_id = OPENID_PHONE[need_update_device_num]
                        need_del_open_id_list.append(need_del_open_id)
                if need_del_open_id_list:
                    req_res = TidyReqData.set_offline_wechat_index_accounts(
                        need_del_open_id_list)
                    print("清除不可用账号{},结果{}".format(need_del_open_id_list,
                                                  req_res))
                    time.sleep(2)
                    need_update = TidyReqData.get_need_update_keys()

            # 校验是否有设备需要刷新
            if len(need_update) > 0:
                print("接收到更新内容", need_update)
                print("当前设备:{}, 当前open_id: {}".format(cur_phone, open_id))
                if open_id in need_update:
                    # 点击右上角
                    time.sleep(1)
                    self.back_to_weixin_home()
                    time.sleep(1)
                    try:
                        device_manager.push(cur_phone)
                        print("{} 设备释放成功".format(cur_phone))
                    except TfMongoException as e:
                        # device_manager.push(cur_phone)
                        print(
                            TfMongoException(
                                -2, "设备 {} --mongo设备释放操作出错,可能是链接超时".format(
                                    cur_phone), cur_phone).processer())
                    break
                # 这里有两种情况,1.另一线程设备进入,2.确实有设备不响应(暂不考虑)
                else:
                    # 链接设备不响应,操作后退出任务重新刷新设备
                    print(
                        "设备 {} 需要刷新或对应设备可能已断开连接,重新获取待更新记录".format(need_update))
                    # 刷新可用列表到服务器
                    self.oap.swap([60, 400], [60, 350])
                    time.sleep(randint(1, 3))
                    self.oap.swap([60, 350], [60, 400])
                    time.sleep(1)
                    # break

            else:
                print("设备{}重新获取待更新记录".format(cur_phone))
                self.oap.swap([60, 400], [60, 350])
                time.sleep(randint(3, 6))
                self.oap.swap([60, 350], [60, 400])
Пример #15
0
class ArticleSpider(scrapy.Spider):
    """
    公众号文章内容爬虫
    """
    name = 'article'
    allowed_domains = ['mp.weixin.qq.com']
    start_url = []
    custom_settings = get_global_settings()
    wx_num, _, _ = TidyReqData.get_gzh_req_data()
    # 担心ip被封 设置请求间隔
    # custom_settings['DOWNLOAD_DELAY'] = 0.5
    custom_settings['DOWNLOADER_MIDDLEWARES'] = {
        'crawler.crawler.middlewares.crawl_article.CrawlArticleMiddleware':
        543,
    }
    custom_settings['ITEM_PIPELINES'] = {
        'crawler.crawler.pipelines.crawl_article.ResponseArticlePipeline': 300,
    }
    custom_settings['DOWNLOAD_TIMEOUT'] = 10
    custom_settings['CONCURRENT_REQUESTS'] = 16

    def __init__(self, *args, **kwargs):
        """
        :param args:
        :param kwargs:
        实例化爬虫需要调用的函数
        """
        # 包含当前公众号所有不存在文本内容数据的生成器
        self.current_nickname = TidyReqData.get_nickname()
        self.articles_list = get_collection_article(self.current_nickname,
                                                    article={"$exists": False},
                                                    title={"$exists": True})
        self.crawler_begin_time = time()
        self.crawler_parse_counter = 0

    def start_requests(self):
        """
        :return:重新爬虫的入口函数, 否者直接请求start_urls中的各个url
        重写之后手动调用Request并指定回调函数例如self.parse
        """
        for article in self.articles_list:
            if "weixin" in article['content_url']:
                yield Request(url=article['content_url'], callback=self.parse)

    def parse(self, response):
        """
        :param response:
        :return:请求完成之后的回调函数
        """
        item = CrawlArticleItem()
        item['article_data'] = response.get_ext_data['article_data']
        item['nickname'] = response.get_ext_data['nickname']
        item['raw_url'] = response.get_ext_data['raw_url']
        self.crawler_parse_counter += 1
        time_gap = time() - self.crawler_begin_time
        print(round(time_gap / self.crawler_parse_counter, 3),
              item['article_data']['article'].replace('\n', ''))
        # 发送状态给前端
        crawling_item = {}
        crawling_item['nickname'] = item['nickname']
        crawling_item['percent'] = self.crawler_parse_counter
        crawling_item['more'] = round(time_gap / self.crawler_parse_counter, 3)
        crawling_item['title'] = find_one(item['nickname'],
                                          item['raw_url'])['title'][:10]
        gc.report_crawling(crawling_item)
        yield item

    def close(self, reason):
        """
        :param reason:
        :return:所有url请求完毕之后关闭爬虫的回调函数
        """
        time_gap = time() - self.crawler_begin_time
        if self.crawler_parse_counter != 0:
            print("%s爬虫关闭 用时%d 共计爬取%d 平均%f" %
                  (self.name, time_gap, self.crawler_parse_counter,
                   time_gap / self.crawler_parse_counter))
        from instance.global_instance import gs
        print("正在为 %s 创建索引..." % (self.current_nickname))
        index_result = gs.index_db_docs(self.current_nickname)
        print("索引完成", index_result)
        from db.meta_data import insert_article_metadata
        insert_article_metadata(
            self.current_nickname, {
                'date': datetime.datetime.now(),
                'articles_num': self.crawler_parse_counter
            })
Пример #16
0
class ArticleReadDataSpider(scrapy.Spider):
    """
    公众号文章阅读数据爬虫
    """
    name = 'read_data'
    allowed_domains = ['mp.weixin.qq.com']
    start_url = []
    custom_settings = get_global_settings()
    wx_num, _, _ = TidyReqData.get_gzh_req_data()
    if wx_num == 0:
        wx_num = 1
    custom_settings['DOWNLOAD_DELAY'] = round(2.5 / wx_num, 2)
    custom_settings['DOWNLOADER_MIDDLEWARES'] = {
        'crawler.crawler.middlewares.crawl_article.ArticleReadDataMiddleware':
        543,
    }
    custom_settings['ITEM_PIPELINES'] = {
        'crawler.crawler.pipelines.crawl_article.ResponseArticleReadDataPipeline':
        300,
    }
    custom_settings['CONCURRENT_REQUESTS'] = 1

    def __init__(self, *args, **kwargs):
        """
        :param args:
        :param kwargs:
        实例化爬虫需要调用的函数
        """
        # 包含当前公众号所有不存在文本内容数据的生成器
        self.current_nickname = TidyReqData.get_nickname()
        print(self.current_nickname)
        articles_list = get_collection_article(self.current_nickname,
                                               read_num={"$exists": False},
                                               comment_id={"$exists": True})
        self.articles_list = []
        for article in articles_list:
            self.articles_list.append(article)
        self.task_num = len(self.articles_list)
        self.task_counter = 0
        self.begin_time = time()
        self.pre_time = time()

    def start_requests(self):
        """
        :return:重新爬虫的入口函数, 否者直接请求start_urls中的各个url
        重写之后手动调用Request并指定回调函数例如self.parse
        """
        for article in self.articles_list:
            if ':' in article['content_url']:
                request = Request(url=article['content_url'],
                                  callback=self.parse,
                                  dont_filter=False)
                request.set_ext_data({
                    'content_url': article['content_url'],
                    'comment_id': article['comment_id']
                })
                yield request

    def parse(self, response):
        """
        :param response:
        :return:请求完成之后的回调函数
        """
        item = CrawlArticleReadDataItem()
        item['read_data'] = response.get_ext_data['read_data']
        item['nickname'] = response.get_ext_data['nickname']
        item['content_url'] = response.get_ext_data['content_url']
        # 打印状爬虫状态信息
        self.task_counter += 1
        pre_time_gap = time() - self.pre_time
        total_time_gap = time() - self.begin_time
        time_need = (self.task_num - self.task_counter) * (total_time_gap /
                                                           self.task_counter)
        print(round(pre_time_gap, 2),
              round(total_time_gap / self.task_counter,
                    2), "%d/%d" % (self.task_counter, self.task_num),
              response.get_ext_data['read_data']['read_num'],
              response.get_ext_data['read_data']['like_num'],
              response.get_ext_data['read_data']['nick_name'],
              str(datetime.timedelta(seconds=time_need)).split('.')[0])
        self.pre_time = time()
        crawling_item = {}
        crawling_item['nickname'] = item['nickname']
        crawling_item['percent'] = '%d/%d' % (self.task_counter, self.task_num)
        crawling_item['more'] = response.get_ext_data['read_data']['read_num']
        crawling_item['title'] = find_one(item['nickname'],
                                          item['content_url'])['title'][:10]
        gc.report_crawling(crawling_item)
        yield item

    def close(self, reason):
        """
        :param reason:
        :return:所有url请求完毕之后关闭爬虫的回调函数
        """
        print(self.name, "爬虫关闭")
 def spider_opened(self, spider):
     spider.logger.info('Spider opened: %s' % spider.name)
     self.wx_num, self.req_data_dict, self.req_data_list = TidyReqData.get_gzh_req_data(
     )
     if self.wx_num == 0:
         self.wx_num = 1