Пример #1
0
class ProxyManager(object):
    """
    ProxyManager
    """
    def __init__(self):
        self.db = ProxyDBClient('proxy')
        self.raw_proxy_queue = 'raw_proxy'
        self.log = LogHandler('proxy_manager')
        self.useful_proxy_queue = 'useful_proxy'

    def refresh(self):
        """
        fetch proxy into Db by ProxyGetter/getFreeProxy.py
        :return:
        """
        self.db.changeTable(self.raw_proxy_queue)
        for proxyGetter in conf.proxy_getter_functions:
            # fetch
            try:
                self.log.info(
                    "{func}: fetch proxy start".format(func=proxyGetter))
                for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                    # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能
                    proxy = proxy.strip()
                    if proxy and verifyProxyFormat(proxy):
                        self.log.info('{func}: fetch proxy {proxy}'.format(
                            func=proxyGetter, proxy=proxy))
                        self.db.put(proxy)
                    else:
                        self.log.error(
                            '{func}: fetch proxy {proxy} error'.format(
                                func=proxyGetter, proxy=proxy))
            except Exception as e:
                self.log.warn(
                    "{func}: fetch proxy fail".format(func=proxyGetter))
                continue

    def get(self):
        """
        return a useful proxy
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        item_dict = self.db.getAll()
        if item_dict:
            if EnvUtil.PY3:
                return random.choice(list(item_dict.keys()))
            else:
                return random.choice(item_dict.keys())
        return None
        # return self.db.pop()

    def delete(self, proxy):
        """
        delete proxy from pool
        :param proxy:
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        self.db.delete(proxy)

    def getAll(self):
        """
        get all proxy from pool as list
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        item_dict = self.db.getAll()
        if EnvUtil.PY3:
            return list(item_dict.keys()) if item_dict else list()
        return item_dict.keys() if item_dict else list()

    def getNumber(self):
        self.db.changeTable(self.raw_proxy_queue)
        total_raw_proxy = self.db.getNumber()
        self.db.changeTable(self.useful_proxy_queue)
        total_useful_queue = self.db.getNumber()
        return {
            'raw_proxy': total_raw_proxy,
            'useful_proxy': total_useful_queue
        }
class QuestionsGenerator:
    """ 获取话题内的精华和讨论问题"""
    top_activity_url = "https://www.zhihu.com/api/v4/topics/%u/feeds/top_activity?limit=10&offset=%u"  # 讨论问题
    essence_url = "https://www.zhihu.com/api/v4/topics/%u/feeds/essence?limit=10&offset=%u"  # 精华问题

    def __init__(self):
        self.p_receiver = ProxiesReceiver()
        self.essence_dict = dict()
        self.activity_dict = dict()
        self.session = requests.Session()
        # 先从话题id中筛选出未获取问题的id放入待获取队列
        QuestionsGenerator.save_new_topic_id()
        self.logger = LogHandler("question_generator")

    @staticmethod
    def badge_parse(badge):
        res_array = []
        for b in badge:
            res_array.append({
                'author_type': b['type'],
                'author_description': b['description']
            })
        return res_array

    def topic_sticky_parse(self, topic_sticky):
        res_array = []
        try:
            for t in topic_sticky:
                if t['target'].get('title'):
                    res_array.append({
                        'id':
                        t['target']['id'],
                        'title':
                        t['target']['title'],
                        'author_name':
                        t['target']['author']['name'],
                        'author_gender':
                        t['target']['author']['gender'],
                        'author_headline':
                        t['target']['author']['headline'],
                        'author_url_token':
                        t['target']['author']['url_token'],
                        'excerpt':
                        t['target']['excerpt'],
                        'updated_time':
                        t['target']['updated'],
                        'contents':
                        t['target']['comment_count'],
                        'likes':
                        t['target']['voteup_count'],
                        'created_time':
                        t['target']['created']
                    })
                else:
                    res_array.append({
                        'id':
                        t['target']['id'],  # 热门回答
                        'title':
                        t['target']['question']['title'],
                        'author_name':
                        t['target']['author']['name'],
                        'author_gender':
                        t['target']['author']['gender'],
                        'author_headline':
                        t['target']['author']['headline'],
                        'author_url_token':
                        t['target']['author']['url_token'],
                        'excerpt':
                        t['target']['excerpt'],
                        'updated_time':
                        t['target']['updated_time'],
                        'created':
                        t['target']['question']['created'],
                        'created_time':
                        t['target']['created_time']
                    })
        except KeyError as ke:
            self.logger.error(ke, topic_sticky)
        except Exception as e:
            raise e
        return res_array

    @timethis
    def crawl_topic_message(self, tid, q_url, num, message_dict):
        """
        话题对应问题/文章获取,最终结果存放:
        message_dict: key-类型(文章/问题),value-{key-id, value-{'question/title':问题/文章名称, author':作者,
        'gender':性别,'author_badge':作者标签, 'author_headline':作者签名, 'author_url_token':作者url标识,excerpt':摘录,
        'created_time':创建时间, 'updated_time':最后更新时间, 'comment_count':'评论数', likes':点赞数}}
        :param tid:话题ID
        :param q_url:请求url
        :param num 抓取数量
        :param message_dict:存储字典
        :return:
        """
        session = requests.session()
        sticky_num = 0
        for offset in range(0, num, 10):
            url = q_url % (tid, offset)
            try:
                ques = session.get(url=url,
                                   headers=headers,
                                   proxies=self.p_receiver.one_random,
                                   timeout=3)
            except Exception as re:
                self.logger.warn((re, url))
                continue
            try:
                q_json = ques.json() if ques else {}
            except JSONDecodeError as je:
                self.logger.error((je, url, ques))
                continue
            for q in q_json.get('data', []):
                target = q.get('target', {})
                question_type = str(target.get('type', 'none_type')).lower()
                if question_type == 'none_type':
                    continue
                elif question_type == 'answer':
                    # 问题回答是双id,使用元组转成字符串
                    addtwodimdict(
                        message_dict, 'answer',
                        str((target['question']['id'], target['id'])), {
                            'question':
                            target['question']['title'],
                            'author_name':
                            target['author']['name'],
                            'author_gender':
                            target['author']['gender'],
                            'author_badge':
                            QuestionsGenerator.badge_parse(
                                target['author']['badge']),
                            'author_headline':
                            target['author']['headline'],
                            'author_url_token':
                            target['author']['url_token'],
                            'excerpt':
                            target['excerpt'],
                            'created_time':
                            target['created_time'],
                            'updated_time':
                            target['updated_time'],
                            'contents':
                            target['comment_count'],
                            'likes':
                            target['voteup_count']
                        })
                elif question_type == 'article':
                    # 文章是单id
                    addtwodimdict(
                        message_dict, 'article', target['id'], {
                            'title':
                            target['title'],
                            'author_name':
                            target['author']['name'],
                            'author_gender':
                            target['author']['gender'],
                            'author_badge':
                            QuestionsGenerator.badge_parse(
                                target['author']['badge']),
                            'author_headline':
                            target['author']['headline'],
                            'author_url_token':
                            target['author']['url_token'],
                            'excerpt':
                            target['excerpt'],
                            'created_time':
                            target['created'],
                            'updated_time':
                            target['updated'],
                            'contents':
                            target['comment_count'],
                            'likes':
                            target['voteup_count']
                        })
                elif question_type == 'question':
                    pass  # 目前抓取到的是一些没有回答的问题,这里过滤掉
                elif question_type == 'topic_sticky_module':  # 热门置顶
                    addtwodimdict(
                        message_dict, 'topic_sticky_module', sticky_num, {
                            'title': target['title'],
                            'data': self.topic_sticky_parse(target['data'])
                        })
                    sticky_num += 1
                else:
                    self.logger.error(
                        "There was a new type:{}!\n".format(question_type))
            if str(q_json.get('paging', {}).get('is_end',
                                                'none')).lower() == 'true':
                return

    def process(self):
        """
        zhTopicQuestions内包含精华(点赞比较多)和讨论(最新比较热)的问题和文章
        对应结构essence_dict和activity_dict
        :return:
        """
        tid = int(redis_cli.block_pop('zhNewTopicID'))
        self.crawl_topic_message(tid, QuestionsGenerator.essence_url,
                                 conf.essence_nums, self.essence_dict)
        self.crawl_topic_message(tid, QuestionsGenerator.top_activity_url,
                                 conf.top_activity_nums, self.activity_dict)
        with open(os.path.join(OUTPUT_PATH, "{}.json".format(int(tid))),
                  "w",
                  encoding="GB18030") as f:
            json.dump(
                {
                    'essence': self.essence_dict,
                    'top_activity': self.activity_dict
                },
                f,
                indent=4,
                ensure_ascii=False)

    @staticmethod
    def save_new_topic_id():
        """ output目录下不要手动放其他名称文件 """
        for nid in set(redis_cli.hkeys('zhTopicMessage')) - set(
            {x.split('.')[0]
             for x in os.listdir(OUTPUT_PATH)}):
            redis_cli.sadd('zhNewTopicID', nid)