예제 #1
0
class ZhiDaoGenerator:
    """ 百度知道关键词抓取,关键词有两部分,1.问题内容包含的关键词,2.问题所属的关键词 """
    def __init__(self):
        self.logger = LogHandler('zhidao_crawl')
        try:
            # 知道请求头,获取Cookie的BAIDUID,否则抓取不到数据
            self.zhidao_headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 16 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
                'Cookie':
                'BAIDUID=%s' % requests.get('https://zhidao.baidu.com/browse/',
                                            timeout=3).cookies['BAIDUID']
            }
        except Exception as e:
            raise e

    def __zd_question_keywords(self, url):
        """ 获取知道问题内容的关键词 """
        try:
            req = requests.get(url, timeout=3)
            req.encoding = req.apparent_encoding
            kws = []
            for kw_tag in BeautifulSoup(req.text, "html.parser").find_all(
                    'li',
                    class_=lambda class_: class_ and ('word grid' in class_)):
                kw = kw_tag.find(class_="word-text")
                if kw is not None:
                    kws.append(kw.string)
            return kws
        except RequestException as re:
            self.logger.warning(re)
            return []
        except Exception as e:
            raise e

    @timethis
    def crawl_zhidao_words(self, save_keyword):
        """ 百度知道抓取 """
        try:
            req = requests.get(
                url=
                'https://zhidao.baidu.com/list?_pjax=%23j-question-list-pjax-container',
                headers=self.zhidao_headers,
                timeout=3)
            req.encoding = req.apparent_encoding
            for qs in BeautifulSoup(req.text, "html.parser").find_all(
                    'div', class_='question-title-section'):
                # 问题所属领域关键词提取
                for qt in map(lambda x: x.string.replace('\n', ''),
                              qs.find_all('a', class_='tag-item')):
                    save_keyword(qt)
                # 问题内容包含关键词提取
                for qm in self.__zd_question_keywords(qs.a.get('href')):
                    save_keyword(str(qm))
        except RequestException as re:
            self.logger.warn(re)
        except Exception as e:
            raise e
예제 #2
0
class ZhihuTopicGenerator:
    """ 分为两个过程:id获取和扩展 """
    def __init__(self):
        # 代理ip
        self.p_receiver = ProxiesReceiver()
        # 建立会话,设置requests重连次数和重连等待时间
        self.session = requests.Session()
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        self.session.mount('https://', adapter)
        self.logger = LogHandler('topics_generator')
        logging.getLogger("urllib3").setLevel(logging.ERROR)

    def __get_topic_message(self, tid):
        """
        话题信息获取,最终结果存放:
        zhTopicMessage哈希表内,key-话题id,value-{'name':名称, 'introduction': 简介, 'questions_count':问题数,
        'best_answers_count':精华问题数, 'followers_count':关注人数, 'best_answerers_count':优秀回答者人数}
        """
        try:
            j_rst = self.session.get(url=topic_message_url % tid,
                                     headers=headers,
                                     proxies=self.p_receiver.one_random,
                                     timeout=3).json()
            if redis_cli.hset(
                    'zhTopicMessage', tid,
                    str({
                        "name":
                        j_rst.get("name"),
                        'introduction':
                        j_rst.get("introduction"),
                        "questions_count":
                        j_rst.get("questions_count"),
                        "best_answers_count":
                        j_rst.get("best_answers_count"),
                        'followers_count':
                        j_rst.get("followers_count"),
                        "best_answerers_count":
                        j_rst.get("best_answerers_count")
                    })):
                # 待获取相关信息
                redis_cli.sadd('zhNewTopicID', tid)
                self.logger.info("zhNewTopicID:%d", tid)
                return True
        except RequestException as re:
            self.logger.warn(re)
        except Exception as e:
            raise e
        return False

    def __get_hot_topics(self):
        """ 搜索zhTemporaryWords内关键词,从其结果中得到相关话题id和名称 """
        tw = redis_cli.block_pop('zhTemporaryWords').decode('utf-8')  # pop
        # 不断翻页至最后,最大获取1000条
        for offset in range(0, 1000, 10):
            try:
                url = zh_search_url % (tw, offset)
                j_topics = self.session.get(url=url,
                                            headers=headers,
                                            proxies=self.p_receiver.one_random,
                                            timeout=3).json()
                topics = j_topics.get('data', None) if j_topics else None
                if not topics:  # 已到最后
                    return
                # 每一页获取话题相关详细信息
                for t in topics:
                    if t.get('object') and t.get('object').get('id'):
                        try:
                            tid = int(t['object']['id'])
                        except ValueError as ve:
                            self.logger.warning(ve, t['object']['id'])
                            continue
                        if self.__get_topic_message(tid):
                            yield tid
                    else:
                        break
            except RequestException as re:
                self.logger.warn((re, url))
            except ReadTimeout as rte:
                self.logger.warn((rte, url))
            except KeyError as ke:
                self.logger.warn((ke, url))
            except Exception as e:
                raise e

    @staticmethod
    def __save_to_dag(child_topic_id, parent_topic_id):
        """ 按其结构保存为有向无环图 """
        ids = redis_cli.hget('zhTopicDAG', parent_topic_id)
        if not ids or ids.decode() == "None":
            redis_cli.hset('zhTopicDAG', parent_topic_id,
                           str({child_topic_id}))
        else:
            new_ids = eval(ids)
            new_ids.add(child_topic_id)
            redis_cli.hset('zhTopicDAG', parent_topic_id, str(new_ids))

    def __add_topics(self, url, topic_id, func):
        try:
            req = self.session.get(url=url % int(topic_id),
                                   headers=headers,
                                   proxies=self.p_receiver.one_random,
                                   timeout=3)
            if not req:  # 获取子父话题有可能不存在
                return
            for p in req.json()['data']:
                expand_topic_id = int(p['id'])
                func(topic_id, expand_topic_id)
                self.__get_topic_message(expand_topic_id)
        except RequestException as re:
            self.logger.warn(re)
        except ReadTimeout as rte:
            self.logger.warn(rte)
        except Exception as e:
            raise e

    def __expand_topics(self, tid):
        """ 话题扩展,分别向父子话题不断扩展 """
        self.__add_topics(parent_url, tid,
                          lambda a, b: self.__save_to_dag(a, b))
        self.__add_topics(child_url, tid,
                          lambda a, b: self.__save_to_dag(b, a))

    @timethis
    def process(self):
        for tid in self.__get_hot_topics():
            self.__expand_topics(tid)