class ZhiDaoGenerator: """ 百度知道关键词抓取,关键词有两部分,1.问题内容包含的关键词,2.问题所属的关键词 """ def __init__(self): self.logger = LogHandler('zhidao_crawl') try: # 知道请求头,获取Cookie的BAIDUID,否则抓取不到数据 self.zhidao_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 16 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Cookie': 'BAIDUID=%s' % requests.get('https://zhidao.baidu.com/browse/', timeout=3).cookies['BAIDUID'] } except Exception as e: raise e def __zd_question_keywords(self, url): """ 获取知道问题内容的关键词 """ try: req = requests.get(url, timeout=3) req.encoding = req.apparent_encoding kws = [] for kw_tag in BeautifulSoup(req.text, "html.parser").find_all( 'li', class_=lambda class_: class_ and ('word grid' in class_)): kw = kw_tag.find(class_="word-text") if kw is not None: kws.append(kw.string) return kws except RequestException as re: self.logger.warning(re) return [] except Exception as e: raise e @timethis def crawl_zhidao_words(self, save_keyword): """ 百度知道抓取 """ try: req = requests.get( url= 'https://zhidao.baidu.com/list?_pjax=%23j-question-list-pjax-container', headers=self.zhidao_headers, timeout=3) req.encoding = req.apparent_encoding for qs in BeautifulSoup(req.text, "html.parser").find_all( 'div', class_='question-title-section'): # 问题所属领域关键词提取 for qt in map(lambda x: x.string.replace('\n', ''), qs.find_all('a', class_='tag-item')): save_keyword(qt) # 问题内容包含关键词提取 for qm in self.__zd_question_keywords(qs.a.get('href')): save_keyword(str(qm)) except RequestException as re: self.logger.warn(re) except Exception as e: raise e
class ZhihuTopicGenerator: """ 分为两个过程:id获取和扩展 """ def __init__(self): # 代理ip self.p_receiver = ProxiesReceiver() # 建立会话,设置requests重连次数和重连等待时间 self.session = requests.Session() retry = Retry(connect=3, backoff_factor=0.5) adapter = HTTPAdapter(max_retries=retry) self.session.mount('https://', adapter) self.logger = LogHandler('topics_generator') logging.getLogger("urllib3").setLevel(logging.ERROR) def __get_topic_message(self, tid): """ 话题信息获取,最终结果存放: zhTopicMessage哈希表内,key-话题id,value-{'name':名称, 'introduction': 简介, 'questions_count':问题数, 'best_answers_count':精华问题数, 'followers_count':关注人数, 'best_answerers_count':优秀回答者人数} """ try: j_rst = self.session.get(url=topic_message_url % tid, headers=headers, proxies=self.p_receiver.one_random, timeout=3).json() if redis_cli.hset( 'zhTopicMessage', tid, str({ "name": j_rst.get("name"), 'introduction': j_rst.get("introduction"), "questions_count": j_rst.get("questions_count"), "best_answers_count": j_rst.get("best_answers_count"), 'followers_count': j_rst.get("followers_count"), "best_answerers_count": j_rst.get("best_answerers_count") })): # 待获取相关信息 redis_cli.sadd('zhNewTopicID', tid) self.logger.info("zhNewTopicID:%d", tid) return True except RequestException as re: self.logger.warn(re) except Exception as e: raise e return False def __get_hot_topics(self): """ 搜索zhTemporaryWords内关键词,从其结果中得到相关话题id和名称 """ tw = redis_cli.block_pop('zhTemporaryWords').decode('utf-8') # pop # 不断翻页至最后,最大获取1000条 for offset in range(0, 1000, 10): try: url = zh_search_url % (tw, offset) j_topics = self.session.get(url=url, headers=headers, proxies=self.p_receiver.one_random, timeout=3).json() topics = j_topics.get('data', None) if j_topics else None if not topics: # 已到最后 return # 每一页获取话题相关详细信息 for t in topics: if t.get('object') and t.get('object').get('id'): try: tid = int(t['object']['id']) except ValueError as ve: self.logger.warning(ve, t['object']['id']) continue if self.__get_topic_message(tid): yield tid else: break except RequestException as re: self.logger.warn((re, url)) except ReadTimeout as rte: self.logger.warn((rte, url)) except KeyError as ke: self.logger.warn((ke, url)) except Exception as e: raise e @staticmethod def __save_to_dag(child_topic_id, parent_topic_id): """ 按其结构保存为有向无环图 """ ids = redis_cli.hget('zhTopicDAG', parent_topic_id) if not ids or ids.decode() == "None": redis_cli.hset('zhTopicDAG', parent_topic_id, str({child_topic_id})) else: new_ids = eval(ids) new_ids.add(child_topic_id) redis_cli.hset('zhTopicDAG', parent_topic_id, str(new_ids)) def __add_topics(self, url, topic_id, func): try: req = self.session.get(url=url % int(topic_id), headers=headers, proxies=self.p_receiver.one_random, timeout=3) if not req: # 获取子父话题有可能不存在 return for p in req.json()['data']: expand_topic_id = int(p['id']) func(topic_id, expand_topic_id) self.__get_topic_message(expand_topic_id) except RequestException as re: self.logger.warn(re) except ReadTimeout as rte: self.logger.warn(rte) except Exception as e: raise e def __expand_topics(self, tid): """ 话题扩展,分别向父子话题不断扩展 """ self.__add_topics(parent_url, tid, lambda a, b: self.__save_to_dag(a, b)) self.__add_topics(child_url, tid, lambda a, b: self.__save_to_dag(b, a)) @timethis def process(self): for tid in self.__get_hot_topics(): self.__expand_topics(tid)