def parse(self): question = { 'title': self.soup.find('h2', class_='zm-item-title zm-editable-content').getText().strip(), 'topics': [], 'follow_cnt': 0, 'similarQues': [], '_id': '', 'answers': [] } for topic in self.soup.find_all('a', class_='zm-item-tag'): question['topics'].append({ 'topic': topic.getText().strip(), 'link': ZHIHU_URL + topic['href'] }) with trytry(): question['follow_cnt'] = int(PATTERN_NUM.findall(self.soup.find('div', id='zh-question-side-header-wrap').getText())[0]) with trytry(): for li in self.soup.find('ul', itemprop='relatedQuestion').find_all('li'): a = li.find('a') question['similarQues'].append({ 'id': a['href'][1+a['href'].rfind('/'):], 'title': a.getText().strip() }) return question
def parse_answer(cls, soup): title_a = soup.find('a', class_='question_link') body = soup.find('div', class_="entry-body") author = soup.find('div', class_='zm-item-answer-author-info') comment_cnt, a_name, a_desc, a_link, r_time = 0, u'匿名用户', '', '', '' with trytry(): r_time = datetime.datetime.fromtimestamp(int( body['data-created'])).strftime('%Y-%m-%d %H:%M:%S') with trytry(): a = author.find('a', class_='author-link') if a: a_name = a.getText().strip() a_link = ZHIHU_URL + a['href'] else: a_name = author.find('span', class_='name').getText().strip() a_desc = author.find('span', class_='bio')['title'] with trytry(): comment_cnt = int( soup.find( 'a', class_=' meta-item toggle-comment').getText().replace( u'条评论', '').strip()) content, e_time = cls.unescape( unicode(soup.find('textarea', class_='content hidden'))) if not e_time: e_time = r_time result = { 'a_name': a_name, 'a_link': a_link, 'a_desc': a_desc, 'r_time': r_time, 'e_time': e_time, 'comment_cnt': comment_cnt, 'question': title_a.getText().strip(), 'agree_cnt': int(soup.find('a', class_='zm-item-vote-count')['data-votecount']), '_id': ZHIHU_URL + title_a['href'] + '/answer/' + body['data-atoken'], 'content': content, 'topics': [] } return result
def parse_imgs(self): answer_soup = self.soup.find('div', class_='zm-editable-content clearfix') if not answer_soup: if u'回答建议修改:涉及淫秽色情低俗信息' in unicode(self.soup): return None imgs = list(set(PATTERN_IMG.findall(unicode(answer_soup)))) if not imgs: return None answer = { 'url': ZHIHU_URL + self.soup.find('div', class_='zm-item-rich-text js-collapse-body')['data-entry-url'], 'agree_cnt': 0, 'a_link': '', 'a_name': u'匿名用户', 'r_time': '', 'e_time': '', 'comment_cnt': '', 'imgs': [], '_id': '', } with trytry(): count = self.soup.find('span', class_='count').getText().strip().lower() if 'k' in count: count = count[:-1] + '000' answer['agree_cnt'] = int(count) for img in imgs: content = self.get(img, timeout=120) filename = sha1(content) + img[img.rfind('.'):] save(content, filename) # answer['imgs'].append({'local': filename, 'raw': img}) answer['imgs'].append(filename) author = self.soup.find('div', class_='zm-item-answer-author-info') author_link = author.find('a', class_='author-link') if author_link: answer['a_link'] = ZHIHU_URL + author_link['href'] answer['a_name'] = author_link.getText().strip() with trytry(): answer['r_time'], answer['e_time'] = self.parse_edit_time(self.soup.find('a', class_='answer-date-link')) with trytry(): comment = self.soup.find('a', class_=' meta-item toggle-comment').getText().strip() if comment != u'添加评论': answer['comment_cnt'] = comment[:-3].strip() answer['_id'] = answer['url'].replace('https://www.zhihu.com/question/', '').replace('/answer/', '-') return answer
def parse_answer(cls, soup): title_a = soup.find('a', class_='question_link') body = soup.find('div', class_="entry-body") author = soup.find('div', class_='zm-item-answer-author-info') comment_cnt, a_name, a_desc, a_link, r_time = 0, u'匿名用户', '', '', '' with trytry(): r_time = datetime.datetime.fromtimestamp(int(body['data-created'])).strftime('%Y-%m-%d %H:%M:%S') with trytry(): a = author.find('a', class_='author-link') if a: a_name = a.getText().strip() a_link = ZHIHU_URL + a['href'] else: a_name = author.find('span', class_='name').getText().strip() a_desc = author.find('span', class_='bio')['title'] with trytry(): comment_cnt = int(soup.find('a', class_=' meta-item toggle-comment').getText().replace(u'条评论', '').strip()) content, e_time = cls.unescape(unicode(soup.find('textarea', class_='content hidden'))) if not e_time: e_time = r_time result = { 'a_name': a_name, 'a_link': a_link, 'a_desc': a_desc, 'r_time': r_time, 'e_time': e_time, 'comment_cnt': comment_cnt, 'question': title_a.getText().strip(), 'agree_cnt': int(soup.find('a', class_='zm-item-vote-count')['data-votecount']), '_id': ZHIHU_URL + title_a['href'] + '/answer/' + body['data-atoken'], 'content': content, 'topics': [] } return result
def main(): # TopicHotCrawler('19552832').run() for index, item in enumerate(MONGO[DB][TOPIC_COLL].find().batch_size(1)): logger.info('now topic %s-%s' % (index, item['_id'])) with trytry(logger=logger): TopicHotCrawler(item['_id']).run()