예제 #1
0
    def parse(self):
        question = {
            'title': self.soup.find('h2', class_='zm-item-title zm-editable-content').getText().strip(),
            'topics': [],
            'follow_cnt': 0,
            'similarQues': [],
            '_id': '',
            'answers': []
        }
        for topic in self.soup.find_all('a', class_='zm-item-tag'):
            question['topics'].append({
                'topic': topic.getText().strip(),
                'link': ZHIHU_URL + topic['href']
            })
        with trytry():
            question['follow_cnt'] = int(PATTERN_NUM.findall(self.soup.find('div', id='zh-question-side-header-wrap').getText())[0])

        with trytry():
            for li in self.soup.find('ul', itemprop='relatedQuestion').find_all('li'):
                a = li.find('a')
                question['similarQues'].append({
                    'id': a['href'][1+a['href'].rfind('/'):],
                    'title': a.getText().strip()
                })

        return question
예제 #2
0
    def parse_answer(cls, soup):
        title_a = soup.find('a', class_='question_link')
        body = soup.find('div', class_="entry-body")

        author = soup.find('div', class_='zm-item-answer-author-info')
        comment_cnt, a_name, a_desc, a_link, r_time = 0, u'匿名用户', '', '', ''

        with trytry():
            r_time = datetime.datetime.fromtimestamp(int(
                body['data-created'])).strftime('%Y-%m-%d %H:%M:%S')

        with trytry():
            a = author.find('a', class_='author-link')
            if a:
                a_name = a.getText().strip()
                a_link = ZHIHU_URL + a['href']
            else:
                a_name = author.find('span', class_='name').getText().strip()
            a_desc = author.find('span', class_='bio')['title']
        with trytry():
            comment_cnt = int(
                soup.find(
                    'a', class_=' meta-item toggle-comment').getText().replace(
                        u'条评论', '').strip())

        content, e_time = cls.unescape(
            unicode(soup.find('textarea', class_='content hidden')))
        if not e_time:
            e_time = r_time
        result = {
            'a_name':
            a_name,
            'a_link':
            a_link,
            'a_desc':
            a_desc,
            'r_time':
            r_time,
            'e_time':
            e_time,
            'comment_cnt':
            comment_cnt,
            'question':
            title_a.getText().strip(),
            'agree_cnt':
            int(soup.find('a', class_='zm-item-vote-count')['data-votecount']),
            '_id':
            ZHIHU_URL + title_a['href'] + '/answer/' + body['data-atoken'],
            'content':
            content,
            'topics': []
        }
        return result
예제 #3
0
    def parse_imgs(self):
        answer_soup = self.soup.find('div', class_='zm-editable-content clearfix')
        if not answer_soup:
            if u'回答建议修改:涉及淫秽色情低俗信息' in unicode(self.soup):
                return None
        imgs = list(set(PATTERN_IMG.findall(unicode(answer_soup))))
        if not imgs:
            return None
        answer = {
            'url': ZHIHU_URL + self.soup.find('div', class_='zm-item-rich-text js-collapse-body')['data-entry-url'],
            'agree_cnt': 0, 'a_link': '', 'a_name': u'匿名用户',
            'r_time': '', 'e_time': '', 'comment_cnt': '', 'imgs': [], '_id': '',
        }
        with trytry():
            count = self.soup.find('span', class_='count').getText().strip().lower()
            if 'k' in count:
                count = count[:-1] + '000'
            answer['agree_cnt'] = int(count)

        for img in imgs:
            content = self.get(img, timeout=120)
            filename = sha1(content) + img[img.rfind('.'):]
            save(content, filename)
            # answer['imgs'].append({'local': filename, 'raw': img})
            answer['imgs'].append(filename)

        author = self.soup.find('div', class_='zm-item-answer-author-info')
        author_link = author.find('a', class_='author-link')
        if author_link:
            answer['a_link'] = ZHIHU_URL + author_link['href']
            answer['a_name'] = author_link.getText().strip()

        with trytry():
            answer['r_time'], answer['e_time'] = self.parse_edit_time(self.soup.find('a', class_='answer-date-link'))

        with trytry():
            comment = self.soup.find('a', class_=' meta-item toggle-comment').getText().strip()
            if comment != u'添加评论':
                answer['comment_cnt'] = comment[:-3].strip()
        answer['_id'] = answer['url'].replace('https://www.zhihu.com/question/', '').replace('/answer/', '-')
        return answer
    def parse_answer(cls, soup):
        title_a = soup.find('a', class_='question_link')
        body = soup.find('div', class_="entry-body")

        author = soup.find('div', class_='zm-item-answer-author-info')
        comment_cnt, a_name, a_desc, a_link, r_time = 0, u'匿名用户', '', '', ''

        with trytry():
            r_time = datetime.datetime.fromtimestamp(int(body['data-created'])).strftime('%Y-%m-%d %H:%M:%S')

        with trytry():
            a = author.find('a', class_='author-link')
            if a:
                a_name = a.getText().strip()
                a_link = ZHIHU_URL + a['href']
            else:
                a_name = author.find('span', class_='name').getText().strip()
            a_desc = author.find('span', class_='bio')['title']
        with trytry():
            comment_cnt = int(soup.find('a', class_=' meta-item toggle-comment').getText().replace(u'条评论', '').strip())

        content, e_time = cls.unescape(unicode(soup.find('textarea', class_='content hidden')))
        if not e_time:
            e_time = r_time
        result = {
            'a_name': a_name,
            'a_link': a_link,
            'a_desc': a_desc,
            'r_time': r_time,
            'e_time': e_time,
            'comment_cnt': comment_cnt,
            'question': title_a.getText().strip(),
            'agree_cnt': int(soup.find('a', class_='zm-item-vote-count')['data-votecount']),
            '_id': ZHIHU_URL + title_a['href'] + '/answer/' + body['data-atoken'],
            'content': content,
            'topics': []
        }
        return result
def main():
    # TopicHotCrawler('19552832').run()
    for index, item in enumerate(MONGO[DB][TOPIC_COLL].find().batch_size(1)):
        logger.info('now topic %s-%s' % (index, item['_id']))
        with trytry(logger=logger):
            TopicHotCrawler(item['_id']).run()
예제 #6
0
def main():
    # TopicHotCrawler('19552832').run()
    for index, item in enumerate(MONGO[DB][TOPIC_COLL].find().batch_size(1)):
        logger.info('now topic %s-%s' % (index, item['_id']))
        with trytry(logger=logger):
            TopicHotCrawler(item['_id']).run()