Пример #1
0
def get_user_by_search(user, number=1, start=0):
    if not isinstance(user, str):
        raise MethodParamError('Parameter \'user\' must be an instance of \'str\'!')
    if not isinstance(number, int):
        raise MethodParamError('Parameter \'number\' must be an instance of \'int\'!')
    if number <= 0:
        number = 1
    position = start
    response = requests.get(zhihu_search_url.format(key=quote(user), offset=position), headers=zhihu_header)
    result = response.json()
    user_tokens = []
    user_htmls = []
    while len(user_tokens) < number:
        for html in result.get('htmls'):
            bs = BeautifulSoup(html, 'lxml')
            user_tokens.append(bs.li.attrs['data-token'])
            user_htmls.append(bs.li)
        if len(result.get('htmls')) < 10:
            break
        if len(user_tokens) < number:
            position += 10
            response = requests.get(zhihu_search_url.format(key=quote(user), offset=position), headers=zhihu_header)
            result = response.json()
    if len(user_tokens) > number:
        user_tokens = user_tokens[:number]
        user_htmls = user_htmls[:number]
    user_infos = []
    for user_li in user_htmls:
        item = []
        item.append(str(user_li.find('img')))
        item.append(str(user_li.find('a', {'class', 'author-link'})))
        item.append(str(user_li.find('span', {'class', 'bio'})))
        item += [str(it) for it in user_li.find_all('a')[-3:]]
        user_infos.append(item)
    return user_tokens, user_infos
Пример #2
0
 def scrape_user_follows(self, id, number=0):
     if not isinstance(id, int):
         raise MethodParamError(
             'Parameter \'id\' isn\'t an instance of type \'int\'!')
     if not isinstance(number, int):
         raise MethodParamError(
             'Parameter \'number\' isn\'t an instance of type \'int\'!')
     if log_weibo:
         logging.info('Scraping follows of weibo user: %d...' % id)
     response = requests.get(weibo_user_follow_url.format(uid=id, page=1))
     result = response.json()
     total = result.get('count')
     if number <= 0:
         need_count = 10
     else:
         need_count = number if number < total else total
     finish_count = 0
     follows = []
     position = 0
     while finish_count < need_count:
         position += 1
         response = requests.get(
             weibo_user_follow_url.format(uid=id, page=position))
         result = response.json()
         for card in result.get('cards'):
             if finish_count >= need_count:
                 break
             user = card.get('user')
             item = WeiboUserItem()
             item.id = user.get('id')
             item.profile_url = 'https://weibo.com/u/{uid}'.format(
                 uid=item.id)
             item.name = user.get('screen_name')
             item.gender = '男' if user.get('gender') == 'm' else '女'
             item.avatar_url = user.get('profile_image_url')
             item.description = user.get('description')
             item.weibo_count = user.get('statuses_count')
             item.follow_count = user.get('follow_count')
             item.fans_count = user.get('followers_count')
             response_info = requests.get(
                 weibo_user_info_url.format(uid1=item.id, uid2=item.id))
             result_info = response_info.json()
             for card in result_info.get('cards'):
                 if card.get('card_type') != 11:
                     continue
                 for card_inner in card.get('card_group'):
                     if card_inner.get('card_type') != 41:
                         continue
                     item_name = card_inner.get('item_name')
                     item_content = card_inner.get('item_content')
                     if item_name == '所在地':
                         item.location = item_content
                     elif item_name == '注册时间':
                         item.signup_time = item_content
             follows.append(item)
             finish_count += 1
     if log_weibo:
         logging.info('Succeed in scraping follows of weibo user: %d.' % id)
     self.scraped_follows[id] = follows
     return follows
Пример #3
0
 def scrape_user_posts(self, user, before=None, after=None, number=10):
     if not isinstance(user, str):
         raise MethodParamError(
             'Parameter \'user\' isn\'t an instance of type \'str\'!')
     if not isinstance(number, int):
         raise MethodParamError(
             "Parameter \'number\' isn\'t an instance of type \'int\'!")
     before = int(time.time()) if before is None else int(before)
     after = 0 if after is None else int(after)
     if log_tieba:
         logging.info('Scraping posts of tieba user: %s...' % user)
     if number <= 0:
         number = 10
     finish = 0
     posts = []
     page = 1
     stop_flag = False
     while finish < number:
         # print(tieba_user_post_url.format(user=user, page=page))
         while True:
             response = requests.get(
                 tieba_user_post_url.format(user=user, page=page))
             if response.text.startswith('<!DOCTYPE html>'):  # 得到贴吧404界面
                 time.sleep(3)
             else:
                 break
         result = response.json()
         for thread in result.get('data').get('thread_list'):
             if finish >= number:
                 break
             item = TiebaPostItem()
             item.time = int(thread.get('create_time'))
             if item.time > before:
                 continue
             if item.time < after:
                 stop_flag = True
                 break
             item.title = thread.get('title')
             if re.match(r'^回复:', item.title):
                 item.title = item.title[3:]
             item.title_url = 'https://tieba.baidu.com/p/{tid}'.format(
                 tid=thread.get('thread_id'))
             item.content = thread.get('content')
             item.content_url = 'http://tieba.baidu.com/p/{tid}?pid={pid}&cid=#{cid}'.format(
                 tid=thread.get('thread_id'),
                 pid=thread.get('post_id'),
                 cid=thread.get('post_id'))
             item.forum = thread.get('forum_name')
             item.forum_url = 'http://tieba.baidu.com/f?kw={kw}'.format(
                 kw=quote(item.forum))
             posts.append(item)
             finish += 1
         page += 1
         if finish >= number or not result.get('data').get(
                 'has_more') or stop_flag:
             break
     if log_tieba:
         logging.info('Succeed in scraping posts of tieba user: %s.' % user)
     return posts
Пример #4
0
 def scrape_answers_by_user(self, user, number=0):
     if not isinstance(user, str):
         raise MethodParamError(
             'Parameter \'user\' isn\'t an instance of type \'str\'!')
     if not isinstance(number, int):
         raise MethodParamError(
             'Parameter \'number\' isn\'t an instance of type \'int\'!')
     if log_zhihu:
         logging.info('Scraping answers of zhihu user: %s...' % user)
     response = requests.get(zhihu_user_answers_url.format(user=user,
                                                           offset=0,
                                                           limit=20),
                             headers=zhihu_header)
     if response.status_code == 404:  # 用户不存在或账号被封禁
         if log_zhihu:
             logging.warning(
                 '404 error. The user doesn\'t exist or has been blocked.')
         return []
     result = response.json()
     total = result.get('paging').get('totals')
     if number <= 0:
         need_count = 10
     else:
         need_count = number if number < total else total
     finish_count = 0
     answer_ids = []
     for data in result.get('data'):
         if finish_count >= need_count:
             break
         answer_ids.append(data.get('id'))
         finish_count += 1
     if finish_count < need_count:
         position = 0
         while not result.get('paging').get('is_end'):
             if finish_count >= need_count:
                 break
             position += 20
             next_page = zhihu_user_answers_url.format(user=user,
                                                       offset=position,
                                                       limit=20)
             result = requests.get(next_page, headers=zhihu_header).json()
             for data in result.get('data'):
                 if finish_count >= need_count:
                     break
                 answer_ids.append(data.get('id'))
                 finish_count += 1
     answers = []
     for answer_id in answer_ids:
         item = self.scrape_answer_by_id(id=answer_id)
         answers.append(item)
     if log_zhihu:
         logging.info('Succeed in scraping answers of zhihu user: %s.' %
                      user)
     self.scraped_user_answers[user] = answers
     return answers
Пример #5
0
 def scrape_user_fans(self, user, number=0):
     if not isinstance(user, str):
         raise MethodParamError(
             'Parameter \'user\' isn\'t an instance of type \'str\'!')
     if not isinstance(number, int):
         raise MethodParamError(
             'Parameter \'number\' isn\'t an instance of type \'int\'!')
     if log_zhihu:
         logging.info('Scraping followers of zhihu user: %s...' % user)
     response = requests.get(zhihu_user_followers_url.format(
         user=user, include=zhihu_followers_query, offset=0, limit=20),
                             headers=zhihu_header)
     if response.status_code == 404:  # 用户不存在或账号被封禁
         if log_zhihu:
             logging.warning(
                 '404 error. The user doesn\'t exist or has been blocked.')
         return []
     result = response.json()
     total = result.get('paging').get('totals')
     if number <= 0:
         need_count = 10
     else:
         need_count = number if number < total else total
     finish_count = 0
     url_tokens = []
     for data in result.get('data'):
         if finish_count >= need_count:
             break
         url_tokens.append(data.get('url_token'))
         finish_count += 1
     if finish_count < need_count:
         while not result.get('paging').get('is_end'):
             if finish_count >= need_count:
                 break
             next_page = result.get('paging').get('next')
             result = requests.get(next_page, headers=zhihu_header).json()
             for data in result.get('data'):
                 if finish_count >= need_count:
                     break
                 url_tokens.append(data.get('url_token'))
                 finish_count += 1
     fans = []
     for url_token in url_tokens:
         item = self.scrape_user_info(user=url_token)
         fans.append(item)
     if log_zhihu:
         logging.info('Succeed in scraping followers of zhihu user: %s.' %
                      user)
     self.scraped_followers[user] = fans
     return fans
Пример #6
0
 def scrape_answers_by_question(self, id, number=0):
     if not isinstance(id, int):
         raise MethodParamError(
             'Parameter \'id\' isn\'t an instance of type \'int\'!')
     if not isinstance(number, int):
         raise MethodParamError(
             'Parameter \'number\' isn\'t an instance of type \'int\'!')
     if log_zhihu:
         logging.info('Scraping answers of question: %d...' % id)
     response = requests.get(zhihu_question_answers_url.format(id=id,
                                                               offset=0,
                                                               limit=20),
                             headers=zhihu_header)
     if response.status_code == 404:  # 问题不存在
         if log_zhihu:
             logging.warning('404 error. The question doesn\'t exist.')
         return []
     result = response.json()
     total = result.get('paging').get('totals')
     if number <= 0:
         need_count = 10
     else:
         need_count = number if number < total else total
     finish_count = 0
     answer_ids = []
     for data in result.get('data'):
         if finish_count >= need_count:
             break
         answer_ids.append(data.get('id'))
         finish_count += 1
     if finish_count < need_count:
         while not result.get('paging').get('is_end'):
             if finish_count >= need_count:
                 break
             next_page = result.get('paging').get('next')
             result = requests.get(next_page, headers=zhihu_header).json()
             for data in result.get('data'):
                 if finish_count >= need_count:
                     break
                 answer_ids.append(data.get('id'))
                 finish_count += 1
     answers = []
     for answer_id in answer_ids:
         item = self.scrape_answer_by_id(id=answer_id)
         answers.append(item)
     if log_zhihu:
         logging.info('Succeed in scraping answers of question: %d.' % id)
     self.scraped_question_answers[id] = answers
     return answers
Пример #7
0
def get_user_by_homepage(url):
    driver_path = 'phantomjs.exe'
    if not DEBUG:
        driver_path = settings.BASE_DIR + '/closends/spider/phantomjs.exe'
    driver = webdriver.PhantomJS(executable_path=driver_path,
                                 service_log_path=os.path.devnull)

    if not isinstance(url, str):
        driver.close()
        raise MethodParamError(
            'Parameter \'url\' must be an instance of \'str\'!')
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    try:
        wait.until(
            ec.visibility_of_element_located((By.CLASS_NAME, 'username')))
    except TimeoutException:  # 网速太慢或链接错误
        driver.close()
        return [], []
    username = driver.find_element_by_class_name('username').text
    driver.close()

    user_ids, user_htmls = get_user_by_account(user=username, number=1)
    if len(user_ids) > 0 and len(user_htmls) > 0:
        return user_ids[0], user_htmls[0]
    return [], []
Пример #8
0
 def scrape_answer_by_id(self, id):
     if not isinstance(id, int):
         raise MethodParamError(
             'Parameter \'id\' isn\'t an instance of type \'int\'!')
     if log_zhihu:
         logging.info('Scraping answer of id: %d...' % id)
     response = requests.get(zhihu_answer_url.format(
         id=id, include=zhihu_answer_query),
                             headers=zhihu_header)
     if response.status_code == 404:
         if log_zhihu:
             logging.warning('404 error. The answer doesn\'t exist.')
         return None
     result = response.json()
     item = ZhihuAnswerItem()
     item.id = result.get('id')
     item.author = result.get('author').get('name')
     item.question_id = result.get('question').get('id')
     item.create_time = result.get('created_time')
     item.update_time = result.get('updated_time')
     page = requests.get('https://www.zhihu.com/question/%d/answer/%d' %
                         (item.question_id, id),
                         headers=zhihu_header)
     bs = BeautifulSoup(page.text, 'lxml')
     content_span = bs.find('div', {'class': 'RichContent'}).div.span
     content = re.search(r'<span.*?>(.*)</span>',
                         str(content_span)).group(1)
     item.content = content
     item.voteup_count = result.get('voteup_count')
     item.comment_count = result.get('comment_count')
     if log_zhihu:
         logging.info('Succeed in scraping answer of id: %d.' % id)
     self.scraped_answers[id] = item
     return item
Пример #9
0
def get_user_by_account(user=None, number=1):
    driver_path = 'phantomjs.exe'
    if not DEBUG:
        driver_path = settings.BASE_DIR + '/closends/spider/phantomjs.exe'
    driver = webdriver.PhantomJS(executable_path=driver_path,
                                 service_log_path=os.path.devnull)

    if not isinstance(user, str):
        driver.close()
        raise MethodParamError(
            'Parameter \'user\' must be an instance of \'str\'!')
    if not isinstance(number, int):
        driver.close()
        raise MethodParamError(
            'Parameter \'number\' must be an instance of \'int\'!')
    if number <= 0:
        number = 1
    wait = WebDriverWait(driver, 3)
    driver.get(weibo_search_url.format(user=quote(user)))
    try:
        wait.until(
            ec.visibility_of_element_located((By.CLASS_NAME, 'pl_personlist')))
        user_divs = driver.find_elements_by_class_name('list_person')
    except TimeoutException:  # 未找到结果或网速太慢
        driver.close()
        return [], []
    except NoSuchElementException:  # 未找到结果
        driver.close()
        return [], []
    if len(user_divs) >= number:  # 截取前number个搜索结果
        user_divs = user_divs[:number]
    user_ids = []
    user_htmls = []
    for user_div in user_divs:
        user_id = user_div.find_element_by_class_name(
            'person_name').find_element_by_tag_name('a').get_attribute('uid')
        user_ids.append(int(user_id))
        user_htmls.append(user_div.get_attribute('outerHTML'))
    driver.close()
    return user_ids, user_htmls
Пример #10
0
    def scrape_user_info(self, id):
        if not isinstance(id, int):
            raise MethodParamError(
                'Parameter \'id\' isn\'t an instance of type \'int\'!')
        if log_weibo:
            logging.info('Scraping info of weibo user: %d...' % id)
        item = WeiboUserItem()
        item.id = id
        item.profile_url = 'https://weibo.com/u/{uid}'.format(uid=id)

        # 通过主页请求获取关注数、粉丝数、头像url
        response = requests.get(weibo_user_profile_url.format(uid1=id,
                                                              uid2=id))
        result = response.json()
        item.follow_count = result.get('userInfo').get('follow_count')
        item.fans_count = result.get('userInfo').get('followers_count')
        item.avatar_url = result.get('userInfo').get('profile_image_url')

        # 通过详细资料请求获取详细资料
        response = requests.get(weibo_user_info_url.format(uid1=id, uid2=id))
        result = response.json()
        for card in result.get('cards'):
            if card.get('card_type') != 11:
                continue
            for card_inner in card.get('card_group'):
                if card_inner.get('card_type') != 41:
                    continue
                item_name = card_inner.get('item_name')
                item_content = card_inner.get('item_content')
                if item_name == '昵称':
                    item.name = item_content
                elif item_name == '性别':
                    item.gender = item_content
                elif item_name == '所在地':
                    item.location = item_content
                elif item_name == '简介':
                    item.description = item_content
                elif item_name == '注册时间':
                    item.signup_time = item_content

        # 通过用户微博请求获取用户微博数
        response = requests.get(
            weibo_user_weibo_url.format(uid1=id, uid2=id, page=1))
        result = response.json()
        item.weibo_count = result.get('cardlistInfo').get('total')
        if log_weibo:
            logging.info('Succeed in scraping info of weibo user: %d.' % id)
        self.scraped_infos[id] = item
        return item
Пример #11
0
 def scrape_user_info(self, user):
     if not isinstance(user, str):
         raise MethodParamError(
             'Parameter \'user\' isn\'t an instance of type \'str\'!')
     if log_tieba:
         logging.info('Scraping info of tieba user: %s...' % user)
     response = requests.get(
         tieba_user_profile_url.format(user=quote(user)))
     bs = BeautifulSoup(response.text, 'lxml')
     item = TiebaUserItem()
     item.name = user
     if bs.find('span', {'class': 'userinfo_sex_male'}) is not None:
         item.sex = 'male'
     else:
         item.sex = 'female'
     age = bs.find('span', {
         'class': 'user_name'
     }).find_all('span')[2].get_text()
     item.tieba_age = float(re.search(r'吧龄:(.*)年', age).group(1))
     item.avatar_url = bs.find('a', {
         'class': 'userinfo_head'
     }).img.attrs['src']
     item.follow_count = int(
         bs.find_all('span',
                     {'class': 'concern_num'})[0].find('a').get_text())
     item.fans_count = int(
         bs.find_all('span',
                     {'class': 'concern_num'})[1].find('a').get_text())
     forum_div1 = bs.find('div', {'id': 'forum_group_wrap'})
     forum_div2 = bs.find('div',
                          {'class': 'j_panel_content'})  # 关注的吧需要展开才能显示完全
     if forum_div1 is not None:
         forum_items1 = forum_div1.find_all('a', {'class': 'unsign'})
         item.forum_count += len(forum_items1)
     if forum_div2 is not None:
         forum_items2 = forum_div2.find_all('a', {'class': 'unsign'})
         item.forum_count += len(forum_items2)
     post = bs.find('span', {
         'class': 'user_name'
     }).find_all('span')[4].get_text()
     item.post_count = int(re.search(r'发贴:(\d+)', post).group(1))
     if log_tieba:
         logging.info('Succeed in scraping info of tieba user: %s.' % user)
     return item
Пример #12
0
def get_user_by_homepage(url):
    if not isinstance(url, str):
        raise MethodParamError('Parameter \'url\' must be an instance of \'str\'!')
    if not re.match(r'https://www\.zhihu\.com/people/.*', url):     # 不合法的主页地址
        return None, None
    user = re.search(r'https://www\.zhihu\.com/people/(.*)', url).group(1).split('/')[0]
    response = requests.get('https://www.zhihu.com/people/' + user + '/activities', headers=zhihu_header)
    if response.status_code == 404:     # 用户不存在
        return None, None
    bs = BeautifulSoup(response.text, 'lxml')
    user_name = bs.find('span', {'class': 'ProfileHeader-name'}).get_text()
    start = 0
    while True:
        user_tokens, user_htmls = get_user_by_search(user=user_name, number=10, start=start)
        if len(user_tokens) == 0:
            break
        for user_token, user_html in zip(user_tokens, user_htmls):
            if user_token == user:
                return user_token, user_html
        start += 10
    return None, None
Пример #13
0
 def save_user_fans(self, user, directory='./products/'):
     if self.scraped_followers == {}:
         if log_zhihu:
             logging.warning(
                 'Haven\'t scraped followers of any zhihu user.')
         return
     infos = self.scraped_followers.get(user)
     if not isinstance(infos, list):
         raise MethodParamError(
             'Haven\'t scraped followers of zhihu user: %s' % user)
     csv_file = open(directory + str(user) + '-followers.csv', 'w')
     writer = csv.writer(csv_file)
     writer.writerow(
         ('ID', '用户名', '性别', '头像链接', '行业', '一句话描述', '个人介绍', '提问数', '回答数',
          '文章数', '被赞同数', '被感谢数', '被收藏数', '关注数', '粉丝数', '关注话题数', '关注专栏数',
          '关注问题数', '关注收藏夹数', '教育经历', '职业经历', '居住地'))
     for info in infos:
         if not isinstance(info, ZhihuUserItem):
             continue
         if info.gender == 0:
             gender = '女'
         elif info.gender == 1:
             gender = '男'
         else:
             gender = '未知'
         writer.writerow(
             (info.id, info.name, gender, info.avatar_url, info.business,
              info.headline, info.description, info.question_count,
              info.answer_count, info.article_count, info.voteup_count,
              info.thanked_count, info.favorited_count,
              info.following_count, info.follower_count,
              info.following_topic_count, info.following_column_count,
              info.following_question_count, info.following_favlist_count,
              '; '.join([str(edu) for edu in info.educations]), '; '.join([
                  str(emp) for emp in info.employments
              ]), '; '.join([str(loc) for loc in info.locations])))
     csv_file.close()
     if log_zhihu:
         logging.info('Succeed in saving followers of zhihu user: %s.' %
                      user)
Пример #14
0
 def scrape_user_forums(self, user):
     if not isinstance(user, str):
         raise MethodParamError(
             'Parameter \'user\' isn\'t an instance of type \'str\'!')
     if log_tieba:
         logging.info('Scraping forums of tieba user: %s...' % user)
     response = requests.get(
         tieba_user_profile_url.format(user=quote(user)))
     bs = BeautifulSoup(response.text, 'lxml')
     forum_div1 = bs.find('div', {'id': 'forum_group_wrap'})
     forum_div2 = bs.find('div',
                          {'class': 'j_panel_content'})  # 关注的吧需要展开才能显示完全
     forums = []
     if forum_div1 is not None:
         for forum_a in forum_div1.find_all('a', {'class': 'unsign'}):
             forums.append(forum_a.span.get_text())
     if forum_div2 is not None:
         for forum_a in forum_div2.find_all('a', {'class': 'unsign'}):
             forums.append(forum_a.get_text())
     if log_tieba:
         logging.info('Succeed in scraping forums of tieba user: %s.' %
                      user)
     return forums
Пример #15
0
 def scrape_question_by_id(self, id=0):
     if not isinstance(id, int):
         raise MethodParamError(
             'Parameter \'id\' isn\'t an instance of type \'int\'!')
     if log_zhihu:
         logging.info('Scraping question of id: %d...' % id)
     response = requests.get(zhihu_question_url.format(
         id=id, include=zhihu_question_query),
                             headers=zhihu_header)
     if response.status_code == 404:
         if log_zhihu:
             logging.warning('404 error. The question doesn\'t exist.')
         return None
     result = response.json()
     item = ZhihuQuestionItem()
     item.id = result.get('id')
     item.title = result.get('title')
     item.create_time = result.get('created')
     item.update_time = result.get('updated_time')
     page = requests.get('https://www.zhihu.com/question/%d' % id,
                         headers=zhihu_header)
     bs = BeautifulSoup(page.text, 'lxml')
     content_div = bs.find('div', {'class': 'QuestionRichText'})
     if content_div is not None:
         item.content = re.search(r'<span.*?>(.*)</span>',
                                  str(content_div.div.span)).group(1)
     item.follower_count = result.get('follower_count')
     item.visit_count = result.get('visit_count')
     item.comment_count = result.get('comment_count')
     topics = result.get('topics')
     if topics is not None:
         for topic in topics:
             item.topics.append(topic.get('name'))
     if log_zhihu:
         logging.info('Succeed in scraping question of id: %d.' % id)
     self.scraped_questions[id] = item
     return item
Пример #16
0
 def scrape_user_activities(self, user, before=None, after=None, number=10):
     if not isinstance(user, str):
         raise MethodParamError(
             'Parameter \'user\' isn\'t an instance of type \'str\'!')
     if not isinstance(number, int):
         raise MethodParamError(
             'Parameter \'number\' isn\'t an instance of type \'int\'!')
     if before is None:
         before = int(time.time())
     else:
         before = int(before)
     if after is None:
         after = 0
     else:
         after = int(after)
     if number <= 0:
         number = 10
     if log_zhihu:
         logging.info('Scraping activities of zhihu user: %s...' % user)
     response = requests.get(zhihu_user_activity_url.format(user=user,
                                                            limit=10,
                                                            after=before),
                             headers=zhihu_header)
     result = response.json()
     activities = []
     stop_flag = False
     while len(activities) < number:
         for data in result.get('data'):
             item = ZhihuActivityItem()
             item.id = int(data.get('id'))
             if item.id < after:
                 stop_flag = True
                 break
             item.verb = data.get('verb')
             item.create_time = data.get('created_time')
             item.actor = data.get('actor').get('url_token')
             target = data.get('target')
             if item.verb == 'QUESTION_CREATE' or item.verb == 'QUESTION_FOLLOW':  # 关注了问题,添加了问题
                 item.target_user_name = target.get('author').get('name')
                 item.target_user_avatar = target.get('author').get(
                     'avatar_url')
                 item.target_user_headline = target.get('author').get(
                     'headline')
                 item.target_user_url = 'https://www.zhihu.com/people/{user}/activities'.format(
                     user=target.get('author').get('url_token'))
                 item.target_title = target.get('title')
                 item.target_title_url = 'https://www.zhihu.com/question/{id}'.format(
                     id=target.get('id'))
             elif item.verb == 'ANSWER_VOTE_UP' or item.verb == 'ANSWER_CREATE':  # 赞同了回答,回答了问题
                 item.target_user_name = target.get('author').get('name')
                 item.target_user_avatar = target.get('author').get(
                     'avatar_url')
                 item.target_user_headline = target.get('author').get(
                     'headline')
                 item.target_user_url = 'https://www.zhihu.com/people/{user}/activities'.format(
                     user=target.get('author').get('url_token'))
                 item.target_title = target.get('question').get('title')
                 item.target_title_url = 'https://www.zhihu.com/question/{id}'.format(
                     id=target.get('question').get('id'))
                 item.target_content = target.get('excerpt')
                 item.target_content_url = 'https://www.zhihu.com/question/{qid}/answer/{aid}'.format(
                     qid=target.get('question').get('id'),
                     aid=target.get('id'))
                 item.thumbnail = target.get('thumbnail')
             elif item.verb == 'MEMBER_VOTEUP_ARTICLE' or item.verb == 'MEMBER_CREATE_ARTICLE':  # 赞了文章,发表了文章
                 item.target_user_name = target.get('author').get('name')
                 item.target_user_avatar = target.get('author').get(
                     'avatar_url')
                 item.target_user_headline = target.get('author').get(
                     'headline')
                 item.target_user_url = 'https://www.zhihu.com/people/{user}/activities'.format(
                     user=target.get('author').get('url_token'))
                 item.target_title = target.get('title')
                 item.target_title_url = 'https://zhuanlan.zhihu.com/p/{id}'.format(
                     id=target.get('id'))
                 item.target_content = target.get('excerpt')
                 item.target_content_url = 'https://zhuanlan.zhihu.com/p/{id}'.format(
                     id=target.get('id'))
                 item.thumbnail = target.get('image_url')
             elif item.verb == 'TOPIC_FOLLOW' or item.verb == 'TOPIC_CREATE':  # 关注了话题,创建了话题
                 item.target_title = target.get('name')
                 item.target_title_url = item.target_title_url = 'https://www.zhihu.com/topic/{id}'.format(
                     id=target.get('id'))
                 item.thumbnail = target.get('avatar_url')
             elif item.verb == 'MEMBER_FOLLOW_COLUMN' or item.verb == 'MEMBER_CREATE_COLUMN':  # 关注了收藏夹,创建了收藏夹
                 item.target_user_name = target.get('author').get('name')
                 item.target_user_avatar = target.get('author').get(
                     'avatar_url')
                 item.target_user_headline = target.get('author').get(
                     'headline')
                 item.target_user_url = 'https://www.zhihu.com/people/{user}/activities'.format(
                     user=target.get('author').get('url_token'))
                 item.target_title = target.get('title')
                 item.target_title_url = 'https://zhuanlan.zhihu.com/{id}'.format(
                     id=target.get('id'))
                 item.thumbnail = target.get('image_url')
             elif item.verb == 'MEMBER_CREATE_PIN' or item.verb == 'MEMBER_FOLLOW_PIN':  # 发布了想法,关注了想法
                 item.target_user_name = target.get('author').get('name')
                 item.target_user_avatar = target.get('author').get(
                     'avatar_url')
                 item.target_user_headline = target.get('author').get(
                     'headline')
                 item.target_user_url = 'https://www.zhihu.com/people/{user}/activities'.format(
                     user=target.get('author').get('url_token'))
                 item.target_content = target.get('excerpt_new')
                 item.target_content_url = 'https://www.zhihu.com/pin/{id}'.format(
                     id=target.get('id'))
             item.action_text = data.get('action_text')
             activities.append(item)
             if len(activities) >= number:
                 break
         if len(activities) >= number or result.get('paging').get(
                 'is_end') or stop_flag:
             break
         response = requests.get(zhihu_user_activity_url.format(
             user=user, limit=10, after=activities[-1].id),
                                 headers=zhihu_header)
         result = response.json()
     if log_zhihu:
         logging.info('Succeed in scraping activities of zhihu user: %s.' %
                      user)
     return activities
Пример #17
0
 def scrape_user_weibo(self, id, before=None, after=None, number=0):
     if not isinstance(id, int):
         raise MethodParamError(
             'Parameter \'id\' isn\'t an instance of type \'int\'!')
     if not isinstance(number, int):
         raise MethodParamError(
             'Parameter \'number\' isn\'t an instance of type \'int\'!')
     before = int(time.time()) if before is None else int(before)
     after = 0 if after is None else int(after)
     if log_weibo:
         logging.info('Scraping weibos of weibo user: %d...' % id)
     response = requests.get(
         weibo_user_weibo_url.format(uid1=id, uid2=id, page=1))
     result = response.json()
     total = result.get('data').get('cardlistInfo').get('total')
     if number <= 0:
         need_count = 10
     else:
         need_count = number if number < total else total
     finish_count = 0
     weibos = []
     position = 0
     stop_flag = False
     while finish_count < need_count:
         position += 1
         print(weibo_user_weibo_url.format(uid1=id, uid2=id, page=position))
         response = requests.get(
             weibo_user_weibo_url.format(uid1=id, uid2=id, page=position))
         result = response.json()
         for card in result.get('data').get('cards'):
             try:
                 if finish_count >= need_count:
                     break
                 if card.get('card_type') != 9:
                     continue
                 res = requests.get(card.get('scheme'))
                 if '微博-出错了' in res.text:  # 该微博已被删除
                     continue
                 time_lst = re.search(r'"created_at": "(.*?)"',
                                      res.text).group(1).split()
                 time_lst.pop(-2)  # 删除时区信息
                 time_str = ' '.join(time_lst)
                 time_value = time.mktime(
                     time.strptime(time_str,
                                   '%a %b %d %H:%M:%S %Y'))  # 获取时间戳
                 mblog = card.get('mblog')
                 if time_value > before:
                     continue
                 if time_value < after:
                     if not mblog.get('isTop'):  # 置顶微博有可能造成异常
                         stop_flag = True
                         break
                     else:
                         continue
                 if 'retweeted_status' in mblog.keys():  # 转发微博
                     item = WeiboRepostContentItem()
                     retweet = mblog.get('retweeted_status')
                     item.content = retweet.get('text')
                     item.source_id = retweet.get('bid')
                     if 'pics' in retweet.keys():
                         for pic in retweet.get('pics'):
                             item.pictures.append(pic.get('url'))
                     if 'page_info' in retweet.keys():
                         item.media_pic = retweet.get('page_info').get(
                             'page_pic').get('url')
                         page_url = retweet.get('page_info').get('page_url')
                         if re.match(
                                 r'http://media\.weibo\.cn/article\?.*id=\d+',
                                 page_url):  # 移动端文章链接打不开,将其换为PC端链接
                             article_id = re.search(
                                 r'http://media\.weibo\.cn/article\?.*id=(\d+)',
                                 page_url).group(1)
                             item.media_url = 'https://weibo.com/ttarticle/p/show?id={id}'.format(
                                 id=article_id)
                         else:
                             item.media_url = page_url
                     if retweet.get('user') is not None:  # 原微博可能已被删除
                         item.source_url = 'https://weibo.com/{uid}/{bid}'.format(
                             uid=retweet.get('user').get('id'),
                             bid=item.source_id)
                         item.source_owner.id = retweet.get('user').get(
                             'id')
                         item.source_owner.name = retweet.get('user').get(
                             'screen_name')
                         item.source_owner.avatar_url = retweet.get(
                             'user').get('profile_image_url')
                         item.source_owner.profile_url = 'https://weibo.com/u/{uid}'.format(
                             uid=item.source_owner.id)
                     item.repost_reason = mblog.get('text')
                 else:
                     item = WeiboContentItem()
                     item.content = mblog.get('text')
                     if 'pics' in mblog.keys():
                         for pic in mblog.get('pics'):
                             item.pictures.append(pic.get('url'))
                     if 'page_info' in mblog.keys():
                         item.media_pic = mblog.get('page_info').get(
                             'page_pic').get('url')
                         page_url = mblog.get('page_info').get('page_url')
                         if re.match(
                                 r'http://media\.weibo\.cn/article\?.*id=\d+',
                                 page_url):  # 移动端文章链接打不开,将其换为PC端链接
                             article_id = re.search(
                                 r'http://media\.weibo\.cn/article\?.*id=(\d+)',
                                 page_url).group(1)
                             item.media_url = 'https://weibo.com/ttarticle/p/show?id={id}'.format(
                                 id=article_id)
                         else:
                             item.media_url = page_url
                 item.id = mblog.get('bid')
                 item.owner.id = mblog.get('user').get('id')
                 item.owner.name = mblog.get('user').get('screen_name')
                 item.owner.avatar_url = mblog.get('user').get(
                     'profile_image_url')
                 item.owner.profile_url = 'https://weibo.com/u/{uid}'.format(
                     uid=item.owner.id)
                 item.url = 'https://weibo.com/{uid}/{bid}'.format(
                     uid=item.owner.id, bid=item.id)
                 item.time = time_value
                 item.source = mblog.get('source')
                 weibos.append(item)
                 finish_count += 1
             except AttributeError:
                 continue
         if finish_count >= need_count or stop_flag:
             break
     if log_weibo:
         logging.info('Succeed in scraping weibos of weibo user: %d.' % id)
     self.scraped_weibos[id] = weibos
     return weibos
Пример #18
0
    def scrape_user_info(self, user):
        if not isinstance(user, str):
            raise MethodParamError(
                'Parameter \'user\' isn\'t an instance of type \'str\'!')
        if log_zhihu:
            logging.info('Scraping info of zhihu user: %s...' % user)
        response = requests.get(zhihu_user_info_url.format(
            user=user, include=zhihu_user_query),
                                headers=zhihu_header)
        if response.status_code == 404:  # 用户不存在或账号被封禁
            if log_zhihu:
                logging.warning(
                    '404 error. The user doesn\'t exist or has been blocked.')
            return None
        result = response.json()
        if result.get('error') is not None:  # 身份未经过验证
            if log_zhihu:
                logging.warning('Your identity hasn\'t been confirmed.')
            return None

        item = ZhihuUserItem()
        item.id = result.get('id')
        item.name = result.get('name')
        item.gender = result.get('gender')
        item.avatar_url = result.get('avatar_url')
        if 'business' in result.keys():
            item.business = result.get('business').get('name')
        item.headline = result.get('headline')
        item.description = result.get('description')
        item.question_count = result.get('question_count')
        item.answer_count = result.get('answer_count')
        item.article_count = result.get('articles_count')
        item.voteup_count = result.get('voteup_count')
        item.thanked_count = result.get('thanked_count')
        item.favorited_count = result.get('favorited_count')
        item.following_count = result.get('following_count')
        item.follower_count = result.get('follower_count')
        item.following_topic_count = result.get('following_topic_count')
        item.following_column_count = result.get('following_columns_count')
        item.following_question_count = result.get('following_question_count')
        item.following_favlist_count = result.get('following_favlists_count')
        educations = result.get('educations')
        if educations is not None:
            for education in educations:
                edu_item = ZhihuEducationItem()
                edu_item.school = education.get('school').get('name')
                if 'major' in education.keys():
                    edu_item.major = education.get('major').get('name')
                item.educations.append(edu_item)
        employments = result.get('employments')
        if employments is not None:
            for employment in employments:
                emp_item = ZhihuEmploymentItem()
                if 'company' in employment.keys():
                    emp_item.company = employment.get('company').get('name')
                if 'job' in employment.keys():
                    emp_item.job = employment.get('job').get('name')
                item.employments.append(emp_item)
        locations = result.get('locations')
        if locations is not None:
            for location in locations:
                item.locations.append(location.get('name'))
        if log_zhihu:
            logging.info('Succeed in scraping info of zhihu user: %s.' % user)
        self.scraped_infos[user] = item
        return item