예제 #1
0
 def parse_all_content(self, response):
     # 有阅读全文的情况,获取全文
     tree_node = etree.HTML(response.body)
     tweet_item = response.meta['item']
     content_node = tree_node.xpath('//*[@id="M_"]/div[1]')[0]
     tweet_html = etree.tostring(content_node, encoding='unicode')
     tweet_item['content'] = extract_weibo_content(tweet_html)
     yield tweet_item
예제 #2
0
 def parse_all_content(self, response):
     # 有阅读全文的情况,获取全文
     tree_node = etree.HTML(response.body)
     tweet_item = response.meta['item']
     content_node = tree_node.xpath('//*[@id="M_"]/div[1]')[0]
     tweet_html = etree.tostring(content_node, encoding='unicode')
     tweet_item['content'] = extract_weibo_content(tweet_html)
     if 'location' in tweet_item:
         tweet_item['location'] = content_node.xpath(
             './/span[@class="ctt"]/a[last()]/text()')[0]
     yield tweet_item
예제 #3
0
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # 如果是第1页,一次性获取后面的所有页
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_tweet,
                                  dont_filter=True,
                                  meta=response.meta)
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time'] = int(time.time())
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info_node = tweet_node.xpath(
                    './/span[@class="ct"]')[-1]
                create_time_info = create_time_info_node.xpath('string(.)')
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                    tweet_item['tool'] = create_time_info.split(
                        '来自')[1].strip()
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())

                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())

                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())

                images = tweet_node.xpath('.//img[@alt="图片"]/@src')
                if images:
                    tweet_item['image_url'] = images[0]

                videos = tweet_node.xpath(
                    './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href'
                )
                if videos:
                    tweet_item['video_url'] = videos[0]

                map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]')
                if map_node:
                    map_node = map_node[0]
                    map_node_url = map_node.xpath('./@href')[0]
                    map_info = re.search(r'xy=(.*?)&', map_node_url).group(1)
                    tweet_item['location_map_info'] = map_info

                repost_node = tweet_node.xpath(
                    './/a[contains(text(),"原文评论[")]/@href')
                if repost_node:
                    tweet_item['origin_weibo'] = repost_node[0]

                # 检测由没有阅读全文:
                all_content_link = tweet_node.xpath(
                    './/a[text()="全文" and contains(@href,"ckAll=1")]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    tweet_html = etree.tostring(tweet_node, encoding='unicode')
                    tweet_item['content'] = extract_weibo_content(tweet_html)
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url,
                              callback=self.parse_comment,
                              meta={'weibo_url': tweet_item['weibo_url']})

            except Exception as e:
                self.logger.error(e)
예제 #4
0
    def parse_tweet(self, response):
        if response.url.endswith('page=1'):
            # if page 1, get all page number
            self.current_page = 1
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                self.all_page_num = all_page
        print("[INFO] Crawling Tweets Page: " + str(self.current_page))
        """
        解析本页的数据
        """
        tree_node = etree.HTML(response.body)
        tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweet_item = TweetsItem()
                tweet_item['crawl_time_utc'] = dt.utcnow(
                )  # insert datetime timestamp utc
                tweet_repost_url = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                          tweet_repost_url)
                tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(
                    user_tweet_id.group(2), user_tweet_id.group(1))
                tweet_item['user_id'] = user_tweet_id.group(2)
                # if tweet_item['user_id']:
                #     print("[DEBUG] user_id:" + str(tweet_item['user_id']))
                # else:
                #     print("[DEBUG] user_id ERROR")

                tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2),
                                                   user_tweet_id.group(1))
                create_time_info_node = tweet_node.xpath(
                    './/span[@class="ct"]')[-1]
                create_time_info = create_time_info_node.xpath('string(.)')
                if "来自" in create_time_info:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.split('来自')[0].strip())
                    self.time_stop_flag = self.time_flag_compare(
                        tweet_item['created_at']
                    )  # time compare to trigger stop flag
                    tweet_item['tool'] = create_time_info.split(
                        '来自')[1].strip()
                else:
                    tweet_item['created_at'] = time_fix(
                        create_time_info.strip())
                    self.time_stop_flag = self.time_flag_compare(
                        tweet_item['created_at']
                    )  # time compare to trigger stop flag
                    tweet_item['tool'] = ""

                like_num = tweet_node.xpath(
                    './/a[contains(text(),"赞[")]/text()')[-1]
                tweet_item['like_num'] = int(
                    re.search('\d+', like_num).group())
                #print("[DEBUG] like_num:" + str(tweet_item['like_num']))
                repost_num = tweet_node.xpath(
                    './/a[contains(text(),"转发[")]/text()')[-1]
                tweet_item['repost_num'] = int(
                    re.search('\d+', repost_num).group())
                #print("[DEBUG] repost_num:" + str(tweet_item['repost_num']))
                comment_num = tweet_node.xpath(
                    './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()'
                )[-1]
                tweet_item['comment_num'] = int(
                    re.search('\d+', comment_num).group())
                #print("[DEBUG] comment_num:" + str(tweet_item['comment_num']))
                # Add to grab all images 1) test if multi images link exists 2) if not use the
                multi_img_link = tweet_node.xpath(
                    './/a[contains(text(),"组图")]/@href')
                if multi_img_link:
                    #print("[DEBUG] multi_img_link:" + multi_img_link[-1])
                    tweet_item['multi_imgs'] = True
                    yield Request(url=multi_img_link[-1],
                                  callback=self.parse_multi_images,
                                  meta={'_id': tweet_item['_id']},
                                  priority=1)
                else:
                    tweet_item['multi_imgs'] = False

                images = tweet_node.xpath('.//img[@alt="图片"]/@src')
                if images:
                    tweet_item['image_url'] = images[0]
                else:
                    tweet_item['image_url'] = "NA"

                videos = tweet_node.xpath(
                    './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href'
                )
                if videos:
                    tweet_item['video_url'] = videos[0]
                else:
                    tweet_item['video_url'] = "NA"

                map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]')
                if map_node:
                    map_node = map_node[0]
                    map_node_url = map_node.xpath('./@href')[0]
                    map_info = re.search(r'xy=(.*?)&', map_node_url).group(1)
                    tweet_item['location_map_info'] = map_info
                else:
                    tweet_item['location_map_info'] = "NA"

                repost_node = tweet_node.xpath(
                    './/a[contains(text(),"原文评论[")]/@href')
                if repost_node:
                    tweet_item['retweet'] = True
                    tweet_item['origin_weibo'] = repost_node[0]
                    # crawl original weibo
                    # origin_weibo_url = self.base_url + '/repost/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1'
                    # yield Request(url=repost_url, callback=self.parse_repost, meta={'weibo_url': tweet_item['weibo_url']},priority = 2)

                else:
                    tweet_item['retweet'] = False
                    tweet_item['origin_weibo'] = "NA"
                # 检测由没有阅读全文:
                all_content_link = tweet_node.xpath(
                    './/a[text()="全文" and contains(@href,"ckAll=1")]')
                if all_content_link:
                    all_content_url = self.base_url + all_content_link[
                        0].xpath('./@href')[0]
                    yield Request(all_content_url,
                                  callback=self.parse_all_content,
                                  meta={'item': tweet_item},
                                  priority=1)

                else:
                    tweet_html = etree.tostring(tweet_node, encoding='unicode')
                    tweet_item['content'] = extract_weibo_content(tweet_html)
                    yield tweet_item

                # 抓取该微博的评论信息
                comment_url = self.base_url + '/comment/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=comment_url,
                              callback=self.parse_comment,
                              meta={'weibo_url': tweet_item['weibo_url']},
                              priority=2)

                # Crawl tweet repost
                repost_url = self.base_url + '/repost/' + tweet_item[
                    'weibo_url'].split('/')[-1] + '?page=1'
                yield Request(url=repost_url,
                              callback=self.parse_repost,
                              meta={'weibo_url': tweet_item['weibo_url']},
                              priority=2)

            except Exception as e:
                self.logger.error(e)

        #  keep looping until hit page with time range limit
        self.current_page = self.current_page + 1
        if self.time_stop_flag == 0 and self.current_page < (
                self.all_page_num + 1) and self.current_page >= 2:
            next_page = self.current_page
            current_page_str = "page=" + str(next_page - 1)
            page_url = response.url.replace(current_page_str,
                                            'page={}'.format(next_page))
            yield Request(page_url,
                          self.parse_tweet,
                          dont_filter=True,
                          meta=response.meta,
                          priority=1)