Exemplo n.º 1
0
    def parse_homepage(self, response):
        log.info('================1')
        now = datetime.datetime.now()
        # json_homepage = demjson.decode(response.body_as_unicode())
        json_homepage = json.loads(response.body_as_unicode())
        tweetsitem = TweetsItem()
        # tweetsitem['id'] = json_homepage['data']['cardlistInfo']['containerid'][6:]
        if json_homepage['data']['cards']:
            count = -1
            for i in json_homepage['data']['cards']:
                count += 1
            for i in range(count):
                if json_homepage['data']['cards'][i]['card_type'] == 9:
                    tweetsitem['id'] = json_homepage['data']['cards'][i][
                        'mblog']['user']['id']
                    tweetsitem['spider_time'] = now.strftime(
                        '%Y-%m-%d %H:%M:%S')
                    tweetsitem['screen_name'] = json_homepage['data']['cards'][
                        i]['mblog']['user']['screen_name']
                    tweetsitem['text'] = json_homepage['data']['cards'][i][
                        'mblog']['text']
                    tweetsitem['created_at'] = json_homepage['data']['cards'][
                        i]['mblog']['created_at']
                    tweetsitem['comments_count'] = json_homepage['data'][
                        'cards'][i]['mblog']['comments_count']
                    tweetsitem['isLongText'] = json_homepage['data']['cards'][
                        i]['mblog']['isLongText']
                    tweetsitem['is_paid'] = json_homepage['data']['cards'][i][
                        'mblog']['is_paid']
                    tweetsitem['attitudes_count'] = json_homepage['data'][
                        'cards'][i]['mblog']['attitudes_count']
                    tweetsitem['mblog_id'] = json_homepage['data']['cards'][i][
                        'mblog']['id']
                    tweetsitem['ve'] = json_homepage['data']['cards'][i][
                        'mblog']['user']['verified']
                    if json_homepage['data']['cards'][i]['mblog'][
                            'source'] is not None:
                        tweetsitem['source'] = json_homepage['data']['cards'][
                            i]['mblog']['source']
                    try:
                        tweetsitem['retweeted_status'] = json_homepage['data'][
                            'cards'][i]['mblog']['retweeted_status']
                    except Exception as e:
                        log.info('no retweeted_status')
                    yield tweetsitem
            if 'page' in response.url:
                page = response.url.split("page=")[-1]

                yield Request(self.start_profile_url_next.format(
                    weiboid=tweetsitem['id'],
                    include=self.start_include,
                    j=int(page) + 1),
                              self.parse_homepage,
                              dont_filter=True)
            else:
                yield Request(self.start_profile_url_next.format(
                    weiboid=tweetsitem['id'], include=self.start_include, j=2),
                              self.parse_homepage,
                              dont_filter=True)
Exemplo n.º 2
0
    def tweets_parse(self, response):
        selector = Selector(response)
        tweets = selector.xpath('body/div[@class="c" and @id]')
        for tweet in tweets:
            tweet_items = TweetsItem()
            id = tweet.xpath('@id').get()

            content = tweet.xpath('div/span[@class="ctt"]/text()').getall()
            pass
Exemplo n.º 3
0
 def parse_homepage_next(self,response):
     json_homepage = json.loads(response.body_as_unicode())
     tweetsitem = TweetsItem()
     try:
         tweetsitem['id'] = json_homepage['data']['cardlistInfo']['containerid'][6:]
         for i in range(10):
               if json_homepage['data']['cards'][i]['card_type'] == 9:
                    tweetsitem['screen_name'] = json_homepage['data']['cards'][i]['mblog']['user']['screen_name']
                    tweetsitem['text'] = json_homepage['data']['cards'][i]['mblog']['text']
                    tweetsitem['created_at'] = json_homepage['data']['cards'][i]['mblog']['created_at']
                    tweetsitem['id_top']=json_homepage['data']['cards'][i]['mblog']['is_top']
                    tweetsitem['attitudes_count'] = json_homepage['data']['cards'][i]['mblog']['attitudes_count']
         yield tweetsitem
     except Exception as e:
         pass
Exemplo n.º 4
0
 def parse_homepage(self,response):
     json_homepage=json.loads(response.body_as_unicode())
     tweetsitem=TweetsItem()
     try:
        tweetsitem['id']=json_homepage['data']['cardlistInfo']['containerid'][6:]
        for i in range(10):
             if json_homepage['data']['cards'][i]['card_type'] ==9:
                 tweetsitem['screen_name']=json_homepage['data']['cards'][i]['mblog']['user']['screen_name']
                 tweetsitem['text']=json_homepage['data']['cards'][i]['mblog']['text']
                 tweetsitem['created_at']=json_homepage['data']['cards'][i]['mblog']['created_at']
                 tweetsitem['attitudes_count']=json_homepage['data']['cards'][i]['mblog']['attitudes_count']
        yield tweetsitem
        for i in range(1,10):
            count=i
            yield Request(self.follow_profile_url_1_next.format(weiboid=tweetsitem['id'],j=count),self.parse_homepage_next)
     except Exception as e:
            pass
Exemplo n.º 5
0
    def parse_tweets(self, response):
        """ 抓取微博数据 """
        selector = Selector(response)
        ID = re.findall('(\d+)/profile', response.url)[0]
        divs = selector.xpath('body/div[@class="c" and @id]')
        for div in divs:
            try:
                tweetsItems = TweetsItem()
                id = div.xpath('@id').extract_first()  # 微博ID
                content = div.xpath(
                    'div/span[@class="ctt"]//text()').extract()  # 微博内容
                cooridinates = div.xpath('div/a/@href').extract()  # 定位坐标
                like = re.findall('赞\[(\d+)\]'.decode('utf8'),
                                  div.extract())  # 点赞数
                transfer = re.findall('转发\[(\d+)\]'.decode('utf8'),
                                      div.extract())  # 转载数
                comment = re.findall('评论\[(\d+)\]'.decode('utf8'),
                                     div.extract())  # 评论数
                others = div.xpath('div/span[@class="ct"]/text()').extract(
                )  # 求时间和使用工具(手机或平台)

                tweetsItems["_id"] = ID + "-" + id
                tweetsItems["ID"] = ID
                if content:
                    tweetsItems["Content"] = " ".join(content).strip(
                        '[位置]'.decode('utf8'))  # 去掉最后的"[位置]"
                if cooridinates:
                    cooridinates = re.findall('center=([\d.,]+)',
                                              cooridinates[0])
                    if cooridinates:
                        tweetsItems["Co_oridinates"] = cooridinates[0]
                if like:
                    tweetsItems["Like"] = int(like[0])
                if transfer:
                    tweetsItems["Transfer"] = int(transfer[0])
                if comment:
                    tweetsItems["Comment"] = int(comment[0])
                if others:
                    others = others[0].split('来自'.decode('utf8'))
                    tweetsItems["PubTime"] = others[0].replace(u"\xa0", "")
                    if len(others) == 2:
                        tweetsItems["Tools"] = others[1].replace(u"\xa0", "")
                yield tweetsItems
            except Exception, e:
                pass
Exemplo n.º 6
0
 def parse_Tweets(self, response):
     if len(response.body) > 50:
         tweets = json.loads(response.body)
         ID = response.meta["ID"]
         page = ''
         containerid = ''
         if tweets.get("cards", ""):
             cards = tweets["cards"]
             if tweets["cardlistInfo"].get("page", ""):
                 page = tweets["cardlistInfo"]["page"]
                 page = str(page)
             else:
                 return
             if tweets["cardlistInfo"].get("containerid", ""):
                 containerid = tweets["cardlistInfo"]["containerid"]
             for card in cards:
                 mblog = card.get('mblog', '')
                 if mblog:
                     tweetsItems = TweetsItem()
                     tweetsItems["_id"] = card["itemid"]
                     tweetsItems["ID"] = ID
                     tweetsItems["Content"] = json.dumps(mblog)
                     tweetsItems["PubTime"] = mblog["created_at"]
                     tweetsItems["Like"] = mblog["attitudes_count"]
                     tweetsItems["Comment"] = mblog["comments_count"]
                     tweetsItems["Transfer"] = mblog["reposts_count"]
                 yield tweetsItems
             url_tweets = "https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s&page=%s" % (
                 ID, containerid, page)
             yield Request(url=url_tweets,
                           meta={"ID": ID},
                           callback=self.parse_Tweets,
                           dont_filter=True)
         else:
             return
     else:
         return
     pass
Exemplo n.º 7
0
    def parse_tweet(self,response):
        if response.url.endswith('page=1'):
            #如果是第一页,一次性获取后面的所有页
            all_page=re.search(r'>&nbsp;1/(\d+)页</div>',response.text)
            if all_page:
                all_page=all_page.group(1)
                all_page=int(all_page)
                for page_num in range(2,all_page+1):
                    page_url=response.url.replace('page=1','page={}'.format(page_num))
                    yield Request(page_url,self.parse_tweet,dont_filter=True,meta=response.meta)
        #解析本页的所有数据
        tree_node=etree.HTML(response.body)
        tweet_nodes=tree_node.xpath('//div[@class="c" and @id]')
        for tweet_node in tweet_nodes:
            try:
                tweetItem=TweetsItem()
                tweetItem['crawl_time']=int(time.time())
                tweet_repost_url=tweet_node.xpath('//a[contains(text(),"转发[")]/@href')[0]
                user_tweet_id=re.search(r'/repost/(.*?)\?uid=(\d+)',tweet_repost_url)
                tweetItem['weibo_url']='https://weibo.com/{}/{}'.format(user_tweet_id.group(2),user_tweet_id.group(1))
                tweetItem['user_id']=user_tweet_id.group(2)
                tweetItem['_id']='{}_{}'.format(user_tweet_id.group(2),user_tweet_id.group(1))
                create_time_info_node=tweet_node.xpath('//span[@class="ct"]')[-1]
                create_time_info=create_time_info_node.xpath('string(.)')
                if "来自" in create_time_info:
                    tweetItem['created_at']=time_fix(create_time_info.split('来自')[0].strip())
                    tweetItem['tool']=create_time_info.split('来自')[1].strip()
                else :
                    tweetItem['created_at']=time_fix(create_time_info.strip())

                like_num=tweet_node.xpath('//a[contains(text(),"赞[")]/text()')[-1]
                tweetItem['like_num']=int(re.search('\d+',like_num).group())
                repost_num=tweet_node.xpath('//a[contains(text(),"转发[")]/text()')[-1]
                tweetItem['repost_num']=int(re.search('\d+',repost_num).group())

                comment_num=tweet_node.xpath(
                    '//a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()')[-1]
                tweetItem['comment_num']=int(re.search('\d+',comment_num).group())
                images=tweet_node.xpath('//img[@alt="图片"]/@src')
                if images:
                    tweetItem['image_url']=images[0]

                videos=tweet_node.xpath('//a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href')
                if videos:
                    tweetItem['video_url']=videos[0]
                map_node=tweet_node.xpath('//a[contains(text(),"显示地图")]')
                if map_node:
                    map_node=map_node[0]
                    map_node_url=map_node.xpath('/@href')[0]
                    map_info=re.search(r'xy=(.*?)&',map_node_url).group(1)
                    tweetItem['location_map_info']=map_info

                repost_node=tweet_node.xpath('//a[contains(text(),"原文评论")]/@href')
                if repost_node:
                    tweetItem['origin_weibo']=repost_node[0]

                #检测有没有阅读全文
                all_content_link=tweet_node.xpath('//a[text()="全文" and contains(@href,ckAll=1)]')
                if all_content_link:
                    all_content_url=self.base_url+all_content_link[0].xpath('/@href')[0]
                    yield Request(all_content_url,callback=self.parse_all_content,
                                  meta={'item':tweetItem},
                                  priority=1)
                else:
                    tweet_html=etree.tostring(tweet_node,encoding='unicode')
                    tweetItem['content']=extract_weibo_content(tweet_html)
                    yield tweetItem

                # #抓取微博的评论信息
                # comment_url=self.base_url+'/comment/'+tweetItem['weibo_url'].split('/')[-1]+'?page=1'
                # yield Request(url=comment_url,callback=self.parse_comment,
                #               meta={'weibo_url':tweetItem['weibo_url']})

            except Exception as e:
                self.logger.error(e)
Exemplo n.º 8
0
def tweet(response):
    selector = Selector(response)
    tweet_nodes = selector.xpath('body/div[@class="c" and @id]')
    tweet_items = []
    for tweet_node in tweet_nodes:
        tweet_item = TweetsItem()
        tweet_repost_url = tweet_node.xpath(
            './/a[contains(text(),"转发[")]/@href').extract_first()
        user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)',
                                  tweet_repost_url)
        tweet_item['user_id_1'] = user_tweet_id.group(2)
        tweet_item['weibo_url_1'] = 'https://weibo.com/{}/{}'.format(
            tweet_item['user_id_1'], user_tweet_id.group(1))

        like_num = tweet_node.xpath(
            './/a[contains(text(),"赞")]/text()').extract_first()
        tweet_item['like_num'] = re.search('\d+', like_num).group()

        repost_num = tweet_node.xpath(
            './/a[contains(text(),"转发")]/text()').extract_first()
        tweet_item['repost_num'] = re.search('\d+', repost_num).group()

        comment_num = tweet_node.xpath(
            './/a[contains(text(),"评论") and not(contains(text(),"原文"))]/text()'
        ).extract_first()
        tweet_item['comment_num'] = re.search('\d+', comment_num).group()

        repost_node = tweet_node.xpath('.//span[@class="cmt"]')

        imgs_node = tweet_node.xpath('//a[text()="原图"]/@href')

        if imgs_node:
            tweet_item['imgs'] = imgs_node.extract_first()

        img_group__node = tweet_node.xpath('//a[contains(text(),"组图")]/@href')

        if img_group__node:
            tweet_item['imgs_group'] = img_group__node.extract_first()

        if repost_node:
            repost_node = repost_node[0]
            tweet_item['is_repost'] = True
            user_url_2 = repost_node.xpath('./a/@href').extract_first()
            try:
                tweet_item['user_id_2'] = re.search(r'u?/(\d+)$',
                                                    user_url_2).group(1)
                source_tweet_comment_url = tweet_node.xpath(
                    './/a[contains(text(),"原文评论")]/@href').extract_first()
                source_tweet_id = re.search(r'comment/(.*?)\?',
                                            source_tweet_comment_url).group(1)
                tweet_item['weibo_url_2'] = 'https://weibo.com/{}/{}'.format(
                    tweet_item['user_id_2'], source_tweet_id)
            except:
                continue
            content_2_node = tweet_node.xpath('.//span[@class="ctt"]')[0]
            tweet_item['content_2'] = content_2_node.xpath(
                'string(.)').extract_first()
            repost_info = tweet_node.xpath(
                'string(./div[last()])').extract_first()
            repost_info = repost_info.replace('转发理由:', '').replace('查看图片', '')
            tweet_item['content_1'] = repost_info.split(
                '赞')[0].strip().replace('\u200b', '')
            tweet_item['created_at_1'] = re.search(
                r'收藏(.*?)[来自]?', repost_info).group(1).strip()
        else:
            tweet_item['is_repost'] = False
            tweet_info_node = tweet_node.xpath('.//span[@class="ctt"]')[0]
            tweet_info = tweet_info_node.xpath('string(.)').extract_first()
            tweet_item['content_1'] = tweet_info.strip().replace('\u200b',
                                                                 '').strip()
            create_time_node = tweet_node.xpath('.//span[@class="ct"]')[0]
            create_time_info = create_time_node.xpath(
                'string(.)').extract_first()
            tweet_item['created_at_1'] = create_time_info.split(
                '\xa0')[0].strip()
            try:
                tweet_item['tool'] = create_time_info.split('\xa0')[1].replace(
                    '来自', '').strip()
            except:
                pass
            tweet_item['_id'] = tweet_item['weibo_url_1']
            tweet_items.append(tweet_item)
    next_url = selector.xpath('//a[text()="下页"]/@href').extract()
    return tweet_items, next_url
Exemplo n.º 9
0
    def parse2(self, response):

        selector = Selector(response)
        tweetitems = TweetsItem()
        # 可以直接用request的meta传递ID过来更方便
        IDhref = selector.xpath(
            '//div[@class="u"]/div[@class="tip2"]/a[1]/@href').extract()
        ID = (IDhref[0])[1:11]  #ID一般是10位,但有些人是9位,至今没有找到解决办法
        Tweets = selector.xpath('//div[@class="c"]')

        # 跟parse1稍有不同,通过for循环寻找需要的对象
        for eachtweet in Tweets:
            # 获取每条微博唯一id标识
            mark_id = eachtweet.xpath('@id').extract()
            print mark_id
            # 当id不为空的时候加入到微博获取列表
            if mark_id:
                # 去重操作,对于已经获取过的微博不再获取
                while mark_id not in self.TweetsID:
                    content = eachtweet.xpath(
                        'div/span[@class="ctt"]/text()').extract()
                    timelocation = eachtweet.xpath(
                        'div[2]/span[@class="ct"]/text()').extract()
                    pic_url = eachtweet.xpath('div[2]/a[2]/@href').extract()
                    like = eachtweet.xpath('div[2]/a[3]/text()').extract()
                    transfer = eachtweet.xpath('div[2]/a[4]/text()').extract()
                    comment = eachtweet.xpath('div[2]/a[5]/text()').extract()

                    tweetitems['_id'] = ID
                    # 把列表元素连接且转存成字符串
                    allcontents = ''.join(content)
                    # 内容可能为空 需要先判定
                    if allcontents:
                        tweetitems['Content'] = allcontents
                    else:
                        pass
                    if timelocation:
                        tweetitems['Time_Location'] = timelocation[0]
                    if pic_url:
                        tweetitems['Pic_Url'] = pic_url[0]
                        # 返回字符串中'[' ']'里的内容
                    if like:
                        tweetitems['Like'] = (like[0])[(
                            (like[0]).index("[") + 1):((like[0]).index("]"))]
                    if transfer:
                        tweetitems['Transfer'] = (
                            transfer[0])[((transfer[0]).index("[") +
                                          1):((transfer[0]).index("]"))]
                    if comment:
                        tweetitems['Comment'] = (
                            comment[0])[((comment[0]).index("[") +
                                         1):((comment[0]).index("]"))]
                        # 把已经抓取过的微博id存入列表
                    self.TweetsID.append(mark_id)
                    yield tweetitems
            else:
                # 如果selector语句找不到id 查看当前查询语句的状态
                print eachtweet

        tweet_nextLink = selector.xpath(
            '//div[@class="pa"]/form/div/a/@href').extract()
        if tweet_nextLink:
            tweet_nextLink = tweet_nextLink[0]
            print tweet_nextLink
            yield Request(self.url + tweet_nextLink, callback=self.parse2)