Пример #1
0
 def parse_page(self, response):
     """ 抓取微博数据 """
     selector = Selector(response)
     tweets = selector.xpath('body/div[@class="c" and @id]')
     for tweet in tweets:
         id = tweet.xpath('@id').extract_first()  # 微博ID
         content = tweet.xpath(
             'div/span[@class="ctt"]/text()').extract_first()  # 微博内容
         ctime = tweet.xpath(
             'div/span[@class="ct"]/text()').extract()  #微博创建时间
         cooridinates = tweet.xpath('div/a/@href').extract_first()  # 定位坐标
         allinks = re.findall(u"<a.*?href=.*?[\d+].*?<\/a>",
                              tweet.extract(), re.I | re.S | re.M)
         like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract())  # 点赞数
         transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]',
                               tweet.extract())  # 转载数
         comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]',
                              tweet.extract())  # 评论数
         comment_link = ""
         for link in allinks:
             like_link_list = re.findall(u'"(http.*)">\u8d5e.*', link)
             if len(like_link_list) > 0:
                 like_link = like_link_list[0]
             transfer_link_list = re.findall(u'"(http.*)">\u8f6c\u53d1.*',
                                             link)
             if len(transfer_link_list) > 0:
                 transfer_link = transfer_link_list[0]
             comment_link_list = re.findall(
                 ur'"(http.*?)" class="cc".*>\u8bc4\u8bba\[\d+\]</a>', link)
             if len(comment_link_list) > 0:
                 comment_link = comment_link_list[0]
         tweetsItems = TweetsItem()
         others = tweet.xpath('div/span[@class="ct"]/text()').extract_first(
         )  # 求时间和使用工具(手机或平台)
         tweetsItems["ID"] = "cctv"
         if content:
             tweetsItems["Content"] = content.strip(
                 u"[\u4f4d\u7f6e]")  # 去掉最后的"[位置]"
         if ctime:
             tweetsItems["Time"] = ctime
         if cooridinates:
             cooridinates = re.findall('center=([\d|.|,]+)', cooridinates)
             if cooridinates:
                 tweetsItems["Co_oridinates"] = cooridinates[0]
         if like:
             tweetsItems["Like"] = int(like[0])
         if transfer:
             tweetsItems["Transfer"] = int(transfer[0])
         if comment:
             tweetsItems["Comment"] = int(comment[0])
         if comment_link != "":
             yield Request(url=comment_link,
                           meta={
                               'tweetsItems': tweetsItems,
                               'comment_link': comment_link
                           },
                           callback=self.parse_fans_name)
Пример #2
0
    def parse2(self, response):
        """ 抓取微博数据 """
        if response.body == "":
            req = response.request
            req.meta["change_proxy"] = True
            yield req
        else:
            # logger.info("got page: %s" % response.body)

            selector = Selector(response)
            urllist = selector.xpath('//a[@href]/@href').extract()
            for text00 in urllist:
                reposturl = re.findall(r'^http://weibo.cn/repost/',text00)
                if reposturl:
                    if text00 not in self.finishurllist:
                        self.starturllist.add(text00)            

            # 爬取ID
            text7 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
            if text7:
                ID = re.findall('uid=(\d+)', text7)[0]

            tweets = selector.xpath('body/div[@class="c" and @id]')
            for tweet in tweets:
                tweetsItems = TweetsItem()
                id = tweet.xpath('@id').extract_first()  # 微博ID
                content = tweet.xpath('div/span[@class="ctt"]/text()').extract_first()  # 微博内容
                cooridinates = tweet.xpath('div/a/@href').extract_first()  # 定位坐标
                like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract())  # 点赞数
                transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract())  # 转载数
                comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract())  # 评论数
                others = tweet.xpath('div/span[@class="ct"]/text()').extract_first()  # 求时间和使用工具(手机或平台)



                tweetsItems["ID"] = ID
                tweetsItems["_id"] = ID + "-" + id
                tweetsItems["tweetID"] = id 
                if content:
                    tweetsItems["Content"] = content.strip(u"[\u4f4d\u7f6e]")  # 去掉最后的"[位置]"
                if cooridinates:
                    cooridinates = re.findall('center=([\d|.|,]+)', cooridinates)
                    if cooridinates:
                        tweetsItems["Co_oridinates"] = cooridinates[0]
                if like:
                    tweetsItems["Like"] = int(like[0])
                if transfer:
                    tweetsItems["Transfer"] = int(transfer[0])
                if comment:
                    tweetsItems["Comment"] = int(comment[0])
                if others:
                    others = others.split(u"\u6765\u81ea")
                    tweetsItems["PubTime"] = others[0]
                    if len(others) == 2:
                        tweetsItems["Tools"] = others[1]
                yield tweetsItems
Пример #3
0
    def parse2(self, response):
        """ 抓取微博数据 """
        selector = Selector(response)
        tweets = selector.xpath('body/div[@class="c" and @id]')
        for tweet in tweets:
            tweetsItems = TweetsItem()
            id = tweet.xpath('@id').extract_first()  # 微博ID
            content = tweet.xpath(
                'div/span[@class="ctt"]/text()').extract_first()  # 微博内容
            cooridinates = tweet.xpath('div/a/@href').extract_first()  # 定位坐标
            like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract())  # 点赞数
            transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]',
                                  tweet.extract())  # 转载数
            comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]',
                                 tweet.extract())  # 评论数
            others = tweet.xpath('div/span[@class="ct"]/text()').extract_first(
            )  # 求时间和使用工具(手机或平台)

            if others:
                others = others.split(u"\u6765\u81ea")
                hasMonth = re.findall(u'\u6708', others[0])
                if hasMonth:
                    temp = others[0].split(u'\u6708')
                    month = temp[0]
                    day = temp[1].split(u'\u65e5')[0]
                    if int(month) == int(self.current_month) and (
                            int(self.current_day) - int(day)) <= 1:
                        tweetsItems["PubTime"] = others[0]
                        if len(others) == 2:
                            tweetsItems["Tools"] = others[1]
                    else:
                        continue
                else:
                    continue
            if content:
                choujiang = re.findall(u'\u62bd', content)
                if not choujiang:
                    continue
                tweetsItems["Content"] = content.strip(
                    u"[\u4f4d\u7f6e]")  # 去掉最后的"[位置]"
            tweetsItems["ID"] = response.meta["ID"]
            tweetsItems["_id"] = id
            if cooridinates:
                cooridinates = re.findall('center=([\d|.|,]+)', cooridinates)
                if cooridinates:
                    tweetsItems["Co_oridinates"] = cooridinates[0]
            if like:
                tweetsItems["Like"] = int(like[0])
            if transfer:
                tweetsItems["Transfer"] = int(transfer[0])
            if comment:
                tweetsItems["Comment"] = int(comment[0])

            yield tweetsItems
Пример #4
0
    def parse2(self, response):
        """ 抓取微博数据 """
        # print("========33333333=======")
        selector = Selector(response)
        tweets = selector.xpath('body/div[@class="c" and @id]')
        # print(tweets)
        for tweet in tweets:
            tweetsItems = TweetsItem()
            id = tweet.xpath('@id').extract_first()  # 微博ID
            content = tweet.xpath(
                'div/span[@class="ctt"]/text()').extract_first()  # 微博内容
            cooridinates = tweet.xpath('div/a/@href').extract_first()  # 定位坐标
            like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract())  # 点赞数
            transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]',
                                  tweet.extract())  # 转载数
            comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]',
                                 tweet.extract())  # 评论数
            others = tweet.xpath('div/span[@class="ct"]/text()').extract_first(
            )  # 求时间和使用工具(手机或平台)

            tweetsItems["ID"] = response.meta["ID"]
            tweetsItems["_id"] = response.meta["ID"] + "-" + id
            if content:
                tweetsItems["Content"] = content.strip(
                    u"[\u4f4d\u7f6e]")  # 去掉最后的"[位置]"
            if cooridinates:
                cooridinates = re.findall('center=([\d|.|,]+)', cooridinates)
                if cooridinates:
                    tweetsItems["Co_oridinates"] = cooridinates[0]
            if like:
                tweetsItems["Like"] = int(like[0])
            if transfer:
                tweetsItems["Transfer"] = int(transfer[0])
            if comment:
                tweetsItems["Comment"] = int(comment[0])
            if others:
                others = others.split(u"\u6765\u81ea")
                tweetsItems["PubTime"] = others[0]
                if len(others) == 2:
                    tweetsItems["Tools"] = others[1]
            # print(tweetsItems)
            yield tweetsItems
        url_next = selector.xpath(
            u'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href'
        ).extract()
        if url_next:
            yield Request(url=self.host + url_next[0],
                          meta={"ID": response.meta["ID"]},
                          callback=self.parse2)
Пример #5
0
    def parse_weibo1(self, response):
        selector = Selector(response)
        tweets = selector.xpath('body/div[@class="c" and @id]')
        for tweet in tweets:
            tweetsItem = TweetsItem()
            weibo_id = tweet.xpath('@id').extract_first()[2:]  # 微博ID
            cmts = tweet.xpath('div/span[@class="cmt"]').extract()
            if len(tweet.xpath('div/span[@class="cmt"]').extract()) > 2:
                r = tweet.xpath(u'div/span[text() = "转发理由:"]')
                content = r.xpath('./../text()').extract_first()
                like = re.findall(u'\u8d5e\[(\d+)\]', ','.join(r.xpath(u'./../a/text()').extract()))  # 点赞数
                transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', ','.join(r.xpath(u'./../a/text()').extract()))  # 转载数
                comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', ','.join(r.xpath(u'./../a/text()').extract()))  # 评论数
                tweetsItem['Type'] = REPOST
                coordinates = None
            else:
                content = tweet.xpath('div/span[@class="ctt"]/text()').extract_first()  # 微博内容
                coordinates = tweet.xpath('div/a/@href').extract_first()  # 定位坐标
                tweetsItem['Type'] = ORIGINAL
                like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract())  # 点赞数
                transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract())  # 转载数
                comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract())  # 评论数
            others = tweet.xpath('div/span[@class="ct"]/text()').extract_first()  # 求时间和使用工具(手机或平台)

            tweetsItem["ID"] = response.meta["ID"]

            tweetsItem["_id"] = response.meta["ID"] + "-" + weibo_id
            if content:
                tweetsItem["Content"] = content.strip(u"[\u4f4d\u7f6e]")  # 去掉最后的"[位置]"
            if coordinates:
                coordinates = re.findall('center=([\d|.|,]+)', coordinates)
                if coordinates:
                    tweetsItem["Coordinates"] = coordinates[0]
            if like:
                tweetsItem["Like"] = int(like[0])
            if transfer:
                tweetsItem["Transfer"] = int(transfer[0])
            if comment:
                tweetsItem["Comment"] = int(comment[0])
            if others:
                others = others.split(u"\u6765\u81ea")
                tweetsItem["PubTime"] = others[0]
                if len(others) == 2:
                    tweetsItem["Tools"] = others[1]

            # yield Request(url=comment_url, meta={"weiboId": weibo_id, "current_page": 1}, callback=self.parse_comment)
            yield tweetsItem
Пример #6
0
    def parse2(self, response):
        """ 抓取微博数据 """
        selector = Selector(response)

        # g = selector.xpath('//input[@name="mp"]/@value').extract_first()
        # mp = int(g)
        # b = response.url.split("=")
        # k = b[1]
        # j = int(k)
        tweets = selector.xpath('body/div[@class="c" and @id]')
        for tweet in tweets:
            tweetsItems = TweetsItem()
            id = tweet.xpath('@id').extract_first()  # 微博ID

            ctt = tweet.xpath('div/span[@class="ctt"]')
            content = ctt.xpath('string(.)').extract()[0]

            # content = tweet.xpath('div/span[@class="ctt"]/text()').extract_first()  # 微博内容
            # cooridinates = tweet.xpath('div/a/@href').extract_first()  # 定位坐标
            like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract())  # 点赞数
            transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract())  # 转载数
            comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract())  # 评论数
            others = tweet.xpath('div/span[@class="ct"]/text()').extract_first()  # 求时间和使用工具(手机或平台)

            tweetsItems["ID"] = response.meta["ID"]
            tweetsItems["_id"] = response.meta["ID"] + "-" + id
            if content:
                tweetsItems["Content"] = content
                # tweetsItems["Content"] = content.strip(u"[\u4f4d\u7f6e]")  # 去掉最后的"[位置]"

            # if cooridinates:
            #     cooridinates = re.findall('center=([\d|.|,]+)', cooridinates)
            #     if cooridinates:
            #         tweetsItems["Co_oridinates"] = cooridinates[0]
            if like:
                tweetsItems["Like"] = int(like[0])
            if transfer:
                tweetsItems["Transfer"] = int(transfer[0])
            if comment:
                tweetsItems["Comment"] = int(comment[0])
            if others:
                others = others.split(u"\u6765\u81ea")
                tweetsItems["PubTime"] = others[0]
                # if len(others) == 2:
                #     tweetsItems["Tools"] = others[1]
            yield tweetsItems
Пример #7
0
    def parse2(self, response):
        """ 抓取微博数据 """
        selector = Selector(response)
        tweets = selector.xpath('body/div[@class="c" and @id]')
        for tweet in tweets:
            tweetsItems = TweetsItem()
            id = tweet.xpath('@id').extract_first()  # 微博ID
            content = tweet.xpath(
                'div/span[@class="ctt"]/text()').extract_first()  # 微博内容
            cooridinates = tweet.xpath('div/a/@href').extract_first()  # 定位坐标
            others = tweet.xpath('div/span[@class="ct"]/text()').extract_first(
            )  # 求时间和使用工具(手机或平台)

            tweetsItems["ID"] = response.meta["ID"]
            tweetsItems["_id"] = response.meta["ID"] + "-" + id
            if content:
                tweetsItems["Content"] = content.strip(
                    u"[\u4f4d\u7f6e]")  # 去掉最后的"[位置]"
            else:
                tweetsItems["Content"] = u""
            if others:
                others = others.split(u"\u6765\u81ea")
                tweetsItems["PubTime"] = others[0]
                if len(others) == 2:
                    tweetsItems["Tools"] = others[1]
                else:
                    tweetsItems["Tools"] = u""
            yield tweetsItems
        url_next = selector.xpath(
            u'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href'
        ).extract()
        if url_next:
            yield Request(url=self.host + url_next[0],
                          meta={"ID": response.meta["ID"]},
                          callback=self.parse2)
            print(self.host + url_next[0])
Пример #8
0
    def parse_weibo(self, response):
        """ 抓取微博数据 """
        selector = Selector(response)
        tweets = selector.xpath('body/div[@class="c" and @id]')
        for tweet in tweets:
            tweetsItem = TweetsItem()
            weibo_id = tweet.xpath('@id').extract_first()[2:]  # 微博ID
            cmts = tweet.xpath('div/span[@class="cmt"]').extract()
            if len(tweet.xpath('div/span[@class="cmt"]').extract()) > 2:
                content = tweet.xpath(
                    u'div/span[text() = "转发理由:"]/../text()').extract_first()
                tweetsItem['Type'] = REPOST
                coordinates = None
            else:
                content = tweet.xpath(
                    'div/span[@class="ctt"]/text()').extract_first()  # 微博内容
                coordinates = tweet.xpath(
                    'div/a/@href').extract_first()  # 定位坐标
                tweetsItem['Type'] = ORIGINAL
            like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract())  # 点赞数
            transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]',
                                  tweet.extract())  # 转载数
            comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]',
                                 tweet.extract())  # 评论数
            others = tweet.xpath('div/span[@class="ct"]/text()').extract_first(
            )  # 求时间和使用工具(手机或平台)

            tweetsItem["ID"] = response.meta["ID"]
            comment_url = self.comment_pattern % (weibo_id, 1)

            tweetsItem["_id"] = response.meta["ID"] + "-" + weibo_id
            if content:
                tweetsItem["Content"] = content.strip(
                    u"[\u4f4d\u7f6e]")  # 去掉最后的"[位置]"
            if coordinates:
                coordinates = re.findall('center=([\d|.|,]+)', coordinates)
                if coordinates:
                    tweetsItem["Coordinates"] = coordinates[0]
            if like:
                tweetsItem["Like"] = int(like[0])
            if transfer:
                tweetsItem["Transfer"] = int(transfer[0])
            if comment:
                tweetsItem["Comment"] = int(comment[0])
            if others:
                others = others.split(u"\u6765\u81ea")
                tweetsItem["PubTime"] = others[0]
                if len(others) == 2:
                    tweetsItem["Tools"] = others[1]

            yield Request(url=comment_url,
                          meta={
                              "weiboId": weibo_id,
                              "current_page": 1
                          },
                          callback=self.parse_comment)
            yield tweetsItem
        next_page = response.meta['current_page'] + 1
        if 'max_page' not in response.meta:
            print selector.xpath('body/div[@id="pagelist"]')
            max_page = selector.xpath(
                'body/div[@id="pagelist"]/form/div/input[@name="mp"]/@value'
            ).extract_first()
            if max_page:
                response.meta['max_page'] = int(max_page)
            else:
                return
        print response.meta['max_page']
        if next_page <= response.meta['max_page']:
            response.meta['current_page'] = next_page
            yield Request(url=self.host + '/' + response.meta['ID'] +
                          '?page=' + str(next_page),
                          meta=response.meta,
                          callback=self.parse_weibo)
Пример #9
0
    def parse2(self, response):
        """ 抓取微博数据 """
        selector = Selector(response)
        tweets = selector.xpath('body/div[@class="c" and @id]')
        ctime = datetime.now()
        crawl = True
        # lastCrawlTime = self.get_lastWeiboDate(response.meta["ID"])
        for tweet in tweets:
            tweetsItems = TweetsItem()
            id = tweet.xpath('@id').extract_first()  # 微博ID

            content = ""  # 微博内容
            for c in tweet.xpath(
                    'div/span[@class="ctt"]/text() | div/span[@class="ctt"]/a/text()'
                    '| div/a/text() | div/text()'):
                content += c.extract()  # 微博内容
            content = content.split("\xa0")[0].replace("#",
                                                       "").replace("全文", "")
            cooridinates = tweet.xpath('div/a/@href').extract_first()  # 定位坐标
            like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract())  # 点赞数
            transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]',
                                  tweet.extract())  # 转载数
            comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]',
                                 tweet.extract())  # 评论数
            others = tweet.xpath('div/span[@class="ct"]/text()').extract_first(
            )  # 求时间和使用工具(手机或平台)

            tweetsItems["ID"] = response.meta["ID"]
            tweetsItems["_id"] = response.meta["ID"] + "-" + id
            if content:
                tweetsItems["Content"] = content.strip(
                    u"[\u4f4d\u7f6e]")  # 去掉最后的"[位置]"
            if cooridinates:
                cooridinates = re.findall('center=([\d|.|,]+)', cooridinates)
                if cooridinates:
                    tweetsItems["Co_oridinates"] = cooridinates[0]
            if like:
                tweetsItems["Like"] = int(like[0])
            if transfer:
                tweetsItems["Transfer"] = int(transfer[0])
            if comment:
                tweetsItems["Comment"] = int(comment[0])
            if others:
                others = others.split(u"\u6765\u81ea")
                others[0] = others[0].replace(u"\xa0", '')
                if "分钟前" in others[0]:
                    tweetsItems["PubTime"] = (
                        datetime.now() -
                        timedelta(minutes=int(others[0].split("分钟前")[0]))
                    ).strftime("%Y-%m-%d %H:%M:%S")
                elif "今天" in others[0]:
                    tweetsItems["PubTime"] = others[0].replace(
                        "今天",
                        datetime.now().strftime("%Y-%m-%d")) + ":00"
                elif "月" in others[0]:
                    tweetsItems["PubTime"] = (str(datetime.now().year) + "-" +
                                              others[0] + ":00").replace(
                                                  "月", "-").replace("日", '')
                else:
                    tweetsItems["PubTime"] = others[0]
                # if len(others) == 2:
                #     tweetsItems["Tools"] = others[1]
            # 提取hashtag
            tweetsItems["Label"] = tweet.xpath(
                'div/span[@class="ctt"]/a/text()').extract_first()  # hashtag
            ctime = datetime.strptime(tweetsItems["PubTime"],
                                      "%Y-%m-%d %H:%M:%S")
            if self.yetNotCrawled(tweetsItems["_id"]) == False:
                crawl = False
                break
            yield tweetsItems
        url_next = selector.xpath(
            u'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href'
        ).extract()
        if url_next and crawl and self.tweetTimeDelta(ctime) <= 30:
            yield Request(url=self.host + url_next[0],
                          meta={"ID": response.meta["ID"]},
                          callback=self.parse2)