def parse_page(self, response): """ 抓取微博数据 """ selector = Selector(response) tweets = selector.xpath('body/div[@class="c" and @id]') for tweet in tweets: id = tweet.xpath('@id').extract_first() # 微博ID content = tweet.xpath( 'div/span[@class="ctt"]/text()').extract_first() # 微博内容 ctime = tweet.xpath( 'div/span[@class="ct"]/text()').extract() #微博创建时间 cooridinates = tweet.xpath('div/a/@href').extract_first() # 定位坐标 allinks = re.findall(u"<a.*?href=.*?[\d+].*?<\/a>", tweet.extract(), re.I | re.S | re.M) like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract()) # 点赞数 transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract()) # 转载数 comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract()) # 评论数 comment_link = "" for link in allinks: like_link_list = re.findall(u'"(http.*)">\u8d5e.*', link) if len(like_link_list) > 0: like_link = like_link_list[0] transfer_link_list = re.findall(u'"(http.*)">\u8f6c\u53d1.*', link) if len(transfer_link_list) > 0: transfer_link = transfer_link_list[0] comment_link_list = re.findall( ur'"(http.*?)" class="cc".*>\u8bc4\u8bba\[\d+\]</a>', link) if len(comment_link_list) > 0: comment_link = comment_link_list[0] tweetsItems = TweetsItem() others = tweet.xpath('div/span[@class="ct"]/text()').extract_first( ) # 求时间和使用工具(手机或平台) tweetsItems["ID"] = "cctv" if content: tweetsItems["Content"] = content.strip( u"[\u4f4d\u7f6e]") # 去掉最后的"[位置]" if ctime: tweetsItems["Time"] = ctime if cooridinates: cooridinates = re.findall('center=([\d|.|,]+)', cooridinates) if cooridinates: tweetsItems["Co_oridinates"] = cooridinates[0] if like: tweetsItems["Like"] = int(like[0]) if transfer: tweetsItems["Transfer"] = int(transfer[0]) if comment: tweetsItems["Comment"] = int(comment[0]) if comment_link != "": yield Request(url=comment_link, meta={ 'tweetsItems': tweetsItems, 'comment_link': comment_link }, callback=self.parse_fans_name)
def parse2(self, response): """ 抓取微博数据 """ if response.body == "": req = response.request req.meta["change_proxy"] = True yield req else: # logger.info("got page: %s" % response.body) selector = Selector(response) urllist = selector.xpath('//a[@href]/@href').extract() for text00 in urllist: reposturl = re.findall(r'^http://weibo.cn/repost/',text00) if reposturl: if text00 not in self.finishurllist: self.starturllist.add(text00) # 爬取ID text7 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first() if text7: ID = re.findall('uid=(\d+)', text7)[0] tweets = selector.xpath('body/div[@class="c" and @id]') for tweet in tweets: tweetsItems = TweetsItem() id = tweet.xpath('@id').extract_first() # 微博ID content = tweet.xpath('div/span[@class="ctt"]/text()').extract_first() # 微博内容 cooridinates = tweet.xpath('div/a/@href').extract_first() # 定位坐标 like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract()) # 点赞数 transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract()) # 转载数 comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract()) # 评论数 others = tweet.xpath('div/span[@class="ct"]/text()').extract_first() # 求时间和使用工具(手机或平台) tweetsItems["ID"] = ID tweetsItems["_id"] = ID + "-" + id tweetsItems["tweetID"] = id if content: tweetsItems["Content"] = content.strip(u"[\u4f4d\u7f6e]") # 去掉最后的"[位置]" if cooridinates: cooridinates = re.findall('center=([\d|.|,]+)', cooridinates) if cooridinates: tweetsItems["Co_oridinates"] = cooridinates[0] if like: tweetsItems["Like"] = int(like[0]) if transfer: tweetsItems["Transfer"] = int(transfer[0]) if comment: tweetsItems["Comment"] = int(comment[0]) if others: others = others.split(u"\u6765\u81ea") tweetsItems["PubTime"] = others[0] if len(others) == 2: tweetsItems["Tools"] = others[1] yield tweetsItems
def parse2(self, response): """ 抓取微博数据 """ selector = Selector(response) tweets = selector.xpath('body/div[@class="c" and @id]') for tweet in tweets: tweetsItems = TweetsItem() id = tweet.xpath('@id').extract_first() # 微博ID content = tweet.xpath( 'div/span[@class="ctt"]/text()').extract_first() # 微博内容 cooridinates = tweet.xpath('div/a/@href').extract_first() # 定位坐标 like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract()) # 点赞数 transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract()) # 转载数 comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract()) # 评论数 others = tweet.xpath('div/span[@class="ct"]/text()').extract_first( ) # 求时间和使用工具(手机或平台) if others: others = others.split(u"\u6765\u81ea") hasMonth = re.findall(u'\u6708', others[0]) if hasMonth: temp = others[0].split(u'\u6708') month = temp[0] day = temp[1].split(u'\u65e5')[0] if int(month) == int(self.current_month) and ( int(self.current_day) - int(day)) <= 1: tweetsItems["PubTime"] = others[0] if len(others) == 2: tweetsItems["Tools"] = others[1] else: continue else: continue if content: choujiang = re.findall(u'\u62bd', content) if not choujiang: continue tweetsItems["Content"] = content.strip( u"[\u4f4d\u7f6e]") # 去掉最后的"[位置]" tweetsItems["ID"] = response.meta["ID"] tweetsItems["_id"] = id if cooridinates: cooridinates = re.findall('center=([\d|.|,]+)', cooridinates) if cooridinates: tweetsItems["Co_oridinates"] = cooridinates[0] if like: tweetsItems["Like"] = int(like[0]) if transfer: tweetsItems["Transfer"] = int(transfer[0]) if comment: tweetsItems["Comment"] = int(comment[0]) yield tweetsItems
def parse2(self, response): """ 抓取微博数据 """ # print("========33333333=======") selector = Selector(response) tweets = selector.xpath('body/div[@class="c" and @id]') # print(tweets) for tweet in tweets: tweetsItems = TweetsItem() id = tweet.xpath('@id').extract_first() # 微博ID content = tweet.xpath( 'div/span[@class="ctt"]/text()').extract_first() # 微博内容 cooridinates = tweet.xpath('div/a/@href').extract_first() # 定位坐标 like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract()) # 点赞数 transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract()) # 转载数 comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract()) # 评论数 others = tweet.xpath('div/span[@class="ct"]/text()').extract_first( ) # 求时间和使用工具(手机或平台) tweetsItems["ID"] = response.meta["ID"] tweetsItems["_id"] = response.meta["ID"] + "-" + id if content: tweetsItems["Content"] = content.strip( u"[\u4f4d\u7f6e]") # 去掉最后的"[位置]" if cooridinates: cooridinates = re.findall('center=([\d|.|,]+)', cooridinates) if cooridinates: tweetsItems["Co_oridinates"] = cooridinates[0] if like: tweetsItems["Like"] = int(like[0]) if transfer: tweetsItems["Transfer"] = int(transfer[0]) if comment: tweetsItems["Comment"] = int(comment[0]) if others: others = others.split(u"\u6765\u81ea") tweetsItems["PubTime"] = others[0] if len(others) == 2: tweetsItems["Tools"] = others[1] # print(tweetsItems) yield tweetsItems url_next = selector.xpath( u'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href' ).extract() if url_next: yield Request(url=self.host + url_next[0], meta={"ID": response.meta["ID"]}, callback=self.parse2)
def parse_weibo1(self, response): selector = Selector(response) tweets = selector.xpath('body/div[@class="c" and @id]') for tweet in tweets: tweetsItem = TweetsItem() weibo_id = tweet.xpath('@id').extract_first()[2:] # 微博ID cmts = tweet.xpath('div/span[@class="cmt"]').extract() if len(tweet.xpath('div/span[@class="cmt"]').extract()) > 2: r = tweet.xpath(u'div/span[text() = "转发理由:"]') content = r.xpath('./../text()').extract_first() like = re.findall(u'\u8d5e\[(\d+)\]', ','.join(r.xpath(u'./../a/text()').extract())) # 点赞数 transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', ','.join(r.xpath(u'./../a/text()').extract())) # 转载数 comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', ','.join(r.xpath(u'./../a/text()').extract())) # 评论数 tweetsItem['Type'] = REPOST coordinates = None else: content = tweet.xpath('div/span[@class="ctt"]/text()').extract_first() # 微博内容 coordinates = tweet.xpath('div/a/@href').extract_first() # 定位坐标 tweetsItem['Type'] = ORIGINAL like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract()) # 点赞数 transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract()) # 转载数 comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract()) # 评论数 others = tweet.xpath('div/span[@class="ct"]/text()').extract_first() # 求时间和使用工具(手机或平台) tweetsItem["ID"] = response.meta["ID"] tweetsItem["_id"] = response.meta["ID"] + "-" + weibo_id if content: tweetsItem["Content"] = content.strip(u"[\u4f4d\u7f6e]") # 去掉最后的"[位置]" if coordinates: coordinates = re.findall('center=([\d|.|,]+)', coordinates) if coordinates: tweetsItem["Coordinates"] = coordinates[0] if like: tweetsItem["Like"] = int(like[0]) if transfer: tweetsItem["Transfer"] = int(transfer[0]) if comment: tweetsItem["Comment"] = int(comment[0]) if others: others = others.split(u"\u6765\u81ea") tweetsItem["PubTime"] = others[0] if len(others) == 2: tweetsItem["Tools"] = others[1] # yield Request(url=comment_url, meta={"weiboId": weibo_id, "current_page": 1}, callback=self.parse_comment) yield tweetsItem
def parse2(self, response): """ 抓取微博数据 """ selector = Selector(response) # g = selector.xpath('//input[@name="mp"]/@value').extract_first() # mp = int(g) # b = response.url.split("=") # k = b[1] # j = int(k) tweets = selector.xpath('body/div[@class="c" and @id]') for tweet in tweets: tweetsItems = TweetsItem() id = tweet.xpath('@id').extract_first() # 微博ID ctt = tweet.xpath('div/span[@class="ctt"]') content = ctt.xpath('string(.)').extract()[0] # content = tweet.xpath('div/span[@class="ctt"]/text()').extract_first() # 微博内容 # cooridinates = tweet.xpath('div/a/@href').extract_first() # 定位坐标 like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract()) # 点赞数 transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract()) # 转载数 comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract()) # 评论数 others = tweet.xpath('div/span[@class="ct"]/text()').extract_first() # 求时间和使用工具(手机或平台) tweetsItems["ID"] = response.meta["ID"] tweetsItems["_id"] = response.meta["ID"] + "-" + id if content: tweetsItems["Content"] = content # tweetsItems["Content"] = content.strip(u"[\u4f4d\u7f6e]") # 去掉最后的"[位置]" # if cooridinates: # cooridinates = re.findall('center=([\d|.|,]+)', cooridinates) # if cooridinates: # tweetsItems["Co_oridinates"] = cooridinates[0] if like: tweetsItems["Like"] = int(like[0]) if transfer: tweetsItems["Transfer"] = int(transfer[0]) if comment: tweetsItems["Comment"] = int(comment[0]) if others: others = others.split(u"\u6765\u81ea") tweetsItems["PubTime"] = others[0] # if len(others) == 2: # tweetsItems["Tools"] = others[1] yield tweetsItems
def parse2(self, response): """ 抓取微博数据 """ selector = Selector(response) tweets = selector.xpath('body/div[@class="c" and @id]') for tweet in tweets: tweetsItems = TweetsItem() id = tweet.xpath('@id').extract_first() # 微博ID content = tweet.xpath( 'div/span[@class="ctt"]/text()').extract_first() # 微博内容 cooridinates = tweet.xpath('div/a/@href').extract_first() # 定位坐标 others = tweet.xpath('div/span[@class="ct"]/text()').extract_first( ) # 求时间和使用工具(手机或平台) tweetsItems["ID"] = response.meta["ID"] tweetsItems["_id"] = response.meta["ID"] + "-" + id if content: tweetsItems["Content"] = content.strip( u"[\u4f4d\u7f6e]") # 去掉最后的"[位置]" else: tweetsItems["Content"] = u"" if others: others = others.split(u"\u6765\u81ea") tweetsItems["PubTime"] = others[0] if len(others) == 2: tweetsItems["Tools"] = others[1] else: tweetsItems["Tools"] = u"" yield tweetsItems url_next = selector.xpath( u'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href' ).extract() if url_next: yield Request(url=self.host + url_next[0], meta={"ID": response.meta["ID"]}, callback=self.parse2) print(self.host + url_next[0])
def parse_weibo(self, response): """ 抓取微博数据 """ selector = Selector(response) tweets = selector.xpath('body/div[@class="c" and @id]') for tweet in tweets: tweetsItem = TweetsItem() weibo_id = tweet.xpath('@id').extract_first()[2:] # 微博ID cmts = tweet.xpath('div/span[@class="cmt"]').extract() if len(tweet.xpath('div/span[@class="cmt"]').extract()) > 2: content = tweet.xpath( u'div/span[text() = "转发理由:"]/../text()').extract_first() tweetsItem['Type'] = REPOST coordinates = None else: content = tweet.xpath( 'div/span[@class="ctt"]/text()').extract_first() # 微博内容 coordinates = tweet.xpath( 'div/a/@href').extract_first() # 定位坐标 tweetsItem['Type'] = ORIGINAL like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract()) # 点赞数 transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract()) # 转载数 comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract()) # 评论数 others = tweet.xpath('div/span[@class="ct"]/text()').extract_first( ) # 求时间和使用工具(手机或平台) tweetsItem["ID"] = response.meta["ID"] comment_url = self.comment_pattern % (weibo_id, 1) tweetsItem["_id"] = response.meta["ID"] + "-" + weibo_id if content: tweetsItem["Content"] = content.strip( u"[\u4f4d\u7f6e]") # 去掉最后的"[位置]" if coordinates: coordinates = re.findall('center=([\d|.|,]+)', coordinates) if coordinates: tweetsItem["Coordinates"] = coordinates[0] if like: tweetsItem["Like"] = int(like[0]) if transfer: tweetsItem["Transfer"] = int(transfer[0]) if comment: tweetsItem["Comment"] = int(comment[0]) if others: others = others.split(u"\u6765\u81ea") tweetsItem["PubTime"] = others[0] if len(others) == 2: tweetsItem["Tools"] = others[1] yield Request(url=comment_url, meta={ "weiboId": weibo_id, "current_page": 1 }, callback=self.parse_comment) yield tweetsItem next_page = response.meta['current_page'] + 1 if 'max_page' not in response.meta: print selector.xpath('body/div[@id="pagelist"]') max_page = selector.xpath( 'body/div[@id="pagelist"]/form/div/input[@name="mp"]/@value' ).extract_first() if max_page: response.meta['max_page'] = int(max_page) else: return print response.meta['max_page'] if next_page <= response.meta['max_page']: response.meta['current_page'] = next_page yield Request(url=self.host + '/' + response.meta['ID'] + '?page=' + str(next_page), meta=response.meta, callback=self.parse_weibo)
def parse2(self, response): """ 抓取微博数据 """ selector = Selector(response) tweets = selector.xpath('body/div[@class="c" and @id]') ctime = datetime.now() crawl = True # lastCrawlTime = self.get_lastWeiboDate(response.meta["ID"]) for tweet in tweets: tweetsItems = TweetsItem() id = tweet.xpath('@id').extract_first() # 微博ID content = "" # 微博内容 for c in tweet.xpath( 'div/span[@class="ctt"]/text() | div/span[@class="ctt"]/a/text()' '| div/a/text() | div/text()'): content += c.extract() # 微博内容 content = content.split("\xa0")[0].replace("#", "").replace("全文", "") cooridinates = tweet.xpath('div/a/@href').extract_first() # 定位坐标 like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract()) # 点赞数 transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract()) # 转载数 comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract()) # 评论数 others = tweet.xpath('div/span[@class="ct"]/text()').extract_first( ) # 求时间和使用工具(手机或平台) tweetsItems["ID"] = response.meta["ID"] tweetsItems["_id"] = response.meta["ID"] + "-" + id if content: tweetsItems["Content"] = content.strip( u"[\u4f4d\u7f6e]") # 去掉最后的"[位置]" if cooridinates: cooridinates = re.findall('center=([\d|.|,]+)', cooridinates) if cooridinates: tweetsItems["Co_oridinates"] = cooridinates[0] if like: tweetsItems["Like"] = int(like[0]) if transfer: tweetsItems["Transfer"] = int(transfer[0]) if comment: tweetsItems["Comment"] = int(comment[0]) if others: others = others.split(u"\u6765\u81ea") others[0] = others[0].replace(u"\xa0", '') if "分钟前" in others[0]: tweetsItems["PubTime"] = ( datetime.now() - timedelta(minutes=int(others[0].split("分钟前")[0])) ).strftime("%Y-%m-%d %H:%M:%S") elif "今天" in others[0]: tweetsItems["PubTime"] = others[0].replace( "今天", datetime.now().strftime("%Y-%m-%d")) + ":00" elif "月" in others[0]: tweetsItems["PubTime"] = (str(datetime.now().year) + "-" + others[0] + ":00").replace( "月", "-").replace("日", '') else: tweetsItems["PubTime"] = others[0] # if len(others) == 2: # tweetsItems["Tools"] = others[1] # 提取hashtag tweetsItems["Label"] = tweet.xpath( 'div/span[@class="ctt"]/a/text()').extract_first() # hashtag ctime = datetime.strptime(tweetsItems["PubTime"], "%Y-%m-%d %H:%M:%S") if self.yetNotCrawled(tweetsItems["_id"]) == False: crawl = False break yield tweetsItems url_next = selector.xpath( u'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href' ).extract() if url_next and crawl and self.tweetTimeDelta(ctime) <= 30: yield Request(url=self.host + url_next[0], meta={"ID": response.meta["ID"]}, callback=self.parse2)