def parse_weibo(self, response): """解析网页中的微博信息""" keyword = response.meta.get('keyword') for sel in response.xpath("//div[@class='card-wrap']"): info = sel.xpath( "div[@class='card']/div[@class='card-feed']/div[@class='content']/div[@class='info']" ) if info: weibo = WeiboItem() weibo['id'] = sel.xpath('@mid').extract_first() weibo['bid'] = sel.xpath( '(.//p[@class="from"])[last()]/a[1]/@href').extract_first( ).split('/')[-1].split('?')[0] weibo['user_id'] = info[0].xpath( 'div[2]/a/@href').extract_first().split('?')[0].split( '/')[-1] weibo['screen_name'] = info[0].xpath( 'div[2]/a/@nick-name').extract_first() txt_sel = sel.xpath('.//p[@class="txt"]')[0] retweet_sel = sel.xpath('.//div[@class="card-comment"]') retweet_txt_sel = '' if retweet_sel and retweet_sel[0].xpath('.//p[@class="txt"]'): retweet_txt_sel = retweet_sel[0].xpath( './/p[@class="txt"]')[0] content_full = sel.xpath( './/p[@node-type="feed_list_content_full"]') is_long_weibo = False is_long_retweet = False if content_full: if not retweet_sel: txt_sel = content_full[0] is_long_weibo = True elif len(content_full) == 2: txt_sel = content_full[0] retweet_txt_sel = content_full[1] is_long_weibo = True is_long_retweet = True elif retweet_sel[0].xpath( './/p[@node-type="feed_list_content_full"]'): retweet_txt_sel = retweet_sel[0].xpath( './/p[@node-type="feed_list_content_full"]')[0] is_long_retweet = True else: txt_sel = content_full[0] is_long_weibo = True weibo['text'] = txt_sel.xpath( 'string(.)').extract_first().replace('\u200b', '').replace( '\ue627', '') weibo['article_url'] = self.get_article_url(txt_sel) weibo['location'] = self.get_location(txt_sel) if weibo['location']: weibo['text'] = weibo['text'].replace( '2' + weibo['location'], '') weibo['text'] = weibo['text'][2:].replace(' ', '') if is_long_weibo: weibo['text'] = weibo['text'][:-6] weibo['at_users'] = self.get_at_users(txt_sel) weibo['topics'] = self.get_topics(txt_sel) reposts_count = sel.xpath( './/a[@action-type="feed_list_forward"]/text()' ).extract_first() try: reposts_count = re.findall(r'\d+.*', reposts_count) except TypeError: print('cookie无效或已过期,请按照' 'https://github.com/dataabc/weibo-search#如何获取cookie' ' 获取cookie') raise CloseSpider() weibo['reposts_count'] = reposts_count[ 0] if reposts_count else '0' comments_count = sel.xpath( './/a[@action-type="feed_list_comment"]/text()' ).extract_first() comments_count = re.findall(r'\d+.*', comments_count) weibo['comments_count'] = comments_count[ 0] if comments_count else '0' attitudes_count = sel.xpath( '(.//a[@action-type="feed_list_like"])[last()]/em/text()' ).extract_first() weibo['attitudes_count'] = (attitudes_count if attitudes_count else '0') created_at = sel.xpath( '(.//p[@class="from"])[last()]/a[1]/text()').extract_first( ).replace(' ', '').replace('\n', '').split('前')[0] weibo['created_at'] = util.standardize_date(created_at) source = sel.xpath('(.//p[@class="from"])[last()]/a[2]/text()' ).extract_first() weibo['source'] = source if source else '' pics = '' is_exist_pic = sel.xpath( './/div[@class="media media-piclist"]') if is_exist_pic: pics = is_exist_pic[0].xpath('ul[1]/li/img/@src').extract() pics = [pic[2:] for pic in pics] pics = [ re.sub(r'/.*?/', '/large/', pic, 1) for pic in pics ] pics = ['http://' + pic for pic in pics] video_url = '' is_exist_video = sel.xpath( './/div[@class="thumbnail"]/a/@action-data') if is_exist_video: video_url = is_exist_video.extract_first() video_url = unquote( str(video_url)).split('video_src=//')[-1] video_url = 'http://' + video_url if not retweet_sel: weibo['pics'] = pics weibo['video_url'] = video_url else: weibo['pics'] = '' weibo['video_url'] = '' weibo['retweet_id'] = '' if retweet_sel and retweet_sel[0].xpath( './/div[@node-type="feed_list_forwardContent"]/a[1]'): retweet = WeiboItem() retweet['id'] = retweet_sel[0].xpath( './/a[@action-type="feed_list_like"]/@action-data' ).extract_first()[4:] retweet['bid'] = retweet_sel[0].xpath( './/p[@class="from"]/a/@href').extract_first().split( '/')[-1].split('?')[0] info = retweet_sel[0].xpath( './/div[@node-type="feed_list_forwardContent"]/a[1]' )[0] retweet['user_id'] = info.xpath( '@href').extract_first().split('/')[-1] retweet['screen_name'] = info.xpath( '@nick-name').extract_first() retweet['text'] = retweet_txt_sel.xpath( 'string(.)').extract_first().replace('\u200b', '').replace( '\ue627', '') retweet['article_url'] = self.get_article_url( retweet_txt_sel) retweet['location'] = self.get_location(retweet_txt_sel) if retweet['location']: retweet['text'] = retweet['text'].replace( '2' + retweet['location'], '') retweet['text'] = retweet['text'][2:].replace(' ', '') if is_long_retweet: retweet['text'] = retweet['text'][:-6] retweet['at_users'] = self.get_at_users(retweet_txt_sel) retweet['topics'] = self.get_topics(retweet_txt_sel) reposts_count = retweet_sel[0].xpath( './/ul[@class="act s-fr"]/li/a[1]/text()' ).extract_first() reposts_count = re.findall(r'\d+.*', reposts_count) retweet['reposts_count'] = reposts_count[ 0] if reposts_count else '0' comments_count = retweet_sel[0].xpath( './/ul[@class="act s-fr"]/li[2]/a[1]/text()' ).extract_first() comments_count = re.findall(r'\d+.*', comments_count) retweet['comments_count'] = comments_count[ 0] if comments_count else '0' attitudes_count = retweet_sel[0].xpath( './/a[@action-type="feed_list_like"]/em/text()' ).extract_first() retweet['attitudes_count'] = (attitudes_count if attitudes_count else '0') created_at = retweet_sel[0].xpath( './/p[@class="from"]/a[1]/text()').extract_first( ).replace(' ', '').replace('\n', '').split('前')[0] retweet['created_at'] = util.standardize_date(created_at) source = retweet_sel[0].xpath( './/p[@class="from"]/a[2]/text()').extract_first() retweet['source'] = source if source else '' retweet['pics'] = pics retweet['video_url'] = video_url retweet['retweet_id'] = '' yield {'weibo': retweet, 'keyword': keyword} weibo['retweet_id'] = retweet['id'] print(weibo) yield {'weibo': weibo, 'keyword': keyword}
def parse_weibo(self, response): """解析网页中的微博信息""" for sel in response.xpath("//div[@class='card-wrap']"): info = sel.xpath( "div[@class='card']/div[@class='card-feed']/div[@class='content']/div[@class='info']" ) if info: weibo = WeiboItem() weibo['id'] = sel.xpath('@mid').extract_first() weibo['user_id'] = info[0].xpath( 'div[2]/a/@href').extract_first().split('?')[0].split( '/')[-1] weibo['nick_name'] = info[0].xpath( 'div[2]/a/@nick-name').extract_first() weibo['txt'] = sel.xpath('.//p[@class="txt"]')[0].xpath( 'string(.)').extract_first().replace('\u200b', '').replace( '\ue627', '') reposts_count = sel.xpath( './/a[@action-type="feed_list_forward"]/text()' ).extract_first() reposts_count = re.findall(r'\d+.*', reposts_count) weibo['reposts_count'] = reposts_count[ 0] if reposts_count else '0' comments_count = sel.xpath( './/a[@action-type="feed_list_comment"]/text()' ).extract_first() comments_count = re.findall(r'\d+.*', comments_count) weibo['comments_count'] = comments_count[ 0] if comments_count else '0' attitudes_count = sel.xpath( '(.//a[@action-type="feed_list_like"])[last()]/em/text()' ).extract_first() weibo['attitudes_count'] = (attitudes_count if attitudes_count else '0') created_at = sel.xpath( '(.//p[@class="from"])[last()]/a[1]/text()').extract_first( ).replace(' ', '').replace('\n', '').split('前')[0] weibo['created_at'] = util.standardize_date(created_at) source = sel.xpath('(.//p[@class="from"])[last()]/a[2]/text()' ).extract_first() weibo['source'] = source if source else '' retweet_sel = sel.xpath('.//div[@class="card-comment"]') if retweet_sel: retweet = WeiboItem() retweet_id = retweet_sel[0].xpath( './/a[@action-type="feed_list_like"]/@action-data' ).extract_first()[4:] retweet['id'] = retweet_id info = retweet_sel[0].xpath( './/div[@node-type="feed_list_forwardContent"]/a[1]' )[0] retweet['user_id'] = info.xpath( '@href').extract_first().split('/')[-1] retweet['nick_name'] = info.xpath( '@nick-name').extract_first() retweet['txt'] = retweet_sel[0].xpath( './/p[@class="txt"]')[0].xpath( 'string(.)').extract_first().replace( '\u200b', '').replace('\ue627', '') reposts_count = retweet_sel[0].xpath( './/ul[@class="act s-fr"]/li/a[1]/text()' ).extract_first() reposts_count = re.findall(r'\d+.*', reposts_count) retweet['reposts_count'] = reposts_count[ 0] if reposts_count else '0' comments_count = retweet_sel[0].xpath( './/ul[@class="act s-fr"]/li[2]/a[1]/text()' ).extract_first() comments_count = re.findall(r'\d+.*', comments_count) retweet['comments_count'] = comments_count[ 0] if comments_count else '0' attitudes_count = retweet_sel[0].xpath( './/a[@action-type="feed_list_like"]/em/text()' ).extract_first() retweet['attitudes_count'] = (attitudes_count if attitudes_count else '0') created_at = retweet_sel[0].xpath( './/p[@class="from"]/a[1]/text()').extract_first( ).replace(' ', '').replace('\n', '').split('前')[0] retweet['created_at'] = util.standardize_date(created_at) source = retweet_sel[0].xpath( './/p[@class="from"]/a[2]/text()').extract_first() retweet['source'] = source if source else '' yield retweet weibo['retweet_id'] = retweet_id print(weibo) yield weibo