def parse_item(self, response): article = response.xpath('//div[@class="article"]') # 标题 title = article.xpath('.//h1/text()').get() # 发布时间 publish_time = article.xpath( './/span[@class="publish-time"]/text()').get() publish_time = publish_time.strip('*') # 当前页url page_url = response.url # 用户url user_home = response.urljoin( article.xpath('.//a[@class="avatar"]/@href').get()) # 内容 content = article.xpath('//div[@class="show-content-free"]').get() data_element = response.xpath('//script[@data-name="page-data"]') # 作者 author = data_element.re(r'"nickname":"([^"]+)",') author = author[0] if author else '' # 字数 words_count = data_element.re(r'"public_wordage":([^"]+?),') words_count = int(words_count[0]) if words_count else 0 # 评论数 comments_count = data_element.re(r'"comments_count":([^"]+?),') comments_count = int(comments_count[0]) if comments_count else '' # 喜欢数量 likes_count = data_element.re(r'"likes_count":([^"]+?),') likes_count = int(likes_count[0]) if likes_count else 0 # 阅读数 views_count = data_element.re(r'"views_count":([^"]+?),') views_count = int(views_count[0]) if views_count else 0 special_id = data_element.re(r'"id":([^"]+?),') special_id = int(special_id[0]) if special_id else 0 item = JianshuItem( title=title, publish_time=publish_time, page_url=page_url, user_home=user_home, content=content, author=author, words_count=words_count, comments_count=comments_count, likes_count=likes_count, views_count=views_count, special_id=special_id, ) yield item
def parse_detail(self, response): title = response.xpath("//h1[@class='title']/text()").get() avatar = response.xpath("//a[@class='avatar']/img/@src").get() author = response.xpath("//span[@class='name']/a/text()").get() pub_time = response.xpath( "//span[@class='publish-time']/text()").get().replace("*", "") #获取文章id url = response.url url1 = url.split("?")[0] article_id = url1.split("/")[-1] #文章内容,包括标签,而不是存文本内容 content = response.xpath("//div[@class='show-content']").get() # word_count = response.xpath("//span[@class='wordage']/text()").get() # comment_count = response.xpath("//span[@class='comments-count']/text()").get() # read_count = response.xpath("//span[@class='views-count']/text()").get() # like_count = response.xpath("//span[@class='likes-count']/text()").get() # subjects = ",".join(response.xpath("//div[@class='include-collection']/a/div/text()").getall()) item = JianshuItem( title=title, avatar=avatar, pub_time=pub_time, author=author, origin_url=response.url, content=content, article_id=article_id, # subjects=subjects, # word_count=word_count, # comment_count=comment_count, # like_count=like_count, # read_count=read_count ) yield item
def parse(self, response): # print(response) item = JianshuItem() info = response.xpath('.//div[@class="collection-wrap"]') if not info: return # print(info) # print(len(info)) for i in info: name = i.xpath('a[1]/h4/text()').extract()[0] content = i.xpath('a[1]/p/text()').extract() article_num = i.xpath('div/a/text()').extract()[0] fans = i.xpath('div/text()').extract()[0] if content: content = content[0] else: content = '' # print(name, content, article_num, fans) # print('='*30) item['name'] = name item['content'] = content item['article_num'] = article_num item['fans'] = fans yield item base_url = 'https://www.jianshu.com/recommendations/collections?page={}&order_by=hot' urls = (base_url.format(str(i)) for i in range(1, 21)) for url in urls: yield Request(url, callback=self.parse)
def parse_item(self, response): title = response.xpath( "//div[@class='article']/h1[@class='title']/text()").get() author = response.xpath( "//div[@class='info']/span[@class='name']/a/text()").get() publish_time = response.xpath( "//div[@class='meta']/span[@class='publish-time']/text()").get() word = response.xpath( "//div[@class='meta']/span[@class='wordage']/text()").get() content = response.xpath( "//div[@class='show-content-free']/p//text()").getall() content = "".join(content).replace("\u3000", "").replace("\xa0", "").strip() url = response.url id = url.split("?")[0].split("/")[-1] view_count = response.xpath( "//div[@class='meta']/span[@class='view-count']/text()").get() comment_count = response.xpath( "//div[@class='meta']/span[@class='comments-count']/text()").get() like_count = response.xpath( "//div[@class='meta']/span[@class='likes-count']/text()").get() reward_count = response.xpath( "//div[@class='meta']/span[@class='rewards-count']/text()").get() item = JianshuItem(title=title, author=author, publish_time=publish_time, word=word, content=content, url=url, id=id, view_count=view_count, comment_count=comment_count, like_count=like_count, reward_count=reward_count) yield item
def parse(self, response): item = JianshuItem() selector = Selector(response) infos = selector.xpath('//ul[@class="note-list"]/li') for info in infos: user = info.xpath('div/div[1]/div/a/text()').extract()[0] time = info.xpath('div/div[1]/div/span/@data-shared-at').extract()[0] title = info.xpath('div/a/text()').extract()[0] view = info.xpath('div/div[2]/a[1]/text()').extract()[1].strip() comment = info.xpath('div/div[2]/a[2]/text()').extract()[1].strip() like = info.xpath('div/div[2]/span[1]/text()').extract()[0].strip() gain = info.xpath('div/div[2]/span[2]/text()').extract() if gain: gain = gain[0].strip() else: gain = '0' item['user'] = user item['time'] = time item['title'] = title item['view'] = view item['comment'] = comment item['like'] = like item['gain'] = gain yield item urls = ['https://www.jianshu.com/c/bd08b5306eb6?order_by=added_at&page={}'.format(str(i)) for i in range(2, 3)] for url in urls: yield Request(url, callback=self.parse)
def parse_item(self, response): title = response.xpath( "//div[@class='article']/h1[@class='title']/text()").get().strip() avatar = response.xpath( "//div[@class='article']/div[@class='author']/a/img/@src").get( ).strip() author = response.xpath( "//div[@class='article']//div[@class='info']/span/a/text()").get( ).strip() pub_time = response.xpath( "//div[@class='meta']/span[@class='publish-time']/text()").get( ).strip() origin_url = response.url # /p/7ba4ea51d56c?utm_campaign=maleskine&utm_content=note&utm_medium=seo_notes&utm_source=recommendation # /p/7ba4ea51d56c # url里面只能有一个? 然后取第一个位 在通过/分割取最后一位 author_id = response.url.split('?')[0].split('/')[-1] content = response.xpath("//div[@class='show-content-free']").get() item = JianshuItem(title=title, avatar=avatar, author=author, pub_time=pub_time, origin_url=origin_url, author_id=author_id, content=content) yield item
def parse(self, response): item = JianshuItem() selector = Selector(response) infos = selector.xpath('//div[@class="collection-wrap"]') for info in infos: name = info.xpath('a[1]/h4/text()').extract()[0] content = info.xpath('a[1]/p/text()').extract() article = info.xpath('div/a/text()').extract()[0] fans = info.xpath('div/text()').extract()[0] if content: content = content[0] else: content = ' ' item['name'] = name item['content'] = content item['article'] = article item['fans'] = fans yield item urls = [ 'https://www.jianshu.com/recommendations/collections?page={}&order_by=hot' .format(str(i)) for i in range(2, 21) ] for url in urls: yield Request(url, callback=self.parse)
def parse_item(self, response): item = {} # 获取内容页数据并解析数据 title = response.xpath("//h1[@class='title']/text()").get() #作者图像 avatar = response.xpath("//a[@class='avatar']/img/@src").get() author = response.xpath("//span[@class='name']/a/text()").get() #发布时间 pub_time = response.xpath("//span[@class='publish-time']/text()").get() #详情页id url = response.url url1 = url.split("?")[0] article_id = url1.split("/")[-1] #文章内容 content = response.xpath( "string(//div[@class='show-content'])").extract() content = [ i.strip().replace('\n', '').replace('\xa0', '') for i in content if i.strip() ] item = JianshuItem(title=title, avatar=avatar, author=author, pub_time=pub_time, origin_url=response.url, article_id=article_id, content=content) return item
def parse_detail(self, response): title = response.xpath('//h1[@class="_1RuRku"]/text()').get() author = response.xpath('//span[@class="FxYr8x"]/a/text()').get() avatar = response.xpath('//div[@class="_2mYfmT"]/a/img/@src').get() pub_time = response.xpath('//div[@class="s-dsoj"]/time/text()').get() # id直接拿 # https://www.jianshu.com/p/9713ff94c4a5 article_url1 = response.url # 以问号切割 article_url2 = article_url1.split("?")[0] article_id = article_url2.split('/')[-1] origin_url = response.url content = response.xpath('//article[@class="_2rhmJa"]').get() subject = response.xpath( '//div[@class="_2Nttfz"]/a/span/text()').getall() # 此时为列表,在MySQL中不支持列表,以逗号分割 subject = ",".join(subject) item = JianshuItem(title=title, content=content, article_id=article_id, origin_url=origin_url, author=author, avatar=avatar, pub_time=pub_time, subject=subject) yield item
def parse_text(self,response): # print(response.text) item = JianshuItem() item['title'] = response.css("._gp-ck ._1RuRku::text").get() item['user_url'] = "https://www.jianshu_selenium.com"+response.css("._gp-ck ._1OhGeD::attr(href)").get() item['date'] = response.css("._gp-ck .s-dsoj time::text").get() item['dz'] = response.css("._gp-ck .s-dsoj span:last-child::text").get() item['text'] = " ".join(response.css("._gp-ck ._2rhmJa p::text").getall()) yield item
def parse(self, response): # 文章id,可从url获取 article_id = response.url.split('/')[-1] # 标题 title = response.xpath('//h1[@class="_1RuRku"]/text()').get() # 内容,这里把内容的标签也保存下来 content = response.xpath('//article').get() # 作者 author = response.xpath('//a[@class="_1OhGeD"]/text()').get() # 头像 avatar = response.xpath('//img[@class="_13D2Eh"]/@src').get() # 发布时间 pub_time = response.xpath('//div[@class="s-dsoj"]/time/text()').get() #字数和阅读量没有可供筛选的条件,并且他们前面有个span有些页面有有些页面没有,所以倒数着来取 # 字数 word_count = response.xpath( '//div[@class="s-dsoj"]/span[last()-1]/text()').get() word_count = word_count.split()[-1] # 阅读量 read_count = response.xpath( '//div[@class="s-dsoj"]/span[last()]/text()').get() read_count = read_count.split()[-1] # 评论数,span中含有注释签<!---->,所以需要getall()才能获取到后面的数字 comment_count = response.xpath( '//div[@class="-pXE92"]/div[1]/span//text()').getall()[-1] # 点赞数,没有点赞数的话没有任何数字,所以自己判断一下给它赋0 like_count = response.xpath( '//div[@class="-pXE92"]/div[2]/span//text()').getall() if len(like_count) == 1: like_count = '0' else: like_count = like_count[-1] # 所属专题 subjects = response.xpath( '//div[contains(@class, "_2Nttfz")]/a/span/text()').getall() # getall()返回的是一个列表,将专题列表转换成以逗号分隔的字符串。 subjects = ','.join(subjects) # url origin_url = response.url item = JianshuItem(article_id=article_id, title=title, content=content, author=author, avatar=avatar, pub_time=pub_time, word_count=word_count, read_count=read_count, comment_count=comment_count, like_count=like_count, subjects=subjects, origin_url=origin_url) yield item
def parse_item(self, response): html =etree.HTML(response.text) item = JianshuItem() item['title'] = html.xpath("//title/text()")[0].split("-")[0] item['name'] = html.xpath("//span[@class='name']/a/text()")[0] item['url'] = response.url.split("?")[0] collection = html.xpath("//div[@class='include-collection']/a/div[@class='name']/text()") if collection: item['collection'] = ','.join(collection) yield item
def parse(self, response): if self.page == 1: articles = response.selector.xpath('//ul[@class="note-list"]/li') else: articles = response.selector.xpath('//li[@class="have-img"]') for article in articles: note_id = article.xpath('@data-note-id').extract() if len(note_id) > 0: self.note_id_list.append(note_id[0]) title = article.xpath('div/a[@class="title"]/text()').extract() article_abstract = article.xpath( 'div/p[@class="abstract"]/text()').extract() article_link = article.xpath( 'div/a[@class="title"]/@href').extract() author = article.xpath( 'div/div/div/a[@class="nickname"]/text()').extract() author_link = article.xpath( 'div/div/div/a[@class="nickname"]/@href').extract() post_time = article.xpath( 'div/div/div/span/@data-shared-at').extract() category = article.xpath( 'div/div/a[@class="collection-tag"]/text()').extract() meta_a = article.xpath('div/div/a/text()').re(r' ([0-9]*)\n') meta_span = article.xpath('div/div/span/text()').re(r' ([0-9]*)') item = JianshuItem() item['title'] = title[0] item['article_abstract'] = article_abstract[0] item['article_link'] = article_link[0] item['author'] = author[0] item['author_link'] = author_link[0] item['post_time'] = post_time[0] item['category'] = '' item['views'] = int(meta_a[0]) item['comments'] = int(meta_a[1]) item['like'] = int(meta_span[0]) item['reward'] = 0 if len(category) > 0: item['category'] = category[0] if len(meta_span) > 1: item['reward'] = int(meta_span[1]) yield item #最多加载15页 if self.page < 15: self.page = self.page + 1 params = urllib.urlencode( { "page": self.page, "seen_snote_ids[]": self.note_id_list }, True) yield scrapy.Request("https://www.jianshu.com/?%s" % params, headers=self.headers, callback=self.parse)
def parse_item(self, response): item = {} # 获取内容页数据并解析数据 item = JianshuItem( ) return item
def parse(self, response): item = JianshuItem() selector = Selector(response) articles = selector.xpath('//ul[@class="note-list"]/li') for article in articles: title = article.xpath('div/a/text()').extract() url = article.xpath('div/a/@href').extract() author = article.xpath('div/div[1]/div/a/text()').extract() # 下载所有热门文章的缩略图, 注意有些文章没有图片 try: #/div/div[1]/a/img image = article.xpath("div/div[1]/a/img/@src").extract()[0] filename = 'images/%s-%s.jpg' % (author[0], title[0]) print("文件名:" + filename) print("图片地址" + image) urllib.request.urlretrieve(image, filename) except: print('--no---image--') #//*[@id="note-9417518"]/div/div[2]/a[1],阅读数 listtop = article.xpath('div/div[2]/a[1]/text()').extract() # likeNum = article.xpath('div/div[2]/span[1]/text()').extract() #//*[@id="note-9417518"]/div/div[2]/a[2]/i #//*[@id="note-9417518"]/div/div[2]/a[2] readAndComment = article.xpath('div/div[2]/a[2]/text()') test = readAndComment[1].extract() item['title'] = title item['url'] = 'http://www.jianshu.com/' + url[0] item['author'] = author item['readNum'] = listtop[1] # 有的文章是禁用了评论的 try: item['commentNum'] = readAndComment[1].extract() except: item['commentNum'] = '' item['likeNum'] = likeNum yield item #/html/body/div[1]/div/div[1]/a #next_link = selector.xpath('//a') #xpath(‘//div[contains(@id,”ma”)]‘) if len(articles) > 0: self.page = self.page + 1 next_link = self.url + "?page=" + str(self.page) print("----" + next_link) yield Request(next_link, callback=self.parse)
def parse_item(self, response): title = response.xpath('//h1[@title]/text()').get() # 文章标题 author = response.xpath( '//div[@class]/a[@href]/span[@class]/text()').get() # 文章作者 # 用于存储文章内容 x_content = response.xpath('//article//text()').getall() content = ' '.join(x_content) print(title, author, content) item = JianshuItem(title=title, author=author, content=content) yield item
def parse_item(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.jianshu.com/p/b851e04de659 @returns items 1 16 @scrapes author content title url datetime wordnum views_count comments_count likes_count followers_count total_likes_count rank """ item = JianshuItem() log.start(logfile='log.txt', loglevel=log.INFO) log.msg('RequestURL:%s' % response.url, spider=JSSpider) contents = response.xpath('//div[contains(@class, "preview")]')[0] item['title'] = contents.xpath( 'h1[contains(@class,"title")]/text()').extract()[0] item['author'] = contents.xpath( 'div/a[contains(@class,"author-name")]/span/text()').extract()[0] item['datetime'] = contents.xpath( 'div[contains(@class,"author-info")]/span/text()').extract()[1] pagecons = response.xpath('//div[contains(@class, "show-content")]/p') item['content'] = pagecons.extract() item['url'] = response.url scriptlists = response.xpath( '//script[contains(@data-name,"note")]/text()').extract() scriptlist6 = scriptlists[0].strip().split(',')[-6:] newscripts = [] for script in scriptlist6: newscripts += script.encode('utf8').split(':') newscript = [n.replace('"', '') for n in newscripts] newdict = dict(newscript[i:i + 2] for i in range(0, len(newscript), 2)) item['wordnum'] = newdict.get('wordage') item['views_count'] = newdict.get('views_count') item['likes_count'] = newdict.get('likes_count') item['comments_count'] = newdict.get('comments_count') followersandtotallikes = response.xpath( '//script[contains(@data-name,"author")]/text()').extract() followersandtotallikes2 = followersandtotallikes[0].strip().split( ',')[-3:-1] newfollowersandtotallikes2 = [] for followersandlikes in followersandtotallikes2: newfollowersandtotallikes2 += followersandlikes.encode( 'utf8').split(':') followerslikes = [ n.replace('"', '') for n in newfollowersandtotallikes2 ] followerslikesdict = dict(followerslikes[i:i + 2] for i in range(0, len(followerslikes), 2)) item['followers_count'] = followerslikesdict.get('followers_count') item['total_likes_count'] = followerslikesdict.get('total_likes_count') return item
def parse_html(self, response): loader = ArticleItem(item=JianshuItem(), response=response) loader.add_xpath('title', "//div[@class='_gp-ck']//h1/text()") loader.add_xpath('num', "//div[@class='s-dsoj']/span[2]/text()") loader.add_xpath('look', "//div[@class='s-dsoj']/span[3]/text()") loader.add_xpath('author', "//span[@class='_22gUMi']/text()") loader.add_xpath('favor', "//span[@class='_1LOh_5']/text()") loader.add_xpath('time', "//time/text()") loader.add_xpath('content', "//article[@class='_2rhmJa']//text()") loader.add_value('url', response.url) article_item = loader.load_item() yield article_item
def parse_item(self, response): item = JianshuItem() item['author_url'] = response.meta['author_url'] item['author_name'] = response.meta['author_name'] try: selector = Selector(response) fans = selector.xpath('//div[@class="info"]/ul/li[2]/div/a/p/text()').extract()[0] articles = selector.xpath('//div[@class="info"]/ul/li[3]/div/a/p/text()').extract()[0] word_count = selector.xpath('//div[@class="info"]/ul/li[4]/div/p/text()').extract()[0] item['fans'] = fans item['articles'] = articles item['word_count'] = word_count yield item except: pass
def parse(self, response): """ response 是请求网页返回的数据 """ item = JianshuItem() selector = Selector(response) title = selector.xpath( '/html/body/div[4]/div/div[1]/div[2]/div[1]/h1/text()').extract( )[0] time = re.findall( r'\d{4}-\d{2}-\d{2}', selector.xpath( '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[1]/span[1]/text()' ).extract()[0])[0] price = selector.xpath( '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/span[2]/text()' ).extract()[0] style = selector.xpath( '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[2]/text()' ).extract()[0][4:] source = selector.xpath( '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[3]/text()' ).extract()[0][4:] house_type = selector.xpath( '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[4]/text()' ).extract()[0][4:] devices = selector.xpath( '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[5]/text()' ).extract()[0][4:] address = selector.xpath( '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[3]/div[1]/div[6]/text()' ).extract()[0][4:] description = selector.xpath( '/html/body/div[4]/div/div[1]/div[2]/div[1]/div[4]/text()' ).extract()[0] item['title'] = title item['time'] = time item['rent'] = price item['rent_style'] = style item['source'] = source item['type'] = house_type item['devices'] = devices item['address'] = address item['text'] = description yield item
def parse(self, response): item = JianshuItem() selector = Selector(response) articles = selector.xpath('//ul[@class="note-list"]/li') for article in articles: title = article.xpath('div/a/text()').extract() url = article.xpath('div/a/@href').extract() # 下载所有热门文章的缩略图, 注意有些文章没有图片 # try: # image = article.xpath("a/img/@src").extract() # urllib.urlretrieve(image[0], '/Users/apple/Documents/images/%s-%s.jpg' % (author[0], title[0])) # except: # print('--no---image--') item['ccommentLimt'] = article.xpath('div/p/text()').extract() listtop = article.xpath('div/div/a/text()').extract() likeNum = article.xpath('div/div/span/text()').extract() author = article.xpath( 'div/div/div/a[@class="nickname"]/text()').extract() # readAndComment = article.xpath('div/div[@class="list-footer"]') # data = readAndComment[0].xpath('string(.)').extract()[0] item['title'] = title item['url'] = 'http://www.jianshu.com/' + url[0] item['author'] = author item['readNum'] = listtop[3] # 有的文章是禁用了评论的 try: item['commentNum'] = listtop[5] except: item['commentNum'] = '' item['likeNum'] = likeNum[0] try: item['moneyNum'] = likeNum[1] except: item['moneyNum'] = '' yield item next_link = selector.xpath( '//*[@id="list-container"]/div/button/@data-url').extract() if len(next_link) == 1: next_link = self.url + str(next_link[0]) yield Request(next_link, callback=self.parse)
def parse(self, response): item = JianshuItem() # print 'responseInfo:'+response.body selector = scrapy.Selector(response) articles = selector.xpath('//ul[@class="note-list"]') # print 'selector:'+str(selector) print 'articles.count:' + str(len(articles)) for article in articles: # print 'article:'+str(article) titles = article.xpath('//a[@class="title"]/text()').extract() print 'titles.count:' + str(len(titles)) for title in titles: # title = title.xpath('///div/a/text()').extract() print 'title:' + title item['title'] = title yield item
def parse(self, response): item = JianshuItem() selector = Selector(response) # response就是返回的网页数据 # 处理好的数据放在items中,在items.py设置好你要处理哪些数据字段,这里我们抓取文章标题,url,作者,阅读数,喜欢,打赏数 # 解析处理数据的地方,用xpath解析处理数据 # 简书不让爬 # articles = selector.xpath('//ul[@class="note-list"]/li') # 试试CSDN博客页面 # articles = selector.xpath('//ul[@id="feedlist_id"]/li') # 测试窝 articles = selector.xpath('/html/body/div[1]/div/div[1]/div[2]/div') for article in articles: # 简书 # title = article.xpath('/div/a/text()').extract() # url = article.xpath('/div/a/@href').extract() # author = article.xpath('/div/div/a/text()').extract() # CSDN # title = article.xpath('/div/div/h2/a/text()').extract() # url = article.xpath('/div/div/h2/a/@href').extract() # author = article.xpath('/div/dl/dd[4]/a/text()').extract() # 测试窝 # 相对xpath千万不要以斜杠“/”开头,否则获取不到数据 title = article.xpath('div[1]/div/h3/a/text()').extract() url = article.xpath('div/div/h3/a/@href').extract() # author = article.xpath('/div/dl/dd[4]/a/text()').extract() # #下载所有热门文章的缩略图,有些文章没有 # try: # image = article.xpath('/a/img/@src').extract() # # urllib.urlretrieve(image[0], '/Users/apple/Documents/images/%s-%s.jpg' % (author[0], title[0])) # except: # print 'NO IMG' # 喜欢数 # 评论数 item['title'] = title item['url'] = url # item['author'] = author yield item
def parse_item(self, response): title = response.xpath( "//div[@class='article']/h1[@class='title']/text()").get().strip() avatar = response.xpath( "//div[@class='article']/div[@class='author']/a/img/@src").get( ).strip() author = response.xpath( "//div[@class='article']//div[@class='info']/span/a/text()").get( ).strip() pub_time = response.xpath( "//div[@class='meta']/span[@class='publish-time']/text()").get( ).strip() origin_url = response.url # /p/7ba4ea51d56c?utm_campaign=maleskine&utm_content=note&utm_medium=seo_notes&utm_source=recommendation # /p/7ba4ea51d56c # url里面只能有一个? 然后取第一个位 在通过/分割取最后一位 author_id = response.url.split('?')[0].split('/')[-1] content = response.xpath("//div[@class='show-content-free']").get() read_count = response.xpath( "//div[@class='meta']/span[@class='views-count']/text()").get( ).strip() like_count = response.xpath( "//div[@class='meta']/span[@class='likes-count']/text()").get( ).strip() word_count = response.xpath( "//div[@class='meta']/span[@class='wordage']/text()").get().strip( ) subjects = ",".join( response.xpath( "//div[@class='include-collection']/a/div[@class='name']/text()" ).getall()).strip() comment_count = response.xpath( "//div[@class='meta']/span[@class='comments-count']/text()").get( ).strip() item = JianshuItem(title=title, avatar=avatar, author=author, pub_time=pub_time, origin_url=origin_url, author_id=author_id, content=content, read_count=read_count, like_count=like_count, word_count=word_count, subjects=subjects, comment_count=comment_count) yield item
def parse(self, response): item = JianshuItem() selector = Selector(response) articles = selector.xpath('//ul[@class="article-list thumbnails"]/li') import pdb pdb.set_trace() for article in articles: title = article.xpath('div/h4/a/text()').extract() url = article.xpath('div/h4/a/@href').extract() author = article.xpath('div/p/a/text()').extract() # ÏÔËÓÈÃÎÕµÄõ¼, עÒÓЩÎÕûÓͼƬ try: image = article.xpath("a/img/@src").extract() urllib.urlretrieve(image[0], '/images/%s-%s.jpg' % (author[0], title[0])) except: print '--no---image--' listtop = article.xpath('div/div/a/text()').extract() likeNum = article.xpath('div/div/span/text()').extract() readAndComment = article.xpath('div/div[@class="list-footer"]') data = readAndComment[0].xpath('string(.)').extract()[0] item['title'] = title item['url'] = 'http://www.jianshu.com/' + url[0] item['author'] = author item['readNum'] = listtop[0] try: item['commentNum'] = listtop[1] except: item['commentNum'] = '' item['likeNum'] = likeNum yield item next_link = selector.xpath( '//*[@id="list-container"]/div/button/@data-url').extract() if len(next_link) == 1: next_link = self.url + str(next_link[0]) print "----" + next_link yield Request(next_link, callback=self.parse, headers={'User-Agent': " ¡magic br"})
def parse_detail(self, response): item = JianshuItem() item['name'] = response.meta['name'] item['img_url'] = response.meta['url'] summer = response.xpath( '//*[@id="link-report"]/span[1]/text()').extract()[0] # //*[@id="link-report"]/span[1] # 查看全文 # //*[@id="link-report"]/span[1]/span/text() title = response.xpath( '//*[@id="content"]/div[3]/div[1]/div[3]/h2/i/text()').extract() print(title) print(item['name']) print(response) print(summer) item['name2'] = summer yield item
def parse_detail(self, response): title = response.xpath("//h1[@class='title']/text()").get() avatar = response.xpath("//a[@class='avatar']/img/@src").get() author = response.xpath("//span[@class='name']/a/text()").get() pub_time = response.xpath("//span[@class='publish-time']/text()").get() url = response.url url1 = url.split("?")[0] article_id = url1.split("/")[-1] content = response.xpath("//div[@class='show-content']").get() item = JianshuItem(title=title, avatar=avatar, author=author, pub_time=pub_time, origin_url=response.url, article_id=article_id, content=content) yield item
def parse(self, response): item = JianshuItem() item['author_uid'] = response.xpath("//a[@class='name']/@href").get() item['author_url'] = response.url item['author'] = response.xpath("//a[@class='name']/text()").get() item['fans'] = response.xpath("//div[@class='info']/ul/li[2]//p/text()").get() item['concern'] = response.xpath("//div[@class='info']/ul/li[1]//p/text()").get() item['article'] = response.xpath("//div[@class='info']/ul/li[3]//p/text()").get() item['word_count'] = response.xpath("//div[@class='info']/ul/li[4]//p/text()").get() item['js_diamond'] = response.xpath("//div[@class='info']/ul/li[6]//p/text()").extract_first() item['tag'] = response.xpath("//div[@class='js-intro']/text()").extract_first() #print(item) yield item uid = item['author_url'].split('/')[-1] page = 1 yield scrapy.Request(url=self.fans_url.format(uid,page), callback=self.parse_fans, meta={'page': page}) yield scrapy.Request(url=self.concern_url.format(uid,page), callback=self.parse_concern, meta={'page': page})
def parse(self, res): item = JianshuItem() articles = res.xpath('//ul[@class="note-list"]/li') for index,article in enumerate(articles): item['author'] = article.xpath('.//div[@class="info"]/a[1]/text()').extract()[0] item['title'] = article.xpath('.//div[@class="content"]/a[1]/text()').extract()[0] item['abstract'] = article.xpath('normalize-space(.//p[@class="abstract"]/text())').extract()[0] pulish_time = article.xpath('.//div[@class="info"]//span/@data-shared-at').extract()[0] item_url = article.xpath('.//div[@class="content"]/a/@href').extract()[0] item['item_url'] = 'http://www.jianshu.com' + str(item_url) read_number = article.xpath('.//div[@class="meta"]/a[1]/text()').extract()[1] comment_number = article.xpath('.//div[@class="meta"]/a[2]/text()').extract()[1] collect_number = article.xpath('.//div[@class="meta"]/span/text()').extract()[0] item['pulish_time'] = DealFunction().format_time(pulish_time) item['comment_number'] = re.sub(r'\s+','',read_number) item['read_number'] = re.sub(r'\s+','',read_number) item['collect_number'] = re.sub(r'\s+','',collect_number) yield item
def parse(self, response): item = JianshuItem() articles = response.xpath("//ul[@class='note-list']/li") for article in articles: item['author'] = article.xpath( './/div[@class="info"]/a/text()').extract()[0] item['title'] = article.xpath( './/div[@class="content"]/a/text()').extract()[0] item['times'] = article.xpath( './/div[@class="info"]/span/@data-shared-at').extract()[0] url = article.xpath( './/div[@class="content"]/a/@href').extract()[0] item['url'] = 'http://www.jianshu.com' + url admire = article.xpath('.//div/div[2]/span[2]/text()').extract() item['admire'] = ''.join(admire) likes = article.xpath('.//div/div[2]/span[1]/text()').extract() item['likes'] = ''.join(likes) yield item