def parse_item(self, response): title = response.xpath('//div[@class="article"]/h1/text()').get() avatar = response.xpath('//div[@class="author"]/a/img/@src').get() author = response.xpath('//div[@class="info"]/span/a/text()').get() pub_time = response.xpath('//div[@class="meta"]/span[@class="publish-time"]/text()').get() url = response.url url1 = url.split('?')[0] article_id = url1.split('/')[-1] content = response.xpath('//div[@class="show-content-free"]').get() word_count = response.xpath('//span[@class="wordage"]/text()').get() read_count = response.xpath('//span[@class="views-count"]/text()').get() like_count = response.xpath('//span[@class="likes-count"]/text()').get() subjects = ''.join(response.xpath('//div[@class="include-collection"]/a/div[@class="name"]/text()').getall()) item = ArticleItem( title=title, avatar=avatar, author=author, pub_time=pub_time, article_id=article_id, content=content, origin_url=response.url, subjects=subjects, word_count=word_count, read_count=read_count, like_count=like_count ) yield item
def parse_detail(self, response): title = response.xpath("//h1[@class = 'title']/text()").get() avatar = response.xpath("//a[@class='avatar']/img/@src").get() author = response.xpath("//span[@class = 'name']/a/text()").get() pub_time = response.xpath("//span[@class= 'publish-time']/text()").get().replace("*", "") url = response.url url1 = url.split("?")[0] article_id = url1.split('/')[-1] content = response.xpath("//div[@class= 'show-content']").get() word_count = response.xpath("//span[@class = 'wordage']/text()").get() comment_count = response.xpath("//span[@class = 'comments-count']/text()").get() read_count = response.xpath("//span[@class = 'views-count']/text()").get() like_count = response.xpath("//span[@class = 'likes-count']/text()").get() subjects = '.'.join(response.xpath("//div[@class = 'include-collection']/a/div/text()").getall()) item = ArticleItem( title=title, avatar=avatar, pub_time=pub_time, origin_url=response.url, article_id=article_id, content=content, subjects= subjects, word_count= word_count, comment_count= comment_count, read_count= read_count, like_count= like_count ) yield item
def parse_detail(self, response): item = ArticleItem() # item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() # item['name'] = response.xpath('//div[@id="name"]').get() # item['description'] = response.xpath('//div[@id="description"]').get() item['title'] = response.xpath( "//div[@class='article']/h1/text()").get() item['author'] = response.xpath( "//div[@class='info']/span[@class='name']/a/text()").get() item['avatar'] = response.xpath( "//div[@class='author']/a[@class='avatar']/img/@src").get() item['content'] = response.xpath( "//div[@class='show-content-free']").get() item['pub_time'] = response.xpath( "//span[@class='publish-time']/text()").get().replace('*', '') # https://www.jianshu.com/p/f0c5934b5d3f?utm_campaign=maleskine&utm_content=note&utm_medium=seo_notes&utm_source=recommendation # https://www.jianshu.com/p/f03e0080e3e5 item['article_id'] = response.url.split('?')[0].split('/')[-1] item['origin_url'] = response.url item['word_count'] = response.xpath( "//span[@class='wordage']/text()").get() item['read_count'] = response.xpath( "//span[@class='views-count']/text()").get() item['like_count'] = response.xpath( "//span[@class='likes-count']/text()").get() item['comment_count'] = response.xpath( "//span[@class='comments-count']/text()").get() subject = response.xpath( "//div[@class='include-collection']/a//div[contains(@class,'name')]/text()" ).getall() item['subject'] = ','.join(subject) print(item) yield item
def parse_detail(self, response): title = response.xpath("//section[@class='ouvJEz']/h1[@class='_1RuRku']/text()").get() author = response.xpath("//span/text()").getall()[3] content = response.xpath("//article[@class ='_2rhmJa']").getall() item = ArticleItem( title=title, author=author, content=content ) yield item
def parse_detail(self, response): #获取文章内容 title = response.xpath("//h1[@class='_1RuRku']/text()").get() author = response.xpath("//span[@class='FxYr8x']/a/text()").get() img = response.xpath("//a[@class='_1OhGeD']/img/@src").get() pub_time = response.xpath("//div[@class='s-dsoj']/time/text()").get() origin_url = response.url article_id = response.url.split("/")[-1] content = response.xpath("//article[@class='_2rhmJa']").get() items = ArticleItem( title = title, author=author, img=img, pub_time=pub_time, article_id = article_id, content = content, origin_url =origin_url ) yield items
def parse_detail(self, response): title = response.xpath("//h1[@class='title']/text()").get() # 标题 avatar = response.xpath("//a[@class='avatar']/img/@src").get() # 头像url author = response.xpath("//span[@class='name']/a/text()").get() # 作者 pub_time = response.xpath( "//span[@class='publish-time']/text()").get() # 发布时间 origin_url = response.url # 获取文章的url,然后解析这个url来获取文章的id url1 = origin_url.split("?")[0] # 根据问号分割 article_id = url1.split("/")[-1] # 这就是文章的id content = response.xpath( "//div[@class='show-content']").get() # 文章(保留标签格式) # 一下是ajax数据 word_count = response.xpath( "//span[@class='wordage']/text()").get() # 字数 comment_count = response.xpath( "//span[@class='comments-count']/text()").get() # 评论数 like_count = response.xpath( "//span[@class='views-count']/text()").get() # 喜欢数 read_count = response.xpath( "//span[@class='likes-count']/text()").get() # 阅读数 subjects = response.xpath( "//div[@class='include-collection']/a/div/text()").getall( ) # 专题信息,返回的是列表 subjects = ",".join(subjects) item = ArticleItem(title=title, avatar=avatar, author=author, pub_time=pub_time, origin_url=origin_url, article_id=article_id, content=content, subjects=subjects, word_count=word_count, comment_count=comment_count, read_count=read_count, like_count=like_count) yield item
def parse_item(self, response): title = response.xpath("//h1[@class='title']/text()").get() avatar = response.xpath("//a[@class='avatar']/img/@src").get() author = response.xpath("//span[@class='name']/a/text()").get() pub_time = response.xpath("//span[@class='publish-time']/text()").get() url = response.url article_id = re.match(r'.*?/p/([0-9a-z]{12})[\?]*.*?', response.url).group(1) content = response.xpath("//div[@class='show-content']").get() content = re.sub(r"<.*?>", "", content).strip() word_count = response.xpath("//span[@class='wordage']/text()").get() comment_count = response.xpath( "//span[@class='comments-count']/text()").get() like_count = response.xpath( "//span[@class='likes-count']/text()").get() read_count = response.xpath( "//span[@class='views-count']/text()").get() subjects = ",".join( response.xpath( "//div[@class='include-collection']/a/div/text()").getall()) item = ArticleItem(title=title, avatar=avatar, author=author, origin_url=url, pub_time=pub_time, article_id=article_id, content=content, word_count=word_count, comment_count=comment_count, like_count=like_count, read_count=read_count, subjects=subjects) yield item
def parse_item(self, response): ''' scrapy shell url title = scrapy.Field() avatar = scrapy.Field() author = scrapy.Field() pub_time = scrapy.Field() article_id = scrapy.Field() ''' title = response.xpath('//h1[@class="_1RuRku"]/text()').get() avatar = response.xpath('//meta[@property="og:image"]/@content').get() author = response.xpath('//span[@class="_22gUMi"]/text()').get() # TODO 用正则取有问题,暂时不取了 # pub_time = response.body.re(r'last_updated_at":(.*?),') url = response.url urlArr = url.split('?') urlRes = urlArr[0] article_id = urlRes.split('/')[-1] content = ''.join( response.xpath('//article[@class="_2rhmJa"]//text()').getall()) now = int(round(time.time() * 1000)) pub_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(now / 1000)) try: like_count = int( response.xpath('//span[@class="_1GPnWJ"]/text()').get()) commit_count = int( response.xpath( '//div[@class="_3nj4GN"]//span[1]/text()[2]').get()) word_count = int( response.xpath( '//div[@class="s-dsoj"]//span[2]/text()').get().replace( ' ', '').replace(',', '').replace('字数', '')) subjects = ','.join( response.xpath('//span[@class="_2-Djqu"]//text()').getall()) read_count = int( response.xpath( '//div[@class="s-dsoj"]//span[3]/text()').get().replace( ' ', '').replace(',', '').replace('阅读', '')) item = ArticleItem(title=title, avatar=avatar, author=author, pub_time=pub_time, article_id=article_id, origin_url=url, content=content, like_count=like_count, commit_count=commit_count, word_count=word_count, subjects=subjects, read_count=read_count) yield item except: print('捕获到异常' + '*' * 30) item = ArticleItem(title=title, avatar=avatar, author=author, pub_time=pub_time, article_id=article_id, origin_url=url, content=content) yield item pass pass
def parse(self, response): # 获取文章总数 article_count = response.css( 'body > div.container.person > div > div.col-xs-16.main > div.main-top > div.info > ul > li:nth-child(3) > div > a > p::text').extract_first() # 计算共分了多少页,每页9篇文章 countPageNumber = int(int(article_count) / 9 + 0.5) # 获取文章列表 articles = response.css('ul.note-list > li') for article in articles: articleItem = ArticleItem() # 获取作者名称 articleItem['author_name'] = article.css('a.blue-link::text').extract_first() # author_name = article.css('a.blue-link::text').extract_first() # 获取作者的头像连接 articleItem['author_image'] = 'http:' + article.css('div.author > a > img::attr(src)').extract_first() # author_image = 'http:' + article.css('div.author > a > img::attr(src)').extract_first() # 获取文章发布时间 articleItem['article_release_time'] = article.css('div.name > span.time::attr(data-shared-at)').extract_first() article_release_time = article.css('div.name > span.time::attr(data-shared-at)').extract_first() # 获取标题 articleItem['article_title'] = article.css('a.title::text').extract_first() # article_title = article.css('a.title::text').extract_first() # 获取文章描述 articleItem['article_desc'] = article.css('p.abstract::text').extract_first().strip() # article_desc = article.css('p.abstract::text').extract_first().strip() # 获取文章链接 articleItem['article_link'] = JsmemberspiderSpider.jianshu + article.css('div.content > a::attr(href)').extract_first() # article_link = JsmemberspiderSpider.jianshu + article.css('div.content > a::attr(href)').extract_first() # 获取阅读量,回复量,喜欢人数,赞赏人数 articleItem['read_count'] = article.css('div.meta > a')[0].css('::text').extract()[-1].strip() articleItem['reply_count'] = article.css('div.meta > a')[1].css('::text').extract()[-1].strip() articleItem['likeit_count'] = article.css('div.meta > span')[0].css('::text').extract_first().strip() articleItem['payit_count'] = article.css('div.meta > span')[1].css('::text').extract_first().strip() if len(article.css('div.meta > span'))>=2 else 0 # read_count = article.css('div.meta > a')[0].css('::text').extract()[-1].strip() # reply_count = article.css('div.meta > a')[1].css('::text').extract()[-1].strip() # likeit_count = article.css('div.meta > span')[0].css('::text').extract_first().strip() # payit_count = article.css('div.meta > span')[1].css('::text').extract_first().strip() if len(article.css('div.meta > span'))>=2 else 0 # yield { # 'author_name': author_name, # 'author_image': author_image, # 'article_release_time': article_release_time, # 'article_title': article_title, # 'article_desc': article_desc, # 'article_link': article_link, # 'read_count': read_count, # 'reply_count': reply_count, # 'likeit_count': likeit_count, # 'payit_count': payit_count, # } JsmemberspiderSpider.article_all.append(articleItem) yield articleItem # pages = (i for i in range(2, countPageNumber + 1)) current_page = int(response.url.split('page=')[1]) next_page = JsmemberspiderSpider.start_url.format(current_page + 1) # 爬虫结束的条件,如果当前页是最后一页 if current_page == countPageNumber: next_page = None if next_page is not None: next_page = response.urljoin(next_page) # yield { # '爬取中:': next_page, # } yield scrapy.Request(next_page, callback=self.parse)
def parse_html(self, response): loader = ArticleItem(item=JianshuItem(), response=response) loader.add_xpath('title', "//div[@class='_gp-ck']//h1/text()") loader.add_xpath('num', "//div[@class='s-dsoj']/span[2]/text()") loader.add_xpath('look', "//div[@class='s-dsoj']/span[3]/text()") loader.add_xpath('author', "//span[@class='_22gUMi']/text()") loader.add_xpath('favor', "//span[@class='_1LOh_5']/text()") loader.add_xpath('time', "//time/text()") loader.add_xpath('content', "//article[@class='_2rhmJa']//text()") loader.add_value('url', response.url) article_item = loader.load_item() yield article_item