def parse(self, response): infos = response.css("div.info") title = response.css("title::text").extract_first().replace("\n","").replace(" ","").split("/")[0].split(":")[-1] for info in infos: item = DoubanBookItem(press="未知出版社",year="1000-1",price="0.00",comment_num="0",score="0.0") item['tags'] = title item['book_name'] = info.css("a::attr(title)").extract_first() item['url'] = info.css("a::attr(href)").extract_first() data = info.css(".pub::text").extract_first().replace("\n","").replace(" ","") item['author'] = data.split('/')[0] item['score'] = info.css(".star.clearfix").css(".rating_nums::text").extract_first() item['comment_num'] = info.css(".star.clearfix").css(".pl::text").extract_first().replace("\n","").replace(" ","") try: item['price'] = self.price_re.search(data).group(0) item['year'] = self.year_re.search(data).group(0) item['press'] = self.press_re.search(data).group(0) item['comment_num'] = self.num_re.search(item['comment_num']).group(0) item['score'] = self.score_re.search(item['score']).group(0) except AttributeError: print(item['book_name'] + "资料丢失!") except TypeError: print(item['book_name'] + "缺少键值") finally: yield item next_url = response.css(".paginator").css(".next a::attr(href)").extract_first() if next_url: time.sleep(random.random()*2+1) yield scrapy.Request(self.join_url(next_url),callback=self.parse) else: yield scrapy.Request(self.all_tags_url[0],callback=self.parse) self.all_tags_url.pop(0)
def parse(self, response): commentSels = scrapy.Selector( text=response.text).xpath('//li[@class="comment-item"]') for s in commentSels: content = s.xpath( './div[@class="comment"]/p[@class="comment-content"]/span[@class="short"]/text()' ).extract()[0] stars = s.xpath( './/span[contains(@class,"user-stars")]/@title').extract() self.cnt += 1 #yield item item = DoubanBookItem() item["idx"] = self.cnt item["content"] = content.replace('\r', '').replace('\n', '').replace( ',', ',').replace('"', '“') item["star"] = stars[0] if stars else "" yield item pageSels = scrapy.Selector(text=response.text).xpath( '//a[@class="page-btn" and contains(text(),"后一页")]/@href') for s in pageSels: page_tmpl = self.urltmpl + s.extract() page_url = page_tmpl % self.bookid print(page_url) yield scrapy.Request(url=page_url, callback=self.parse)
def parse_book(self, response): item = DoubanBookItem() item['book_name'] = response.css("#wrapper").css("h1").css( "span::text").extract_first() info = response.css("div#info").css("span.pl::text") print(info) info = response.css("div#info").css("br::text") print(info) item['author'] = response.css("div#info").css( "a::text").extract_first() item['press'] = response.css("div")
def parse1(self, response): m = response.xpath('//td[@valign="top"]')[0] for i in range(0, 25): book = DoubanBookItem() book['name'] = m.xpath('//div[@class="pl2"]/a/@title').extract()[i] book['nums'] = m.xpath('//span[@class="pl"]/text()').extract()[i] book['ratings'] = m.xpath( '//span[@class="rating_nums"]/text()').extract()[i] book['author'] = m.xpath('//p[@class="pl"]/text()').extract()[i] time.sleep(1) yield book
def parse_page(self, response): for item in response.xpath('//tr[@class="item"]'): book = DoubanBookItem() book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0] book['ratings'] = item.xpath('td[2]/div[2]/span[@class="rating_nums"]/text()').extract()[0] book_info = item.xpath('td[2]/p[1]/text()').extract()[0] book_info_contents = book_info.strip().split(" / ") book['author'] = book_info_contents[0] book['publisher'] = book_info_contents[-3] # book_info_contents可能为4段或者5段(包含译者),负索引提取 book['edition_year'] = book_info_contents[-2] book['price'] = book_info_contents[-1] yield book
def parse_next(self,response): #pass #//*[@id="content"]/div/div[1]/div/table[1]/tbody/tr/td[2]/div[1]/a #//*[@id="content"]/div/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/a #//*[@id="content"]/div/div[1]/div/table[1] for items in response.xpath('//tr[@class="item"]'): book = DoubanBookItem() #/tr/td[2]/div[1]/a book['name'] = items.xpath('td[2]/div[1]/a/@title').extract()[0] book['ratings'] = items.xpath('//span[@class="rating_nums"]/text()').extract()[0] #td[2]/p[1] book['info'] = items.xpath('td[2]/p[1]/text()').extract()[0] yield book #返回内容
def parse_page(self, response): for item in response.xpath('//tr[@class="item"]'): book = DoubanBookItem() book['a'] = item.xpath('td[2]/div[1]/a/@title').extract()[0] book['b'] = item.xpath( 'td[2]/div[2]/span[@class="rating_nums"]/text()').extract()[0] # book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0] book_info = item.xpath('td[2]/p[1]/text()').extract()[0] book_info_contents = book_info.strip().split(' / ') book['c'] = book_info_contents[0] book['d'] = book_info_contents[1] book['e'] = book_info_contents[2] book['f'] = book_info_contents[3] yield book
def parse_book(self, response): sel = Selector(response=response) item = DoubanBookItem() item["name"] = sel.xpath( "//div[@id = 'wrapper']/h1/span/text()").extract_first().strip() item["score"] = sel.xpath( "//*[@id='interest_sectl']/div/div[2]/strong/text()" ).extract_first().strip() item["link"] = response.url try: contents = sel.xpath( "//*[@id='link-report']/div[1]/div/p/text()").extract() item["content_description"] = "\n".join( content for content in contents).strip() except: item["content_description"] = "" try: profiles = sel.xpath( "//*[@id='content']/div/div[1]/div[3]/div[2]/div/div/p/text()" ).extract() item["author_profile"] = "\n".join( profile for profile in profiles).strip() except: item["author_profile"] = "" #get the infos of the book and processing the string to extract info infos = response.xpath("//*[@id='info']").extract_first() infos = re.sub("\s+", "", infos) infos = re.sub("<.*?>", " ", infos).strip() infos = infos.split(" ") infos = [ info.replace(":", "") for info in infos if info != "" and info != ":" and info != " " ] #extract info inventory = [("author", "作者"), ("press", "出版社"), ("date", "出版年"), ("page", "页数"), ("price", "定价"), ("ISBN", "ISBN")] for dict_name, info_name in inventory: item[dict_name] = infos[infos.index(info_name) + 1] if info_name in infos else "" return item
def parse(self, response): # xpath 教程:https://www.runoob.com/xpath/xpath-tutorial.html # 下面这句话拿到了一个网页的书籍的列表 lis = response.xpath('//ul[@class="subject-list"]/li') # 遍历每本书 for li in lis: # 解析各个书籍 img = li.xpath('div[1]/a/img').attrib.get('src', '') info_attr = li.xpath('div[2]/h2/a').attrib href = info_attr.get('href', '') title = info_attr.get('title', '') about = li.xpath('div[2]/div').css('::text').get().strip() rate = li.xpath('div[2]/div[2]/span[2]').css('::text').get() rate_count = li.xpath('div[2]/div[2]/span[3]').css( '::text').get().strip() desc = li.xpath('div[2]/p').css('::text').get() body = { 'img': img, 'href': href, 'title': title, 'about': about, 'rate': rate, 'rate_count': rate_count, 'desc': desc, } # Scrapy 中的 Item,参考:https://scrapy-cookbook.readthedocs.io/zh_CN/latest/scrapy-05.html#item item = DoubanBookItem() for k, v in body.items(): item[k] = v # 返回这个 item yield item # 每个 tag 对应的好很多页,这里解析下一页的地址 next = response.xpath('//span[@class="next"]/a').attrib.get('href', '') if next: next = f'https://book.douban.com{next}' # 然后爬下一页 yield scrapy.Request(next, callback=self.parse)
def parse_page(self, response): for item in response.xpath('//tr[@class="item"]'): book = DoubanBookItem() book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0] book['ratings'] = item.xpath( 'td[2]/div[2]/span[@class="rating_nums"]/text()').extract()[0] # book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0] book_info = item.xpath('td[2]/p[1]/text()').extract()[0] book_info_contents = book_info.strip().split(' / ') num = len(book_info_contents) if num == 4: book['author'] = book_info_contents[0] book['publisher'] = book_info_contents[1] book['edition_year'] = book_info_contents[2] book['price'] = book_info_contents[3] if num == 5: book['author'] = book_info_contents[0] book['author1'] = book_info_contents[1] book['publisher'] = book_info_contents[2] book['edition_year'] = book_info_contents[3] book['price'] = book_info_contents[4] yield book
def parse(self, response): nodes = response.xpath('//li[@class="subject-item"]') for li in nodes: item = DoubanBookItem() item['title'] = self.normal( li.xpath('./div[2]/h2/a/@title').extract()) item['author'] = self.normal( li.xpath('.//div[@class="pub"]/text()').extract()) item['star'] = self.normal( li.xpath('.//span[@class="rating_nums"]/text()').extract()) item['comment'] = self.normal( li.xpath('.//span[@class="pl"]/text()').extract()) item['price'] = self.normal( li.xpath('.//span[@class="buy-info"]/a/text()').extract()) item['describe'] = self.normal( li.xpath('./div[2]/p/text()').extract()) yield item next_page = response.xpath('//link[@rel="next"]/@href').extract() if next_page: next_page = next_page[0] yield Request('https://book.douban.com' + next_page, callback=self.parse)
def parse(self, response): sel = Selector(response) book_list = sel.xpath('//ul[@class="cover-col-4 clearfix"]/li') for book_ele in book_list: book_item = DoubanBookItem() # 书籍背景图片地址 cover_url = book_ele.xpath( './a[@class="cover"]/img/@src').extract()[0] # 书籍详细页地址 url = book_ele.xpath('./a[@class="cover"]/@href').extract()[0] # 书籍名称 book_name = book_ele.xpath( './div[@class="detail-frame"]/h2/a/text()').extract()[0] # 书籍作者,我们发现这样获取到的信息包含了书籍作者、出版社和发布时间三个值, # 比如"[美] 彼得·布雷瓦 / 后浪丨文化发展出版社 / 2017-11",它们是通过/进行累加的 book_author_str = book_ele.xpath( './div[@class="detail-frame"]//p[@class="color-gray"]/text()' ).extract()[0] book_author_array = book_author_str.split("/") book_author = book_author_array[0].strip() # 发布时间 publish_time = book_author_array[2].strip() # 书籍介绍 book_detail = book_ele.xpath( './div[@class="detail-frame"]//p[@class="detail"]/text()' ).extract()[0] book_item["cover_url"] = cover_url.strip() book_item["url"] = url.strip() book_item["book_name"] = book_name.strip() book_item["book_author"] = book_author.strip() book_item["publish_time"] = publish_time.strip() book_item["book_detail"] = book_detail.strip() # 进到书籍详细页去获取书籍页数和价格信息 yield scrapy.Request(url=url, meta={'book_item': book_item}, callback=self.parse_detail)
def parse(self, response): book = DoubanBookItem() if response.status == 200: try: title = response.xpath( "//div[@id='wrapper']/h1/span/text()").extract() link = response.url imgurl = response.xpath( "//div[@id='mainpic']/a[@class='nbg']/@href" ).extract_first() author = response.xpath( "//div[@id='info']/a[1]/text()").extract() score = response.xpath( "//div[@id='interest_sectl']/div/div[2]/strong/text()" ).extract() score_num = response.xpath( "//div[@id='interest_sectl']/div/div[2]/div/div[2]/span/a/span/text()" ).extract() label = response.xpath("//a[@class=' tag']/text()").extract() bookdesc = response.xpath( "//*[@id='link-report']/div[1]/div/p/text()").extract() authordesc = response.xpath( "//*[@id='content']/div/div[1]/div[3]/div[2]/div/div/p/text()" ).extract() infos = response.xpath("//div[@id='info']") for info in infos.xpath("./*|./text()"): name = info.xpath("text()").extract_first() if name is not None: curType = "" if "出版社:" == name: curType = "press" continue elif "出版年:" == name: curType = "publishyear" continue elif "页数:" == name: curType = "pagecount" continue elif "定价:" == name: curType = "price" continue elif "ISBN:" == name: curType = "isbn" continue span = info.extract() span = span.strip() # 去掉空格 span = span.replace("\n", "") # 去掉换行符 span = span.replace("<br>", "") # 去掉换行符 if len(span) != 0: if curType == "press": book['press'] = span elif curType == "publishyear": book['publishyear'] = span elif curType == "pagecount": book['pagecount'] = int(re.sub( "\D", "", span)) #todo 这里限制只获取数字 去掉冒号 单位 elif curType == "price": book['price'] = float( re.findall(r"\d+\.?\d*", span)[0]) elif curType == "isbn": book['isbn'] = span book['title'] = title book['link'] = link book['imgurl'] = imgurl book['author'] = author book['score'] = score book['label'] = label book['authordesc'] = authordesc book['bookdesc'] = bookdesc yield book continueurls = response.xpath( "//div[@id='db-rec-section']/div[@class='content clearfix']/dl/dt/a/@href" ).extract() for url in continueurls: yield scrapy.Request(url) except: print('-' * 30 + 'error' + '-' * 30) else: print('*' * 99)
def parse_detail_page(self, response): # 处理蘑菇代理的ip异常 if 'navigator.platform' in response.text: print("Your IP is restricted.", response.url) yield scrapy.Request(url=response.url, callback=self.parse_detail_page, dont_filter=True) return item = DoubanBookItem() item['url'] = response.url schema = response.xpath( "//script[@type='application/ld+json']/text()").extract_first() if schema is not None: d = eval(schema) item['title'] = d.get('name') item['isbn'] = d.get('isbn') try: author = d['author'][0].get('name') except IndexError: pass else: item['author'] = author info = response.xpath("//div[@id='info']").extract_first() info_map = { '副标题': 'subtitle', '出版年': 'publishing_year', '出版社': 'publishing_house', '页数': 'page_number', '定价': 'price', } for name, item_name in info_map.items(): try: temp = re.search(rf'{name}:</span>(.*?)<br>', info) except: continue if temp is not None: item[item_name] = temp.group(1).strip() rating = response.xpath( "//strong[@class='ll rating_num ']/text()").extract_first() if rating is not None: item['rating'] = rating.strip() item['vote_number'] = response.xpath( "//span[@property='v:votes']/text()").extract_first() item['image'] = response.xpath( "//*[@id='mainpic']/a/img/@src").extract_first() content_list = response.xpath( "//div[@id='link-report']//div[@class='intro']/p/text()").extract( ) item['content_intro'] = ' '.join(content_list) item['author_intro'] = response.xpath( "//span[text()='作者简介']/../following-sibling::div[1]//div[@class='intro']/p/text()" ).extract_first() if response.url is not None: book_id = re.search(r'(\d+)/$', response.url).group(1) directory_list = response.xpath( f"//div[@id='dir_{book_id}_full']/text()").extract() item['directory'] = ';'.join(directory_list) recommend_books = response.xpath( "//*[@id='db-rec-section']/div/dl/dd/a/text()").extract() if len(recommend_books) != 0: recommend_books = [book.strip() for book in recommend_books] recommend_urls = response.xpath( "//*[@id='db-rec-section']/div/dl/dd/a/@href").extract() item['douban_recommends'] = list(zip(recommend_books, recommend_urls)) tags = response.xpath( "//div[@id='db-tags-section']//a[@class=' tag']/text()").extract( ) item["tags"] = ' '.join(tags) # 短评和评论 item['comments'] = response.xpath( "//*[@id='new_score']/ul/li//span[@class='short']/text()").extract( ) yield item m = { 'url': response.url, 'title': item.get('title'), } cid_list = response.xpath( '//div[@class="review-list "]/div/@data-cid').extract() headers_review = { 'X-Requested-With': 'XMLHttpRequest', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Dest': 'empty', } for cid in cid_list: url = f'https://book.douban.com/j/review/{cid}/full' yield scrapy.Request(url=url, callback=self.parse_review_page, headers=headers_review, meta={'data': deepcopy(m)})