Пример #1
0
 def parse(self, response):
     infos = response.css("div.info")
     title = response.css("title::text").extract_first().replace("\n","").replace(" ","").split("/")[0].split(":")[-1]
     for info in infos:
         item = DoubanBookItem(press="未知出版社",year="1000-1",price="0.00",comment_num="0",score="0.0")
         item['tags'] = title
         item['book_name'] = info.css("a::attr(title)").extract_first()
         item['url'] = info.css("a::attr(href)").extract_first()
         data = info.css(".pub::text").extract_first().replace("\n","").replace(" ","")
         item['author'] = data.split('/')[0]
         item['score'] = info.css(".star.clearfix").css(".rating_nums::text").extract_first()
         item['comment_num'] = info.css(".star.clearfix").css(".pl::text").extract_first().replace("\n","").replace(" ","")
         try:
             item['price'] = self.price_re.search(data).group(0)
             item['year'] = self.year_re.search(data).group(0)
             item['press'] = self.press_re.search(data).group(0)
             item['comment_num'] = self.num_re.search(item['comment_num']).group(0)
             item['score'] = self.score_re.search(item['score']).group(0)
         except AttributeError:
             print(item['book_name'] + "资料丢失!")
         except TypeError:
             print(item['book_name'] + "缺少键值")
         finally:
             yield item
     next_url = response.css(".paginator").css(".next a::attr(href)").extract_first()
     if next_url:
         time.sleep(random.random()*2+1)
         yield scrapy.Request(self.join_url(next_url),callback=self.parse)
     else:
         yield scrapy.Request(self.all_tags_url[0],callback=self.parse)
         self.all_tags_url.pop(0)
Пример #2
0
 def parse(self, response):
     commentSels = scrapy.Selector(
         text=response.text).xpath('//li[@class="comment-item"]')
     for s in commentSels:
         content = s.xpath(
             './div[@class="comment"]/p[@class="comment-content"]/span[@class="short"]/text()'
         ).extract()[0]
         stars = s.xpath(
             './/span[contains(@class,"user-stars")]/@title').extract()
         self.cnt += 1
         #yield item
         item = DoubanBookItem()
         item["idx"] = self.cnt
         item["content"] = content.replace('\r',
                                           '').replace('\n', '').replace(
                                               ',', ',').replace('"', '“')
         item["star"] = stars[0] if stars else ""
         yield item
     pageSels = scrapy.Selector(text=response.text).xpath(
         '//a[@class="page-btn" and contains(text(),"后一页")]/@href')
     for s in pageSels:
         page_tmpl = self.urltmpl + s.extract()
         page_url = page_tmpl % self.bookid
         print(page_url)
         yield scrapy.Request(url=page_url, callback=self.parse)
Пример #3
0
 def parse_book(self, response):
     item = DoubanBookItem()
     item['book_name'] = response.css("#wrapper").css("h1").css(
         "span::text").extract_first()
     info = response.css("div#info").css("span.pl::text")
     print(info)
     info = response.css("div#info").css("br::text")
     print(info)
     item['author'] = response.css("div#info").css(
         "a::text").extract_first()
     item['press'] = response.css("div")
Пример #4
0
 def parse1(self, response):
     m = response.xpath('//td[@valign="top"]')[0]
     for i in range(0, 25):
         book = DoubanBookItem()
         book['name'] = m.xpath('//div[@class="pl2"]/a/@title').extract()[i]
         book['nums'] = m.xpath('//span[@class="pl"]/text()').extract()[i]
         book['ratings'] = m.xpath(
             '//span[@class="rating_nums"]/text()').extract()[i]
         book['author'] = m.xpath('//p[@class="pl"]/text()').extract()[i]
         time.sleep(1)
         yield book
Пример #5
0
	def parse_page(self, response):
		for item in response.xpath('//tr[@class="item"]'):
			book = DoubanBookItem()
			book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0]
			book['ratings'] = item.xpath('td[2]/div[2]/span[@class="rating_nums"]/text()').extract()[0]
			book_info = item.xpath('td[2]/p[1]/text()').extract()[0]  
			book_info_contents = book_info.strip().split(" / ")
			book['author'] = book_info_contents[0]   
			book['publisher'] = book_info_contents[-3]   # book_info_contents可能为4段或者5段(包含译者),负索引提取
			book['edition_year'] = book_info_contents[-2]
			book['price'] = book_info_contents[-1]
			yield book
Пример #6
0
	def parse_next(self,response):
		#pass
		#//*[@id="content"]/div/div[1]/div/table[1]/tbody/tr/td[2]/div[1]/a
		#//*[@id="content"]/div/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/a
		#//*[@id="content"]/div/div[1]/div/table[1]
		for items in response.xpath('//tr[@class="item"]'):
			book = DoubanBookItem()
			#/tr/td[2]/div[1]/a
			book['name'] = items.xpath('td[2]/div[1]/a/@title').extract()[0]
			book['ratings'] = items.xpath('//span[@class="rating_nums"]/text()').extract()[0]
			#td[2]/p[1]
			book['info'] = items.xpath('td[2]/p[1]/text()').extract()[0]
			yield book #返回内容
Пример #7
0
 def parse_page(self, response):
     for item in response.xpath('//tr[@class="item"]'):
         book = DoubanBookItem()
         book['a'] = item.xpath('td[2]/div[1]/a/@title').extract()[0]
         book['b'] = item.xpath(
             'td[2]/div[2]/span[@class="rating_nums"]/text()').extract()[0]
         # book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0]
         book_info = item.xpath('td[2]/p[1]/text()').extract()[0]
         book_info_contents = book_info.strip().split(' / ')
         book['c'] = book_info_contents[0]
         book['d'] = book_info_contents[1]
         book['e'] = book_info_contents[2]
         book['f'] = book_info_contents[3]
         yield book
    def parse_book(self, response):
        sel = Selector(response=response)
        item = DoubanBookItem()

        item["name"] = sel.xpath(
            "//div[@id = 'wrapper']/h1/span/text()").extract_first().strip()
        item["score"] = sel.xpath(
            "//*[@id='interest_sectl']/div/div[2]/strong/text()"
        ).extract_first().strip()
        item["link"] = response.url

        try:
            contents = sel.xpath(
                "//*[@id='link-report']/div[1]/div/p/text()").extract()
            item["content_description"] = "\n".join(
                content for content in contents).strip()
        except:
            item["content_description"] = ""

        try:
            profiles = sel.xpath(
                "//*[@id='content']/div/div[1]/div[3]/div[2]/div/div/p/text()"
            ).extract()
            item["author_profile"] = "\n".join(
                profile for profile in profiles).strip()
        except:
            item["author_profile"] = ""

        #get the infos of the book and processing the string to extract info
        infos = response.xpath("//*[@id='info']").extract_first()
        infos = re.sub("\s+", "", infos)
        infos = re.sub("<.*?>", " ", infos).strip()
        infos = infos.split(" ")
        infos = [
            info.replace(":", "") for info in infos
            if info != "" and info != ":" and info != " "
        ]

        #extract info
        inventory = [("author", "作者"), ("press", "出版社"), ("date", "出版年"),
                     ("page", "页数"), ("price", "定价"), ("ISBN", "ISBN")]
        for dict_name, info_name in inventory:
            item[dict_name] = infos[infos.index(info_name) +
                                    1] if info_name in infos else ""

        return item
Пример #9
0
    def parse(self, response):
        # xpath 教程:https://www.runoob.com/xpath/xpath-tutorial.html
        # 下面这句话拿到了一个网页的书籍的列表
        lis = response.xpath('//ul[@class="subject-list"]/li')

        # 遍历每本书
        for li in lis:
            # 解析各个书籍
            img = li.xpath('div[1]/a/img').attrib.get('src', '')
            info_attr = li.xpath('div[2]/h2/a').attrib
            href = info_attr.get('href', '')
            title = info_attr.get('title', '')
            about = li.xpath('div[2]/div').css('::text').get().strip()
            rate = li.xpath('div[2]/div[2]/span[2]').css('::text').get()
            rate_count = li.xpath('div[2]/div[2]/span[3]').css(
                '::text').get().strip()
            desc = li.xpath('div[2]/p').css('::text').get()

            body = {
                'img': img,
                'href': href,
                'title': title,
                'about': about,
                'rate': rate,
                'rate_count': rate_count,
                'desc': desc,
            }

            # Scrapy 中的 Item,参考:https://scrapy-cookbook.readthedocs.io/zh_CN/latest/scrapy-05.html#item
            item = DoubanBookItem()
            for k, v in body.items():
                item[k] = v
            # 返回这个 item
            yield item

        # 每个 tag 对应的好很多页,这里解析下一页的地址
        next = response.xpath('//span[@class="next"]/a').attrib.get('href', '')
        if next:
            next = f'https://book.douban.com{next}'

            # 然后爬下一页
            yield scrapy.Request(next, callback=self.parse)
 def parse_page(self, response):
     for item in response.xpath('//tr[@class="item"]'):
         book = DoubanBookItem()
         book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0]
         book['ratings'] = item.xpath(
             'td[2]/div[2]/span[@class="rating_nums"]/text()').extract()[0]
         # book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0]
         book_info = item.xpath('td[2]/p[1]/text()').extract()[0]
         book_info_contents = book_info.strip().split(' / ')
         num = len(book_info_contents)
         if num == 4:
             book['author'] = book_info_contents[0]
             book['publisher'] = book_info_contents[1]
             book['edition_year'] = book_info_contents[2]
             book['price'] = book_info_contents[3]
         if num == 5:
             book['author'] = book_info_contents[0]
             book['author1'] = book_info_contents[1]
             book['publisher'] = book_info_contents[2]
             book['edition_year'] = book_info_contents[3]
             book['price'] = book_info_contents[4]
         yield book
Пример #11
0
 def parse(self, response):
     nodes = response.xpath('//li[@class="subject-item"]')
     for li in nodes:
         item = DoubanBookItem()
         item['title'] = self.normal(
             li.xpath('./div[2]/h2/a/@title').extract())
         item['author'] = self.normal(
             li.xpath('.//div[@class="pub"]/text()').extract())
         item['star'] = self.normal(
             li.xpath('.//span[@class="rating_nums"]/text()').extract())
         item['comment'] = self.normal(
             li.xpath('.//span[@class="pl"]/text()').extract())
         item['price'] = self.normal(
             li.xpath('.//span[@class="buy-info"]/a/text()').extract())
         item['describe'] = self.normal(
             li.xpath('./div[2]/p/text()').extract())
         yield item
     next_page = response.xpath('//link[@rel="next"]/@href').extract()
     if next_page:
         next_page = next_page[0]
         yield Request('https://book.douban.com' + next_page,
                       callback=self.parse)
Пример #12
0
    def parse(self, response):
        sel = Selector(response)
        book_list = sel.xpath('//ul[@class="cover-col-4 clearfix"]/li')
        for book_ele in book_list:
            book_item = DoubanBookItem()
            # 书籍背景图片地址
            cover_url = book_ele.xpath(
                './a[@class="cover"]/img/@src').extract()[0]
            # 书籍详细页地址
            url = book_ele.xpath('./a[@class="cover"]/@href').extract()[0]
            # 书籍名称
            book_name = book_ele.xpath(
                './div[@class="detail-frame"]/h2/a/text()').extract()[0]
            # 书籍作者,我们发现这样获取到的信息包含了书籍作者、出版社和发布时间三个值,
            # 比如"[美] 彼得·布雷瓦 / 后浪丨文化发展出版社 / 2017-11",它们是通过/进行累加的
            book_author_str = book_ele.xpath(
                './div[@class="detail-frame"]//p[@class="color-gray"]/text()'
            ).extract()[0]
            book_author_array = book_author_str.split("/")
            book_author = book_author_array[0].strip()
            # 发布时间
            publish_time = book_author_array[2].strip()
            # 书籍介绍
            book_detail = book_ele.xpath(
                './div[@class="detail-frame"]//p[@class="detail"]/text()'
            ).extract()[0]
            book_item["cover_url"] = cover_url.strip()
            book_item["url"] = url.strip()
            book_item["book_name"] = book_name.strip()
            book_item["book_author"] = book_author.strip()
            book_item["publish_time"] = publish_time.strip()
            book_item["book_detail"] = book_detail.strip()

            # 进到书籍详细页去获取书籍页数和价格信息
            yield scrapy.Request(url=url,
                                 meta={'book_item': book_item},
                                 callback=self.parse_detail)
Пример #13
0
    def parse(self, response):

        book = DoubanBookItem()
        if response.status == 200:
            try:

                title = response.xpath(
                    "//div[@id='wrapper']/h1/span/text()").extract()
                link = response.url
                imgurl = response.xpath(
                    "//div[@id='mainpic']/a[@class='nbg']/@href"
                ).extract_first()
                author = response.xpath(
                    "//div[@id='info']/a[1]/text()").extract()
                score = response.xpath(
                    "//div[@id='interest_sectl']/div/div[2]/strong/text()"
                ).extract()
                score_num = response.xpath(
                    "//div[@id='interest_sectl']/div/div[2]/div/div[2]/span/a/span/text()"
                ).extract()
                label = response.xpath("//a[@class='  tag']/text()").extract()
                bookdesc = response.xpath(
                    "//*[@id='link-report']/div[1]/div/p/text()").extract()
                authordesc = response.xpath(
                    "//*[@id='content']/div/div[1]/div[3]/div[2]/div/div/p/text()"
                ).extract()
                infos = response.xpath("//div[@id='info']")

                for info in infos.xpath("./*|./text()"):
                    name = info.xpath("text()").extract_first()
                    if name is not None:
                        curType = ""
                    if "出版社:" == name:
                        curType = "press"
                        continue
                    elif "出版年:" == name:
                        curType = "publishyear"
                        continue
                    elif "页数:" == name:
                        curType = "pagecount"
                        continue
                    elif "定价:" == name:
                        curType = "price"
                        continue
                    elif "ISBN:" == name:
                        curType = "isbn"
                        continue

                    span = info.extract()
                    span = span.strip()  # 去掉空格
                    span = span.replace("\n", "")  # 去掉换行符
                    span = span.replace("<br>", "")  # 去掉换行符

                    if len(span) != 0:
                        if curType == "press":
                            book['press'] = span
                        elif curType == "publishyear":
                            book['publishyear'] = span
                        elif curType == "pagecount":
                            book['pagecount'] = int(re.sub(
                                "\D", "", span))  #todo 这里限制只获取数字 去掉冒号 单位
                        elif curType == "price":
                            book['price'] = float(
                                re.findall(r"\d+\.?\d*", span)[0])
                        elif curType == "isbn":
                            book['isbn'] = span

                book['title'] = title
                book['link'] = link
                book['imgurl'] = imgurl
                book['author'] = author
                book['score'] = score
                book['label'] = label
                book['authordesc'] = authordesc
                book['bookdesc'] = bookdesc

                yield book

                continueurls = response.xpath(
                    "//div[@id='db-rec-section']/div[@class='content clearfix']/dl/dt/a/@href"
                ).extract()

                for url in continueurls:
                    yield scrapy.Request(url)

            except:
                print('-' * 30 + 'error' + '-' * 30)
        else:
            print('*' * 99)
Пример #14
0
    def parse_detail_page(self, response):
        # 处理蘑菇代理的ip异常
        if 'navigator.platform' in response.text:
            print("Your IP is restricted.", response.url)
            yield scrapy.Request(url=response.url,
                                 callback=self.parse_detail_page,
                                 dont_filter=True)
            return
        item = DoubanBookItem()
        item['url'] = response.url
        schema = response.xpath(
            "//script[@type='application/ld+json']/text()").extract_first()
        if schema is not None:
            d = eval(schema)
            item['title'] = d.get('name')
            item['isbn'] = d.get('isbn')
            try:
                author = d['author'][0].get('name')
            except IndexError:
                pass
            else:
                item['author'] = author
        info = response.xpath("//div[@id='info']").extract_first()
        info_map = {
            '副标题': 'subtitle',
            '出版年': 'publishing_year',
            '出版社': 'publishing_house',
            '页数': 'page_number',
            '定价': 'price',
        }
        for name, item_name in info_map.items():
            try:
                temp = re.search(rf'{name}:</span>(.*?)<br>', info)
            except:
                continue
            if temp is not None:
                item[item_name] = temp.group(1).strip()

        rating = response.xpath(
            "//strong[@class='ll rating_num ']/text()").extract_first()
        if rating is not None:
            item['rating'] = rating.strip()
        item['vote_number'] = response.xpath(
            "//span[@property='v:votes']/text()").extract_first()
        item['image'] = response.xpath(
            "//*[@id='mainpic']/a/img/@src").extract_first()

        content_list = response.xpath(
            "//div[@id='link-report']//div[@class='intro']/p/text()").extract(
            )
        item['content_intro'] = ' '.join(content_list)
        item['author_intro'] = response.xpath(
            "//span[text()='作者简介']/../following-sibling::div[1]//div[@class='intro']/p/text()"
        ).extract_first()
        if response.url is not None:
            book_id = re.search(r'(\d+)/$', response.url).group(1)
            directory_list = response.xpath(
                f"//div[@id='dir_{book_id}_full']/text()").extract()
            item['directory'] = ';'.join(directory_list)

        recommend_books = response.xpath(
            "//*[@id='db-rec-section']/div/dl/dd/a/text()").extract()
        if len(recommend_books) != 0:
            recommend_books = [book.strip() for book in recommend_books]
        recommend_urls = response.xpath(
            "//*[@id='db-rec-section']/div/dl/dd/a/@href").extract()
        item['douban_recommends'] = list(zip(recommend_books, recommend_urls))
        tags = response.xpath(
            "//div[@id='db-tags-section']//a[@class='  tag']/text()").extract(
            )
        item["tags"] = ' '.join(tags)
        # 短评和评论
        item['comments'] = response.xpath(
            "//*[@id='new_score']/ul/li//span[@class='short']/text()").extract(
            )
        yield item

        m = {
            'url': response.url,
            'title': item.get('title'),
        }
        cid_list = response.xpath(
            '//div[@class="review-list  "]/div/@data-cid').extract()
        headers_review = {
            'X-Requested-With': 'XMLHttpRequest',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Dest': 'empty',
        }
        for cid in cid_list:
            url = f'https://book.douban.com/j/review/{cid}/full'
            yield scrapy.Request(url=url,
                                 callback=self.parse_review_page,
                                 headers=headers_review,
                                 meta={'data': deepcopy(m)})