def parse(self, response): # 提取数据 # 书的信息在<article class="product_pod">中 books = response.css('article.product_pod') item = BookItem() for book in books: # 书籍名称 # name = book.css('h3 a::text').extract_first() name = book.xpath('./h3/a/@title').extract_first() # 书籍价格 #price = book.css('p.price_color::text').extract_first() price = book.xpath('./div[@class="product_price"]/p/text()').extract_first() # 书籍图片地址 imgUrl = book.xpath('./div[@class="image_container"]/a/img/@src').extract_first() item['name'] = name, item['price'] = price item['imgUrl'] = 'http://books.toscrape.com/'+ imgUrl yield item # 提取下一页链接 next_url = response.xpath('//li[@class="next"]/a/@href').extract_first() # next_url = response.css('ul.pager li.next a::attr(href)').extract_first() if next_url: # 构造下一页链接 next_url = response.urljoin(next_url) # 构造新的Request对象 yield scrapy.Request(next_url, callback=self.parse)
def parse_chapter(self, response): item = BookItem() item['c_title'] = response.css('h1::text').extract_first() item['content'] = response.xpath( '//*[@id="content"]/p/text()').extract() yield item
def parse(self, response): try: node_list = response.xpath("//div[@class='book-info']") for node in node_list: item = BookItem() item['book_name'] = node.xpath("./h3/a/text()").extract()[0] item['book_type'] = node.xpath( "./p[1]/span[1]/text()").extract()[0] item['book_stat'] = node.xpath( "./p[1]/span[2]/text()").extract()[0] item['book_author'] = node.xpath( "./p[1]/span[3]/text()").extract()[0] yield item if self.pageNum != 16: self.pageNum += 1 yield scrapy.Request(self.baseURL + str(self.pageNum), callback=self.parse) except Exception as e: print(e)
def detil(self, response): image_url = response.xpath( "//div[@class='detail']/a/img/@src").extract_first() book_name = response.xpath( "//div[@class='detail']/div/h1/text()").extract_first() bool_jianjie = response.xpath( "//div[@class='detail']/div/div/div/text()").extract_first() bool_jianjie = bool_jianjie.strip() read_url1 = response.xpath( "//a[@class='reader']/@href").extract_first() book_type = response.xpath( "//a[@class='c009900']/text()").extract_first() booker = response.xpath( "//dl[@class='bookso']/dd/text()").extract_first() try: str_list = read_url1.split('/') book_id = str_list[-1] except: book_id = 403 r = BookItem(image_url=image_url, book_name=book_name, bool_jianjie=bool_jianjie, book_id=book_id, book_type=book_type, booker=booker) yield r text = scrapy.Request(url=read_url1, callback=self.read1, meta={'book_id': book_id}) yield text
def parse(self, response): #大分类分组 div_list = response.xpath( "//div[@class='con flq_body']/div[4]") # 数据太多了,选取第四个div为小说的分类来爬 for div in div_list: item = BookItem() item["b_cate"] = div.xpath("./dl/dt//text()").extract() item["b_cate"] = [ i.strip() for i in item["b_cate"] if len(i.strip()) > 0 ] #中间分类分组 dl_list = div.xpath("./div//dl[@class='inner_dl']") for dl in dl_list: item["m_cate"] = dl.xpath("./dt//text()").extract() item["m_cate"] = [ i.strip() for i in item["m_cate"] if len(i.strip()) > 0 ][0] #小分类分组 a_list = dl.xpath("./dd/a") for a in a_list: item["s_href"] = a.xpath("./@href").extract_first() item["s_cate"] = a.xpath("./text()").extract_first() if item["s_href"] is not None: yield scrapy.Request(item["s_href"], callback=self.parse_book_list, meta={"item": deepcopy(item)})
def parse_item(self, response): print('bar') item = BookItem() name_info = response.xpath('//div[@class="name_info"]') item['title'] = name_info.xpath('./h1/@title').extract()[0] item['intro'] = name_info.xpath('./h2/span/@title').extract()[0] messbox_info = response.xpath('//div[@class="messbox_info"]') item['author'] = messbox_info.xpath( './/span[@id="author"]/a/text()').extract()[0] item['publisher'] = messbox_info.xpath( './/span[@ddt-area="003"]/a/text()').extract()[0] item['star'] = messbox_info.xpath( './/span[@class="star"]/@style').extract()[0] item['image'] = response.xpath( '//div[@class="pic_info"]//img/@src').extract()[0] item['price'] = response.xpath( '//div[@class="price_pc"]//p/text()').extract()[0] pro_content = response.xpath('//div[@class="pro_content"]') item['ISBN'] = pro_content.xpath( './ul[@class="key clearfix"]/li/text()').extract()[9] item['tag0'] = pro_content.xpath( './/span[@class="lie"]/a/text()').extract()[0] item['tag1'] = pro_content.xpath( './/span[@class="lie"]/a/text()').extract()[1] item['tag2'] = pro_content.xpath( './/span[@class="lie"]/a/text()').extract()[2] item['link'] = response.url item['web'] = 'Dangdang' self.items.append(item) return self.items
def parse(self, response): books = response.xpath('//ul[@class="bang_list"]/li') items = [] for i in range(0, 19): item = BookItem() url = books.xpath('.//div[@class="pic"]/a/@href').extract()[i] print('foo {}'.format(url)) yield scrapy.Request(url=url, callback=self.parse_item)
def parse(self, response): ''' start_requests已经爬取到页面,那如何提取我们想要的内容呢?那就可以在这个方法里面定义。 这里的话,并木有定义,只是简单的把页面做了一个保存,并没有涉及提取我们想要的数据,后面会慢慢说到 也就是用xpath、正则、或是css进行相应提取,这个例子就是让你看看scrapy运行的流程: 1、定义链接; 2、通过链接爬取(下载)页面; 3、定义规则,然后提取数据; 就是这么个流程,似不似很简单呀? ''' meta = response.meta filename = response.xpath('//div[@class="bookname"]/h1/text()').extract_first() if filename is not None: filename = filename.replace("正文", "").strip() else: return if 'next' in meta: next = meta['next'] else: next = 0 if 'url' in meta: url = meta['url'] else: url = '' # 保存数据 item = BookItem() item['url'] = url item['title'] = filename contents = response.xpath('//div[@id="content"]/text()').extract() content = '' for it in contents: if it == '\r\n': continue content += it + '\r\n' item['content'] = content yield item # 不自动查找下一页, 直接停止 if next == 0: return taga = response.xpath('//div[@class="bottem1"]/a') # css选择器提取下一页链接 next_page = '' for a in taga: ts = a.xpath('text()').extract_first() if ts == '下一章': next_page = a.xpath('@href').extract_first() break logging.info(next_page) if next_page.strip() != '': # 判断是否存在下一页 next_page = response.urljoin(next_page) # 太快网站会返回失败信息 yield scrapy.Request(next_page, meta={"url": next_page, "next": 1}, callback=self.parse)
def parse_page1(self,response): ''' 需要知道的是item是一个字典 ''' item = BookItem() request = scrapy.Request("http://www.example.com/some_page.html", callback=self.parse_page2) request.meta['item'] = item return request '''比如我们要爬取淘宝上的商品,我们在第一层爬取时候获得了标题(title)和价格(price),
def parse(self, response): for article in response.xpath("//article[@class = 'product_pod']"): loader = ItemLoader(item=BookItem(), selector=article) img_link = article.xpath( ".//div[@class = 'image_container']/a/img/@src").get() absolute_url = response.urljoin(img_link) loader.add_value('image_urls', absolute_url) loader.add_xpath('book_name', './/h3/a/@title') yield loader.load_item()
def parse(self, response): title_xpath = "//div[@class='bookname']/h1/text()" body_xpath = "//div[@id='content']/text()" next_page_xpath = "//div[@class='bottem1']/a[3]/@href" body = response.body.decode('gbk') item = BookItem() item["title"] = Selector(text=body).xpath(title_xpath).extract()[0] item["body"] = '\n'.join( Selector(text=body).xpath(body_xpath).extract()[1:]) yield item url = Selector(text=body).xpath(next_page_xpath).extract()[0] yield scrapy.Request("https://www.23txt.com" + url)
def parse_content(self, response, **kwargs): item = BookItem() soup = BeautifulSoup(response.text, "lxml") same_name = response.cb_kwargs.get('book_name') item['book_name'] = same_name item['chapter_title'] = soup.find('font', attrs={'size': '4'}).text item['chapter_content'] = soup.find('td', attrs={'width': '820'}).text yield item next_page = soup.find('td', attrs={'width': '28%'}).find('a') if next_page: yield response.follow(next_page.get('href'), callback=self.parse_content, cb_kwargs={"book_name": same_name})
def parse(self, response): item=BookItem() sel=Selector(response) imgs=sel.xpath('//*[@class="doulist-item"]') item['url']=[] item['name']=[] for img in imgs: site=img.xpath('div/div[2]/div[2]/a/img/@src').extract_first() img_name=img.xpath('div/div[2]/div[3]/a/text()').extract_first() img_name=img_name.split()[0] item['url'].append(site) item['name'].append(img_name) yield item
def parse(self, response): dt_list = response.xpath("//div[@class='mc']/dl/dt") # 大分类列表 for dt in dt_list: item = BookItem() item["b_cate"] = dt.xpath("./a/text()").extract_first() em_list = dt.xpath("./following-sibling::dd[1]/em") # 小分类列表 for em in em_list: item["s_cate"] = em.xpath("./a/text()").extract_first() item["s_href"] = em.xpath("./a/@href").extract_first() if item["s_href"] is not None: item["s_href"] = "https:" + item["s_href"] yield scrapy.Request(item["s_href"], callback=self.parse_book_list, meta={"item": deepcopy(item)})
def get_chapterurl(self, response): item = BookItem() item['name'] = str(response.meta['name']).replace('\xa0', '') item['novelurl'] = response.meta['url'] category = BeautifulSoup(response.text, 'lxml').find('table').find('a').get_text() author = BeautifulSoup( response.text, 'lxml').find('table').find_all('td')[1].get_text() bash_url = BeautifulSoup(response.text, 'lxml').find( 'p', class_='btnlinks').find('a', class_='read')['href'] name_id = str(bash_url)[-6:-1].replace('/', '') item['category'] = str(category).replace('/', '') item['author'] = str(author).replace('/', '') item['name_id'] = name_id return item
def parse(self, response): item = BookItem() nodes = response.xpath('//div[@class="article"]//tr[@class="item"]') for node in nodes: name = node.xpath('td[2]/div[1]/a/text()').extract_first().strip() summary = node.xpath('td[2]/p[2]/span/text()').extract_first() item['name'] = name item['summary'] = summary yield item next_urls = response.xpath( '//div[@class="paginator"]//span[@class="next"]/a/@href' ).extract_first() if next_urls: yield Request(url=parse.urljoin(response.url, next_urls), callback=self.parse)
def parse_content(self, response): ''' 抓取章节内容 ''' arr = BookItem() arr['list_name'] = response.meta['list_name'] arr['list_url'] = response.meta['list_url'] arr['num'] = response.meta['num'] arr['desc']= response.meta['desc'] arr['file_name']= response.meta['file_name'] title = response.xpath('//h2/text()').extract_first() content = response.xpath('//div[@id="box"]//p[@class="Text"]/text()').extract() arr['content'] = "\n".join(content) arr['title'] = title yield arr
def parse(self, response): if 'cid' in response.url: urls = response.xpath('//a[@class="a_7 fl"]/@href').extract() for url in urls: yield scrapy.Request(url) elif 'view' in response.url: item = BookItem() item['name'] = response.xpath( '//h2[@class="h_10"]/text()').extract()[0].replace('\t', '') item['price'] = response.xpath( '//h2[@class="h_10"]/span/text()').extract()[0] item['isbn'] = response.xpath( '//div[@class="div_47 fix"]/span/text()').extract()[1].replace( 'ISBN:', '') item['url'] = response.url yield item
def parse(self, response): li_list = response.xpath('//ul[@class="ulwrap"]/li') for li in li_list: item = BookItem() item['b_cate'] = li.xpath('./div[1]/a/text()').extract_first() # 大分类 名称 a_list = li.xpath('./div[2]/a') for a in a_list: item['s_href'] = a.xpath("./@href").extract_first() # 小分类 链接 item["s_cate"] = a.xpath('./text()').extract_first() # 小分类名称 if item["s_href"] is not None: item["s_href"] = "http://snbook.suning.com"+item["s_href"] # 补全链接 yield scrapy.Request( url=item['s_href'], callback=self.parse_book_list, meta=deepcopy(item) )
def parse_book(self, response): book = BookItem() item = response.css('div.product_main') book['name'] = item.css('div.product_main h1::text').extract_first() book['price'] = item.css('p.price_color::text').extract_first() book['review_rating'] = item.css( 'p.star-rating::attr(class)').re_first('star-rating ([A-Za-z]+)') book['upc'] = response.css( 'table.table-striped tr:nth-child(1) td::text').extract_first() book['stock'] = response.css( 'table.table-striped tr:nth-child(6) td::text').re_first( 'In stock \((\d+) available\)') book['review_num'] = response.css( 'table.table-striped tr:nth-child(7) td::text').extract_first() yield book
def parse_list(self, response): item = BookItem() content_title = response.css('div.cont_title') item['title'] = content_title.css('h1::text').extract_first() item['author'] = content_title.css( 'div.hslice>p.entry-title>a::text').extract_first() item['thumb'] = content_title.css( 'div.hslice>div.thumb>img::attr(src)').extract_first() details = response.css('div#book_detail') details.remove(details[0]) #删除最新章节 chapters = details.css('ol>li') for c_list in chapters: c_link = c_list.css('a::attr(href)').extract_first() response.follow(c_link, callback=self.parse_chapter)
def parse(self, response): print("-"*10) #print("hello, world") item = BookItem() item['title'] = response.xpath("//a[@class='pic']/@title").extract() item['price'] = response.xpath("//span[@class='search_now_price']/text()").extract() item['pic'] = response.xpath("//a[@class='pic']/img/@data-original").extract() item['author'] = response.xpath("//a[@name='itemlist-author']/text()").extract() item['publish'] = response.xpath("//a[@name='P_cbs']/@title").extract() item['time'] = response.xpath("//p[@class='search_book_author']/span[2]/text()").extract() yield item for i in range(1, 6): url = "http://search.dangdang.com/?key=%BC%C6%CB%E3%BB%FA&act=input&page_index="+str(i) yield Request(url, callback=self.parse)
def parse_content(self, response): base = 'https://www.99csw.com' item = BookItem() soup = BeautifulSoup(response.text, "lxml") same_book_name = response.meta.get('book_name') item['book_name'] = same_book_name item['chapter_title'] = soup.find('h2').text item['chapter_content'] = soup.find('div', attrs={ "id": "content" }).text yield item next_page = soup.find('a', attrs={'id': 'next'}) if next_page: next_url = base + next_page.get('href') yield scrapy.Request(next_url, callback=self.parse_content, meta={'book_name': same_book_name})
def parse(self, response): item = BookItem() # 通过各Xpath表达式分别提取商品的名称、价格、链接、评论数等信息 item["name"] = response.xpath("//a[@class='pic']/@title").extract() item["price"] = response.xpath( "//span[@class='price_n']/text()").extract() item["link"] = response.xpath("//a[@class='pic']/@href").extract() item["comnum"] = response.xpath( "//a[@name='itemlist-review']/text()").extract() # 提取完后返回item yield item # 接下来很关键,通过循环自动爬取20页的数据 for i in range(1, 21): # 通过上面总结的网址格式构造要爬取的网址 url = "http://search.dangdang.com/?key=python&act=input&show=big&page_index=" + str( i) + "#J_tab" # 通过yield返回Request,并指定要爬取的网址和回调函数 # 实现自动爬取 yield Request(url, callback=self.parse)
def parse_book_info(self, response): soup = BeautifulSoup(response.text, 'lxml') item = BookItem() item['book_name'] = self.get_text(soup.select_one('h2.book-name')) item['author'] = soup.select_one('p.book-author').text.strip().split( '(')[0].strip().replace('\n', '').replace(' ', '').strip() item['translator'] = self.get_list_one_text( re.findall(re.compile('<a.*>(.*?)\\s+</a>\\s+\(译者\)'), response.text)) item['editor'] = self.get_list_one_text( re.findall(re.compile('维护人:\\S+<a.*>(.*?)</a>'), response.text)) price_text = re.findall(re.compile('price\\S>(.*?)</span>'), response.text) item['price'] = None if ( len(price_text) < 2 or price_text[1].find('¥') < 0) else price_text[1] item['isbn'] = self.get_list_one_text( re.findall(re.compile('书\\s+号\\S+>(.*?)\\s+</li>'), response.text)) item['publish_status'] = self.get_list_one_text( re.findall(re.compile('出版状态\\S+>(.*?)\\s+</li>'), response.text)) item['publish_date'] = self.get_list_one_text( re.findall(re.compile('出版日期\\S+>(.*?)\\s+</li>'), response.text)) item['origin_book_name'] = self.get_list_one_text( re.findall(re.compile('原书名\\S+n>\\s+(.*?)\\s+</li>'), response.text)) item['origin_book_price'] = None item['pages'] = self.get_list_one_text( re.findall(re.compile('页\\s+数\\S+>(.*?)\\s+</li>'), response.text)) item['format'] = self.get_list_one_text( re.findall(re.compile('开\\s+本\\S+>(.*?)\\s+</li>'), response.text)) item['introduction'] = self.get_text(soup.select_one('div#abstract')) item['origin_book_isbn'] = self.get_list_one_text( re.findall(re.compile('原书号\\S+>(.*?)\\s+</li>'), response.text)) item['avatar'] = soup.select_one('div.book-detail-img > a > img').get( 'src') item['tags'] = self.get_tags_text( soup.select('div.block-tag > div.block-body > ul > li > a')) item['book_url'] = response.request.url item['website'] = '博文视点' #特殊处理 if str(item['isbn']).strip() == '': item['isbn'] = None yield item
def parse_detail(self, response): ipQuery = response.meta["ipQuery"] html = response.text item = BookItem() item["name"] = response.xpath( '//h1[@id="itemDisplayName"]/text()').extract_first().strip() # 价格是js渲染 # 1. splash # 2. js逆向 luaUrl = "https:" + re.findall(r'"luaUrl":"(.*?)"', html)[0] passPartNumber = re.findall(r'"passPartNumber":"(.*?)"', html)[0] partNumber = re.findall(r'"partNumber":"(.*?)"', html)[0] vendorCode = re.findall(r'"vendorCode":"(.*?)"', html)[0] provinceCode = ipQuery["provinceCommerceId"] lesCityId = ipQuery["cityLESId"] lesDistrictId = ipQuery["districtLESId"] a = lesCityId + lesDistrictId + "01" category1 = re.findall(r'"category1":"(.*?)"', html)[0] mdmCityId = ipQuery["cityMDMId"] cityId = ipQuery["cityCommerceId"] districtId = ipQuery["districtCommerceId"] cmmdtyType = re.findall(r'"cmmdtyType":"(.*?)"', html)[0] custLevel = "" mountType = re.findall(r'"mountType":"(.*?)"', html)[0] if mountType != "03": b = "" else: b = mountType catenIds = re.findall(r'"catenIds":"(.*?)"', html)[0] weight = re.findall(r'"weight":"(.*?)"', html)[0] e = "" price_url = luaUrl + "/nspcsale_0_" + passPartNumber + "_" + partNumber + "_" + vendorCode + "_" + provinceCode + "_" + lesCityId + "_" + a + "_" + category1 + "_" + mdmCityId + "_" + cityId + "_" + districtId + "_" + cmmdtyType + "_" + custLevel + "_" + b + "_" + catenIds + "_" + weight + "___" + e + ".html" yield scrapy.Request(url=price_url, callback=self.parse_price, meta={"item": item})
def parse(self, response): item = BookItem() conent = response.css('div.sons:nth-child(2) > div:nth-child(1)') for con in conent: cen = con.css('head > meta:nth-child(5)::attr(content)').extract() name = con.css( 'div.sons:nth-child(2) > div:nth-child(1) > h1:nth-child(2)::text' ).extract() time = con.css( 'div.sons:nth-child(2) > div:nth-child(1) > p:nth-child(3) > a:nth-child(1)::text' ).extract() zuozhe = con.css( 'div.sons:nth-child(2) > div:nth-child(1) > p:nth-child(3) > a:nth-child(3)::text' ).extract() item['cen'] = cen item['name'] = name item['time'] = time item['zuozhe'] = zuozhe yield item
def parse_price(self, response): item = BookItem() item['title'] = response.xpath( '//div[@class="name_info"]/h1/@title').extract() item['comment_num'] = response.xpath( '//a[@id="comm_num_down"]/text()').extract() item['link'] = response.url item['price'] = response.xpath( '//p[@id="dd-price"]/text()').extract()[1] item['img_url'] = response.xpath( '//img[@id="modalBigImg"]/@src').extract() item['cate_1'] = response.xpath( '//div[@class="breadcrumb"]/a[@class="domain"]/b/text()').extract( ) item['cate_2'] = response.xpath( '//div[@class="breadcrumb"]/a/text()').extract()[0] item['cate_3'] = response.xpath( '//div[@class="breadcrumb"]/a/text()').extract()[1] yield item
def parse(self, response): menu_pattern = re.compile("menu:(\[{.*}\]).*?submenu:", re.S) result = menu_pattern.findall(response.text) pattern = re.compile("NAME: '(.*?)',URL: '(.*?)',id: '\d+',children:") cata_list = pattern.findall(result[0]) for cata in cata_list[1:]: item = BookItem() item['cata_title'] = cata[0] item['cata_url'] = cata[1] if not item['cata_url'].startswith('//list'): num_pattern = re.compile(".*/(\d+)-(\d+)-(\d+).html") result = num_pattern.findall(item['cata_url'])[0] item[ 'cata_url'] = '//list.jd.com/list.html?cat={},{}&tid={}'.format( result[0], result[1], result[2]) item['cata_url'] = 'https:' + item['cata_url'] # print(item) yield scrapy.Request(item['cata_url'], callback=self.parse_list, meta={'item': deepcopy(item)})
def parse_content(self, response): base = 'https://www.99csw.com' item = BookItem() soup = BeautifulSoup(response.text, "lxml") same_book_name = response.meta.get('book_name') item['book_name'] = same_book_name item['chapter_title'] = soup.find('h2').text item['chapter_content'] = soup.find('div', attrs={"id": "content"}).text yield item next_page = soup.find('a', attrs={'id': 'next'}) if next_page: next_url = base + next_page.get('href') js = "window.scrollTo(0,document.body.scrollHeight)" yield SeleniumRequest(next_url, script=js, # handler=self._handle_js, callback=self.parse_content, meta={'book_name': same_book_name})