def getrank(self, response): item = DangdangItem() item = response.meta['item'] if len(response.text) > 100: item['ranking'] = re.findall('"rank":"(.*?)"', response.text)[0] else: item['ranking'] = '无排名' return item
def single_goods(self, response): item = DangdangItem() item['title'] = response.meta['title'] #商品名 item['url'] = response.url #购买链接 item['comment'] = response.xpath( '//a[@dd_name="评论数"]/text()').extract() #评论数 item['price'] = response.meta['price'] #价格 # 商品id,为获得商品排名 goods_id = str(item['url']).split('/') g_id = goods_id[len(goods_id) - 1].split('.')[0] ranking_url = 'http://product.dangdang.com/index.php?r=callback%2Fget-bang-rank&productId=' + g_id yield Request(ranking_url, self.getrank, meta={'item': item})
def parse(self, response): item = DangdangItem() item['name'] = response.xpath("//a[@dd_name='单品标题']/text()").extract() item['price'] = response.xpath( "//span[@class='price_n']/text()").extract() item['link'] = response.xpath("//a[@class='pic']/@href").extract() item['comnum'] = response.xpath( "//a[@dd_name='单品评论']/text()").extract() yield item for i in range(1, 100): url = "http://category.dangdang.com/pg" + str( i) + "-cid4011029.html" yield Request(url, callback=self.parse)
def parse(self, response): item = DangdangItem() item['title'] = response.xpath('//a[@name="itemlist-picture"]/@title').extract() #print(item['title']) item['link'] = response.xpath('//a[@name="itemlist-picture"]/@href').extract() item['comment'] = response.xpath('//a[@dd_name="单品评论"]/text()').extract() for i in range(0,len(item['comment'])): item['comment'][i] = int(item['comment'][i].replace("条评论",'')) item['price'] = response.xpath('//span[@class="search_now_price"]/text()').extract() for i in range(0, len(item['price'])): item['price'][i] = float(re.findall('[0-9.].*',item['price'][i].replace('¥', ''))[0]) yield item for i in range(2,101): url = 'http://category.dangdang.com/pg'+str(i)+'-cp01.54.00.00.00.00.html' yield Request(url,callback=self.parse)
def parse(self, response): try: dammit = UnicodeDammit(response.body, ["utf-8", "gbk"]) data = dammit.unicode_markup selector = scrapy.Selector(text=data) lis = selector.xpath( "//li['@ddt-pit'][starts-with(@class,'line')]") for li in lis: title = li.xpath("./a[position()=1]/@title").extract_first() price = li.xpath( "./p[@class='price']/span[@class='search_now_price']/text()" ).extract_first() author = li.xpath( "./p[@class='search_book_author']/span[position()=1]/a/@title" ).extract_first() date = li.xpath( "./p[@class='search_book_author']/span[position()=last()- 1]/text()" ).extract_first() publisher = li.xpath( "./p[@class='search_book_author']/span[position()=last()]/a/@title " ).extract_first() detail = li.xpath( "./p[@class='detail']/text()").extract_first() #detail有时没有,结果None item = DangdangItem() item["title"] = title.strip() if title else "" item["author"] = author.strip() if author else "" item["date"] = date.strip()[1:] if date else "" item["publisher"] = publisher.strip() if publisher else "" item["price"] = price.strip() if price else "" item["detail"] = detail.strip() if detail else "" yield item #最后一页时link为None link = selector.xpath( "//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href" ).extract_first() if link: url = response.urljoin(link) yield scrapy.Request(url=url, callback=self.parse) except Exception as err: print(err)
def parse(self, response): item = DangdangItem() products = json.loads(response.text) for product in products['products']: item['book_name'] = product.get('name') # 书名 item['author_name'] = product.get('authorname') # 作者 item['price'] = product.get('price') # 现价 item['original_price'] = product.get('original_price') # 原价 item['score'] = product.get('score') # 评分 item['stock'] = product.get('stock') # 库存 item['total_review_count'] = product.get( 'total_review_count') # 评论数 item['shop_id'] = product.get('shop_id') # 店铺id item['shop_info'] = product.get('shop_info') # 店铺名称 item['publisher'] = product.get('publisher') # 出版社 item['publish_date'] = product.get('publish_date') # 出版日期 item['image_url'] = product.get('image_url') # 图书封面 item['product_url'] = product.get('product_url') # 图书url yield item self.offset += 1 if self.offset <= settings.MAX_PAGE: yield scrapy.Request(self.url.format(self.offset))
def parse_item(self, response): item = DangdangItem() # 实例化item commment_item = CommentItem() item["category"] = response.xpath('//*[@id="breadcrumb"]/a[1]/b/text()').extract_first()+'>'+response.xpath('//*[@id="breadcrumb"]/a[2]/text()').extract_first()+'>'+response.xpath('//*[@id="breadcrumb"]/a[3]/text()').extract_first() item["title"] = response.xpath("//*[@id='product_info']/div[1]/h1/@title").extract_first() item["detail"] = json.dumps(response.xpath("//*[@id='detail_describe']/ul//li/text()").extract(),ensure_ascii=False) item["link"] = response.url item["img_link"] =json.dumps(response.xpath("//div[@class='img_list']/ul//li/a/@data-imghref").extract()) try: item["price"] = response.xpath("//*[@id='dd-price']/text()").extract()[1].strip() except IndexError as e: item["price"] = response.xpath("//*[@id='dd-price']/text()").extract()[0].strip() item["comment_num"] = response.xpath("//*[@id='comm_num_down']/text()").extract()[0] try: item["source"] = response.xpath("//*[@id='shop-geo-name']/text()").extract()[0].replace('\xa0至','') except IndexError as e: item["source"] = '当当自营' # 通过正则表达式提取url中的商品id goodsid = re.compile('\/(\d+).html').findall(response.url)[0] commment_item['goods_id'] = goodsid item["goods_id"] = goodsid '''######################################################## 通过抓包分析,提取商品的好评率 ########################################################''' # 提取详情页源码中的categoryPath script = response.xpath("/html/body/script[1]/text()").extract()[0] categoryPath = re.compile(r'.*categoryPath":"(.*?)","describeMap').findall(script)[0] # 构造包含好评率包的链接 rate_url = "http://product.dangdang.com/index.php?r=comment%2Flist&productId="+str(goodsid)+"&categoryPath="+str(categoryPath)+"&mainProductId="+str(goodsid) r = requests.get(rate_url) data_dict = json.loads(r.text) item["rate"] = data_dict['data']['list']['summary']['goodRate'] item["good_comment_num"] = data_dict['data']['list']['summary']['total_crazy_count'] item["mid_comment_num"] = data_dict['data']['list']['summary']['total_indifferent_count'] item["bad_comment_num"] = data_dict['data']['list']['summary']['total_detest_count'] yield item '''##################################################### 开始对评论、评分进行清洗并爬取 #####################################################''' html_str = data_dict['data']['list']['html'] html = etree.HTML(html_str) comment_items = html.xpath('//div[@class="comment_items clearfix"]') pageIndex = 1 while comment_items: pageIndex += 1 for item in comment_items: comment_unit = item.xpath('.//div[@class="describe_detail"][1]/span[not(@class="icon")]/text()') score = item.xpath('.//div[@class="pinglun"]/em/text()')[0] time = item.xpath('.//div[@class="items_right"]/div[@class="starline clearfix"][1]/span[1]/text()')[0] comment = ' '.join(comment_unit) commment_item["comment"] = comment commment_item['score'] = score commment_item["time"] = time yield commment_item rate_url = "http://product.dangdang.com/index.php?r=comment%2Flist&productId="+str(goodsid)+"&categoryPath="+str(categoryPath)+"&mainProductId="+str(goodsid) + "&pageIndex=" + str(pageIndex) r = requests.get(rate_url) data_dict = json.loads(r.text) html_str = data_dict['data']['list']['html'] html = etree.HTML(html_str) comment_items = html.xpath('//div[@class="comment_items clearfix"]')