Python DangdangItem примеры использования

Язык программирования: Python

Пространство имен/Пакет: Dangdang.items

Класс/Тип: DangdangItem

Примеров на hotexamples.com: 7

Python DangdangItem - 7 примеров найдено. Это лучшие примеры Python кода для Dangdang.items.DangdangItem, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DangdangItem(7)

xpath(1)

Основные методы

DangdangItem (7)

xpath (1)

Пример #1

Показать файл

 def getrank(self, response):
     item = DangdangItem()
     item = response.meta['item']
     if len(response.text) > 100:
         item['ranking'] = re.findall('"rank":"(.*?)"', response.text)[0]
     else:
         item['ranking'] = '无排名'
     return item

Пример #2

Показать файл

 def single_goods(self, response):
     item = DangdangItem()
     item['title'] = response.meta['title']  #商品名
     item['url'] = response.url  #购买链接
     item['comment'] = response.xpath(
         '//a[@dd_name="评论数"]/text()').extract()  #评论数
     item['price'] = response.meta['price']  #价格
     # 商品id,为获得商品排名
     goods_id = str(item['url']).split('/')
     g_id = goods_id[len(goods_id) - 1].split('.')[0]
     ranking_url = 'http://product.dangdang.com/index.php?r=callback%2Fget-bang-rank&productId=' + g_id
     yield Request(ranking_url, self.getrank, meta={'item': item})

Пример #3

Показать файл

 def parse(self, response):
     item = DangdangItem()
     item['name'] = response.xpath("//a[@dd_name='单品标题']/text()").extract()
     item['price'] = response.xpath(
         "//span[@class='price_n']/text()").extract()
     item['link'] = response.xpath("//a[@class='pic']/@href").extract()
     item['comnum'] = response.xpath(
         "//a[@dd_name='单品评论']/text()").extract()
     yield item
     for i in range(1, 100):
         url = "http://category.dangdang.com/pg" + str(
             i) + "-cid4011029.html"
         yield Request(url, callback=self.parse)

Пример #4

Показать файл

Файл: dd.py Проект: ZJane/PythonWebCrawler

 def parse(self, response):
     item = DangdangItem()
     item['title'] = response.xpath('//a[@name="itemlist-picture"]/@title').extract()
     #print(item['title'])
     item['link'] = response.xpath('//a[@name="itemlist-picture"]/@href').extract()
     item['comment'] = response.xpath('//a[@dd_name="单品评论"]/text()').extract()
     for i in range(0,len(item['comment'])):
         item['comment'][i] = int(item['comment'][i].replace("条评论",''))
     item['price'] = response.xpath('//span[@class="search_now_price"]/text()').extract()
     for i in range(0, len(item['price'])):
         item['price'][i] = float(re.findall('[0-9.].*',item['price'][i].replace('&yen;', ''))[0])
     yield item
     for i in range(2,101):
         url = 'http://category.dangdang.com/pg'+str(i)+'-cp01.54.00.00.00.00.html'
         yield Request(url,callback=self.parse)

Пример #5

Показать файл

    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            lis = selector.xpath(
                "//li['@ddt-pit'][starts-with(@class,'line')]")
            for li in lis:
                title = li.xpath("./a[position()=1]/@title").extract_first()
                price = li.xpath(
                    "./p[@class='price']/span[@class='search_now_price']/text()"
                ).extract_first()
                author = li.xpath(
                    "./p[@class='search_book_author']/span[position()=1]/a/@title"
                ).extract_first()
                date = li.xpath(
                    "./p[@class='search_book_author']/span[position()=last()- 1]/text()"
                ).extract_first()
                publisher = li.xpath(
                    "./p[@class='search_book_author']/span[position()=last()]/a/@title "
                ).extract_first()
                detail = li.xpath(
                    "./p[@class='detail']/text()").extract_first()
                #detail有时没有，结果None
                item = DangdangItem()
                item["title"] = title.strip() if title else ""
                item["author"] = author.strip() if author else ""
                item["date"] = date.strip()[1:] if date else ""
                item["publisher"] = publisher.strip() if publisher else ""
                item["price"] = price.strip() if price else ""
                item["detail"] = detail.strip() if detail else ""
                yield item
                #最后一页时link为None
            link = selector.xpath(
                "//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href"
            ).extract_first()
            if link:
                url = response.urljoin(link)
                yield scrapy.Request(url=url, callback=self.parse)

        except Exception as err:
            print(err)

Пример #6

Показать файл

 def parse(self, response):
     item = DangdangItem()
     products = json.loads(response.text)
     for product in products['products']:
         item['book_name'] = product.get('name')  # 书名
         item['author_name'] = product.get('authorname')  # 作者
         item['price'] = product.get('price')  # 现价
         item['original_price'] = product.get('original_price')  # 原价
         item['score'] = product.get('score')  # 评分
         item['stock'] = product.get('stock')  # 库存
         item['total_review_count'] = product.get(
             'total_review_count')  # 评论数
         item['shop_id'] = product.get('shop_id')  # 店铺id
         item['shop_info'] = product.get('shop_info')  # 店铺名称
         item['publisher'] = product.get('publisher')  # 出版社
         item['publish_date'] = product.get('publish_date')  # 出版日期
         item['image_url'] = product.get('image_url')  # 图书封面
         item['product_url'] = product.get('product_url')  # 图书url
         yield item
         self.offset += 1
         if self.offset <= settings.MAX_PAGE:
             yield scrapy.Request(self.url.format(self.offset))

Пример #7

Показать файл

Файл: dd.py Проект: nosun/crawl_dangdang

    def parse_item(self, response):
        item = DangdangItem()  # 实例化item
        commment_item = CommentItem()
        item["category"] = response.xpath('//*[@id="breadcrumb"]/a[1]/b/text()').extract_first()+'>'+response.xpath('//*[@id="breadcrumb"]/a[2]/text()').extract_first()+'>'+response.xpath('//*[@id="breadcrumb"]/a[3]/text()').extract_first()
        item["title"] = response.xpath("//*[@id='product_info']/div[1]/h1/@title").extract_first()
        item["detail"] = json.dumps(response.xpath("//*[@id='detail_describe']/ul//li/text()").extract(),ensure_ascii=False)
        item["link"] = response.url
        item["img_link"] =json.dumps(response.xpath("//div[@class='img_list']/ul//li/a/@data-imghref").extract())
        try:
            item["price"] = response.xpath("//*[@id='dd-price']/text()").extract()[1].strip()
        except IndexError as e:
            item["price"] = response.xpath("//*[@id='dd-price']/text()").extract()[0].strip()
        item["comment_num"] = response.xpath("//*[@id='comm_num_down']/text()").extract()[0]

        try:
            item["source"] = response.xpath("//*[@id='shop-geo-name']/text()").extract()[0].replace('\xa0至','')
        except IndexError as e:
            item["source"] = '当当自营'
        
        # 通过正则表达式提取url中的商品id
        goodsid = re.compile('\/(\d+).html').findall(response.url)[0]  
        commment_item['goods_id'] = goodsid
        item["goods_id"] = goodsid

        '''########################################################
                      通过抓包分析,提取商品的好评率             
        ########################################################'''
        # 提取详情页源码中的categoryPath
        script = response.xpath("/html/body/script[1]/text()").extract()[0]
        categoryPath = re.compile(r'.*categoryPath":"(.*?)","describeMap').findall(script)[0]
        # 构造包含好评率包的链接
        rate_url = "http://product.dangdang.com/index.php?r=comment%2Flist&productId="+str(goodsid)+"&categoryPath="+str(categoryPath)+"&mainProductId="+str(goodsid)
        r = requests.get(rate_url)
        data_dict = json.loads(r.text)
        item["rate"] = data_dict['data']['list']['summary']['goodRate']
        item["good_comment_num"] = data_dict['data']['list']['summary']['total_crazy_count']
        item["mid_comment_num"] = data_dict['data']['list']['summary']['total_indifferent_count']
        item["bad_comment_num"] = data_dict['data']['list']['summary']['total_detest_count']
        yield item

        '''#####################################################
                         开始对评论、评分进行清洗并爬取                 
        #####################################################'''
        html_str = data_dict['data']['list']['html']
        html = etree.HTML(html_str)
        comment_items = html.xpath('//div[@class="comment_items clearfix"]')
        pageIndex = 1
        while comment_items:   
            pageIndex += 1
            for item in comment_items:
                comment_unit = item.xpath('.//div[@class="describe_detail"][1]/span[not(@class="icon")]/text()')
                score = item.xpath('.//div[@class="pinglun"]/em/text()')[0]
                time = item.xpath('.//div[@class="items_right"]/div[@class="starline clearfix"][1]/span[1]/text()')[0]
                comment = ' '.join(comment_unit)
                commment_item["comment"] = comment 
                commment_item['score'] = score
                commment_item["time"] = time
                yield commment_item


            rate_url = "http://product.dangdang.com/index.php?r=comment%2Flist&productId="+str(goodsid)+"&categoryPath="+str(categoryPath)+"&mainProductId="+str(goodsid) + "&pageIndex=" + str(pageIndex)
            r = requests.get(rate_url)
            data_dict = json.loads(r.text)
            html_str = data_dict['data']['list']['html']
            html = etree.HTML(html_str)
            comment_items = html.xpath('//div[@class="comment_items clearfix"]')