예제 #1
0
    def parse_url(self, response):
        html = Selector(response)
        id = html.xpath(
            "//ul[@class= 'bigimg cloth_shoplist']/li/@id").extract()
        next_link = html.xpath(
            "//*[@id='12810']/div[3]/div[2]/div/ul/li/a[@title='下一页']/@href"
        ).extract_first()
        start_url = 'http://product.dangdang.com/'
        next_start_url = 'http://category.dangdang.com'
        item = response.meta['item']
        atype = item['atype']
        item = DangdangItem(atype=atype)

        for aid in id:
            item['aid'] = int(aid)
            item = DangdangItem(aid=aid, atype=atype)
            item_url = start_url + aid + '.html'
            yield scrapy.Request(url=item_url,
                                 callback=self.parse_item,
                                 headers=self.header,
                                 meta={'item': item})
        if next_link != '':
            next_url = next_start_url + next_link
            yield scrapy.Request(url=next_url,
                                 callback=self.parse_url,
                                 headers=self.header,
                                 meta={'item': item})
예제 #2
0
 def parse(self, response):
     for atype in range(0, 3):
         item = DangdangItem()
         item['atype'] = atype
         item = DangdangItem(atype=atype)
         cid = "-cid400252" + str(atype) + ".html"
         base_url = "http://category.dangdang.com/pg1" + cid
         yield scrapy.Request(url=base_url,
                              callback=self.parse_url,
                              headers=self.header,
                              meta={'item': item})
예제 #3
0
 def parse(self, response):
     lilist = response.css('ul.bigimg li')
     #print(lilist)
     for li in lilist:
         item = DangdangItem()
         title = li.css('a::attr(title)').extract_first()
         if len(title) > 32:
             item['title'] = title[:32]
         else:
             item['title'] = title
         item['author'] = li.css('p.search_book_author span a::text').extract_first()
         item['price'] = li.css('p.price span::text').extract_first()
         item['comment_num'] = li.css('p.search_star_line a::text').extract_first()[:-3]
         item['detail'] = li.css('p.detail::text').extract_first()
         if li.css('a img::attr(data-original)').extract_first():
             item['picurl'] = li.css('a img::attr(data-original)').extract_first()
         else:
             item['picurl'] = li.css('a img::attr(src)').extract_first()
         item['picurl'] = item['picurl'].replace('_b_','_w_')
         yield item
         #print(item)
     time.sleep(3)
     self.p += 1
     if self.p < 6:
         next_url = 'http://search.dangdang.com/?key=python&page_index='+str(self.p)
         url = response.urljoin(next_url)
         yield scrapy.Request(url=url,callback=self.parse)
예제 #4
0
 def parse(self, response):
     all_data_x = response.xpath('//div[@class="con flq_body"]')[0]
     all_first_title = all_data_x.xpath('./div[contains(@class,"level_one")]')
     for first_title_e in all_first_title:
         f = first_title_e.xpath('./dl/dt/text()').extract_first()
         if f is not None:
             item = DangdangItem()
             f = re.sub(r'\r|\n|\t| ','',f)
             if not f:
                 f = first_title_e.xpath('./dl/dt/a/text()').extract()
                 f = ''.join(f)
                 f = re.sub(r'\r|\n|\t| ','',f)
             item['first_title'] = f
             all_s_t_title = first_title_e.xpath('.//div[@class="col eject_left"]/dl')
             if all_s_t_title.extract_first() is not None:
                 for s_t in all_s_t_title:
                     second_title = s_t.xpath('./dt/text()').extract_first()
                     second_title = re.sub(r'\r|\n|\t| ','',second_title)
                     if second_title:
                         item['second_title'] = second_title
                     else:
                         item['second_title'] = s_t.xpath('./dt/a/text()').extract_first()
                     item['second_title'] = item['second_title'].replace(' ','')
                     all_third_title = s_t.xpath('./dd/a')
                     for third_e in all_third_title:
                         item['third_title'] = third_e.xpath('./@title').extract_first()
                         item['label_url'] = third_e.xpath('./@href').extract_first()
                         if ('search.dangdang.com' in item['label_url']) or ('category.dangdang.com' in item['label_url']):
                             item_copy = copy.deepcopy(item)
                             yield scrapy.Request(item_copy['label_url'],callback=self.parse_book_detail,meta={'item':item_copy})
예제 #5
0
    def parse_item(self, response):
        item_html = Selector(response)
        item = DangdangItem()
        item = response.meta['item']

        shop_name = item_html.xpath(
            "//*[@id='service-more']/div[2]/p[1]/span/span[2]/a/text()"
        ).extract_first()
        item['shop_name'] = shop_name

        item_name = ','.join(
            item_html.xpath(
                "//*[@id='product_info']/div[1]/h1/text()").extract()).replace(
                    ' ', '').replace('\r\n', '')
        item['item_name'] = item_name

        item_price = ','.join(
            item_html.xpath("//*[@id='dd-price']/text()").extract()).replace(
                ' ', '').replace(',', '')
        item['item_price'] = item_price

        item_from = ','.join(
            item_html.xpath(
                "//*[@id='shop-geo-name']/text()").extract()).replace(
                    ' 至', '')
        item['item_from'] = item_from

        image_url = item_html.xpath(
            "//*[@id='main-img-slider']/li/a/@data-imghref").extract()
        item['image_url'] = image_url

        image_list = ','.join(image_url)
        item['image_list'] = image_list

        yield item
예제 #6
0
    def parse(self, response):
        '''
          递归解析响应数据
        '''

        print('*'*64)

        dlist = response.selector.xpath(".//ul[@class='bigimg']/li")

        for dd in dlist:
            item = DangdangItem()
            item['name'] = dd.xpath("./a/@title").extract_first() #good
            price = dd.xpath(".//span[@class='search_now_price']").extract_first()
            price = re.findall(".*?([0-9]*\.[0-9]*)",price)
            if(price[0]):
                item['price'] = price[0]
            else:
                item['price'] = None
            item['pic'] = dd.xpath(".//img/@data-original|.//img/@src").extract_first()
            item['author'] = dd.xpath(".//a[@name='itemlist-author']/@title").extract_first()
            item['publisher'] = dd.xpath(".//a[@name='P_cbs']/text()").extract_first() #good
            item['comments'] = dd.xpath(".//a[@class='search_comment_num']/text()").extract_first() #wrong
            item['pubdate'] = dd.re_first("(([0-9]{4})-([0-9]{2})-([0-9]{2}))") #good
            item['description'] = dd.xpath(".//p[@class='detail']/text()").extract_first() # good

            yield item

        self.p += 1

        # 想爬取多少页?
        if(self.p<=10):
            next_url = "http://search.dangdang.com/?key=python&act=input&page_index=" + str(self.p)
            url = response.urljoin(next_url)
            yield scrapy.Request(url=url, callback=self.parse)
예제 #7
0
파일: dd.py 프로젝트: chengyuer/DangDang
 def parse(self, response):
     item = DangdangItem()
     item['title'] = response.xpath("//a[@class ='pic']/@title").extract()
     item['link'] = response.xpath("//a[@class ='pic']/@href").extract()
     item['comment'] = response.xpath("//a[@class ='search_comment_num']/text()").extract()
     yield item
     for i in  range(2,101):
         url = "http://category.dangdang.com/pg"+str(i)+"-cp01.54.06.00.00.00.html"
         yield  Request(url,callback=self.parse)
예제 #8
0
 def parse(self, response):
     item = DangdangItem()
     item["title"] = response.xpath("//a[@name='itemlist-picture']/@title").extract()
     item["link"] = response.xpath("//a[@name='itemlist-picture']/@href").extract()
     item["comment"] = response.xpath("//a[@name='itemlist-review']/text()").extract()
     yield item
     for i in range(2,81):
         url = "http://category.dangdang.com/pg"+str(i)+"-cid4008154.html"
         yield Request(url,callback=self.parse)
예제 #9
0
파일: dd.py 프로젝트: HaJiang/scrapy..
 def parse(self, response):
     item = DangdangItem()          #items.py里面的函数
     item["title"] = response.xpath("//a[@class='pic']/@title").extract()
     item["link"] = response.xpath("//a[@class='pic']/@href").extract()
     item["comment"] = response.xpath("//a[@name='itemlist-review']/text()").extract()
     yield item
     for i in range(2, 10):
         url = "http://category.dangdang.com/pg"+str(i)+"-cp01.54.06.00.00.00.html"
         yield Request(url, callback = self.parse)
예제 #10
0
 def handle_items(self, response):
     item = DangdangItem()
     item["title"] = response.xpath(
         "//a[@name='sort-big-pic']/@title").extract()
     item["link"] = response.xpath(
         "//a[@name='sort-big-pic']/@href").extract()
     item["comment"] = response.xpath(
         "//a[@name='sort-evaluate']/text()").extract()
     yield item
예제 #11
0
 def parse(self, response):
     item=DangdangItem()
     item["title"]=response.xpath("//a[@name='sort-big-pic']/@title").extract()
     item["link"]=response.xpath("//a[@name='sort-big-pic']/@href").extract()
     item["comment"]=response.xpath("//a[@name='sort-evaluate']/text()").extract()
     yield item
     for i in range(2,81):
         url='http://category.dangdang.com/pg'+str(i)+'-cid4008154.html'
         yield Request(url,callback=self.parse)
예제 #12
0
 def parse_name(self, response):
     items = DangdangItem()
     items['title'] = response.xpath(
         '//div[@class="name_info"]/h1/@title').extract()
     items['num'] = response.xpath(
         '//a[@id="comm_num_down"]/text()').extract()
     items['link'] = response.url
     items['price'] = response.xpath('//p[@id="dd-price"]/text()').extract()
     yield items
예제 #13
0
 def parse(self, response):
     item = DangdangItem()
     item['title'] = response.xpath("//a[@class='pic']/@title").extract
     item['link'] = response.xpath("//a[@class='pic']/@href").extract
     item['comment'] = response.xpath(
         "//a[@class='search_comment_num']/text()").extract
     print(item['title'])
     print(item['link'])
     print(item['comment'])
     yield item
예제 #14
0
 def get_info(self, response):
     item = DangdangItem()
     item['name'] = response.xpath('//a[@class="pic"]/@title').extract()
     item['price'] = response.xpath(
         '//p[@class="price"]/span/text()').extract()
     item["link"] = response.xpath("//a[@class='pic']/@href").extract()
     item["comnum"] = response.xpath(
         "//a[@name='itemlist-review']/text()").extract()
     #print('11111111111111111111')
     yield item
예제 #15
0
 def parse_item(self, response):
     lst = response.xpath('//ul[@class="bigimg cloth_shoplist"]/li')
     for i in lst:
         item = DangdangItem()
         item['name'] = i.xpath('./p[@class="name"]/a/@title').extract()[0]
         item['price'] = i.xpath(
             './p[@class="price"]/span/text()').extract()[0][1:]
         item['link'] = i.xpath('./p[@class="link"]/a/@href').extract()[0]
         item['comment'] = i.xpath(
             './p[@class="star"]/a/text()').extract()[0].replace('条评论', '')
         yield item
예제 #16
0
 def parse(self, response):
     item = DangdangItem()
     item["title"] = response.xpath("//a[@name='itemlist-picture']/@title").extract()
     item["link"] = response.xpath("//a[@name='itemlist-picture']/@href").extract()
     item["price"] = response.xpath("//span[@class='price_n']/text()").extract()
     item["comment"] = response.xpath("//a[@name='itemlist-review']/text()").extract()
     # print(item["title"])
     yield item
     for i in range(2, 20):
         url = 'http://search.dangdang.com/?key=%CC%A4%B2%BD%BB%FA&page_index=' + str(i)
         yield Request(url, callback=self.parse)
예제 #17
0
 def parse(self, response):
     html = response.text
     tree = etree.HTML(html)
     li_list = tree.xpath('//*[@id="component_59"]/li')
     for li in li_list:
         item = DangdangItem()
         item['title'] = li.xpath('./a/@title')
         item['miaoshu'] = li.xpath('./p[@class = "detail"]/text()'
                                    )  # 之前的爬虫写错了导致抓到的title和miaoshu放在一个了
         # print(li)
         print(item['title'])
         yield item
예제 #18
0
 def parse_name(self, response):
     items = DangdangItem()
     items['title'] = response.xpath(
         '//*[@id="product_info"]/div[1]/h1/@title').extract()
     items['num'] = response.xpath(
         '//*[@id="comm_num_down"]/text()').extract()
     items['link'] = response.url
     items['price'] = response.xpath('//*[@id="dd-price"]/text()').extract()
     items['cbs'] = response.xpath(
         '//*[@id="product_info"]/div[2]/span[2]/a/text()').extract()
     items['pic'] = response.xpath('//*[@id="largePic"]/@src').extract()
     yield items
예제 #19
0
    def parse_subpage(self,response,category):

        length= len(response.xpath('//*[@id="component_0__0__8395"]/li/a/img').extract())#获取每一面的图书数量
        for i in range(0,length+1):
            item = DangdangItem()

            item['name']=response.xpath('//*[@id = "component_0__0__8395"] /li[{}]/p[2]/a/text()'.format(i)).extract()
            item['author']=response.xpath('//*[@id="component_0__0__8395"]/li[{}]/p[5]/text()'.format(i)).extract()
            item['price']=response.xpath('//*[@id="component_0__0__8395"]/li[{}]/p[1]/span[1]/text()'.format(i)).extract()
            item['comments']=response.xpath('//*[@id="component_0__0__8395"]/li[{}]/p[4]/a/text()'.format(i)).extract()
            item['category']=category

            yield item
예제 #20
0
파일: dd.py 프로젝트: chenyudan003/spider
 def parse(self, response):
     item = DangdangItem()
     item['title'] = response.xpath(
         '//a[@name="itemlist-picture"]/@title').extract()
     item['link'] = response.xpath(
         '//a[@name="itemlist-picture"]/@href').extract()
     item['comment'] = response.xpath(
         '//a[@class="search_comment_num"]/text()').extract()
     yield item
     for i in range(1, 100):
         url = 'http://search.dangdang.com/?key=%BC%C6%CB%E3%BB%FA&act=input&page_index=' + str(
             i)
         yield Request(url, callback=self.parse)
예제 #21
0
 def parse(self, response):
     item = DangdangItem()
     item["name"] = response.xpath("//h1/@title").extract_first()
     item["author"] = response.xpath(
         "//span[@id='author']/a[1]/text()").extract_first()
     item["price"] = response.xpath(
         "//p[@id='dd-price']/text()[2]").extract_first()
     item["ISBN"] = response.xpath(
         "//div[@id='detail_describe']/ul/li[5]/text()").extract_first()
     item["crawl_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     self.number = self.number + 1
     item["id"] = self.number
     yield item
예제 #22
0
    def parse(self, response):
        item = DangdangItem()
        item['name'] = response.xpath('//*[@class="pic"]/@title').extract()
        item['price'] = response.xpath(
            '//*[@class="price_n"]/text()').extract()
        item['link'] = response.xpath('//*[@class="pic"]/@href').extract()
        item['comment'] = response.xpath(
            '//*[@dd_name="单品评论"]/text()').extract()
        yield item

        for i in range(2, 5):
            url = 'http://category.dangdang.com/pg{}-cid4005627.html'.format(i)
            yield Request(url, callback=self.parse)
예제 #23
0
파일: dd.py 프로젝트: wsgan001/python-4
 def parse(self, response):
     item = DangdangItem()
     item['title'] = response.xpath(
         '//a[@name="itemlist-title"]/@title').extract()
     item['link'] = response.xpath(
         '//a[@name="itemlist-title"]/@href').extract()
     item['comment'] = response.xpath(
         '//a[@class="search_comment_num"]/text()').extract()
     yield item
     for i in range(2, 101):
         url = 'http://category.dangdang.com/pg' + str(
             i) + '-cp01.54.06.00.00.00.html'
         yield Request(url, callback=self.parse)
예제 #24
0
    def parse(self, response):

        #print(response.text)
        tmp = response.xpath("//div[@id='bd']//li[@id]")  #选取中商品
        # print(type(tmp))
        print(len(tmp))
        for li in tmp:
            item = DangdangItem()
            item['book_name'] = li.xpath('p[@name]/a/@title').extract_first()
            item['price'] = li.xpath(
                "p[@class='price']/span[@class='search_now_price']/text()"
            ).extract_first()
            #print(item['price'])
            item['author'] = li.xpath(
                "p[@class='search_book_author']/span[1]/a[1]/@title"
            ).extract_first()
            item['pubdate'] = li.xpath(
                "p[@class='search_book_author']/span[2]/text()").extract_first(
                )
            item['press'] = li.xpath(
                "p[@class='search_book_author']/span[3]/a[1]/@title"
            ).extract_first()
            item['star'] = li.xpath(
                "p[@class='search_star_line']/span/span/@style").extract_first(
                )
            item['_id'] = li.xpath("a/@href").extract_first()
            item['comment'] = li.xpath("p[@class='search_star_line']/a/text()"
                                       ).extract_first()  #a/@href可以提取到评论的连接
            item['comment_url'] = li.xpath(
                "p[@class='search_star_line']/a/@href").extract_first(
                )  # a/@href可以提取到评论的连接
            #print(item['comment_url'])
            #print(item['star'],item['book_url'])
            #print(item['author'],item['press'],item['pubdate'])
            yield item
        # next = tmp.xpath("//li[@class='next']/a/@href")
        #print(next)
        # if next:
        #     print("http://category.dangdang.com"+next.extract_first())
        #     yield scrapy.Request("http://category.dangdang.com"+next.extract_first(),callback=self.parse)

    #书名 // li[ @ sku] / p[ @ name]

    #价格//li[@sku]/p[@class='price']/span[@class='search_now_price'] 有折扣的情况下,无折扣饿另说


#作者,出版社//li[@sku]/p[@class='search_book_author']/span/a[1]/@title
#星级//li[@sku]/p[@class='search_star_line']/span/span/@style   width: 90%;
#//li[@sku]/p[@class='search_star_line']/a      1232312条评论  评论数
#//评论网址//li[@sku]/p[@class='search_star_line']/a/@href
#出版日期//li[@sku]/p[@class='search_book_author']/span[2]      /2013-01-01
예제 #25
0
    def parse_page(self, response):
        for item in response.xpath(
                '//*[@id="search_nature_rg"]/ul[@class="bigimg"]/li'):
            # 所有图书
            book = DangdangItem()
            # try:
            book['price'] = float(
                item.xpath('./p[@class="price"]/span[1]/text()').pop().extract(
                ).lstrip('¥'))
            book['type_tag'] = response.meta['type']
            book['name'] = item.xpath(
                './p[@class="name"]/a/text()').pop().extract().strip()
            book['book_tag'] = str(time.time()) + book.get('name', None)
            # book['image_url'] = item.xpath('./a/img/@src').pop().extract()
            book['link'] = item.xpath('./p[1]/a/@href').pop().extract()

            book['star_level'] = \
                int(item.xpath('./p[@class="search_star_line"]/span/span/@style').pop().extract().split(' ')[-1].rstrip(
                    '%;'))
            try:
                book['time'] = item.xpath(
                    './p[@class="search_book_author"]/span[2]/text()').pop(
                    ).extract().split('/')[-1]
                book['author_tag'] = ','.join(
                    item.xpath(
                        './p[@class="search_book_author"]/span[1]/a/text()').
                    extract()).strip()
                book['publish_company'] = item.xpath(
                    './p[@class="search_book_author"]/span[3]/a/text()').pop(
                    ).extract().strip()
                book['brief'] = item.xpath(
                    './p[2]/text()').pop().extract().strip()
            except:
                scrapy.Spider.log(
                    self, "Error:{} , url {}:".format(book['name'],
                                                      response.url))
            finally:
                yield book
예제 #26
0
파일: dd.py 프로젝트: ggqshr/PythonProject
 def parse(self, response):
     item = DangdangItem()
     item["title"] = response.xpath("//a[@class='pic']/@title").extract()
     item["link"] = response.xpath("//a[@class='pic']/@href").extract()
     item["comment"] = response.xpath(
         "//a[@class='search_comment_num']/text()").extract()
     item["price"] = response.xpath(
         "//p[@class='price']/span[@class='search_now_price']/text()"
     ).extract()
     yield item
     for i in xrange(1, 101):
         url = "http://category.dangdang.com/pg" + str(
             i) + "-cp01.54.06.00.00.00.html"
         yield Request(url, callback=self.parse)
예제 #27
0
 def parse(self, response):
     item = DangdangItem()
     item["title"] = response.xpath(
         "//a[@name='itemlist-title']/@title").extract()
     item["link"] = response.xpath(
         "//a[@name='itemlist-title']/@href").extract()
     item["comment"] = response.xpath(
         "//a[@name='itemlist-review']/text()").extract()
     # print(item["title"])
     yield item
     for i in range(2, 11):  #爬取2~10页
         url = 'http://category.dangdang.com/pg' + str(
             i) + '-cid4008154.html'
         yield Request(url, callback=self.parse)
예제 #28
0
    def parse(self, response):
        lst = response.xpath('//ul[@class="bigimg cloth_shoplist"]/li')
        for i in lst:
            item = DangdangItem()
            item['name'] = i.xpath('./p[@class="name"]/a/@title').extract()[0]
            item['price'] = i.xpath(
                './p[@class="price"]/span/text()').extract()[0][1:]
            item['link'] = i.xpath('./p[@class="link"]/a/@href').extract()[0]
            item['comment'] = i.xpath(
                './p[@class="star"]/a/text()').extract()[0].replace('条评论', '')
            yield item

        for i in range(2, 5):
            url = 'http://category.dangdang.com/pg{}-cid4005627.html'.format(i)
            yield Request(url, callback=self.parse)
예제 #29
0
 def parse(self, response):
     # hax = Selector(response)
     items = []
     titles = response.xpath(
         '//body//div[@id="search_nature_rg"]//a/@href').extract()
     for index in range(len(titles)):
         item = DangdangItem()
         title = titles[index]
         item['links'] = title
         items.append(item)
     for item in items:
         yield FormRequest(dont_filter=True,
                           url=item['links'],
                           meta={'item': item},
                           callback=self.parse2)
예제 #30
0
 def parse_item(self, response):
     item = DangdangItem()
     item['kind'] = response.xpath(
         "//div[@class='layout_location']/span[last()]/text()"
     ).extract_first()
     allitem = response.css('.bang_list.clearfix.bang_list_mode>li')
     for i in allitem:
         item['name'] = i.css('.name>a').xpath('./@title').extract_first()
         item['link'] = i.css('.name>a').xpath('./@href').extract_first()
         item['comment'] = i.css('.star>a::text').extract_first()
         item['satisfaction'] = i.css('.tuijian::text').extract_first()
         item['price'] = i.xpath(
             "//div[@class='price']/p[1]/span[@class='price_n']/text()"
         ).extract_first()
         yield item