Пример #1
0
 def next_page(self, hxs):
     #pagination object area
     poa = hxs.select('//div[@id="bottom_pagenum"]/span/a')
     for item in poa:
         text = extract_value(item.select('text()'))
         if text.find('下一页') != -1:
             url = extract_value(item.select('@href'))
             request = self.make_request_from_response(
                 url="%s%s" % (self.BASE_URL, url),
                 )
             self.crawl(request)
             self.log('next page:%s' % request.url)
             break
Пример #2
0
    def process_listpage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        skus = hxs.select('//li[@sku]')
        for sku in skus:
            skuvalue = ''.join(sku.select('@sku').extract())
            imgurl = sku.select('div[@class="p-price"]/img/@src').extract()[0]
            url = sku.select('div[@class="p-name"]/a/@href').extract()[0]
            name = extract_value(sku.select('div[@class="p-name"]/a/text()'))
            request = self.make_request_from_response(\
                imgurl,
                cur_idepth=self.basic_link_info.cur_idepth,
                gurl=url, name=name, sku=skuvalue
                )
            self.crawl(request)

            #price = gocr(img, self.tmpfile_dir)
            #if price <= 0:
            #    self.log('gocr %s from %s' % (price, img), level=log.ERROR)
            #    continue

            item_num += 1

        self.next_page(hxs)
        return item_num
Пример #3
0
    def process(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        prolist = hxs.select('//div[@id="prodlist"]/li')
        for item in prolist:
            url = "%s%s" % (self.BASE_URL ,
                    extract_value(item.select('a/@href')))
            sprice = extract_value(
                item.select('p[@class="pimg"]/span[@class="pinfo"]/i[@class="ltprice"]/text()')
                )
            price = canonicalize_price(sprice)
            name = extract_value(
                item.select('p[@class="pimg"]/span[@class="pname"]/a/text()')
                )
            self.save(url, name, (), price)
            item_num += 1

        self.next_page(hxs)
        return item_num
Пример #4
0
    def process_entrypage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        floors = hxs.select('//div[@class="sFloors l"]/div[@class="sFloor clearfix"]')
        # we don't need floor[0], which is book
        for i in range(1, len(floors)):
            cat1 = extract_value(floors[i].select("h3/a/text()"))

            subcats = floors[i].select("ul/li/dl/dt")
            for subcat in subcats:
                cat2 = extract_value(subcat.select("a/text()"))
                url = extract_value(subcat.select("a/@href"))
                request = self.make_request_from_response(
                    "http://www.360buy.com/%s" % url, cur_idepth=self.basic_link_info.cur_idepth, cat=[cat1, cat2]
                )
                self.crawl(request)
                item_num += 1

        return item_num
Пример #5
0
    def next_page(self, hxs):
        url = None
        last_page = int(extract_value(hxs.select('//a[@id="pageLast"]')))
        current_page = int(extract_value(hxs.select('//div[@class="snPages"]/a[@class="current"]/text()').extract()))
        if current_page == 1:
            # http://www.suning.com/emall/pcd_10052_10051_-7_N_20089_20002_.html
            # ['pcd', '10052', '10051', '-7', 'N', '20089', '20002', '.html']
            fs = self.response.url.split("/")[-1].split("_")
            url = (
                "http://www.suning.com/emall/secondPointSearchNewCmd?"
                "storeId=%s&catalogId=%s&categoryId=%s&topBrandName="
                "&top=N&top_category=%s&sortIndex=5&currentPage=1&isList=0" % (fs[1], fs[2], fs[-2], fs[-3])
            )
        elif current_page < last_page:
            url = self.response.url.replace("currentPage=%s" % (current_page - 1), "currentPage=%s" % current_page)

        if url:
            request = self.make_request_from_response(url=url)
            self.crawl(request, cat=self.response.meta["cat"])
            self.log("next page:%s" % request.url)
Пример #6
0
    def process_entrypage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        links = hxs.select('//div[contains(@id, "JDS_")]')
        for index, link in enumerate(links):
            cat1 = extract_value(link.select('div[@class="mt"]/h2/a/text()'))
            dls = link.select('div[@class="mc"]/dl[@class="fore"]')
            for dl in dls:
                cat2 = extract_value(dl.select('dt/a/text()'))
                dds = dl.select('dd/em') 
                for dd in dds:
                    cat3 = extract_value(dd.select('a/text()'))
                    catlist = (cat1, cat2, cat3)
                    url = dd.select('a/@href').extract()[0]
                    request = self.make_request_from_response( \
                        "http://www.360buy.com/%s"%url,
                        cur_idepth=self.basic_link_info.cur_idepth,
                        cat=catlist)
                    self.crawl(request) 
                    item_num += 1

        return item_num
Пример #7
0
    def process_listpage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        products = hxs.select('//div[@id="proShow"]/ul/li')
        for prod in products:
            prod_name = extract_value(prod.select('div[@class="inforBg"]/span/a/@title'))
            prod_url = extract_value(prod.select('div[@class="inforBg"]/span/a/@href'))

            img_url = extract_value(prod.select('div[@class="inforBg"]/p/img/@src'))

            request = self.make_request_from_response(
                img_url,
                cur_idepth=self.basic_link_info.cur_idepth,
                prod_url=prod_url,
                prod_name=prod_name,
                cat=self.response.meta["cat"],
            )
            self.crawl(request)

            item_num += 1

        self.next_page(hxs)
        return item_num
Пример #8
0
    def process_listpage(self):
        item_num = 0
        hxs = HtmlXPathSelector(self.response)
        goods = hxs.select('//div[@class="floorConn"]/div[@class="goodsList"]/ul/li')
        gname_dict = {}
        gid_list = []
        for item in goods:
            name = extract_value(item.select('.//p[@class="productName"]/@title'))
            #url =  extract_value(item.select('div[@class="txt"]/a/@href'))
            gid = extract_value(item.select('.//span[@name="price"]/@id'))
            gid_list.append(gid)
            gname_dict[gid] = name
            item_num += 1

        request = self.make_request_from_response(
            url= "%s%s" % (self.PRICE_REQUST_URL, ",".join(gid_list)),
            cur_idepth=self.basic_link_info.cur_idepth,
            gname_dict=gname_dict,
            )
        self.crawl(request)

        self.next_page(hxs)
        return item_num