def next_page(self, hxs): #pagination object area poa = hxs.select('//div[@id="bottom_pagenum"]/span/a') for item in poa: text = extract_value(item.select('text()')) if text.find('下一页') != -1: url = extract_value(item.select('@href')) request = self.make_request_from_response( url="%s%s" % (self.BASE_URL, url), ) self.crawl(request) self.log('next page:%s' % request.url) break
def process_listpage(self): item_num = 0 hxs = HtmlXPathSelector(self.response) skus = hxs.select('//li[@sku]') for sku in skus: skuvalue = ''.join(sku.select('@sku').extract()) imgurl = sku.select('div[@class="p-price"]/img/@src').extract()[0] url = sku.select('div[@class="p-name"]/a/@href').extract()[0] name = extract_value(sku.select('div[@class="p-name"]/a/text()')) request = self.make_request_from_response(\ imgurl, cur_idepth=self.basic_link_info.cur_idepth, gurl=url, name=name, sku=skuvalue ) self.crawl(request) #price = gocr(img, self.tmpfile_dir) #if price <= 0: # self.log('gocr %s from %s' % (price, img), level=log.ERROR) # continue item_num += 1 self.next_page(hxs) return item_num
def process(self): item_num = 0 hxs = HtmlXPathSelector(self.response) prolist = hxs.select('//div[@id="prodlist"]/li') for item in prolist: url = "%s%s" % (self.BASE_URL , extract_value(item.select('a/@href'))) sprice = extract_value( item.select('p[@class="pimg"]/span[@class="pinfo"]/i[@class="ltprice"]/text()') ) price = canonicalize_price(sprice) name = extract_value( item.select('p[@class="pimg"]/span[@class="pname"]/a/text()') ) self.save(url, name, (), price) item_num += 1 self.next_page(hxs) return item_num
def process_entrypage(self): item_num = 0 hxs = HtmlXPathSelector(self.response) floors = hxs.select('//div[@class="sFloors l"]/div[@class="sFloor clearfix"]') # we don't need floor[0], which is book for i in range(1, len(floors)): cat1 = extract_value(floors[i].select("h3/a/text()")) subcats = floors[i].select("ul/li/dl/dt") for subcat in subcats: cat2 = extract_value(subcat.select("a/text()")) url = extract_value(subcat.select("a/@href")) request = self.make_request_from_response( "http://www.360buy.com/%s" % url, cur_idepth=self.basic_link_info.cur_idepth, cat=[cat1, cat2] ) self.crawl(request) item_num += 1 return item_num
def next_page(self, hxs): url = None last_page = int(extract_value(hxs.select('//a[@id="pageLast"]'))) current_page = int(extract_value(hxs.select('//div[@class="snPages"]/a[@class="current"]/text()').extract())) if current_page == 1: # http://www.suning.com/emall/pcd_10052_10051_-7_N_20089_20002_.html # ['pcd', '10052', '10051', '-7', 'N', '20089', '20002', '.html'] fs = self.response.url.split("/")[-1].split("_") url = ( "http://www.suning.com/emall/secondPointSearchNewCmd?" "storeId=%s&catalogId=%s&categoryId=%s&topBrandName=" "&top=N&top_category=%s&sortIndex=5¤tPage=1&isList=0" % (fs[1], fs[2], fs[-2], fs[-3]) ) elif current_page < last_page: url = self.response.url.replace("currentPage=%s" % (current_page - 1), "currentPage=%s" % current_page) if url: request = self.make_request_from_response(url=url) self.crawl(request, cat=self.response.meta["cat"]) self.log("next page:%s" % request.url)
def process_entrypage(self): item_num = 0 hxs = HtmlXPathSelector(self.response) links = hxs.select('//div[contains(@id, "JDS_")]') for index, link in enumerate(links): cat1 = extract_value(link.select('div[@class="mt"]/h2/a/text()')) dls = link.select('div[@class="mc"]/dl[@class="fore"]') for dl in dls: cat2 = extract_value(dl.select('dt/a/text()')) dds = dl.select('dd/em') for dd in dds: cat3 = extract_value(dd.select('a/text()')) catlist = (cat1, cat2, cat3) url = dd.select('a/@href').extract()[0] request = self.make_request_from_response( \ "http://www.360buy.com/%s"%url, cur_idepth=self.basic_link_info.cur_idepth, cat=catlist) self.crawl(request) item_num += 1 return item_num
def process_listpage(self): item_num = 0 hxs = HtmlXPathSelector(self.response) products = hxs.select('//div[@id="proShow"]/ul/li') for prod in products: prod_name = extract_value(prod.select('div[@class="inforBg"]/span/a/@title')) prod_url = extract_value(prod.select('div[@class="inforBg"]/span/a/@href')) img_url = extract_value(prod.select('div[@class="inforBg"]/p/img/@src')) request = self.make_request_from_response( img_url, cur_idepth=self.basic_link_info.cur_idepth, prod_url=prod_url, prod_name=prod_name, cat=self.response.meta["cat"], ) self.crawl(request) item_num += 1 self.next_page(hxs) return item_num
def process_listpage(self): item_num = 0 hxs = HtmlXPathSelector(self.response) goods = hxs.select('//div[@class="floorConn"]/div[@class="goodsList"]/ul/li') gname_dict = {} gid_list = [] for item in goods: name = extract_value(item.select('.//p[@class="productName"]/@title')) #url = extract_value(item.select('div[@class="txt"]/a/@href')) gid = extract_value(item.select('.//span[@name="price"]/@id')) gid_list.append(gid) gname_dict[gid] = name item_num += 1 request = self.make_request_from_response( url= "%s%s" % (self.PRICE_REQUST_URL, ",".join(gid_list)), cur_idepth=self.basic_link_info.cur_idepth, gname_dict=gname_dict, ) self.crawl(request) self.next_page(hxs) return item_num