def parse_info(self, response): hxs = HtmlXPathSelector(response) rows = hxs.select("//table/tr") item = TexasItem() #Remove the last from the identifing URL item['ident'] = response.url[0:-5] #Rip out the info, compare using lookup table for tr in rows: td = tr.select("td/text()") l = len(td) key = td[l-2].extract() val = td[l-1].extract() item[self.lookup[key]] = val values = hxs.select("//p/text()") keys = hxs.select("//p/span/text()") #Rip down the auxially data for i in range(len(keys)-1): key = keys[i].extract() val = values[i+1].extract() item[self.lookup[key]] = self.cleanString(val) #and lastly the mugshot hxs = hxs.select("//table/tr/td/img/@src") if len(hxs.extract()) >= 1: item['mugshot'] = "http://www.tdcj.state.tx.us/stat/dr_info/" + hxs.extract()[0] return item
def parse(self, response): hxs = HtmlXPathSelector(response) deduct = self.get_data(hxs.extract(), 'productDeduc.push("','")') answers = self.get_data(hxs.extract(), 'productAns.push("','")') checks = hxs.select('//form[@name="customVal"]/table/tr/td[@style="padding-left:30px;"]/text()').extract() answers_deduct = zip(answers, deduct) options = zip(checks, answers_deduct) #Gets the prices. prices = zip(['Like New', 'Fair', 'Poor'], self.get_data(hxs.extract(), 'pp1=',';\n')[:-1]) for grade, price in prices: loader = ProductLoader(item=Product(), response=response) name = hxs.select('//*[@id="vmMainPage"]/div/div/div/div/h1/text()').extract()[0] loader.add_value('name', ' '.join((name, grade))) loader.add_value('price', self.calc_price(float(price), options)) loader.add_value('url', response.url) yield loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) categories = hxs.select( '//p[@class="catname"]/strong/a/@href').extract() if categories: yield Request(response.url, dont_filter=True) else: tr = ('<tr><td colspan="3" align="center" class="pagenums">' + '<p class="pagenums">\r\n\t\t\t\t ' + '<img src="images/clearpixel.gif" width="300" ' + 'height="8" alt=""></p></td>\r\n\t\t\t </tr>') tr_end = '<tr>' + hxs.select('//td[@class="prodseparator"]').\ extract()[0].decode('utf') + '</tr>' html = hxs.extract().replace(tr,'<table class="item">').\ replace(tr_end,'</table><table class="item">') products_hxs = HtmlXPathSelector(text=html) products = products_hxs.select('//table[@class="item"]') for product in products: name = product.select( 'tr/td/strong/div[@class="prodname"]/a/text()').extract() if name: name = name[0] url = product.select( 'tr/td/strong/div[@class="prodname"]/a/@href').extract( ) if url: url = url[0] price_options = product.select( 'tr/td/form/script').extract() if price_options: price_values = self._get_prices(price_options[0]) for price, desc in price_values: loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', ' '.join((name, desc))) loader.add_value( 'url', urljoin_rfc(get_base_url(response), url)) loader.add_value('price', price) yield loader.load_item() else: price = product.select( 'tr/td/div[@class="prodprice"]/span/text()' ).extract() if price: price = price[0] else: price = 0.0 loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', name) loader.add_value( 'url', urljoin_rfc(get_base_url(response), url)) loader.add_value('price', price) yield loader.load_item() next = hxs.select( '//a[@class="ectlink" and @ rel="next"]/@href').extract() if next: url = urljoin_rfc(get_base_url(response), next[0]) yield Request(url, callback=self.parse_products)
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_list = hxs.select(u'//div[@class="box"]/div/a/@href').extract() if product_list: for url in product_list: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product, meta=response.meta) return #fill main product fields product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_value('category', response.meta.get('category')) img = hxs.select('//div[@class="image"]/img/@src').extract() if img: product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) brand = ''.join(hxs.select('//li[contains(text(), "Brand")]/text()').extract()).replace('Brand: ', '') product_loader.add_value('brand', brand) name = hxs.select('//div[@class="description"]/strong/text()').extract()[0] product = product_loader.load_item() #construct url for ajax request to grab all product options pid = hxs.select('//input[@name="product_id"]/@value').extract() if pid: for line in hxs.extract().split('\n'): if 'var isclearance' in line.lower() and 'true' in line.lower(): clearance = 'true' else: clearance = 'false' url = 'http://www.mattressnextday.co.uk/?route=api/product/sizes×tamp={}&productId={}&callback=jQuery110206226998819969816_1389291112656&storeId=0&isClearance={}&_=1389291112657'.format(int(time.time()), pid[0], clearance) yield Request(url, meta={'product': product}, callback=self.get_product_options) else: self.log('ERROR! Unable to parse product ID from url: {}'.format(response.url))
def parse_game(self, response): self.log("Found game page %s" % response.url) hxs = HtmlXPathSelector(response) clues = hxs.select('//td[@class="clue"]') jitems = [] game = first(hxs.select('//div[@id="game_title"]/h1/text()').extract()) cats = hxs.select('//td[@class="category_name"]/text()').extract() self.log(game) for clue in clues: jitem = JarchiveItem() found = clue.select('table/tr/td/div/@onmouseover').extract() if len(found) > 0: clueinfo = first(clue.select('.//td[@class="clue_text"]/@id').extract()).split("_") round = clueinfo[1] cluecol = int(clueinfo[2])-1 if round == "DJ": cluecol+=6 togglebox = found[0].split("', '") cr = HtmlXPathSelector(text=togglebox[2]).select(".//em[@class='correct_response']/text()") cr = first(cr.extract()) v = first(clue.select('.//td[@class="clue_value"]/text()').extract()) if v: v = v[1:] c = first(clue.select('.//td[@class="clue_text"]/text()').extract()) (jitem['correct_response'], jitem['value'], jitem['clue'], jitem['game'], jitem['category']) = cr, v, c, game, cats[cluecol] jitems.append(jitem) return jitems
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) prod = hxs.extract() if prod: url = response.meta['prod_url'] url = urljoin_rfc(self.start_urls[0], url) name = re.search('"title":"([^"]*)"+', prod).group() name = name.split(":")[1].strip('"').strip() if name: name_sufix = '' loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', url) sku = re.search('"reference":"(?:[^\\"]+|\\.)*"', prod)#re.search('"reference":"(\d+-\d+\S\s+\w+)', prod) if not sku: sku = re.search('"reference":"(\d+-\d+\S\w+)', prod) if sku: sku = sku.group().split(':')[1].strip('",') loader.add_value('sku', sku) loader.add_value('identifier', sku) name_sufix = '-'.join(sku.split('-')[1:]) if name_sufix: loader.add_value('name', name+' ('+name_sufix+')') else: loader.add_value('name', name) price = re.findall('"flat_price_inc":"(\d+.\d+)', prod) if price: loader.add_value('price', price[0][:-1]) yield loader.load_item()
def parse_game(self, response): self.log("Found game page %s" % response.url) hxs = HtmlXPathSelector(response) clues = hxs.select('//td[@class="clue"]') jitems = [] game = first(hxs.select('//div[@id="game_title"]/h1/text()').extract()) cats = hxs.select('//td[@class="category_name"]/text()').extract() self.log(game) for clue in clues: jitem = JarchiveItem() found = clue.select('table/tr/td/div/@onmouseover').extract() if len(found) > 0: clueinfo = first( clue.select( './/td[@class="clue_text"]/@id').extract()).split("_") round = clueinfo[1] cluecol = int(clueinfo[2]) - 1 if round == "DJ": cluecol += 6 togglebox = found[0].split("', '") cr = HtmlXPathSelector(text=togglebox[2]).select( ".//em[@class='correct_response']/text()") cr = first(cr.extract()) v = first( clue.select('.//td[@class="clue_value"]/text()').extract()) if v: v = v[1:] c = first( clue.select('.//td[@class="clue_text"]/text()').extract()) (jitem['correct_response'], jitem['value'], jitem['clue'], jitem['game'], jitem['category']) = cr, v, c, game, cats[cluecol] jitems.append(jitem) return jitems
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) prod = hxs.extract() if prod: url = response.meta['prod_url'] url = urljoin_rfc(self.start_urls[0], url) name = re.search('"title":"([^"]*)"+', prod).group() name = name.split(":")[1].strip('"').strip() if name: name_sufix = '' loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', url) sku = re.search( '"reference":"(?:[^\\"]+|\\.)*"', prod) #re.search('"reference":"(\d+-\d+\S\s+\w+)', prod) if not sku: sku = re.search('"reference":"(\d+-\d+\S\w+)', prod) if sku: sku = sku.group().split(':')[1].strip('",') loader.add_value('sku', sku) loader.add_value('identifier', sku) name_sufix = '-'.join(sku.split('-')[1:]) if name_sufix: loader.add_value('name', name + ' (' + name_sufix + ')') else: loader.add_value('name', name) price = re.findall('"flat_price_inc":"(\d+.\d+)', prod) if price: loader.add_value('price', price[0][:-1]) yield loader.load_item()
def test_null_bytes(self): hxs = HtmlXPathSelector(text='<root>la\x00la</root>') self.assertEqual(hxs.extract(), u'<html><body><root>lala</root></body></html>') xxs = XmlXPathSelector(text='<root>la\x00la</root>') self.assertEqual(xxs.extract(), u'<root>lala</root>')
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) p = re.compile("of <b>(\d+)</b", re.IGNORECASE) total = p.findall(hxs.extract()) pages = int(int(total[0]) / 96) + 2 for i in range(1, pages): next_url = self.start_urls[0] + "&page=" + str(i) yield Request(next_url, meta={"cur": i, "attempt": 1}, callback=self.parse_items)
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) p = re.compile('of <b>(\d+)</b', re.IGNORECASE) total = p.findall(hxs.extract()) pages = int(int(total[0])/96) + 2 for i in range(1, pages): next_url = self.start_urls[0] + "&page=" + str(i) yield Request(next_url, meta={'cur': i, 'attempt': 1}, callback=self.parse_items)
def parse_categories(self, response): hxs = HtmlXPathSelector(response) html = hxs.extract().replace('Sub Categories', '<div id="sub_categories">').replace('<p> </p>', '</div>') new_hxs = HtmlXPathSelector(text=html) sub_categories = new_hxs.select('//*[@id="sub_categories"]/a/@href').extract() for sub_category in sub_categories: url = urljoin_rfc(get_base_url(response), sub_category) yield Request(url, self.parse_products)
def parse(self, response): hxs = HtmlXPathSelector(response) deduct = self.get_data(hxs.extract(), 'productDeduc.push("', '")') answers = self.get_data(hxs.extract(), 'productAns.push("', '")') checks = hxs.select( '//form[@name="customVal"]/table/tr/td[@style="padding-left:30px;"]/text()' ).extract() answers_deduct = zip(answers, deduct) options = zip(checks, answers_deduct) #Gets the prices. prices = zip(['Like New', 'Fair', 'Poor'], self.get_data(hxs.extract(), 'pp1=', ';\n')[:-1]) for grade, price in prices: loader = ProductLoader(item=Product(), response=response) name = hxs.select( '//*[@id="vmMainPage"]/div/div/div/div/h1/text()').extract()[0] loader.add_value('name', ' '.join((name, grade))) loader.add_value('price', self.calc_price(float(price), options)) loader.add_value('url', response.url) yield loader.load_item()
def test_selector_over_text(self): hxs = HtmlXPathSelector(text='<root>lala</root>') self.assertEqual(hxs.extract(), u'<html><body><root>lala</root></body></html>') xxs = XmlXPathSelector(text='<root>lala</root>') self.assertEqual(xxs.extract(), u'<root>lala</root>') xxs = XmlXPathSelector(text='<root>lala</root>') self.assertEqual(xxs.select('.').extract(), [u'<root>lala</root>'])
def parse_categories(self, response): hxs = HtmlXPathSelector(response) html = hxs.extract().replace('Sub Categories', '<div id="sub_categories">').replace( '<p> </p>', '</div>') new_hxs = HtmlXPathSelector(text=html) sub_categories = new_hxs.select( '//*[@id="sub_categories"]/a/@href').extract() for sub_category in sub_categories: url = urljoin_rfc(get_base_url(response), sub_category) yield Request(url, self.parse_products)
def parse_subproducts(self, response): hxs = HtmlXPathSelector(response) #Fix for the HTML code. html = hxs.extract().replace('<br></h3>','').\ replace('<h3','<div class="item"').\ replace('</p>\n <div','</p></div>\n <div').\ replace('<input type="radio"', '<div class="hd" ').\ replace('checked>','>').\ replace('</p></div>','</div></p></div>').\ replace('</p>\n', '</div></p>\n') products_hxs = HtmlXPathSelector(text=html) products = products_hxs.select('//div[@class="item"]') for product in products: sub_products = product.select('div[@class="hd"]') if sub_products: for sub_product in sub_products: value = sub_product.select('./@value').extract()[0] hd = sub_product.select('./text()').extract()[0] name = ' '.join( (product.select('p/text()').extract()[0], hd)) extracted = process.extractOne(name, self.products) try: if extracted[1] >= 98: url = 'http://sellusyourgadget.co.uk/index.php/home/getConditions/%s' yield Request(url % value.split(':')[0], callback=self.parse_options, meta={ 'id': response.meta['id'], 'name': name, 'memoryR': value, 'memory': value }) except TypeError: return else: name = product.select('p/text()').extract()[0] extracted = process.extractOne(name, self.products) try: if extracted[1] >= 98: value = product.select('p/input/@value').extract()[0] url = 'http://sellusyourgadget.co.uk/index.php/home/getConditions/%s' yield Request(url % value.split(':')[0], callback=self.parse_options, meta={ 'id': response.meta['id'], 'name': name, 'memoryR': value, 'memory': value }) except TypeError: return
def parse_detail(self, response): outputfile = self.output_file if not outputfile: log.msg("download %s fail" % response.url, level = log.WARNING, spider = self) return content_type=chardet.detect(response.body) hxs=HtmlXPathSelector(response) maindoing='' try: if content_type['encoding'] in ['ISO-8859-2','GB2312']:content_type['encoding']='gbk' maindoing=response.body.decode(content_type['encoding']) except Exception, e: maindoing=hxs.extract()
def parse_article(self, response): self.log("Haciendo como que parseo el articulo %s" % response.url) hxs = HtmlXPathSelector(response) for k in keywords: if (re.search(k, hxs.extract())): self.log("El art. en %s contiene %s" % (response.url, k)) title = hxs.select('//table')[5].select('.//table')[2].select('.//font')[0].select('.//font')[2].select('text()').extract() item = Article() item['title'] = title[0] item['url'] = response.url return item return None
def parse_course_list(self, response, courselist): retval = [] hxs = HtmlXPathSelector(response) courses = hxs.select('.//tr[descendant::font[@color="#0000FF"]]') length = len(courses) if length == 0: # no course to process return [] data = hxs.extract().split(courses[0].extract())[1] if length == 1: course_details = [data] else: course_details = [] for course in courses[1:]: s = data.split(course.extract()) course_details.append(s[0]) data = s[1] course_details.append(s[1]) # sanity check assert (length == len(course_details)) flags = re.UNICODE | re.MULTILINE #| re.DOTALL for course, course_detail in zip(courses, course_details): code_title_au_dept = course.select('.//font/text()').extract() passfail = filter( None, re.findall(u'<font.*color="RED">([^<]*)', course_detail, flags)) mutex = filter( None, re.findall(u'<font.*color="BROWN">([^<]*)', course_detail, flags)) unavail = filter( None, re.findall(u'<font.*color="GREEN">([^<]*)', course_detail, flags)) prereq = filter( None, re.findall(u'<font.*color="#FF00FF">([^<]*)', course_detail, flags)) desc = re.search(u'<font size="2">([^<]*)', course_detail).groups()[0] courseitem = self._fill_in(courselist, code_title_au_dept, passfail, mutex, unavail, prereq, desc) if courseitem: retval.append(courseitem) return retval
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//td[@class="td"]/div[@style="width:750px;' ' padding: 10px 0px 10px 20px; "]/' 'table [@width="80%" and @cellpadding="4" and' ' @border="0" and @align="center"]') if products: for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'tr/td/table/tr/td/strong/a/text()') loader.add_xpath('name', 'tr/td/div/strong/a/text()') url = product.select( 'tr/td/table/tr/td/strong/a/@href').extract() if url: url = urljoin_rfc(get_base_url(response), url[0]) else: url = product.select('tr/td/div/strong/a/@href').extract() if url: url = urljoin_rfc(get_base_url(response), url[0]) loader.add_value('url', url) loader.add_xpath('price', 'tr/td/div[@class="HeadingText"]/text()') yield loader.load_item() else: try: categories = hxs.select( '//td[@class="td"]/div[@style="width:750px;' ' padding: 10px 0px 10px 20px; "]/' 'table[@cellpadding="5"]') if categories: for category in categories: url = urljoin_rfc( get_base_url(response), category.select( 'tr/td/a[@class="HeadingText"]/@href').extract( )[0]) yield Request(url, dont_filter=True, callback=self.parse_products) except IndexError: pass html = hxs.extract().replace('Sub Categories', '<div id="sub_categories">').replace( '<p> </p>', '</div>') new_hxs = HtmlXPathSelector(text=html) sub_categories = new_hxs.select( '//*[@id="sub_categories"]/a/@href').extract() for sub_category in sub_categories: url = urljoin_rfc(get_base_url(response), sub_category) yield Request(url, dont_filter=True, callback=self.parse_products)
def parse_article(self, response): self.log("Haciendo como que parseo el articulo %s" % response.url) hxs = HtmlXPathSelector(response) for k in keywords: if (re.search(k, hxs.extract())): self.log("El art. en %s contiene %s" % (response.url, k)) title = hxs.select('//table')[5].select('.//table')[2].select( './/font')[0].select('.//font')[2].select( 'text()').extract() item = Article() item['title'] = title[0] item['url'] = response.url return item return None
def parse_subproducts(self, response): hxs = HtmlXPathSelector(response) # Fix for the HTML code. html = ( hxs.extract() .replace("<br></h3>", "") .replace("<h3", '<div class="item"') .replace("</p>\n <div", "</p></div>\n <div") .replace('<input type="radio"', '<div class="hd" ') .replace("checked>", ">") .replace("</p></div>", "</div></p></div>") .replace("</p>\n", "</div></p>\n") ) products_hxs = HtmlXPathSelector(text=html) products = products_hxs.select('//div[@class="item"]') for product in products: sub_products = product.select('div[@class="hd"]') if sub_products: for sub_product in sub_products: value = sub_product.select("./@value").extract()[0] hd = sub_product.select("./text()").extract()[0] name = " ".join((product.select("p/text()").extract()[0], hd)) extracted = process.extractOne(name, self.products) try: if extracted[1] >= 98: url = "http://sellusyourgadget.co.uk/index.php/home/getConditions/%s" yield Request( url % value.split(":")[0], callback=self.parse_options, meta={"id": response.meta["id"], "name": name, "memoryR": value, "memory": value}, ) except TypeError: return else: name = product.select("p/text()").extract()[0] extracted = process.extractOne(name, self.products) try: if extracted[1] >= 98: value = product.select("p/input/@value").extract()[0] url = "http://sellusyourgadget.co.uk/index.php/home/getConditions/%s" yield Request( url % value.split(":")[0], callback=self.parse_options, meta={"id": response.meta["id"], "name": name, "memoryR": value, "memory": value}, ) except TypeError: return
def parsePage(self, response): hxs = HtmlXPathSelector(response) item = response.meta['item'] emails = collectAllEmail(hxs.extract()) if len(emails) > 0: item['email'] = emails[0] yield item extractor = SgmlLinkExtractor(allow_domains=response.url) for entry in extractor.extract_links(response): if entry.url is not None: req = Request(entry.url, callback=self.parsePage) req.meta['item'] = item yield req
def parse_detail(self, response): outputfile = self.output_file if not outputfile: log.msg("download %s fail" % response.url, level=log.WARNING, spider=self) return content_type = chardet.detect(response.body) hxs = HtmlXPathSelector(response) maindoing = '' try: if content_type['encoding'] in ['ISO-8859-2', 'GB2312']: content_type['encoding'] = 'gbk' maindoing = response.body.decode(content_type['encoding']) except Exception, e: maindoing = hxs.extract()
def parse_products(self, response): hxs = HtmlXPathSelector(response) categories = hxs.select('//p[@class="catname"]/strong/a/@href').extract() if categories: yield Request(response.url, dont_filter=True) else: tr = ('<tr><td colspan="3" align="center" class="pagenums">'+ '<p class="pagenums">\r\n\t\t\t\t '+ '<img src="images/clearpixel.gif" width="300" '+ 'height="8" alt=""></p></td>\r\n\t\t\t </tr>') tr_end = '<tr>' + hxs.select('//td[@class="prodseparator"]').\ extract()[0].decode('utf') + '</tr>' html = hxs.extract().replace(tr,'<table class="item">').\ replace(tr_end,'</table><table class="item">') products_hxs = HtmlXPathSelector(text=html) products = products_hxs.select('//table[@class="item"]') for product in products: name = product.select('tr/td/strong/div[@class="prodname"]/a/text()').extract() if name: name = name[0] url = product.select('tr/td/strong/div[@class="prodname"]/a/@href').extract() if url: url = url[0] price_options = product.select('tr/td/form/script').extract() if price_options: price_values = self._get_prices(price_options[0]) for price, desc in price_values: loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', ' '.join((name,desc))) loader.add_value('url', urljoin_rfc(get_base_url(response), url)) loader.add_value('price', price) yield loader.load_item() else: price = product.select('tr/td/div[@class="prodprice"]/span/text()').extract() if price: price = price[0] else: price = 0.0 loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', name) loader.add_value('url', urljoin_rfc(get_base_url(response), url)) loader.add_value('price', price) yield loader.load_item() next = hxs.select('//a[@class="ectlink" and @ rel="next"]/@href').extract() if next: url = urljoin_rfc(get_base_url(response), next[0]) yield Request(url, callback=self.parse_products)
def parse_product_list(self, response): hxs = HtmlXPathSelector(response) path = '' pattern = r"'([A-Za-z0-9_\./\\-]*)'" for line in hxs.extract().split('\n'): if 'path = ' in ' '.join(line.split()): text = ' '.join(line.split()) path = re.search(pattern, text).group().replace("'", '') if path: category_id = hxs.select('//select[@name="sort_by"]/@id').extract()[0].replace('sort_by_','') product_list_url = ('http://www.mattressnextday.co.uk/index.php?route=api/' + 'category/getProducts&sort_by=price_low_to_high&'+ 'category_id='+category_id+'&price_range=all&'+ 'layout=grid&path='+path+'&per_page=1000&page=1') yield Request(product_list_url, callback=self.parse_product_list, meta=response.meta) for url in hxs.select('//div[@class="name"]/a/@href').extract(): url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product, meta=response.meta)
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = BurtonItem() page = hxs.extract() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item['product_id'], item['name'] = self.get_basic_info(hxs) item['description'], item['features'] = self.get_description( hxs) item['variants'], thumb_urls, color_names = self.get_variants( page) item['all_sizes'] = self.get_all_sizes(page) item['color_json'], image_urls = self.get_colors( page, color_names) item['price'], item['old_price'] = self.get_prices(hxs) item['in_stock'] = ['IN_STOCK'] item['product_link'] = [basic.cdata(response.url)] self.xml.create_xml(item) item['image_urls'] = image_urls + thumb_urls self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//td[@class="td"]/div[@style="width:750px;' ' padding: 10px 0px 10px 20px; "]/' 'table [@width="80%" and @cellpadding="4" and' ' @border="0" and @align="center"]') if products: for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'tr/td/table/tr/td/strong/a/text()') loader.add_xpath('name', 'tr/td/div/strong/a/text()') url = product.select('tr/td/table/tr/td/strong/a/@href').extract() if url: url = urljoin_rfc(get_base_url(response), url[0]) else: url = product.select('tr/td/div/strong/a/@href').extract() if url: url = urljoin_rfc(get_base_url(response), url[0]) loader.add_value('url', url) loader.add_xpath('price', 'tr/td/div[@class="HeadingText"]/text()') yield loader.load_item() else: try: categories = hxs.select('//td[@class="td"]/div[@style="width:750px;' ' padding: 10px 0px 10px 20px; "]/' 'table[@cellpadding="5"]') if categories: for category in categories: url = urljoin_rfc(get_base_url(response), category.select('tr/td/a[@class="HeadingText"]/@href').extract()[0]) yield Request(url, dont_filter=True, callback=self.parse_products) except IndexError: pass html = hxs.extract().replace('Sub Categories', '<div id="sub_categories">').replace('<p> </p>', '</div>') new_hxs = HtmlXPathSelector(text=html) sub_categories = new_hxs.select('//*[@id="sub_categories"]/a/@href').extract() for sub_category in sub_categories: url = urljoin_rfc(get_base_url(response), sub_category) yield Request(url, dont_filter=True, callback=self.parse_products)
def parse_course_list(self, response, courselist): retval = [] hxs = HtmlXPathSelector(response) courses = hxs.select('.//tr[descendant::font[@color="#0000FF"]]') length = len(courses) if length == 0: # no course to process return [] data = hxs.extract().split(courses[0].extract())[1] if length == 1: course_details = [data] else: course_details = [] for course in courses[1:]: s = data.split(course.extract()) course_details.append(s[0]) data = s[1] course_details.append(s[1]) # sanity check assert(length == len(course_details)) flags = re.UNICODE | re.MULTILINE #| re.DOTALL for course, course_detail in zip(courses, course_details): code_title_au_dept = course.select('.//font/text()').extract() passfail = filter(None, re.findall(u'<font.*color="RED">([^<]*)', course_detail, flags)) mutex = filter(None, re.findall(u'<font.*color="BROWN">([^<]*)', course_detail, flags)) unavail = filter(None, re.findall(u'<font.*color="GREEN">([^<]*)', course_detail, flags)) prereq = filter(None, re.findall(u'<font.*color="#FF00FF">([^<]*)', course_detail, flags)) desc = re.search(u'<font size="2">([^<]*)', course_detail).groups()[0] courseitem = self._fill_in(courselist, code_title_au_dept, passfail, mutex, unavail, prereq, desc) if courseitem: retval.append(courseitem) return retval
def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = BurtonItem() page = hxs.extract() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item['product_id'], item['name'] = self.get_basic_info(hxs) item['description'], item['features'] = self.get_description(hxs) item['variants'], thumb_urls, color_names = self.get_variants(page) item['all_sizes'] = self.get_all_sizes(page) item['color_json'], image_urls = self.get_colors(page, color_names) item['price'], item['old_price'] = self.get_prices(hxs) item['in_stock'] = ['IN_STOCK'] item['product_link'] = [basic.cdata(response.url)] self.xml.create_xml(item) item['image_urls'] = image_urls + thumb_urls self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) cat_name = response.meta["cat_name"] # Fill up the Product model fields #identifier = url = response.url name = hxs.select("//div[contains(@class, 'details')]/h1/text()" ).extract()[0].replace("-", " ").strip() price = hxs.select( "//div[@class='price-info']/strong[@id='price']/text()").extract( )[0].strip() #sku = #metadata = category = cat_name image_url = hxs.select("//div[@class='main-image']/a/@href").extract() if not image_url: image_url = hxs.select( "//div[@class='main-image']/img/@src").extract() if not image_url: image_url = "" brand = hxs.select( "//div[contains(@class, 'details')]/h1/strong/text()").extract() shipping_cost = hxs.select( "//dl[@class='blue']/dd/text()").extract()[0].strip() self.log(price) if 'Cart' in price: products = [] for line in hxs.extract().split('\n'): if "Add to Cart" in line: product = re.findall('"([A-Za-z0-9 _\./\\-]*)"', line) if product: products.append(product[:1] + product[3:]) if products: self.log("products: " + str(products)) yield Request(url, dont_filter=True, callback=self.parse_add_products, meta={ 'products': products, 'url': url, 'brand': brand, 'category': category }) else: b_product_id = hxs.select( '//input[@id="b_product_id"]/@value').extract() o = urlparse(url) params = parse_qs(o.query) cur_option_id = "" if "v" in params: self.log("option v found") cur_option_name = params["v"] if cur_option_name: cur_option_id = cur_option_name[0].strip().lower() product_id = hxs.select( '//input[@id="product_id"]/@value').extract() if not b_product_id: self.log("ERROR b_product id not found") else: res_product_id = (b_product_id[0] + " " + cur_option_id).strip() #l.add_value('identifier', res_product_id) size_option = hxs.select( '//fieldset/div/div[label/text()="\r\n \r\n Size:\r\n "]/select' ) if size_option: sizes = [] for line in response.body.split('\n'): if 'products[' in line and 'new Array' in line: sizes.append( ast.literal_eval( line.split('Array')[-1].split(';')[0])) for size in sizes: l = ProductLoader(response=response, item=Product()) #l.add_value('identifier', identifier) l.add_value('url', url) l.add_value('name', name + ' ' + size[3] + ' ' + size[4]) l.add_value('identifier', res_product_id + '-' + size[0]) l.add_value('price', size[1]) l.add_value('category', category) l.add_value('image_url', image_url) l.add_value('brand', brand) l.add_value('shipping_cost', shipping_cost) if size[1]: l.add_value('stock', '1') else: l.add_value('stock', '0') yield l.load_item() else: l = ProductLoader(response=response, item=Product()) #l.add_value('identifier', identifier) l.add_value('url', url) l.add_value('identifier', res_product_id) l.add_value('name', name) l.add_value('price', price) #l.add_value('sku', sku) #l.add_value('metadata', metadata) l.add_value('category', category) l.add_value('image_url', image_url) l.add_value('brand', brand) l.add_value('shipping_cost', shipping_cost) if price: l.add_value('stock', '1') else: l.add_value('stock', '0') yield l.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@id="ProductDetails"]/div[@id="ProductDetails"]') if products: for product in products: url = product.select( 'div/div[@id="ProductName"]/h2/a/@href').extract() if url: url = urljoin_rfc(get_base_url(response), url[0]) else: url = product.select('tr/td/div/strong/a/@href').extract() if url: url = urljoin_rfc(get_base_url(response), url[0]) yield Request(url, callback=self.parse_product) """ try: identifier = re.search(r'/prod(\d+)', url).groups()[0] except: # Options yield Request(url, callback=self.parse_products) continue loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'div/div[@id="ProductName"]/h2/a/text()') loader.add_value('url', url) price = product.select('div/div/div[@class="Cart-ProductCost"]/text()').extract() if price: price = price[0] else: price = '0.0' loader.add_value('price', price) category = hxs.select('//div[@class="SiteMap"]/a/text()').extract() category = category[-2] if category else '' loader.add_value('category', category) loader.add_value('identifier', identifier) try: loader.add_value('image_url', urljoin_rfc(get_base_url(response), product.select('.//a/img/@src') .extract()[0])) except: pass out_stock = product.select('.//div[@class="Cart-Special-Note" and contains(text(), "This product is coming soon")]/text()').extract() if out_stock: loader.add_value('stock', 0) yield loader.load_item() """ else: try: categories = hxs.select( '//td[@class="td"]/div[@style="width:750px;' ' padding: 10px 0px 10px 20px; "]/' 'table[@cellpadding="5"]') if categories: for category in categories: url = urljoin_rfc( get_base_url(response), category.select( 'tr/td/a[@class="HeadingText"]/@href').extract( )[0]) yield Request(url, dont_filter=True, callback=self.parse_products) except IndexError: pass sub_categories = hxs.select( '//div[@class="CategoryContainer"]//tr/td/font/a/@href').extract() sub_categories = hxs.select( '//div[@id="ProductDetails"]/a/@href').extract() for sub_category in sub_categories: url = urljoin_rfc(get_base_url(response), sub_category) yield Request(url, callback=self.parse_products) html = hxs.extract().replace('Sub Categories', '<div id="sub_categories">').replace( '<p> </p>', '</div>') new_hxs = HtmlXPathSelector(text=html) sub_categories = new_hxs.select( '//*[@id="sub_categories"]/a/@href').extract() for sub_category in sub_categories: url = urljoin_rfc(get_base_url(response), sub_category) yield Request(url, dont_filter=True, callback=self.parse_products) sub_categories = hxs.select( '//div[@id="ProductDetails"]/a/@href').extract() for sub_category in sub_categories: url = urljoin_rfc(get_base_url(response), sub_category) yield Request(url, callback=self.parse_products)
def search(city,date): try: movielist=[] start=0 conn = httplib.HTTPConnection("www.google.com") conn.request("GET", "/movies?near="+city+"&start="+str(start)+"&date="+str(date)) r1=conn.getresponse() hxs=HtmlXPathSelector(HtmlResponse("www.google.com/movies?near="+city+"&date="+str(date)+"&start="+str(start),r1.status,r1.getheaders(),r1.read(),request=conn)) st=hxs.extract() while("No showtimes were found" not in st): theater=hxs.select('//h2[contains(@class,"name")]/a[contains(@href,"movies")]/text()').extract() m=hxs.select('//div[contains(@class,"name")]/a[contains(@href,"movies")]/text()').extract() x=hxs.select('//div[contains(@class,"times")]/text()').extract() counter=0 for i,j in zip(theater,theater[1:]): star=st.find(i) end=st.find(j) test=st.find(m[counter],star,end) while(test!=-1): temp=x[counter].split() try: am=next(temp1 for temp1,temp2 in enumerate(temp) if temp2.endswith("am")) temp[am]=temp[am][:-2] except: am=0 try: pm=next(temp1 for temp1,temp2 in enumerate(temp) if temp2.endswith("pm")) temp[pm]=temp[pm][:-2] if int(temp[pm].split(':')[0])<12: temp[pm]=str((int(temp[pm].split(':')[0])+12)%24)+temp[pm][temp[pm].find(':'):] for temp3 in range(am+1,pm): if int(temp[temp3].split(':')[0])<12: temp[temp3]=str((int(temp[temp3].split(':')[0])+12)%24)+temp[temp3][temp[temp3].find(':'):] except: pass movielist.append((i,m[counter],temp)) star =test+len(m[counter]) counter+=1 test=st.find(m[counter],star,end) while(counter!=len(m)): temp=x[counter].split() try: am=next(temp1 for temp1,temp2 in enumerate(temp) if temp2.endswith("am")) temp[am]=temp[am][:-2] except: am=0 try: pm=next(temp1 for temp1,temp2 in enumerate(temp) if temp2.endswith("pm")) temp[pm]=temp[pm][:-2] for temp3 in range(am+1,pm+1): if int(temp[temp3].split(':')[0])<12: temp[temp3]=str((int(temp[temp3].split(':')[0])+12)%24)+temp[temp3][temp[temp3].find(':'):] except: pass movielist.append((theater[-1],m[counter],temp)) counter+=1 #pdb.set_trace() conn.close() start+=10 conn = httplib.HTTPConnection("www.google.com") conn.request("GET", "/movies?near="+city+"&start="+str(start)+"&date="+str(date)) r1=conn.getresponse() hxs=HtmlXPathSelector(HtmlResponse("www.google.com/movies?near="+city+"&date="+str(date)+"&start="+str(start),r1.status,r1.getheaders(),r1.read(),request=conn)) st=hxs.extract() #return json.JSONEncoder().encode(movielist) return json.dumps(movielist, sort_keys=True, indent=4) except: pass
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select( '//div[@class="product-name"]/h1/text()').extract()[0] url = response.url price = hxs.select( '//div[@class="product-shop"]//div[@class="wrapper-price-share"]' '//div[@class="price-box"]//span[contains(@id, "product-price")]' '/text()').re('\xa3(.*)') if price: price = price[0] else: price = hxs.select('//span[@class="price"]/text()').re( '\xa3(.*)')[0] sku = hxs.select('//meta[@itemprop="productID"]/@content').re( 'sku:(.*)') category = hxs.select( '//div[@class="breadcrumbs"]//a/text()').extract()[-1] brand = hxs.select('//div[@class="product-brand"]/img/@src').extract() if brand: brand = brand[0].split('/')[-1].lower() brand = brand.replace('_', ' ').replace('-logo', '#').replace( ' logo', '#').split('#')[0].title() else: brand = '' loader.add_value('brand', brand) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) loader.add_value('identifier', sku) loader.add_value('category', category) stock = hxs.select('//div[@class="pdp-info-icon in-stock"]').extract() if not stock: loader.add_value('stock', 0) image_url = hxs.select( '//meta[@property="og:image"]/@content').extract() if image_url: loader.add_value('image_url', image_url[0]) options = hxs.select('//div[@class="input-box"]/select') main_category = hxs.select( '//div[@class="breadcrumbs"]//a/text()').extract() main_category = main_category[1] if len(main_category) > 1 else '' if options and main_category == 'Essentials': log.msg('CRAWL PRODUCT OPTIONS') options = json.loads(hxs.extract().partition('Product.Config(') [-1].partition(');')[0]) #options_number_key = '157' #if options_number_key not in options['attributes']: #options_number_key = '187' options_number_key = response.xpath('//select/@id').re( 'attribute(\d+)')[0] options = options['attributes'][options_number_key]['options'] for option in options: product = loader.load_item() product['identifier'] = product['identifier'] + '-' + option[ 'label'] product['name'] = product['name'] + ' ' + option['label'] product['price'] = float(product['price']) + float( option['price']) option_loader = ProductLoader(item=product, response=response) item = option_loader.load_item() metadata = SimplyPleasureMeta() metadata['cost_price'] = self.cost_prices.get( item['identifier']) item['metadata'] = metadata yield item else: item = loader.load_item() metadata = SimplyPleasureMeta() metadata['cost_price'] = self.cost_prices.get(item['identifier']) item['metadata'] = metadata yield item
def _query(xpath, response, extract=True): ret = HtmlXPathSelector(response).select(xpath) return ret.extract() if extract else ret
def parse(self, response): try: hxs = HtmlXPathSelector(response) except AttributeError: msg = 'Error getting selector on page for row: %s' % response.meta[ 'row'] self.log('[ERROR] %s' % msg) self.errors.append(msg) return row = response.meta['row'] json_data = None for line in hxs.extract().split('\n'): if "JsonObject = " in line: json_data = json.loads( line.replace('JsonObject = ', '').replace('; \r', '')) products = json_data['Rest'] + json_data['Deals'] collected_products = [] for product_info in products: # skip winter tyres if product_info['WinterTyre']: continue loader = ProductLoader(item=Product(), selector=product_info) loader.add_value('name', product_info['ModelName']) brand = product_info['Manufacturer'] loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_info['PrimaryId'] fitting_method = 'Delivered' url = '/catalogue' + product_info[ 'CatalogueUrl'] + '/f?tyre=' + str(product_info['PrimaryId']) loader.add_value('url', urljoin(get_base_url(response), url)) image_url = product_info.get('ModelImageLarge') if not image_url: image_url = product_info.get('ModelImage') if image_url: image_url = image_url.split('src="')[-1].split('"')[0] loader.add_value('image_url', urljoin(get_base_url(response), image_url)) loader.add_value('identifier', str(identifier) + '-' + fitting_method) price = product_info['SellingPrice'] loader.add_value('price', price) spec = product_info['SpecificationName'] metadata = MicheldeverMeta() # metadata['mts_stock_code'] = row['MTS Stockcode'] metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] metadata['speed_rating'] = spec.split()[-1] metadata['width'] = row['Width'] metadata['fitting_method'] = fitting_method load_rating = product_info['LoadRatingName'] metadata['load_rating'] = load_rating metadata['alternative_speed_rating'] = '' xl = product_info['Reinforced'] metadata['xl'] = 'Yes' if xl else 'No' run_flat = product_info['RunFlat'] metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = product_info['Variant'] if manufacturer_mark: manufacturer_mark = manufacturer_mark.split()[0].strip() metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], metadata['load_rating'], metadata['speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating # Do not collect "Delivered" tyres # yield product product['price'] = product_info['FullyFittedPrice'] fitting_method = 'Fitted' product['identifier'] = str(identifier) + '-' + fitting_method product['metadata']['fitting_method'] = fitting_method collected_products.append(product) min_price_products = {} for product in collected_products: key = "%s-%s-%s-%s-%s-%s-%s" % ( product['brand'], product['name'], product['metadata']['fitting_method'], product['metadata']['full_tyre_size'], product['metadata']['xl'], product['metadata']['run_flat'], product['metadata']['manufacturer_mark']) if key in min_price_products: if product['price'] < min_price_products[key]['price']: min_price_products[key] = product else: min_price_products[key] = product for product in min_price_products.values(): yield product
def parse(self, response): hxs = HtmlXPathSelector(response) # titles = hxs.select("//ul[@class='title2 fs_14']") # testElem = hxs.select("/html/body/div[2]/div/div/div/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[3]/div[1]/div[1]/ul[1]") # print response.url webStr = hxs.extract() # print webStr newsLinks = getLinks_before20110406(webStr) items = [] for newsLink in newsLinks: # print li.extract() title = newsLink[0] link = newsLink[1] # newsBody = getNewsBody(link) newsBody = getNewsBody_20100506_20101231(link) if len(newsBody) < 100: print '------------------------' print title print link print newsBody print 'problem!!!!!!!!!!!!!!!!!!!!' item = CctvScraperItem() # item["date"] = '20130716' # item["date"] = response.url[-14:-6] item["date"] = getDTFromUrl(response.url) item ["title"] = title.encode('utf8') item ["link"] = link.encode('utf8') item["newsBody"] = newsBody items.append(item) self.csvWriter.writerow([item['date'], item['title'], item["newsBody"]]) # print title # print link # print newsBody # def parse(self, response): # hxs = HtmlXPathSelector(response) # items = [] # sections1 = hxs.select("/html") # for li in sections1.select('//li'): # title = ''.join(li.select('a/text()').extract()) # if u"[视频]" not in title: # continue # link = li.select('a/@href').extract()[0] # newsBody = getNewsBody(link) # item = CctvScraperItem() # # item["date"] = '20130716' # item["date"] = response.url[-14:-6] # item ["title"] = title.encode('utf8') # item ["link"] = link.encode('utf8') # item["newsBody"] = newsBody # items.append(item) # self.csvWriter.writerow([item['date'], item['title'], item["newsBody"]]) # print title # print link # sections1 = hxs.select("/html/body/div[2]/div/div/div/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[3]/div[1]/div[1]/ul[1]") # sections2 = hxs.select("/html/body/div[2]/div/div/div/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[3]/div[1]/div[1]/ul[2]") # sections3 = hxs.select("/html/body/div[2]/div/div/div/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[3]/div[1]/div[1]/ul[3]") # # items = [] # for sections in [sections1, sections2, sections3]: # for li in sections.select('.//li'): # # print li.extract() # title = ''.join(li.select('a/text()').extract()) # link = li.select('a/@href').extract()[0] # item = CctvScraperItem() # item ["title"] = title # item ["link"] = link # items.append(item) # print title # print link # return items
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) try: name = hxs.select( '//div[@class="product-name"]/h1/text()').extract()[0] url = response.url price = hxs.select( '//div[@class="product-view nested-container"]' '//div[@class="price-box"]/span/span[@class="price"]/text()' ).extract() if not price: price = hxs.select( '//div[@class="product-view nested-container"]' '//div[@class="price-box"]/p[@class="special-price"]' '/span[@class="price"]/text()').extract() sku = hxs.select('//tr[th/text()="SKU"]/td/text()').extract()[0] brand = hxs.select( '//tr[th/text()="Manufacturer"]/td/text()').extract()[0] if price: price = extract_price(price[0]) else: price = 0 image_url = hxs.select('//a[@id="zoom1"]/img/@src').extract() if image_url: image_url = image_url[0] else: image_url = '' breadcrumb = hxs.select( '//div[@class="grid-full breadcrumbs"]/ul/li/a/text()' ).extract() category = breadcrumb[-1] if "ESSENTIAL" in ''.join(breadcrumb).upper(): opts = [] for line in hxs.extract().split('\n'): if '"options":[' in line: opts = json.loads( line.split('"options":')[-1].split('}}')[0]) if opts: for opt in opts: log.msg('CRAWL PRODUCT OPTIONS') option_name = name + " - " + opt.get('label') option_price = price + extract_price(opt.get('price')) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', url) loader.add_value('name', option_name) loader.add_value('price', option_price) loader.add_value('sku', sku) loader.add_value('brand', brand) loader.add_value('image_url', image_url) loader.add_value('identifier', sku + '-' + opt.get('label')) loader.add_value('category', category) stock = hxs.select( '//p[@class="availability in-stock"]').extract() if not stock: loader.add_value('stock', 0) yield loader.load_item() else: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) loader.add_value('brand', brand) loader.add_value('image_url', image_url) identifier = hxs.select( '//input[@name="product"]/@value').extract()[0] loader.add_value('identifier', identifier) loader.add_value('category', category) stock = hxs.select( '//p[@class="availability in-stock"]').extract() if not stock: loader.add_value('stock', 0) yield loader.load_item() else: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) loader.add_value('brand', brand) loader.add_value('image_url', image_url) identifier = hxs.select( '//input[@name="product"]/@value').extract()[0] loader.add_value('identifier', identifier) loader.add_value('category', category) stock = hxs.select( '//p[@class="availability in-stock"]').extract() if not stock: loader.add_value('stock', 0) yield loader.load_item() except IndexError: return
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) meta = response.meta url = response.url price = '' for line in hxs.extract().split('\n'): if "MAIN:No^Refrnce" in line: price = line.split('");')[0].split(', "')[-1] if not price: try: price = hxs.select( '//span[@itemprop="price"]/text()').extract()[0].replace( ',', '') except: pass identifier = meta.get('identifier') if not identifier: identifier = hxs.select( '//form[@name="addItemToCart"]//input[@name="sku"]/@value' ).extract()[0] image_url = meta.get('image_url') if not image_url: image_url = hxs.select('//img[@id="mainImage"]/@src').extract() brand = meta.get('brand') if not brand: brand = hxs.select( '//div[@id="tMain"]//div[@class="mfrLogo"]//img[1]/@alt' ).extract() category = meta.get('category') if not category: try: category = hxs.select('//ul[@id="breadcrumbs"]/li/a/text()' ).extract()[-1].strip() except: pass sku = meta.get('sku') if not sku: sku = hxs.select( '//meta[@itemprop="productID" and contains(@content, "mpn:")]/@content' ).re(r'mpn:(\w+)') if sku: bushnell_product = self.bushnell_products.get( sku[0].upper().strip(), None) if bushnell_product: category = bushnell_product['Class'] log.msg( 'Extracts category "%s" from bushnell file, URL: %s' % (category, response.url)) name = meta.get('name') if not name: name = ''.join( hxs.select( '//h1[@itemprop="name"]//text()').extract()).strip() if url not in self.urls_list: self.urls_list.append(url) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', identifier) loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('url', url) loader.add_value('sku', sku) loader.add_value('name', name) loader.add_value('price', price) product = loader.load_item() yield self._get_reviews_url(product)
def parse(self, response): hxs = HtmlXPathSelector(response) yield hxs.extract() yield Request(response.url, callback=self.parse, dont_filter=True)