def parse(self, response): hxs = HtmlXPathSelector(response) ''' categories = hxs.select('//ul[@class="side_menu bolded" and position()=1]/li[not(@class)]/a/@href').extract() for cat in categories: yield Request(cat) subcategories = hxs.select('//a[@class="category_name"]/@href').extract() for subcat in subcategories: yield Request(urljoin_rfc(get_base_url(response), subcat)) ''' categories = ['http://www.webstaurantstore.com/vendor/CAR150/cardinal-international.html', 'http://www.webstaurantstore.com/vendor/LIB500/libbey.html', 'http://www.webstaurantstore.com/vendor/VOL300/vollrath.html', 'http://www.webstaurantstore.com/vendor/RUS600/dexter-russell.html', 'http://www.webstaurantstore.com/vendor/GET600/get-enterprises.html', 'http://www.webstaurantstore.com/vendor/BEV500/beverage-air.html'] for cat in categories: yield Request(cat) next_page = hxs.select('//a[@title="Next page"]/@href').extract() if next_page: yield Request(urljoin_rfc(get_base_url(response), next_page[0])) products = hxs.select('//td[@class="search_product_title"]/a/@href').extract() for product in products: yield Request(urljoin_rfc(get_base_url(response), product), callback=self.parse_product)
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="prod"]') for product in products: loader = ProductLoader(item=Product(), selector=product) #loader.add_xpath('name', 'div/form/fieldset/div/h5/a/span/text()') name = product.select('div/form/fieldset/div/h5/a/span/text()').extract()[0].strip() url = product.select('div/form/fieldset/div/h5/a/@href').extract() if url: url = urljoin_rfc(get_base_url(response), url[0]) #loader.add_value('url', url) #loader.add_xpath('price', 'div/form/fieldset/div/span[@class="productPrice priceExVAT"]/text()') #yield loader.load_item() price = product.select('div/form/fieldset/div/span[@class="productPrice priceExVAT"]/text()').extract()[0].strip() yield Request(url, callback=self.parse_product, meta={'name':name, 'price':price}) pages = hxs.select('//span[@class="pagingButton"]/a/@href').extract() if pages: if response.meta['do_pagination']: for page in pages: url = urljoin_rfc(get_base_url(response), page) yield Request(url, callback=self.parse_products, meta={'do_pagination':False}) else: sub_categories = hxs.select('//div[@class="subcat"]/div/a/@href').extract() for sub_category in sub_categories: url = urljoin_rfc(get_base_url(response), sub_category) yield Request(url, callback=self.parse_products, meta={'do_pagination':True})
def parse_item(self, response): # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据 items = [] sel = Selector(response) base_url = get_base_url(response) sites_even = sel.css('table.tablelist tr.even') for site in sites_even: item = TencentItem() item['name'] = site.css('.l.square a').xpath('text()').extract() relative_url = site.css('.l.square a').xpath('@href').extract()[0] item['detailLink'] = urljoin_rfc(base_url, relative_url) item['catalog'] = site.css('tr > td:nth-child(2)::text').extract() item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract() item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract() item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract() items.append(item) #print repr(item).decode("unicode-escape") + '\n' sites_odd = sel.css('table.tablelist tr.odd') for site in sites_odd: item = TencentItem() item['name'] = site.css('.l.square a').xpath('text()').extract() relative_url = site.css('.l.square a').xpath('@href').extract()[0] item['detailLink'] = urljoin_rfc(base_url, relative_url) item['catalog'] = site.css('tr > td:nth-child(2)::text').extract() item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract() item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract() item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract() items.append(item) #print repr(item).decode("unicode-escape") + '\n' info('parsed ' + str(response)) return items
def parse_products(self, response): hxs = HtmlXPathSelector(response) if not self.brand_crawled: brands = hxs.select('//*[@class="infoBox-categories"]//a/@href').extract() for url in brands: if not re.search('^http', url): url = urljoin_rfc(base_url, url) yield Request(url, callback=self.parse_products) self.brand_crawled = True # Is it another subcategory page? sub_sub_categories = hxs.select('//div[@id="catView"]//a/@href').extract() for url in sub_sub_categories: if not re.search('^http', url): url = urljoin_rfc(base_url, url) yield Request(url, callback=self.parse_products) # Is it products page? products = hxs.select('//div[@id="productView"]/ul/li[@class="product"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/h2/a/text()') loader.add_xpath('price', './/h3/a/text()') loader.add_xpath('url', './/h2/a/@href') yield loader.load_item()
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES) # categories # categories = hxs.select(u'//td[@id="left"]//a/@href').extract() # try: # categories = [a['href'] for a in soup.find('td', id='left').findAll('a')] # except AttributeError: # categories = [] # for url in categories: # url = urljoin_rfc(get_base_url(response), url) # yield Request(url) # pagination next_page = hxs.select(u'//div[@class="pager"]/a[contains(text(),"Next")]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) if '127.0.0.1' in next_page: next_page = next_page.replace('127.0.0.1', 'argonautliquor.com') yield Request(next_page, dont_filter=True) else: next_page = soup.find(lambda tag: tag.name == 'a' and 'Next' in tag.text and tag.findParent('div', 'pager')) if next_page: next_page = urljoin_rfc(get_base_url(response), next_page['href']) if '127.0.0.1' in next_page: next_page = next_page.replace('127.0.0.1', 'argonautliquor.com') yield Request(next_page, dont_filter=True) # products for product in self.parse_product(response): yield product
def parse(self, response): hxs = HtmlXPathSelector(response) ''' if response.url == self.start_urls[0]: cats = hxs.select('//font[@size="2.5"]/../@href').extract() for cat in cats: url = urljoin_rfc(get_base_url(response), cat) yield Request(url) ''' subcats = hxs.select('//img[contains(@src, "orange-arrow.gif")]/../font/a/@href').extract() subcats += hxs.select('//table[@class="categorytable"]//td[@class="categorymodelcell"]//a/@href').extract() for subcat in subcats: yield Request(urljoin_rfc(get_base_url(response), subcat)) ''' price_list = hxs.select('//a[contains(text(), "Price List")]/@href').extract() if not price_list: price_list = hxs.select('//a[contains(@href, "PriceList")]/@href').extract() if price_list: yield Request(urljoin_rfc(get_base_url(response), price_list[0])) ''' next_page = hxs.select('//a/b[contains(text(), "Next Page")]/../@href').extract() if next_page: yield Request(urljoin_rfc(get_base_url(response), next_page[0])) for product in self.parse_products(hxs, response): yield product
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories categories = hxs.select(u'//a[@class="siteNavLink"]/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # categories categories = hxs.select(u'//ul[@id="categoryList"]/li/a/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # pages next_page = hxs.select(u'//a[child::span[@class="blueArrowRightBtn"]]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page) products = hxs.select(u'//div[@class="show"]/ul/li//h1[@class="showName"]/a/@href').extract() for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product)
def parse(self, response): hxs = HtmlXPathSelector(response) for item in hxs.select(u'//tr[contains(@class,"product-item")]'): product_loader = ProductLoader(item=Product(), selector=item) product_loader.add_xpath('name', u'.//td[@class="productListingNewName"]/b/a/text()') price = item.select(u'.//span[@class="js_price_tax"]/text()').extract()[0] price = price.strip().replace('.', '').replace(',', '.') product_loader.add_value('price', price) url = item.select(u'.//td[@class="productListingNewName"]/b/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) # If quantity field is not present on page, there are subproducts qty = item.select(u'.//input[@name="products_qty"]').extract() if qty: yield product_loader.load_item() else: yield Request(url, callback=self.parse_sub) level = response.meta.get('level', 1) sub_url = u'//div[@class="box-content"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href' subcategories = hxs.select(sub_url).extract() for subcategory in subcategories: url = urljoin_rfc(get_base_url(response), subcategory) yield Request(url, meta={'level': level+1}) next_url = hxs.select(u'//li[@class="page-next"]/a/@href').extract() if next_url: next_url = urljoin_rfc(get_base_url(response), next_url[0]) yield Request(next_url, meta={'level': level})
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # pages next_page = hxs.select(u'//div[@style="float:left;padding-right:8px;"]/a[child::img]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page, callback=self.parse_product) products = hxs.select(u'//div[contains(@class,"itemGrid")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//a[@class="oesLink"]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) name = product.select(u'.//a[@class="oesLink"]/span/text()').extract()[0] name += ' ' + product.select(u'.//a[@class="oesLink"]/text()').extract()[0] product_loader.add_value('name', name) product_loader.add_xpath('price', u'.//span[@class="PlistOfferPrice"]/text()', re=u'\$(.*)') product_loader.add_xpath('price', u'.//div[@class="pricing"]/span/div/span/text()', re=u'\$(.*)') loaded = product_loader.get_output_value('name') and product_loader.get_output_value('price') if not loaded: continue yield product_loader.load_item()
def parse(self, response): BASE_URL = 'http://www.virginmobile.com/vm/' hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="webapp_shophome_3col_spotlight"]') for product in products: loader = ProductLoader(item=Product(), selector=product) xpath = 'div/div/div/div/div/div/div/div/div/div[@class="inner"]/text()' if product.select(xpath): loader.add_xpath('name', xpath) loader.add_xpath('price', 'div/div/div/div/div/div/p/span/text()') relative_url = product.select('div/div/div/div/div/p/a/@href') if relative_url: url = urljoin_rfc(BASE_URL, relative_url.extract()[0], response.encoding) loader.add_value('url', url) else: xpath = 'div/div/div/div/div/div/div/div/div/div/div[@class="inner"]/text()' if product.select(xpath): loader.add_xpath('name', xpath) loader.add_xpath('price', 'div/div/div/div/div/div/div/p/span/text()') relative_url = product.select('div/div/div/div/div/div/p/a/@href') if relative_url: url = urljoin_rfc(BASE_URL, relative_url.extract()[0], response.encoding) loader.add_value('url', url) yield loader.load_item()
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # brands brands = hxs.select('//div[@id="trucks"]//a/@href').extract() for category in brands: url = urljoin_rfc(get_base_url(response), category) yield Request(url) # categories categories = hxs.select('//div[@id="categories"]//a/@href').extract() for category in categories: url = urljoin_rfc(get_base_url(response), category) yield Request(url) # pages next_page = hxs.select('//ul[contains(@class, "pagination")]/li/a[contains(text(), "Next")]/@href').extract() if next_page: url = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(url) products = hxs.select(u'//div[contains(@class,"products_content")]/ul/li/h4/a/@href').extract() for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product)
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories # enter only from the main page if response.url == self.URL_BASE: #categories = hxs.select('//div[@id="col-left"]//a/@href').extract() categories = hxs.select('//div[@id="pNav"]//li[starts-with(@id, "pn")]//a/@href').extract() for url in categories: url = urljoin_rfc(self.URL_BASE, url) yield Request(url) # subcategories #subcategories = hxs.select('//ul/li[contains(text(), "Category")]//a/@href').extract() #for url in subcategories: # url = urljoin_rfc(self.URL_BASE, url) # yield Request(url) # next page next_page = hxs.select('//div[@class="pagination"]//a[contains(text(),"Next")]/@href').extract() if next_page: next_page = urljoin_rfc(self.URL_BASE, next_page[0]) yield Request(next_page) # products #products = hxs.select('//div[contains(@class,"chunk")]') products = hxs.select('//h4/a[contains(@href, "product.cfm")]/..') for product in products: try: url = product.select('.//a/@href').extract()[0] url = urljoin_rfc(self.URL_BASE, url) yield Request(url, callback=self.parse_product) except IndexError: continue
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories categories = hxs.select(u'//div[@class="menuItem"]/@onclick').re('\.assign\(\'(.*)\'') for url in categories: url = urljoin_rfc(get_base_url(response), '/' + url) if ('javascript' not in url) and ('Javascript' not in url): yield Request(url) # pages # next_pages = hxs.select(u'').extract() # for next_page in next_pages: # url = urljoin_rfc(get_base_url(response), next_page) # yield Request(url) # products products = hxs.select(u'//div/img/../@onclick').re('assign\(\'(.*)\'') products += hxs.select(u'//div[@class="catpadding"]//div[@class="DefaultFont"]/a/@href').extract() products += hxs.select(u'//table[@id="Table_01"]//div/a[child::img]/@href').extract() for url in products: url = urljoin_rfc(get_base_url(response), url) if ('javascript' not in url) and ('Javascript' not in url): yield Request(url, callback=self.parse_product) for product in self.parse_product(response): yield product
def parse_categories(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) # proceed only it is a products page and not an upper level category hasSubCategories = hxs.select("//div[@class='listing-type-grid catalog-listing']")[0].select(".//a/@href") if hasSubCategories: subCatUrls = hasSubCategories.extract() for subCatUrl in subCatUrls: yield Request(subCatUrl, callback=self.parse_categories) else: # go to the next page nextPageLink = hxs.select("//img[@alt='Next Page']/../@href") nextPageLink2 = hxs.select("//a[text()='Next']/@href") # if there is a next page... (the link has different formats in different pages) if nextPageLink: link = nextPageLink.extract()[0] yield Request(urljoin_rfc(base_url, link), callback=self.parse_categories) elif nextPageLink2: link = nextPageLink2.extract()[0] yield Request(urljoin_rfc(base_url, link), callback=self.parse_categories) productUrls = hxs.select("//li[@class='item']/div[@class='product-image']/a/@href").extract() for productUrl in productUrls: yield Request(urljoin_rfc(base_url, productUrl), callback=self.parse_product)
def parseItem(self, response): base = get_base_url(response) item = MirrorItem() meta = {} item['item'] = response.url yield item for img in response.xpath('//img/@src'): img = urljoin_rfc(base, img.extract()) item['item'] = img yield item for js in response.xpath('//script/@src'): js = urljoin_rfc(base, js.extract()) item['item'] = js yield item for css in response.xpath('//link/@href'): if url_has_any_extension(css.extract(), '.css'): css = urljoin_rfc(base, css.extract()) yield Request(url=css, meta=meta, callback=self.parseStyle) else: item['item'] = css yield item
def parse(self, response): if not isinstance(response, HtmlResponse): return # categories hxs = HtmlXPathSelector(response) # printer brands printers_brands = hxs.select('//div[@id="catalogueDirectory"]//img[@class="catalogueImage"]/../@href').extract() for url in printers_brands: url = urljoin_rfc(self.URL_BASE, url) yield Request(url) printers_series = hxs.select( '//div[@id="printerWizardFamilyContainer" or @id="printerWizardModelContainer"]//a/@href' ).extract() for url in printers_series: url = urljoin_rfc(self.URL_BASE, url) yield Request(url) # next page next_page = hxs.select('//a[@class="AXISPageNumber" and contains(text(),"Next")]/@href').extract() if next_page: url = urljoin_rfc(self.URL_BASE, next_page[0]) yield Request(url) # products products = hxs.select( '//tr[contains(@class,"productList")]//h3[@class="productListItemHeader"]/a/@href' ).extract() for product in products: product = urljoin_rfc(self.URL_BASE, product) yield Request(product, callback=self.parse_product)
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//div[@class="prodInfo"]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//a[contains(@class,"prodLink")]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) name = product.select(u'.//a[contains(@class,"prodLink")]/text()').extract()[0].strip() product_loader.add_value('name', name) try: price = product.select(u'.//div[@class="PriceContent"]//div[@class="camelPrice"]/span[@class="bigPriceText2"]/text()').re('\$(.*)')[0] price += product.select(u'.//div[@class="PriceContent"]//div[@class="camelPrice"]/span[@class="smallPriceText2"]/text()').extract()[0] except IndexError: price_big = product.select(u'.//div[@class="PriceContent"]//div[@class="camelPrice"]/span[@class="bigPriceTextOutStock2"]/text()').re('\$(.*)') price_small = product.select(u'.//div[@class="PriceContent"]//div[@class="camelPrice"]/span[@class="smallPriceTextOutStock2"]/text()').extract() if price_big and price_small: price = price_big[0] + price_small[0] else: continue product_loader.add_value('price', price) yield product_loader.load_item() # pages next_page = hxs.select(u'//a[@class="jump next"]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page, callback=self.parse, meta={'cats': response.meta['cats'][:]}) elif response.meta.get('cats'): yield Request(response.meta['cats'][0], meta={'cats': response.meta['cats'][1:]})
def parse_course_list(self, response): hxs = HtmlXPathSelector(response) programs = hxs.select(u'//ul[@id="SearchResults"]/li/h2/a/@href').extract() if programs: for url in programs: url = urljoin_rfc(get_base_url(response), url) response.meta['courses'].setdefault(url, []).append(response.meta['field_name']) else: response.meta['field_names'] = [response.meta['field_name']] for x in self.parse_course(response): yield x # Pagination next_url = hxs.select(u'//div[@id="PageNumbers"]/a[@class="next"]/@href').extract() if next_url: url = urljoin_rfc(get_base_url(response), next_url[0]) yield Request(url, meta=response.meta, callback=self.parse_course_list) # Next field elif response.meta['fields']: field_name, field_url = response.meta['fields'].pop() response.meta['field_name'] = field_name yield Request(field_url, meta=response.meta, callback=self.parse_course_list) # All fields processed, do courses else: for course, fields in response.meta['courses'].items(): yield Request(course, meta={'field_names': fields}, callback=self.parse_course)
def parse_item(self, response): # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据 items = [] sel = Selector(response) base_url = get_base_url(response) sites_even = sel.css("table.tablelist tr.even") for site in sites_even: item = TutorialItem() item["name"] = site.css(".l.square a").xpath("text()").extract() relative_url = site.css(".l.square a").xpath("@href").extract()[0] item["detailLink"] = urljoin_rfc(base_url, relative_url) item["catalog"] = site.css("tr > td:nth-child(2)::text").extract() item["workLocation"] = site.css("tr > td:nth-child(4)::text").extract() item["recruitNumber"] = site.css("tr > td:nth-child(3)::text").extract() item["publishTime"] = site.css("tr > td:nth-child(5)::text").extract() items.append(item) # print repr(item).decode("unicode-escape") + '\n' sites_odd = sel.css("table.tablelist tr.odd") for site in sites_odd: item = TutorialItem() item["name"] = site.css(".l.square a").xpath("text()").extract() relative_url = site.css(".l.square a").xpath("@href").extract()[0] item["detailLink"] = urljoin_rfc(base_url, relative_url) item["catalog"] = site.css("tr > td:nth-child(2)::text").extract() item["workLocation"] = site.css("tr > td:nth-child(4)::text").extract() item["recruitNumber"] = site.css("tr > td:nth-child(3)::text").extract() item["publishTime"] = site.css("tr > td:nth-child(5)::text").extract() items.append(item) # print repr(item).decode("unicode-escape") + '\n' info("parsed " + str(response)) return items
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # getting product details from product list prod_names = hxs.select('//h4/a/@title').extract() prod_urls = hxs.select('//h4/a/@href').extract() prices = hxs.select('//td[@class="ProductPrice"]/h4/text()').extract() prices = [p.strip().strip(u'\xa3') for p in prices] names_urls_prices = zip(prod_names, prod_urls, prices) for name, url, price in names_urls_prices: url = urljoin_rfc(get_base_url(response), url) if url: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) yield loader.load_item() # pages next_page = hxs.select('//a[@class="NextPage"]/@href').extract() if next_page: url = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(url)
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories categories = hxs.select(u'//ul[@id="MenuBar1"]/li/a/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # subcategories subcategories = hxs.select(u'//table[@class="TextListingTable"]//a/@href').extract() for url in subcategories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # pages # next_pages = hxs.select(u'').extract() # for next_page in next_pages: # url = urljoin_rfc(get_base_url(response), next_page) # yield Request(url) # products products = hxs.select(u'//table[@class="ListingTable"]//a/@href').extract() for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product)
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories categories = hxs.select(u'//div[@class="Category"]//a/@href').extract() categories += hxs.select(u'//h3[@class="Org LeftNavMenu"]//a/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url, meta=response.meta) # pagination next_page = hxs.select(u'//div[@class="pager"]//a[contains(text(),"Next")]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) if next_page.count(u'&page=') > 1: next_page = re.sub(u'&page=\d+', u'', next_page, 1) yield Request(next_page, meta=response.meta) # products products = hxs.select(u'//div[contains(@class,"ProDes1")]/div/a/@href').extract() for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product, meta=response.meta)
def parse(self, response): if not isinstance(response, HtmlResponse): return #categories hxs = HtmlXPathSelector(response) # printer brands printers_brands = hxs.select('//nav[@id="sidebar"]/ul/li/a/@href').extract() for url in printers_brands: url = urljoin_rfc(self.URL_BASE, url) yield Request(url) printers_series = hxs.select('//div[@class="thumbs"]//a[@class="button"]/@href').extract() for url in printers_series: url = urljoin_rfc(self.URL_BASE, url) yield Request(url) # printer list printers_list = hxs.select('//section[@class="printer-list"]//a/@href').extract() for url in printers_list: url = urljoin_rfc(self.URL_BASE, url) yield Request(url) # next page # next_page = # if next_page: # url = urljoin_rfc(URL_BASE, next_page[0]) # yield Request(url) # products for p in self.parse_product(response): yield p
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # brands brands = hxs.select(u'//div[@id="shop_content"]//li[not(@class="first")]//a/@href').extract() for url in brands: url = urljoin_rfc(get_base_url(response), url) yield Request(url) cats = hxs.select('//h2[text()="Product Category"]/following-sibling::div[1]/form[@id="product-refinement"]//a/@href').extract() for url in cats: yield Request(urljoin_rfc(get_base_url(response), url)) #show_all = hxs.select(u'//div[@class="sidenav-content gray top-level"]//a[contains(text(),"Shop All")]/@href').extract() show_all = hxs.select('//p[@class="arrow_sym"]/a[@class="SearchLinkBold" and starts-with(@title, "View More")]/@href').extract() if show_all: for url in show_all: link = urljoin_rfc(get_base_url(response), url) yield Request(link) next_page = hxs.select(u'//div[@id="pagination" and @class="pagination"]//a[child::img and @title="Next Page"]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page) # products for product in self.parse_product(response): yield product
def parse_page(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="item"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'h2/a/text()') relative_url = product.select('h2/a/@href').extract()[0] url = urljoin_rfc('http://www.dolphinmusic.co.uk/', relative_url, response.encoding) loader.add_value('url', url) loader.add_xpath('price', 'div[@class="pricing"]/p[@class="price"]/text()') yield loader.load_item() next_page = hxs.select('//*[@id="categoryMain"]/div[@class="pagination"]/ul/li/a/@href').extract() if not next_page: relative_urls = hxs.select('//*[@id="sidebar"]/ul[@id="refineCat"]/li/a/@href').extract() for relative_url in relative_urls: url = urljoin_rfc('http://www.dolphinmusic.co.uk/', relative_url, response.encoding) yield Request(url, callback=self.parse_page) else: next_url = next_page[-1] if self._is_next(next_url): url = urljoin_rfc('http://www.dolphinmusic.co.uk/', next_url, response.encoding) yield Request(url, callback=self.parse_page)
def parse(self, response): if not isinstance(response, HtmlResponse): return URL_BASE = 'http://www.getinthemix.co.uk' #categories hxs = HtmlXPathSelector(response) category_urls = hxs.select('//div[contains(@class,"brands_sub")]//a/@href').extract() for url in category_urls: url = urljoin_rfc(URL_BASE, url) yield Request(url) #subcategories subcategory_urls = hxs.select('//div[@class="cat_list"]//a/@href').extract() for url in subcategory_urls: url = urljoin_rfc(URL_BASE, url) yield Request(url) #next page next_pages = hxs.select('//div[@id="page_number"]//a/@href').extract() if next_pages: for page in next_pages: url = urljoin_rfc(URL_BASE, page) yield Request(url) # products for p in self.parse_product(response): yield p
def parse(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) hxs = HtmlXPathSelector(response) cats = hxs.select('//div[@id="col1"]/ul/li/a/@href').extract() for cat in cats: request = Request(urljoin_rfc(base_url,cat), callback=self.parse) yield request cats = hxs.select('//div[@id="col2"]/ul/li/a/@href').extract() for cat in cats: request = Request(urljoin_rfc(base_url,cat), callback=self.parse) yield request cats = hxs.select('//div[@id="col3"]/ul/li/a/@href').extract() for cat in cats: request = Request(urljoin_rfc(base_url,cat), callback=self.parse) yield request scats = hxs.select('//ul[@class="catalogueList"]/li/div/h4/a/@href').extract() for scat in scats: request = Request(urljoin_rfc(base_url,scat), callback=self.parse) yield request product_urls = hxs.select('//td[@class="productItem"]/div/div/a/@href').extract() for product in product_urls: request = Request(urljoin_rfc(base_url,product), callback=self.parse_product) yield request
def process_response(self, request, response, spider): if request.method.upper() == 'HEAD': if response.status in [301, 302, 303, 307] and 'Location' in response.headers: redirected_url = urljoin_rfc(request.url, response.headers['location']) redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) else: return response if response.status in [302, 303] and 'Location' in response.headers: redirected_url = urljoin_rfc(request.url, response.headers['location']) redirected = self._redirect_request_using_get(request, redirected_url) return self._redirect(redirected, request, spider, response.status) if response.status in [301, 307] and 'Location' in response.headers: redirected_url = urljoin_rfc(request.url, response.headers['location']) redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) if isinstance(response, HtmlResponse): interval, url = get_meta_refresh(response) if url and interval < self.max_metarefresh_delay: redirected = self._redirect_request_using_get(request, url) return self._redirect(redirected, request, spider, 'meta refresh') return response
def parse(self, response): if not isinstance(response, HtmlResponse): return URL_BASE = 'http://www.decks.co.uk' #categories hxs = HtmlXPathSelector(response) #category_urls = hxs.select('//div[@class="products-nav"]/ul/li/a/@href').extract() category_urls = hxs.select('//div[starts-with(@class,"radnav")]//a/@href').extract() for url in category_urls: url = urljoin_rfc(URL_BASE, url) yield Request(url) #subcategories (for categories that don't show the products directly) subcategories_urls = hxs.select('//ul[@class="smaller"]//p[@class="go"]/a/@href').extract() for url in subcategories_urls: url = urljoin_rfc(URL_BASE, url) yield Request(url) # products products = [p for p in self.parse_product(response)] for p in products: yield p #next page next_page = hxs.select('//a[contains(text(),"Next")]/@href').extract() if next_page and products: url = urljoin_rfc(URL_BASE, next_page[0]) yield Request(url)
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # categories categories = hxs.select(u'//div[@id="ctl00_menu_products_pnlsmenu"]//a/@href').extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) subcategories = hxs.select(u'//div[@class="item" and not(parent::div[@class="catprods"])]//a/@href').extract() for url in subcategories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) # pagination next_page = hxs.select(u'//a[@class="next i-next"]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page) # products products = hxs.select(u'//div[@class="item" and parent::div[@class="catprods"]]//a/@href').extract() for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product)
def parse(self, response): hxs = HtmlXPathSelector(response) categories = hxs.select('//ul/li/a/@href').extract() for category in categories: url = urljoin_rfc(get_base_url(response), category) yield Request(url, callback=self.parse_category)
def parse_item(self, response): ''' skuArray.push({ productexternalid: 72833, colour: 'Light Grey/Grey', size: '49', skuNopId: 91684, skuId: 227272, price: '£90.00', priceAsDecimal: 90.0000, stockquantity: 0, preorder: true, outofstock: true, issubscribed: false, availableDate: 'Due in 02/07/2015' }); ''' hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products_data = [] collect_product = False for i, l in enumerate(response.body.split('\n')): if 'skuArray.push({' in l: collect_product = True current_product = {} continue if '});' in l and collect_product: collect_product = False products_data.append(current_product) continue if collect_product: attr_data = [a.strip() for a in l.split(':')] current_product[attr_data[0]] = eval(attr_data[1].replace( 'false', 'False').replace('true', 'True')) if isinstance(current_product[attr_data[0]], tuple): current_product[attr_data[0]] = current_product[ attr_data[0]][0] main_name = hxs.select( '//h1[@itemprop="name"]/text()').extract()[0].strip() categories = hxs.select( '//div[@id="breadcrumb"]//span[@itemprop="title"]/text()').extract( )[1:] for p in products_data: loader = ProductLoader(item=Product(), response=response) loader.add_xpath( 'image_url', '//img[@itemprop="image"]/@src', lambda a: urljoin_rfc(base_url, a[0]) if a else '') loader.add_value('identifier', p['skuId']) loader.add_value('sku', p['productexternalid']) loader.add_value('price', p['priceAsDecimal']) loader.add_value('stock', p['stockquantity']) loader.add_value('category', categories) loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') loader.add_value('url', response.url) loader.add_value( 'name', main_name + ' - ' + p['colour'] + ' - ' + p['size']) yield loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( "//div[@class='rightbox']//div[@id='ContentPage']/table[last()]/tr/td" ) products_found = False if products: for product in products: name = product.select(".//h2//text()").extract() if not name: name = product.select(".//h3//text()").extract() identifier = product.select(".//prices/@prod_ref").extract() if not identifier: identifier = product.select( './/input[@type="image"]/@name').extract() url = urljoin_rfc( get_base_url(response), ''.join( product.select( 'div/h2[@class="product"]/a/@href').extract())) price = product.select( './/td[1]/span[@class="actlarge"]/text()').extract() if not identifier and not price: url = urljoin_rfc( get_base_url(response), ''.join(product.select('.//h4/a/@href').extract())) yield Request(url, callback=self.parse_products) if not name: logging.error("NO NAME!!! %s" % response.url) continue name = " ".join(name[0].split()) # fix whitespaces if not identifier: logging.error("NO IDENTIFIER!!! %s - %s" % (name, response.url)) continue identifier = identifier[0] if "!" in identifier: identifier = identifier.split('!')[-1] if "_" in identifier: identifier = identifier.split('_')[-1] if not price: logging.error("NO PRICE!!! %s - %s" % (name, response.url)) continue price = price[0] loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', name) loader.add_value('sku', identifier) loader.add_value('identifier', identifier) loader.add_value('url', url) loader.add_value('price', price) yield loader.load_item() products_found = True if not products_found: categories = hxs.select('//span/table/tr//td/a/@href').extract() for category in categories: url = urljoin_rfc(get_base_url(response), category) yield Request(url, callback=self.parse_products) for product_link in hxs.select( '//div[@class="product_list"]/div/h2/a/@href').extract(): url = urljoin_rfc(get_base_url(response), product_link) yield Request(url, callback=self.parse_products)
def get_full_url(response, url): return urljoin_rfc(get_base_url(response), url)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = 'https://www.shadestation.co.uk/' loader = ProductLoader(item=Product(), response=response) identifier = hxs.select(self.product_page_identifier_xpath).extract() if identifier: identifier = identifier[0].strip() else: retry_no = int(response.meta.get('retry_no', 0)) if retry_no < 20: retry_no += 1 yield Request(response.url, meta={'dont_merge_cookies': True, 'retry_no': retry_no}, dont_filter= True, callback=self.parse_product) else: self.log('WARNING: possible blocking in => %s' % response.url) return sku = hxs.select(u'//span[@itemprop="productID"]/text()').extract() sku = sku[0] if sku else '' category = hxs.select(u'//div[@itemprop="breadcrumb"]/a/text()').extract()[1:] loader.add_value('identifier', identifier) name = hxs.select(u'//h1[@itemprop="name"]/text()').extract()[0] size = response.xpath('//select[@name="sizeSelector"]/option[@selected]/text()').extract() if size: name = name + ' ' + size[0].strip() extra_info = hxs.select('//div[@class="product_extra_info_area"]//text()').extract() if extra_info: extra_info = ' '.join(map(lambda x: x.strip(), extra_info)).strip() if 'lens' in extra_info.lower() or 'frame' in extra_info.lower(): name = name + ' ' + extra_info colour = hxs.select('//li[contains(@class, "small_colour_selected")]/@rel').extract() if colour: if 'frame:' not in name.lower() and 'frame' in colour[0].lower(): colour = colour[0].replace('<br/>', ' ').strip() name = name + ' ' + colour loader.add_value('name', name) #brand = hxs.select('//img[@id="product_logo"]/@src').re('images/(.*).svg') #brand = brand[0].replace('_', ' ') if brand else '' brand = '' if len(category) > 1: brand = category[1].split(' ')[:-1] brand = ' '.join(brand) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('url', response.url) price = hxs.select(u'//div[@itemprop="price"]/text()').re('Our Price (.*)') if not price: price = hxs.select(u'//div[@itemprop="price"]/text()').extract() price = price[0] if price else '0.00' loader.add_value('price', price) image = hxs.select(u'//div[@id="product_image_crop"]/div/@imageurl').extract() image = image[0] if image else '' image = urljoin_rfc(base_url, image) loader.add_value('image_url', image) in_stock = response.xpath('//div[@itemprop="availability" and contains(text(), "In Stock")]') if in_stock: stock_level = response.xpath('//div[@class="furtherdetails"]/text()').re('\d+') stock = int(stock_level[0]) if stock_level else None else: stock = 0 loader.add_value('stock', stock) yield loader.load_item()
def parse(self, response): base_url = get_base_url(response) selected_option_id = response.meta.get('option_id', None) self._browser.get(response.url) container = self._browser.driver.find_element_by_xpath( '//div[@class="chosen-container chosen-container-single chosen-container-single-nosearch"]' ) container.click() hxs = HtmlXPathSelector(text=self._browser.driver.page_source) if not selected_option_id: options = hxs.select( '//ul[@class="chosen-results"]/li/@data-option-array-index' ).extract() for option_id in options: yield Request(response.url, dont_filter=True, meta={'option_id': option_id}) return option = self._browser.driver.find_element_by_xpath( '//ul[@class="chosen-results"]/li[@data-option-array-index="' + selected_option_id + '"]') option.click() hxs = HtmlXPathSelector(text=self._browser.driver.page_source) tariffs = hxs.select('//li[contains(@class, "rate-element")]') device_identifier = re.search('0,(.*?)-_', response.url).group(1) for tariff in tariffs: loader = ProductLoader(item=Product(), response=response) duration = '24' identifier = tariff.select('@data-shop-id').extract() loader.add_value( 'identifier', device_identifier + '-' + selected_option_id + '-' + identifier[0]) phone_name = ' '.join( tariff.select( './/div[@class="configuration-output"]//p[not(span)]//text()' ).extract()) tariff_name = ' '.join( tariff.select( './/div[@class="heading-2"]/span[@class="title-1" or @class="title-2"]//text()' ).extract()) phone_price = ''.join( tariff.select( './/div[@class="configuration-output"]//p/span//text()'). extract()).replace(',', '.') image_url = hxs.select( '//div[@id="device-image-slider"]//li/img/@src').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) monthly_cost = ''.join( tariff.select('.//p[@class="price monthly-price"]/span//text()' ).extract()).replace(',', '.') normalized_name = self.get_normalized_name(phone_name) loader.add_value('name', normalized_name + ' - ' + tariff_name) loader.add_value('url', response.url) loader.add_value('brand', phone_name.split()[0]) loader.add_value('price', phone_price) loader.add_value('image_url', image_url) product = loader.load_item() metadata = VodafoneMeta() metadata['device_name'] = phone_name metadata['monthly_cost'] = re.search('(\d+.\d+)', monthly_cost).group(1) metadata['tariff_name'] = tariff_name metadata['contract_duration'] = duration metadata['operator'] = operator metadata['channel'] = channel metadata['promotional_text'] = '' metadata['network_generation'] = '4G' product['metadata'] = metadata yield product
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response=response) name = hxs.select('//div[@class="prod-nome"]/text()').extract() price = hxs.select('//div[@class="prod-price "]/text()').extract() if not price: price = hxs.select( '//div[@class="prod-price campanha"]/text()').extract() price = price[0] brand = '' categories = hxs.select( '//div[@id="breadcrumb"]/ul/li/a/text()').extract()[1:] l = ProductLoader(item=Product(), response=response) image_url = hxs.select('//div[@id="prod-imagem"]/img/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' l.add_value('image_url', image_url) l.add_value('url', response.url) l.add_value('name', name) l.add_value('price', extract_price_eu(price)) l.add_value('brand', brand) for category in categories: l.add_value('category', category) ean = hxs.select('//script[@data-flix-ean]/@data-flix-ean').extract() l.add_value('sku', ean) identifier = re.findall('idprod=(.*)', response.url)[0] l.add_value('identifier', identifier) product = l.load_item() metadata = SonaeMeta() promotion_price = hxs.select( '//div[@class="prod-price-old"]/del/text()').re(r'[\d,.]+') if promotion_price: metadata['promotion_price'] = promotion_price[0].replace( '.', '').replace(',', '.') if response.meta.get('exclusive_online', 'No') == 'Yes': metadata['exclusive_online'] = 'Yes' if self.meta_df is not None and not self.meta_df.empty and identifier in self.meta_df.index: prev_meta = self.meta_df.loc[identifier] else: prev_meta = {} promo = hxs.xpath( '//div[@id="prod-data"]//div[@class="prod-price campanha"]') promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') metadata['extraction_timestamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M') if promo: metadata[ 'promo_start'] = promo_start if promo_start and not promo_end else today metadata['promo_end'] = '' else: if promo_start: metadata['promo_start'] = promo_start metadata['promo_end'] = today if not promo_end else promo_end product['metadata'] = metadata yield product
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) yield Request('http://www.lookers.co.uk/ford/new-offers/', callback=self.parse_offers_ford) brands = hxs.select( '//div[@class="container"]//div[contains(@class, "jump")]/div/a/@href' ).extract() for url in brands: yield Request(urljoin_rfc(base_url, url)) models = hxs.select( '//div[@class="cycle-inner"]/div/a/@href').extract() for url in models: yield Request(urljoin_rfc(base_url, url)) cars_urls = hxs.select( '//div[@class="inset"]//a[@class="btn"]/@href').extract() for url in cars_urls: yield Request(urljoin_rfc(base_url, url)) name = hxs.select( '//div[contains(@class, "title")]/h3/span[@class="variant"]/text()' ).extract() if name: model = hxs.select( '//div[contains(@class, "title")]/h3/span[@class="model"]/text()' ).extract() if model: name = model[0] + ' ' + name[0] else: name = name[0] cap_id = re.findall('CAPID=(\d+)&', response.body) if not cap_id: log.msg('PRODUCT WITHOUT IDENTIFIER: ' + response.url) return brand = hxs.select( '//div[contains(@class, "title")]/h3/span[@class="make"]/text()' ).extract()[0] loader = ProductLoader(item=Product(), response=response) cap_id = cap_id[0] loader.add_value('identifier', cap_id) loader.add_value('name', name) loader.add_value('brand', brand) loader.add_value('category', 'New cars') loader.add_value('url', response.url) image_url = hxs.select( '//div[@class="span8"]/div[contains(@class, "custom-imag")]/div[@class="inner"]/img/@src' ).extract() if image_url: loader.add_value('image_url', image_url[0]) try: price = hxs.select( '//div[@class="price-now"]/span[@class="value"]/text()' ).extract()[0] except IndexError: price = 0 loader.add_value('price', price) product = loader.load_item() yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select('//div[@class="item"]/h1/text()')[0].extract().strip() category = hxs.select('//span[@class="breadcrumbs"]/span/a/span/text()').extract() if category: category = category[-1].strip() image_url = hxs.select('//div[@class="gallery-image"]/img/@data-src').extract() if image_url: image_url = image_url[0] brand = response.xpath('//meta[@itemprop="brand manufacturer"]/@content').extract() products = [] options = hxs.select('//div[@id="fancy-options-variants"]/div[@class!="e-filtration-result-empty"]') if options: for option in options: loader = ProductLoader(item=Product(), response=response, selector=option) identifier = option.select('.//@data-variant-id')[0].extract().strip() loader.add_value('identifier', identifier) loader.add_value('brand', brand) sku = hxs.select('.//div[@itemprop="mpn"]/span/text()').extract() if sku: loader.add_value('sku', sku[0]) bushnell_product = self.bushnell_products.get(sku[0].upper().strip(), None) if bushnell_product: category = bushnell_product['Class'] log.msg('Extracts category "%s" from bushnell file, URL: %s' % (category, response.url)) loader.add_value('url', response.url) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('name', option.select('.//@data-variant-name')[0].extract()) price = option.select('.//span[@class="variant-price"]/@content')[0].extract() loader.add_value('price', price) product = loader.load_item() metadata = KeterMeta() metadata['reviews'] = [] metadata['brand'] = brand[0] if brand else '' product['metadata'] = metadata products.append(product) else: loader = ProductLoader(item=Product(), response=response, selector=hxs) loader.add_value('name', name) identifier = hxs.select('.//@data-variant-id')[0].extract().strip() loader.add_value('identifier', identifier) loader.add_value('brand', brand) sku = hxs.select('.//div[@itemprop="mpn"]/span/text()').extract() if sku: loader.add_value('sku', sku[0]) bushnell_product = self.bushnell_products.get(sku[0].upper().strip(), None) if bushnell_product: category = bushnell_product['Class'] log.msg('Extracts category "%s" from bushnell file, URL: %s' % (category, response.url)) loader.add_value('url', response.url) loader.add_value('category', category) loader.add_value('image_url', image_url) price = hxs.select('.//span[@class="variant-price"]/@content')[0].extract() loader.add_value('price', price) product = loader.load_item() metadata = KeterMeta() metadata['brand'] = brand[0] if brand else '' metadata['reviews'] = [] product['metadata'] = metadata products.append(product) if hxs.select(u'//span[@id="product-social-header-ratings-text"]/span[@id="product-social-header-review-not-rated"]'): for product in products: yield product return try: reviews_url = hxs.select(u'//div[@id="product-customer-reviews"]/span[@class="all-reviews"]/a/@href').extract()[0] except: reviews_url = hxs.select(u'//div[@id="product-customer-reviews"]//span[@class="all-reviews"]/a/@href').extract()[0] yield Request(urljoin_rfc(base_url, reviews_url), meta={'products': products, 'product_url': response.url}, callback=self.parse_review)
def extract_products(self, hxs, url): for el in hxs.select( '//div[starts-with(@class,"tyre_container round")]'): tyre_options = fix_spaces("".join( el.select( './/p[@class="tyre_details"]//text()').extract())).strip() if not tyre_options: msg = 'Could not extract tyre options from element from %s' % url self.log('ERROR: %s' % msg) # self.errors.append(msg) continue res = parse_pattern(tyre_options) if not res: msg = "ERROR parsing: %s on %s" % (tyre_options, url) self.log(msg) # self.errors.append(msg) continue width, ratio, rim, load_rating, speed_rating, name = res # skip winter tyres if el.select(".//div[@class='tyre_winter']"): continue name = name.strip() identifier = el.select("./@id").extract()[0] price = "".join( el.select( ".//p[@class='tyre_price']//text()").extract()).strip() if not price: continue brand = el.select( ".//span[@class='tyre_brand_text']/text()").extract()[0] image_url = el.select( './/img[contains(@class, "tyre_image")]/@src').extract()[0] image_url = urljoin_rfc('http://asdatyres.co.uk', image_url) run_flat_found = is_run_flat(name) run_flat = 'Yes' if len( el.select(".//div[@class='tyre_rf']").extract() ) > 0 or run_flat_found else 'No' xl = 'Yes' if len( el.select(".//div[@class='tyre_xl']").extract()) > 0 else 'No' if xl == 'Yes': name = name.replace("XL", "").strip() loader = ProductLoader(Product(), selector=hxs) loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('price', price) loader.add_value('url', 'http://www.asdatyres.co.uk/') loader.add_value('image_url', image_url) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) metadata = MicheldeverMeta() metadata['width'] = width metadata['aspect_ratio'] = ratio metadata['rim'] = rim metadata['load_rating'] = load_rating metadata['speed_rating'] = speed_rating metadata['fitting_method'] = 'Fitted' metadata['run_flat'] = run_flat metadata['xl'] = xl metadata['fitting_method'] = 'Fitted' man_code = '' for code, man_mark in self.all_man_marks.iteritems(): if code in name: man_code = man_mark break metadata['manufacturer_mark'] = man_code metadata['full_tyre_size'] = '/'.join( (width, ratio, rim, load_rating, speed_rating)) fuel = el.select( './/div[@class="label_ratings"]/div[@class="fuel_rating"]//span[contains(@class, "label_rating_")]/text()' ).extract() grip = el.select( './/div[@class="label_ratings"]/div[@class="wet_rating"]//span[contains(@class, "label_rating_")]/text()' ).extract() noise = el.select( './/div[@class="label_ratings"]/div[contains(@class, "noise_rating")]/@data-decibels' ).extract() metadata['fuel'] = fuel[0] if fuel else '' metadata['grip'] = grip[0] if grip else '' metadata['noise'] = noise[0] if noise else '' product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) name = hxs.select('//*[@id="producttitle2"]/text()').extract() if name: loader = ProductLoader(item=Product(), response=response) loader.add_xpath( 'sku', 'normalize-space(substring-after(//p[@class="productcode"]/text(),"-"))' ) loader.add_xpath( 'identifier', 'normalize-space(substring-after(//p[@class="productcode"]/text(),"-"))' ) loader.add_value('url', response.url) loader.add_value('name', name) size_selected = hxs.select( '//select[@id="SizeDropDown"]//option[@selected]/@value' ).extract() if size_selected: loader.add_value('name', ' ' + size_selected[0]) # Sometimes name already contains option name if ''.join( hxs.select( '//select[@id="MainContent_SizeDropDown"]//option[@selected="selected"]/@value' ).extract()) not in name: loader.add_xpath( 'name', '//select[@id="MainContent_SizeDropDown"]//option[@selected="selected"]/@value' ) loader.add_xpath( 'price', '//*[@id="titleSection"]/p[@class="productprice"]/text()') for cat in hxs.select('//ul/li/ul/li/a'): if ''.join(cat.select('./@href').extract()) in response.url: loader.add_value('category', ''.join(cat.select('./text()').extract())) break img = hxs.select( '//img[@id="MainContent_prodimage"]/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) brand = ''.join( hxs.select( '//img[@id="MainContent_imgBrandLogo"]/@src').extract()) brands = { '/images/brands/hi-tec.png': 'hi-tec', '/images/brands/progressive.png': 'progressive safety', } loader.add_value( 'brand', brands.get(brand, brand.split('/')[-1].split('.')[0])) stock = hxs.select( '//span[@id="MainContent_RemStockLabel"]/text()').re('\d+') if stock: loader.add_value('stock', stock) elif hxs.select( '//*[@id="MainContent_RemStockLabel" and contains(@class, "remstockgreen")]' ): loader.add_value('stock', 1) else: loader.add_value('stock', '0') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '4.50') else: loader.add_value('shipping_cost', '0') yield loader.load_item() if not response.meta.get('formpost'): formdata = {} for input_elem in hxs.select('//input'): if input_elem.select('./@name') and input_elem.select( './@value'): formdata[''.join( input_elem.select('./@name').extract())] = ''.join( input_elem.select('./@value').extract()) for size in hxs.select( '//select[@id="SizeDropDown"]//option/@value').extract( ): formdata = dict(formdata) formdata['ctl00$MainContent$SizeDropDown'] = size self.log('Request size %s for %s' % (size, response.url)) yield FormRequest( 'http://www.yeomansoutdoors.co.uk/Product.aspx', formdata=formdata, dont_filter=True, callback=self.parse_product, meta={ 'formpost': True, 'size': size })
def parse_site(self, response): cats = response.xpath('//div[@class="main-nav"]//a/@href').extract() for cat in cats: yield Request(urljoin_rfc(get_base_url(response), cat), callback=self.parse_subcats)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) row = response.meta['row'] products = hxs.select('//div[contains(@class, "tyre_container")]') for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) brand = product_el.select( 'form/span[@class="tyre_brand_text"]/text()').extract() brand = brand[0] if brand else '' winter_tyre = product_el.select( 'div[@class="tyre_type"]/div[@class="tyre_winter"]').extract() # skip winter tyres if winter_tyre: continue for tyre_brand in self.brands: if tyre_brand.upper() == brand.strip().upper(): brand = tyre_brand full_name = ' '.join( map( lambda x: x.strip(), product_el.select( 'form/p[@class="tyre_details"]//text()').extract())) if not full_name: continue loader.add_value('name', ' '.join(full_name.split()[2:])) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_el.select('@id').extract() if identifier: identifier = identifier[0] else: log.msg('Product without identifier') search_params = '/'.join([ row['Aspect Ratio'], row['Rim'], row['Width'], row['Alt Speed'] ]) log.msg('Search parameters: ' + search_params) return loader.add_value('url', response.url) image_url = product_el.select( 'img[contains(@class, "tyre_image")]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('identifier', identifier) price = ''.join( product_el.select( 'div/p[@class="tyre_price"]//text()').extract()) if not price: continue loader.add_value('price', price) metadata = MicheldeverMeta() metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] speed = re.search('(\s\d+\w+\s)', full_name) speed_rating = speed.group().strip()[-1] if speed else '' load_rating = speed.group().strip()[:-1] if speed else '' metadata['speed_rating'] = speed_rating metadata['load_rating'] = load_rating metadata['width'] = row['Width'] metadata['fitting_method'] = 'Fitted' metadata['alternative_speed_rating'] = '' xl = product_el.select( 'div[@class="tyre_type"]/div[@class="tyre_xl"]').extract() metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(full_name) run_flat = product_el.select( 'div[@class="tyre_type"]/div[@class="tyre_rf"]').extract() metadata[ 'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No' metadata['manufacturer_mark'] = self._get_manufacturer_code( full_name) metadata['full_tyre_size'] = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], metadata['load_rating'], metadata['speed_rating'])) fuel = product_el.select( './/div[@class="label_ratings"]/div[@class="fuel_rating"]//span[contains(@class, "label_rating_")]/text()' ).extract() grip = product_el.select( './/div[@class="label_ratings"]/div[@class="wet_rating"]//span[contains(@class, "label_rating_")]/text()' ).extract() noise = product_el.select( './/div[@class="label_ratings"]/div[contains(@class, "noise_rating")]/@data-decibels' ).extract() metadata['fuel'] = fuel[0] if fuel else '' metadata['grip'] = grip[0] if grip else '' metadata['noise'] = noise[0] if noise else '' product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) yield product brand_filters = hxs.select( '//div[@class="filter-wrapper"]/div[div/input[@name="brand_filter"]]/p/text()' ).extract() for brand_filter in brand_filters: url = response.url.split('&')[0] + '&brand=' + brand_filter.lower() yield Request(url, meta=response.meta, callback=self.parse)
def parse_brand(self, response): hxs = HtmlXPathSelector(response) products = hxs.select("//h2[@class='product-name fn']") brand = response.meta['brand'] base_url = get_base_url(response) if not products: for url in hxs.select( '//a[text()="SHOP ALL" or text()="SHOP NOW"]/@href' ).extract(): yield Request(urljoin(base_url, url), meta=response.meta, callback=self.parse_brand) try: hxs = HtmlXPathSelector( text=json.loads(response.body)['productList']) products = hxs.select("//h2[@class='product-name fn']") except Exception as e: print e for product in products: name = product.select("./a/text()").extract()[0] url = product.select("./a/@href").extract()[0] if not url in self.seen: self.seen.append(url) yield Request(url=url, meta={ 'name': name, 'brand': brand }, callback=self.parse_item) if 'pageSize' in response.url: next_page = hxs.select( '//li[@class="next last-child"]/a[@class="ir"]/@href' ).extract() if next_page: yield Request(urljoin_rfc(base_url, next_page[0])) # shop now link shop_now_link = hxs.select( '//div[@class="whole-banner-brands-e"]/div[contains(@class,"circle-link")]/a/@href' ).extract() if shop_now_link: log.msg(shop_now_link) yield Request(urljoin_rfc(base_url, shop_now_link[0])) if not products and not shop_now_link: retries = response.meta.get('retries', 0) if retries < 3: meta = response.meta meta['retries'] = retries + 1 log.msg('[{}] Retry attempt {}'.format(response.url, retries)) yield Request(response.url, callback=self.parse_brand, meta=meta, dont_filter=True) view_all = hxs.select('//a[text()="View All"]/@href') if view_all: url = urljoin_rfc(self.base_url, view_all[0].extract()) meta = response.meta meta['brand'] = brand yield Request(url, callback=self.parse_brand, meta=response.meta)
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_titles = hxs.select('//div[@class="product-header"]/h2/text()').extract() product_urls = hxs.select('//div[@data-product-id]/@class').re(r'js-product-([\w-]+)') products = [] for l in response.body.split('\n'): if 'Harveys.DATA.CDP.Products' in l: products.append(l.strip()) for i, product in enumerate(products): data = json.loads(product.split(' = ')[1][:-1]) product_id = data['product_id'] product_url = response.url for value in data['variants'].values(): product_name = product_titles[i] + ' - ' + ' - '.join(value['attributes'].values()) product_price = value['prices']['price']['value'] variant_id = value[u'variant_id'] product_identifier = '%s:%s' % (product_id, variant_id) product_url = urljoin_rfc(product_url, '#/%s' % product_urls[i]) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', product_url) loader.add_value('name', product_name) loader.add_value('identifier', product_identifier) loader.add_value('price', product_price) loader.add_value('shipping_cost', '59') if product_url in self.old_data: loader.add_value('category', self.old_data[product_url]['category']) loader.add_value('brand', self.old_data[product_url]['brand']) loader.add_value('sku', self.old_data[product_url]['sku']) category_found = bool(loader.get_output_value('category')) if not category_found: for category, urls in self.category_products.items(): if product_url in urls or product_url + '/' in urls: loader.add_value('category', category.split(',')) category_found = True break if not category_found: if 'lily-loveseat' in product_url: loader.add_value('category', ['Sofa', 'Fabric', 'armchair']) elif 'lean-to-shelf' in product_url: loader.add_value('category', ['Cabinets', 'Bookcases']) elif 'bench' in product_url: loader.add_value('category', ['Dining', 'Dining Tables']) elif 'console-table' in product_url: loader.add_value('category', ['Cabinets', 'Console Tables']) elif 'coffee-table' in product_url: loader.add_value('category', ['Living', 'Coffee Tables']) elif 'nest-of-table' in product_url: loader.add_value('category', ['Living', 'Nest of Tables']) elif '-sofa' in product_url or 'sofa' in product_name.lower(): if 'leather' in product_url or 'leather' in product_name.lower(): category = ['Sofa', 'Leather'] else: category = ['Sofa', 'Fabric'] if '2-seater' in product_url: category.append('2 seater') elif '2.5 seater' in product_name.lower(): category.append('2.5 seater') elif '3-seater' in product_url: category.append('3 seater') elif '4-seater' in product_url: category.append('4 seater') elif 'corner' in product_url: category.append('Corner sofas') elif 'recliner' in product_url: category.append('Recliner sofas') if len(category) == 3: loader.add_value('category', category) elif '-corner' in product_url: if 'leather' in product_url or 'leather' in product_name.lower(): category = ['Sofa', 'Leather', 'Corner sofas'] else: category = ['Sofa', 'Fabric', 'Corner sofas'] loader.add_value('category', category) elif '-recliner-chair' in product_url or (('chair' in product_name.lower() or 'seat' in product_name.lower()) and ('recliner' in product_name.lower() or ' no recline' in product_name.lower())) or 'relaxer-chair' in product_url or 'hand-facing' in product_url: if 'leather' in product_url or 'leather' in product_name.lower() or 'reid-hedgemoor' in product_url: category = ['Sofa', 'Leather', 'armchair'] else: category = ['Sofa', 'Fabric', 'armchair'] loader.add_value('category', category) elif '-footstool' in product_url and not ('chair' in product_url): if 'millan-' in product_url or 'leather' in product_url or 'leather' in product_name.lower(): loader.add_value('category', ['Sofa', 'Leather', 'Footstools']) else: loader.add_value('category', ['Sofa', 'Fabric', 'Footstools']) elif '-table' in product_url and '-chairs' in product_url: loader.add_value('category', ['Dining', 'Dining Sets']) elif '-dining-table' in product_url: loader.add_value('category', ['Dining', 'Dining Tables']) elif '-bookcase' in product_url: loader.add_value('category', ['Cabinets', 'Bookcases']) elif '-lamp-table' in product_url: loader.add_value('category', ['Living', 'Lamp Tables']) elif '-sideboard' in product_url: loader.add_value('category', ['Cabinets', 'Sideboards']) elif '-display-unit' in product_url: loader.add_value('category', ['Cabinets', 'Display Units']) elif 'tv unit' in product_name.lower(): loader.add_value('category', ['Cabinets', 'Entertainment units']) elif '-shelving-unit' in product_url: loader.add_value('category', ['Cabinets', 'Display Units']) elif '-wine-storage' in product_url: loader.add_value('category', ['Cabinets', 'Display Units']) self.products_collected.append(set_product_type(loader.load_item()))
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select(u'//div[@class="prod_title"]/h1/text()').extract() if not name: self.log('ERROR: no product NAME found! URL:{}'.format( response.url)) return else: name = name[0].strip() loader.add_value('name', name) prod_id = hxs.select('//input[@name="productCode"]/@value').extract() loader.add_value('identifier', prod_id[0]) loader.add_value('url', response.url) price = hxs.select( u'//h3[@class="prod_price"]/text()').extract()[0].strip() if not price: self.log('ERROR: no product PRICE found! URL:{}'.format( response.url)) return if price: loader.add_value('price', price) product_image = hxs.select(u'//a[@id="imageLink"]/img/@src').extract() if not product_image: self.log('ERROR: no product Image found!') else: image = urljoin_rfc(get_base_url(response), product_image[0].strip()) loader.add_value('image_url', image) categories = hxs.select( u'//nav[@id="breadcrumb"]/ol/li/a/text()').extract()[1:-1] if not categories: self.log('ERROR: category not found! URL:{}'.format(response.url)) else: for category in categories: loader.add_value('category', category.strip()) sku = hxs.select( '//dl[dt/text()="Our Product Number"]/dd/text()').extract() if not sku: self.log('ERROR: no SKU found! URL:{}'.format(response.url)) else: loader.add_value('sku', sku[0].strip()) loader.add_value('brand', response.meta.get('brand', '')) item = loader.load_item() metadata = ToyMonitorMeta() ean = ''.join( hxs.select('//dl[dt/text()="Manufacturer Number"]/dd/text()'). extract()).strip() if ean: metadata['ean'] = ean promo = response.xpath( '//div[@class="prod_details_main"]/span[@class="badge"]/img/@alt' ).extract() if promo: metadata['promotions'] = promo[0] metadata['reviews'] = [] item['metadata'] = metadata reviews_url = 'http://theentertainer.ugc.bazaarvoice.com/6038-en_gb/%s/reviews.djs?format=embeddedhtml&page=1&scrollToTop=true' yield Request(reviews_url % item['identifier'], callback=self.parse_review_page, meta={'item': item})
def parse(self, response): if not isinstance(response, HtmlResponse): self.log('ERROR: BAD HtmlResponse!!! URL:{}'.format(response.url)) return hxs = HtmlXPathSelector(response) # logic to find categories # find subcats for Outilage Jardin categories = hxs.select( '//div[contains(@class,"bg_U15 menugroup") and contains(@alt,"Jardin") and contains(@alt,"Outillage")]//div[@class="jsGroup"]//ul[@class="tree"]//a/@href' ).extract() # find subcats for Aspirateurs categories += hxs.select( '//div[contains(@class,"bg_U4 menugroup") and contains(@alt,"Entretien") and contains(@alt,"maison")]//div[@class="jsGroup"]//ul[@class="tree"]//a/@href' ).extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) totalproducts = hxs.select('//span[@class="SearchBig"]/text()').re( r'(\d+)') # pagination next_page = hxs.select( u'//ul[@class="PaginationButtons"]//a[contains(text(),"Suivant")]/@href' ).extract() if next_page and int(totalproducts[0]) <= 100000: if not 'filter_active' in response.meta: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page, meta={ 'next_page_retry': 1, 'dont_redirect': True }) else: next_page = hxs.select( u'//ul[@class="PaginationButtons"]//a[contains(text(),"Suivant")]' ) next_page_onclick_id = next_page.select( '@id').extract()[-1] + '.OnClick' req = FormRequest.from_response( response, formname='PageForm', formdata={next_page_onclick_id: u'1'}, meta={'filter_active': True}) req.dont_filter = True yield req if totalproducts and int( totalproducts[0]) > 100000 and not response.meta.get( 'filter_active'): filters = hxs.select( '//div[@class="blocFilter" and contains(strong/text(), "Type de produit")]//input/@name' ).extract() req_base = FormRequest.from_response(response, formname='PageForm', meta={'filter_active': True}, dont_click=True) for filter in filters: req = replace_formdata(req_base, formdata={filter: u'1'}) req.dont_filter = True yield req products = hxs.select( u'//div[@id="productList"]//div[contains(@class,"plProductView")]') if products: for product in products: product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath( 'url', './/a[contains(@class,"plPrName")]/@href') product_loader.add_xpath( 'name', './/a[contains(@class,"plPrName")]/text()') product_loader.add_xpath( 'category', '//div[@class="productListTitle"]/h1/text()') product_loader.add_xpath( 'image_url', './/div[contains(@class, "plProductImg")]//img/@data-src') product_loader.add_xpath('sku', './@data-sku') product_loader.add_xpath( 'identifier', './/input[contains(@name, "ProductPostedForm.ProductId")]/@value' ) price = product.select( u'.//div[contains(@class,"priceContainer")]/div[contains(@class,"priceM")]/text()' ).extract() if price: decimals = product.select( u'//div[contains(@class,"priceContainer")]/div[contains(@class,"priceM")]/sup/text()' ).re(u'(\d+)') if decimals: price = price[0] + '.' + decimals[0] product_loader.add_value('price', price) product_loader.add_value('stock', 1) if product_loader.get_output_value( 'name') and product_loader.get_output_value('price'): identifier = product_loader.get_output_value('identifier') if identifier and identifier.strip(): yield product_loader.load_item() else: self.log('PRODUCT WITH NO IDENTIFIER => %s' % response.url) else: # this site is buggy (it returns no products when we traverse thru the pages at random rate) # so this is a kind of retry code if 'next_page_retry' in response.meta: self.log('ERROR - NO PRODUCTS FOUND, retrying...') count = response.meta['next_page_retry'] if count < self.RETRY_TIMES: self.log( 'ERROR - NO PRODUCTS FOUND, retry #{} url: {}'.format( count, response.url)) if not 'filter_active' in response.meta: yield Request(response.url, meta={ 'next_page_retry': count + 1, 'dont_redirect': True }, dont_filter=True) else: # TODO: FormRequest? pass else: self.log( 'ERROR - NO PRODUCTS FOUND, retry limit reached, giving up, url: {}' .format(response.url))
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) name = hxs.select('//div[@class="catBanner"]/h2/text()').extract()[0] price = hxs.select( '//span[@id="variant-price-header"]/text()').extract() if price: price = extract_price(price[0]) else: return sku = hxs.select('//div[@class="prod"]/p[@class="code"]').re( "Code: ([0-9]+)")[0] brand = hxs.select( '//td[@class="attrib" and text()="Manufacturer"]/following-sibling::td/text()' ).extract() product_loader.add_value('sku', sku) category = " ".join( hxs.select('//div[@id="breadcrumb"]/ul/li/a/text()').extract() [2:-1])[2:] product_loader.add_value('category', category) product_loader.add_value('brand', brand) image_url = hxs.select( '//div[@id="primary_image"]/a/img/@src').extract() if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) identifier = hxs.select( '//input[@name="productCodePost"]/@value').extract() product = product_loader.load_item() variants = hxs.select('//select[@id="variant"]/option') if variants: for option in variants: value = option.select('./@value').extract() if value: variant = parse_variant(value[0]) title = option.select('./text()').extract()[0] price = extract_price(variant.get('price', "0")) subid = variant.get('code') if subid: prod = Product(product) prod['identifier'] = "%s_%s" % (identifier[0], subid) prod['price'] = price subname = title.split(u"£") if subname: subname = subname[0].strip().replace(u"\xa0", " ") if subname.endswith(","): subname = subname[:-1] prod['name'] = "%s %s" % (name, subname) yield prod else: # one option product prod = Product(product) prod['name'] = name o = hxs.select( '//div[@class="options_not_available"]/text()').extract() if o: prod['name'] += ' ' + o[0].strip() prod['identifier'] = identifier[0] prod['price'] = price yield prod
def parse(self, response): base_url = get_base_url(response) categories = response.xpath('//ul[@class="main-nav"]/li/a/@href').extract()[1:] for url in categories: yield Request(urljoin_rfc(base_url, url), cookies=self.additional_cookies) sub_categories = response.xpath('//div[@class="sidenav-title" and span/text()="Browse Categories"]' '/following-sibling::div[@class="inner"]//a/@href').extract() for url in sub_categories: yield Request(urljoin_rfc(base_url, url), cookies=self.additional_cookies) per_page = set(response.xpath('//div[contains(@class, "showing-per-page")]//option/@value').extract()) if per_page: per_page_param = url_query_parameter(response.url, 'productsPerPage') if per_page_param != '48': url = add_or_replace_parameter(response.url, 'productsPerPage', '48') url = add_or_replace_parameter(url, 'page', '0') yield Request(url, cookies=self.additional_cookies) return # Check for valid location is_valid, country_detected = self._is_valid_location(response) if not is_valid: reason = 'Wrong country detected: %s' % country_detected new_request = self._retry_request(response, self.parse, reason) if new_request: yield new_request return # Parse products mde = MicrodataExtractor() data = mde.extract(response.body) if data: product_ids = response.xpath('//div[@itemtype="http://schema.org/Product"]/@data-id').extract() product_urls = map(lambda u: urljoin_rfc(base_url, u), response.xpath('//div[@itemtype="http://schema.org/Product"]' '/div[@class="product-info"]/div[@class="title"]/a/@href').extract()) product_imgs = map(lambda u: urljoin_rfc(base_url, u), response.xpath('//div[@itemtype="http://schema.org/Product"]//a[@class="product-image"]' '//img[@class="product-image-file"]/@src').extract()) rrp_prices = {} for product_id in product_ids: rrp_price = response.xpath('//div[@data-id="%s"]//div/@data-tc-original-price' % product_id).extract() if rrp_price: rrp_prices[product_id] = rrp_price[0] products_extra_data = {} for product_id, product_url, product_img in zip(product_ids, product_urls, product_imgs): products_extra_data[product_id] = { 'url': product_url, 'image_url': product_img, } category = '' categories = filter(lambda item: item['type'] == 'http://data-vocabulary.org/Breadcrumb', data['items']) if categories: category = categories[0]['properties']['title'][1] brands = set(response.xpath('//div[@class="filter-brand-wrapper"]' '//label[contains(@for, "product-listings__filter-top-brands-")]/a[@disabled]/text()')\ .re(r'(.*) \(')) products = filter(lambda item: item.get('type', '') == 'http://schema.org/Product', data['items']) for product in products: product_id = product['properties']['productId'] ajax_url = self.AJAX_URL % product_id headers = {'X-Requested-With': 'XMLHttpRequest'} req = Request(ajax_url, headers=headers, callback=self.parse_options, meta={'main_product': product['properties'], 'category': category, 'products_extra': products_extra_data, 'brands': brands, 'rrp_prices': rrp_prices, 'proxy': response.meta.get('proxy'), 'proxy_service_disabled': True}, cookies=self.additional_cookies) yield req # Check for next page and follow this if exists next_page = response.xpath('//li[@class="next"]/a/@href').extract() if next_page: yield Request(urljoin_rfc(get_base_url(response), next_page[0]), cookies=self.additional_cookies)
def parse(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select('//ul[@id="MainMenu"]//a/@href').extract(): url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product_list)
def parse(self, response): hxs = HtmlXPathSelector(response) categories = hxs.select('//td[@class="produkt_menu"]/div/table/tr/td/a/@href').extract() for category in categories: url = urljoin_rfc(get_base_url(response), category) yield Request(url, callback=self.parse_products)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) colour_options = hxs.select( '//ul[@class="selection-grid"]/li/a/@href').extract() for colour_option in colour_options: yield Request(urljoin_rfc(base_url, colour_option), callback=self.parse_product) name = hxs.select( 'normalize-space(//*[@itemprop="name"]/text())').extract()[0] ext_name = ''.join( hxs.select( '//h1[@id="prod-title"]/text()').extract()).strip().replace( u'\xa0', ' ') name = name + ' ' + ext_name if ext_name else name brand = hxs.select( 'normalize-space(//*[@itemprop="brand"]/span/text())').extract() try: image_url = urljoin_rfc( base_url, hxs.select('//div[@id="prod-media-player"]' '//img/@src').extract()[0].strip()) except IndexError: image_url = '' set_items = hxs.select( '//div[@class="item-details"]/a/@href').extract() if set_items: for item in set_items: yield Request(urljoin_rfc(base_url, item), self.parse_product) return options = hxs.select('//div[@id="prod-multi-product-types"]') if options: products = options.select('.//div[@class="product-type"]') for product in products: opt_name = product.select('.//h3/text()').extract()[0].strip() try: stock = product.select( '//div[contains(@class, "mod-stock-availability")]' '//p/strong/text()').re(r'\d+')[0] except IndexError: stock = 0 loader = ProductLoader(item=Product(), selector=product) sku = product.select( './/div[contains(@class, "mod-product-code")]/p/text()' ).extract() if sku: loader.add_value('sku', sku[0].strip()) loader.add_xpath( 'identifier', './/div[contains(@class, "mod-product-code")]/p/text()') loader.add_value('name', '%s %s' % (name, opt_name)) loader.add_xpath( 'category', '//div[@id="breadcrumbs"]//li[@class="last"]/a/text()') loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('url', response.url) loader.add_xpath('price', './/p[@class="price"]/strong/text()') loader.add_value('stock', stock) yield loader.load_item() else: price = ''.join( hxs.select('//ul/li/strong[@class="price"]/text()').extract() ).strip() if not price: price = ''.join( hxs.select('//div[@id="prod-price"]//strong/text()'). extract()).split() if not price: price = ''.join( hxs.select('//span[@class="now-price"]/text()'). extract()).split() stock = hxs.select( '//div[contains(@class, "mod-stock-availability")]/p[not(contains(@class, "hidden"))]//strong/text()' ).extract() stock = stock[0].strip() if stock else '' loader = ProductLoader(item=Product(), response=response) sku = hxs.select( '//div[@id="prod-product-code"]/p/text()').extract() if not sku: sku = '' loader.add_xpath('identifier', '//div[@id="prod-product-code"]/p/text()') loader.add_value('name', name) loader.add_xpath( 'category', '//div[@id="breadcrumbs"]//li[@class="last"]/a/text()') loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('url', response.url) loader.add_value('price', price) loader.add_value('sku', sku) if 'OUT OF STOCK' in stock.upper(): loader.add_value('stock', 0) else: stock_value = extract_price(stock) if stock_value > 0 and 'IN STOCK' not in stock: loader.add_value('stock', stock_value) if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', 3) yield loader.load_item()
def _urljoin(self, response, url): """Helper to convert relative urls to absolute""" return urljoin_rfc(response.url, url, response.encoding)
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select(u'//h1[@class="mainbox-title"]/text()')[0].extract() loader.add_value('name', name) loader.add_value('url', response.url) price = hxs.select( u'//div[@id="product_info"]//span[@class="price"]/span[@class="price" and @id]/text()' ) if not price: price = hxs.select( u'//*[@itemprop="price"]/span[@class="price" and @id]/text()') price = price[0].extract().replace(',', '') loader.add_value('price', price) image_url = hxs.select( u'//a[contains(text(),"View larger image")]/@href') if image_url: image_url = urljoin_rfc(get_base_url(response), image_url[0].extract()) loader.add_value('image_url', image_url) category = hxs.select( u'//div[@class="breadcrumbs"]/a[1]/following-sibling::a[1]/text()' ).extract() if category: loader.add_value('category', category[0]) sku = hxs.select( u'//div[@class="product-main-info" or @id="product_info"]//p[@class="sku"]//span[starts-with(@id,"product_code")]/text()' ) if sku and sku[0].extract().lower() != 'n/a': sku = sku[0].extract().lower() loader.add_value('sku', sku) loader.add_xpath('identifier', '//input[contains(@name, "product_id")]/@value') options = hxs.select(u'//div[starts-with(@id,"opt_")]//select/option') select_name = hxs.select( u'//div[starts-with(@id,"opt_")]//select/@name').extract() if len(options) == 1: formdata = { 'additional_info[get_detailed]': '1', 'additional_info[get_discounts]': '1', 'additional_info[get_features]': '', 'additional_info[get_icon]': '1', 'additional_info[get_options]': '1', 'additional_info[info_type]': 'D', 'appearance[but_role]': 'action', 'appearance[capture_options_vs_qty]': '', 'appearance[details_page]': '1', 'appearance[separate_buttons]': '', 'appearance[show_add_to_cart]': '1', 'appearance[show_list_buttons]': '1', 'appearance[show_price]': '1', 'appearance[show_price_values]': '1', 'appearance[show_product_amount]': '1', 'appearance[show_product_options]': '1', 'appearance[show_qty]': '1', 'appearance[show_sku]': '1', 'dispatch': 'products.options', select_name[0]: options[0].select(u'./@value').extract()[0] } yield FormRequest('http://www.eglobaldigitalstore.co.uk/index.php', formdata=formdata, meta={'loader': loader}, callback=self.reload_price, dont_filter=True) return else: out_stock = hxs.select('//span[contains(@class, "out-of-stock")]') if out_stock: loader.add_value('stock', 0) yield loader.load_item() for option in options: option_text = option.select(u'./text()')[0].extract() opt_value = option.select(u'./@value').extract()[0] if not opt_value: continue loader = ProductLoader(item=Product(), selector=hxs) res = re.search('(.*?) \(\+\xa3([\d\.,]+)\)', option_text) if res: option_name, option_price = res.groups() else: option_name = re.search('(.*)', option_text).groups()[0] option_price = u'0.00' loader.add_value('name', u'%s %s' % (name, option_name)) loader.add_value('url', response.url) if category: loader.add_value('category', category[0]) loader.add_value('price', str(Decimal(price) + Decimal(option_price))) if image_url: loader.add_value('image_url', image_url) formdata = { 'additional_info[get_detailed]': '1', 'additional_info[get_discounts]': '1', 'additional_info[get_features]': '', 'additional_info[get_icon]': '1', 'additional_info[get_options]': '1', 'additional_info[info_type]': 'D', 'appearance[but_role]': 'action', 'appearance[capture_options_vs_qty]': '', 'appearance[details_page]': '1', 'appearance[separate_buttons]': '', 'appearance[show_add_to_cart]': '1', 'appearance[show_list_buttons]': '1', 'appearance[show_price]': '1', 'appearance[show_price_values]': '1', 'appearance[show_product_amount]': '1', 'appearance[show_product_options]': '1', 'appearance[show_qty]': '1', 'appearance[show_sku]': '1', 'dispatch': 'products.options', select_name[0]: opt_value } yield FormRequest('http://www.eglobaldigitalstore.co.uk/index.php', formdata=formdata, meta={ 'loader': loader, 'opt_value': opt_value }, callback=self.parse_identifier, dont_filter=True)
def basePageParse(self,response): url = response.css(".dep-nav > li > a::attr(href)")[1].extract() url = urljoin_rfc(get_base_url(response), url) yield scrapy.Request(url,self.xiangXiJieShaoParse)
def _get_prices(self, price_response): unpriced = re.search('eBunpriced="(.*)"', price_response.body) if unpriced: unpriced = [ prod_id for prod_id in unpriced.groups()[0].split(',') if prod_id.strip() ] eBzp = [None] * 200 eBzpp = [None] * 200 eBzp_assignments = re.findall('(eBzp\[\d+\]=.*);', price_response.body) for assignment in eBzp_assignments: exec assignment.replace('eBop', "''").replace('eBspl', "'&'").replace( 'eBsp', "'&'") eBzpp_assignments = re.findall('(eBzpp\[\d+\]=.*);', price_response.body) for assignment in eBzpp_assignments: exec assignment.replace('eBop', "'&'").replace('eBspl', "'&'").replace( 'eBsp', "'&'") prices = {} for i, prod in enumerate(eBzp): if prod: prices[prod] = eBzpp[i] hxs = price_response.meta['hxs'] main_name = hxs.select('//h1/text()').extract()[0].strip() image_url = hxs.select('//div[@id="gallery1"]/a/img/@src').extract() image_url = urljoin_rfc('http://www.petertyson.co.uk/', image_url[0]) if image_url else '' category = price_response.meta['category'] products = hxs.select('//form[@id="eBvariant1"]//option') subprods = hxs.select( '//div[@id="TabbedPanels1"]//em/strong[contains(text(), "//")]/text()' ).extract() ''' if not products and subprods: subprods = subprods[0].split('//') for prod in subprods: r = prod.split(':') if len(r) == 2: p = Product() loader = ProductLoader(response=price_response.meta['main_response'], item=p) loader.add_value('name', main_name + ' ' + r[0].strip()) loader.add_value('price', r[1]) loader.add_value('image_url', image_url) loader.add_value('category', category) loader.add_value('url', price_response.meta['main_response'].url) yield loader.load_item() return ''' if not products and prices: product_id = hxs.select( "//form[@class='eBbuybutton']/input[@name='product']/@value" ).extract() if product_id: price = prices.get(product_id[0]) or eBzpp[0] p = Product() loader = ProductLoader( response=price_response.meta['main_response'], item=p) loader.add_value('name', main_name) loader.add_value('price', price) loader.add_value('identifier', product_id) loader.add_value('sku', product_id) loader.add_value('image_url', image_url) loader.add_value('category', category) loader.add_value('url', price_response.meta['main_response'].url) yield loader.load_item() for product in products: subprods = product.select('./@value').extract()[0].split(',') if len(subprods) == 1 and subprods[0] in prices and subprods[ 0] not in unpriced: p = Product() loader = ProductLoader( response=price_response.meta['main_response'], item=p) subname = product.select('./text()').extract()[0].strip() identifier = subprods[0] loader.add_value('name', main_name + ' ' + subname) loader.add_value('price', prices[subprods[0]]) loader.add_value('identifier', identifier) loader.add_value('image_url', image_url) loader.add_value('sku', identifier) loader.add_value('category', category) loader.add_value('url', price_response.meta['main_response'].url) yield loader.load_item() elif len(subprods) > 1: subprods = subprods[1:] for i, subprod in enumerate(subprods): if subprod in prices and subprod not in unpriced: p = Product() loader = ProductLoader( response=price_response.meta['main_response'], item=p) loader.add_value( 'url', price_response.meta['main_response'].url) first_subname = product.select( './text()').extract()[0].strip() subname = subprods[i - 1].strip() loader.add_value( 'name', unquote(main_name + ' ' + first_subname + ' ' + subname)) loader.add_value('price', prices[subprod]) loader.add_value('image_url', image_url) loader.add_value('category', category) loader.add_value('sku', subprod) loader.add_value('identifier', subprod) yield loader.load_item() '''
def parse(self,response): sel = Selector(response) user_id = response.url.split('/')[-1] user_url = urljoin_rfc('http://www.medhelp.org/friendships/list/',user_id) print user_url
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name fitting_method = 'Delivered' loader.add_value('url', response.url) image_url = response.xpath('//img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(get_base_url(response), image_url[0])) identifier = response.xpath('//form[@name="form1"]/@action').extract() if not identifier: yield self.retry_request(response) return identifier = identifier[0] if identifier.startswith('./') and identifier not in self.identifiers: identifier = identifier[2:] elif u'./' + identifier in self.identifiers: identifier = u'./' + identifier loader.add_value('identifier', identifier) price = response.xpath( '//*[@class="price"]/*[@class="mainPrice"]/text()')[0].extract() loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) brand = response.xpath( '//div[@class="hidden"]/input[@class="producerName"]/@value' ).extract() if not brand: yield self.retry_request(response) return brand = brand[0].strip() loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) brand = re.sub(u'\u0119', u'e', brand) product_name = response.xpath( '//h1[@itemprop="name"]/text()')[0].extract().strip() product_name = re.sub(u'[:\u2122]', u'', product_name) product_name = product_name.replace(brand, '').strip() data = parse_pattern(product_name) if not data: self.log('ERROR parsing "{}" [{}]'.format(product_name, response.url)) return loader.add_value('name', data['Name']) metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in product_name metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(product_name) run_flat = 'run on flat' in product_name.lower( ) or 'run flat' in product_name.lower() or run_flat_found metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if mark in product_name.split(' ') ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = self.all_man_marks.get( manufacturer_mark, '') if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) # metadata['alternative_speed_rating'])) label_info = map( unicode.strip, response.xpath( '//div[@class="labelInfo"]/div[@class="labelIco"]/span[contains(@class, "paramValue")]/text()' ).extract())[:3] metadata['fuel'] = label_info[0] if label_info and len( label_info) == 3 else '' metadata['grip'] = label_info[1] if label_info and len( label_info) == 3 else '' metadata['noise'] = label_info[2] if label_info and len( label_info) == 3 else '' product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) yield product
def parse(self,response): sel = Selector(response) user_id = sel.xpath('//div[@class="user_info"]/div/span/a/@href').extract()[0].split('/')[-1] user_url = urljoin_rfc('http://www.medhelp.org/notes/list/',user_id) print user_url
def parse_product(self, response): if not isinstance(response, HtmlResponse): return identifier = response.xpath( '//form[@id="pdAddToCart"]//input[@name="product"]/@value' ).extract() if not identifier: return base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) # Normalize URL product_url = url_query_cleaner(response.url, parameterlist=('content', 'product'), sep=';') loader.add_value('url', product_url) loader.add_value('identifier', identifier[0]) sku = response.xpath( '//td[text()="Item Code:"]/following-sibling::td[1]/text()' ).extract() if sku: loader.add_value('sku', sku[0]) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') price = response.xpath('//span[@itemprop="price"]/text()').extract() if price: price = extract_price(price[0].strip().replace('.', '').replace( ',', '.')) loader.add_value('price', self.convert_to_pounds(str(price))) else: loader.add_value('price', '0.0') image_url = response.xpath('//img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) brand = response.xpath( '//td[text()="Manufacturer:"]/following-sibling::td[1]/text()' ).extract() if brand: loader.add_value('brand', brand[0]) category = response.xpath( '//main//span[@class="text-title"]/text()').extract() if category: loader.add_value('category', category[0].split(':')[0].strip()) availability = response.xpath( '//div[@class="pd-availability"]/span[contains(text(),"Delivery")]/text()' ).extract() if availability and 'unknown' in availability[0].lower(): loader.add_value('stock', 0) product = loader.load_item() options = response.xpath('//form[@id="pdAddToCart"]//select') if not options: if not (getattr(self, 'simple_run', False) and (hasattr(self, 'matched_identifiers')) \ and (product['identifier'] not in self.matched_identifiers)): if not product['identifier'] in self.id_seen: self.id_seen.append(product['identifier']) yield product return for sel in options: opt = '' select_name = sel.xpath('@name').extract() if select_name: opt = select_name[0].replace('opt_', '') for option in sel.xpath('option[@value!="-2"]'): item = Product(product) opt_id = option.xpath('@value').extract() if opt_id: item['identifier'] += '-' + opt + '-' + opt_id[0] item['stock'] = 1 if option.xpath('@data-av') == '100': item['stock'] = 0 opt_name = option.xpath('text()').extract() if opt_name: item['name'] += ' - ' + opt_name[0] opt_surcharge = option.xpath('@data-surcharge').extract() if opt_surcharge: item['price'] += extract_price(opt_surcharge[0]) if getattr(self, 'simple_run', False) and (hasattr(self, 'matched_identifiers')) \ and (item['identifier'] not in self.matched_identifiers): continue if not item['identifier'] in self.id_seen: self.id_seen.append(item['identifier']) yield item