def parse_product(self, response): browser = PhantomJS() self.log('>>> BROWSER: GET => %s' % response.url) browser.get(response.url) self.log('>>> BROWSER: OK!') hxs = HtmlXPathSelector(text=browser.driver.page_source) browser.close() self.log('>>> BROWSER: Closed') sku = hxs.select(u'//*[@class="displaySkuCode"]//text()').extract() sku = sku[0].replace('#', '') product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath( 'name', u'//div[contains(@class,"title")]//h1/text()') product_loader.add_value('sku', sku) product_loader.add_xpath( 'category', u'//ul[contains(@class, "pd-breadcrumbs")]/li[2]/a/text()') product_loader.add_value('identifier', sku) price = hxs.select( u'//div[contains(@class, "product-price__reg-price")]/text()' ).extract() product_loader.add_value('price', price[0].replace('Reg.', '')) product_loader.add_value('brand', response.meta['brand'].lower()) product_loader.add_value('url', response.url) image_url = hxs.select( u'/html/head/link[@rel="image_src"]/@href').extract() if image_url: product_loader.add_value('image_url', image_url[0]) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'] metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product brand = response.meta['brand'].lower() if brand not in product['name'] and brand not in response.body.lower(): return # http://www.canadiantire.ca/AST/browse/2/OutdoorLiving/3/OutdoorStorage/Sheds/PRD~0600292P/Keter+Rattan+Vertical+Shed.jsp?locale=en # http://canadiantire.ugc.bazaarvoice.com/9045/0600292P/reviews.djs?format=embeddedhtml # <script language="JavaScript" src="http://canadiantire.ugc.bazaarvoice.com/static/9045/bvapi.js" type="text/javascript"></script> part1 = hxs.select( u'//script[starts-with(@src,"http://canadiantire.ugc.bazaarvoice.com/static/")]/@src' ).extract()[0].split('/')[-2] part2 = hxs.select( '//div[@id="bazaarVoiceConfig"]/@data-product-code').extract()[0] yield Request( 'http://canadiantire.ugc.bazaarvoice.com/%s/%s/reviews.djs?format=embeddedhtml' % (part1, part2), meta=response.meta, callback=self.parse_review_js)
def start_requests(self): browser = PhantomJS() url = 'http://www.nisbets.co.uk/Homepage.action' self.log('>>> BROWSER: GET => %s' % url) browser.get(url) self.log('>>> BROWSER: OK') time.sleep(120) page_source = browser.driver.page_source browser.close() for req in self.parse(url, page_source): yield req
def parse(self, response): base_url = get_base_url(response) browser = PhantomJS() browser.get(response.url) hxs = HtmlXPathSelector(text=browser.driver.page_source) browser.close() categories = hxs.select('//div[@id="nav-full"]//a') for category in categories: url = category.select('./@href').extract() if url: meta = response.meta category_name = category.select('./span/text()').extract() meta['category'] = category_name[0] if category_name else '' yield Request(urljoin_rfc(base_url, url[0]), meta=meta, callback=self.parse_pagination)
def parse(self, response): browser = PhantomJS() url = self.start_urls[0] self.log('>>> BROWSER: GET => %s' % url) browser.get(url) self.log('>>> BROWSER: OK') time.sleep(120) browser.driver.find_element_by_xpath( '//p[@class="style-inc"]//input').click() time.sleep(30) page_source = browser.driver.page_source browser.close() hxs = HtmlXPathSelector(text=page_source) for cat in hxs.select('//ul[@class="clear-after"]/li/ul/li/a'): yield Request( urljoin_rfc(url, cat.select('./@href').extract()[0]), callback=self.parse_cat, meta={'category': cat.select('./text()').extract()[0]})
def parse(self, response): # get the lastest link client = imaplib.IMAP4_SSL('imap.gmail.com', 993) client.login('totalfeedcompetitormonitor', 'uyWTStB6') client.select('INBOX') mails = client.uid('search', 'ALL')[1][0].split()[::-1] for mail_uid in mails: mail = client.uid('fetch', mail_uid, '(RFC822)') mail = email.message_from_string(mail[1][0][1]) subject = email.header.decode_header(mail['Subject'])[0][0] if 'Nouveau message' not in subject: continue body = ' '.join([m.get_payload() for m in mail.get_payload()]) url = re.search('(http.*?DownloadToken.*)', body).group(1).replace('\r', '') break browser = PhantomJS() # url = 'https://poseidon.hubtotal.net/zephyr/DownloadToken.jsp?token=iQ4rBu6SBKEB8KdOLpeO0JplfDhqJPqiIgOQrjsfuKedCnYC' self.log('>>> BROWSER: GET => %s' % url) browser.get(url) self.log('>>> BROWSER: OK') time.sleep(180) page_source = browser.driver.page_source browser.close() token = urlparse.parse_qs(urlparse.urlparse(url).query)['token'][0] hxs = HtmlXPathSelector(text=page_source) link_id = hxs.select('//h3[@class="unit-name"]/a/@id').re('file_(.*)') download_link = 'https://poseidon.hubtotal.net/zephyr/MFTWebAppDownloadToken/Download?file={}&token={}'.format( link_id[0], token) yield Request(download_link, callback=self.parse_feed)
class JDSportsSpider(BaseSpider): name = u'jdsports.co.uk' allowed_domains = ['www.jdsports.co.uk'] start_urls1 = [ 'http://www.jdsports.co.uk/men/mens-footwear/brand/nike/', 'http://www.jdsports.co.uk/women/womens-footwear/brand/nike/' ] start_urls2 = [ 'http://www.jdsports.co.uk/featured/kids+nike+footwear?pageSize=9999', 'http://www.jdsports.co.uk/search/nike-skateboarding?pageSize=9999' ] def __init__(self, *args, **kwargs): super(JDSportsSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) # Browser self.log('>>> BROWSER: Open browser') self._browser = PhantomJS() self.log('>>> BROWSER: OK') def spider_closed(self, spider): # Browser close self.log('>>> BROWSER: close') self._browser.close() self.log('>>> BROWSER: OK') def start_requests(self): product_urls = [] for url in self.start_urls1: self.log('>>> BROWSER: GET => %s' % url) self._browser.get(url) self.log('>>> BROWSER: OK') find_more = True while find_more: hxs = HtmlXPathSelector(text=self._browser.driver.page_source) product_urls += hxs.select( '//a[@data-perf-id="product"]/@href').extract() try: self.log('>>> BROWSER: CLICK NEXT PAGE LINK') self._browser.driver.find_element_by_xpath( '//ul[@data-component-name="pagination"]/li[contains(@class, "next")]/a' ).click() self.log('>>> BROWSER: OK') except NoSuchElementException: self.log('>>> BROWSER: NEXT PAGE NOT FOUND') find_more = False else: time.sleep(5) for url in product_urls: yield Request(url, callback=self.parse_product) for url in self.start_urls2: yield Request(url, callback=self.parse_products_list, meta={'category': ''}) def parse_categories(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) urls = hxs.select('//*[@id="Sport/Activity"]/li/a/@href').extract() categories = hxs.select('//*[@id="Sport/Activity"]/li/a/@id').extract() for url, category in zip(urls, categories): url = url.replace('fh_view_size%3d20', 'fh_view_size%3d9999') yield Request(urljoin_rfc(base_url, url), callback=self.parse_products_list, meta={'category': category}) def parse_products_list(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for url in hxs.select('//a[@data-perf-id="product"]/@href').extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_product, meta=response.meta) def parse_product(self, response): hxs = HtmlXPathSelector(response) # page 404 if hxs.select("//img[@class='image-404']"): self.log("[WARNING] Product not found on page: %s" % response.url) return base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select( '//*[@id="infoPanel"]/h1/text()').extract()[0].strip() url = response.url loader.add_value('url', urljoin_rfc(base_url, url)) loader.add_value('name', name) image_url = hxs.select('//*[@id="main"]/noscript/img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = hxs.select( '//*[@id="productSummaryPrice"]/text()').extract()[0] if price == 'No price available.': return price = extract_price(price.replace(u'\xa3', '')) loader.add_value('price', price) if 'category' in response.meta: loader.add_value('category', response.meta.get('category')) else: categories = hxs.select( '//div[@class="breadcrumbs"]/a[not(contains(@class, "current"))]/text()' ).extract() if categories: loader.add_value('category', categories[-1]) identifier = hxs.select( '//div[@id="productPage"]/@data-plu').extract()[0] loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('brand', 'Nike') if price < 60: loader.add_value('shipping_cost', 3.99) else: loader.add_value('shipping_cost', 0) yield loader.load_item()
class VodafoneSpider(VodafoneBaseSpider): name = 'vodafone-t-mobile.de' allowed_domains = ['vodafone.co.uk'] start_urls = ( 'https://www.t-mobile.de/apple-iphone/iphone-6/0,26907,28800-_,00.html', 'https://www.t-mobile.de/apple-iphone/iphone-6-plus/0,26908,28801-_,00.html?WT.svl=100', 'https://www.t-mobile.de/samsung-galaxy/samsung-galaxy-s5-lte/0,27026,28852-_,00.html?WT.svl=100' ) def __init__(self, *args, **kwargs): super(VodafoneSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) # Browser self.log('>>> BROWSER: Open browser') self._browser = PhantomJS() self.log('>>> BROWSER: OK') def spider_closed(self, spider): self._browser.close() def parse(self, response): base_url = get_base_url(response) selected_option_id = response.meta.get('option_id', None) self._browser.get(response.url) container = self._browser.driver.find_element_by_xpath( '//div[@class="chosen-container chosen-container-single chosen-container-single-nosearch"]' ) container.click() hxs = HtmlXPathSelector(text=self._browser.driver.page_source) if not selected_option_id: options = hxs.select( '//ul[@class="chosen-results"]/li/@data-option-array-index' ).extract() for option_id in options: yield Request(response.url, dont_filter=True, meta={'option_id': option_id}) return option = self._browser.driver.find_element_by_xpath( '//ul[@class="chosen-results"]/li[@data-option-array-index="' + selected_option_id + '"]') option.click() hxs = HtmlXPathSelector(text=self._browser.driver.page_source) tariffs = hxs.select('//li[contains(@class, "rate-element")]') device_identifier = re.search('0,(.*?)-_', response.url).group(1) for tariff in tariffs: loader = ProductLoader(item=Product(), response=response) duration = '24' identifier = tariff.select('@data-shop-id').extract() loader.add_value( 'identifier', device_identifier + '-' + selected_option_id + '-' + identifier[0]) phone_name = ' '.join( tariff.select( './/div[@class="configuration-output"]//p[not(span)]//text()' ).extract()) tariff_name = ' '.join( tariff.select( './/div[@class="heading-2"]/span[@class="title-1" or @class="title-2"]//text()' ).extract()) phone_price = ''.join( tariff.select( './/div[@class="configuration-output"]//p/span//text()'). extract()).replace(',', '.') image_url = hxs.select( '//div[@id="device-image-slider"]//li/img/@src').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) monthly_cost = ''.join( tariff.select('.//p[@class="price monthly-price"]/span//text()' ).extract()).replace(',', '.') normalized_name = self.get_normalized_name(phone_name) loader.add_value('name', normalized_name + ' - ' + tariff_name) loader.add_value('url', response.url) loader.add_value('brand', phone_name.split()[0]) loader.add_value('price', phone_price) loader.add_value('image_url', image_url) product = loader.load_item() metadata = VodafoneMeta() metadata['device_name'] = phone_name metadata['monthly_cost'] = re.search('(\d+.\d+)', monthly_cost).group(1) metadata['tariff_name'] = tariff_name metadata['contract_duration'] = duration metadata['operator'] = operator metadata['channel'] = channel metadata['promotional_text'] = '' metadata['network_generation'] = '4G' product['metadata'] = metadata yield product