示例#1
0
    def parse_product(self, response):
        browser = PhantomJS()
        self.log('>>> BROWSER: GET => %s' % response.url)
        browser.get(response.url)
        self.log('>>> BROWSER: OK!')

        hxs = HtmlXPathSelector(text=browser.driver.page_source)

        browser.close()
        self.log('>>> BROWSER: Closed')

        sku = hxs.select(u'//*[@class="displaySkuCode"]//text()').extract()

        sku = sku[0].replace('#', '')

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_xpath(
            'name', u'//div[contains(@class,"title")]//h1/text()')
        product_loader.add_value('sku', sku)
        product_loader.add_xpath(
            'category',
            u'//ul[contains(@class, "pd-breadcrumbs")]/li[2]/a/text()')
        product_loader.add_value('identifier', sku)
        price = hxs.select(
            u'//div[contains(@class, "product-price__reg-price")]/text()'
        ).extract()
        product_loader.add_value('price', price[0].replace('Reg.', ''))
        product_loader.add_value('brand', response.meta['brand'].lower())
        product_loader.add_value('url', response.url)
        image_url = hxs.select(
            u'/html/head/link[@rel="image_src"]/@href').extract()
        if image_url:
            product_loader.add_value('image_url', image_url[0])
        product = product_loader.load_item()

        metadata = KeterMeta()
        metadata['brand'] = response.meta['brand']
        metadata['reviews'] = []
        product['metadata'] = metadata
        response.meta['product'] = product

        brand = response.meta['brand'].lower()
        if brand not in product['name'] and brand not in response.body.lower():
            return

        # http://www.canadiantire.ca/AST/browse/2/OutdoorLiving/3/OutdoorStorage/Sheds/PRD~0600292P/Keter+Rattan+Vertical+Shed.jsp?locale=en
        # http://canadiantire.ugc.bazaarvoice.com/9045/0600292P/reviews.djs?format=embeddedhtml
        # <script language="JavaScript" src="http://canadiantire.ugc.bazaarvoice.com/static/9045/bvapi.js" type="text/javascript"></script>
        part1 = hxs.select(
            u'//script[starts-with(@src,"http://canadiantire.ugc.bazaarvoice.com/static/")]/@src'
        ).extract()[0].split('/')[-2]
        part2 = hxs.select(
            '//div[@id="bazaarVoiceConfig"]/@data-product-code').extract()[0]

        yield Request(
            'http://canadiantire.ugc.bazaarvoice.com/%s/%s/reviews.djs?format=embeddedhtml'
            % (part1, part2),
            meta=response.meta,
            callback=self.parse_review_js)
示例#2
0
    def start_requests(self):
        browser = PhantomJS()
        url = 'http://www.nisbets.co.uk/Homepage.action'
        self.log('>>> BROWSER: GET => %s' % url)
        browser.get(url)
        self.log('>>> BROWSER: OK')

        time.sleep(120)

        page_source = browser.driver.page_source

        browser.close()

        for req in self.parse(url, page_source):
            yield req
示例#3
0
    def parse(self, response):
        base_url = get_base_url(response)

        browser = PhantomJS()

        browser.get(response.url)

        hxs = HtmlXPathSelector(text=browser.driver.page_source)

        browser.close()

        categories = hxs.select('//div[@id="nav-full"]//a')
        for category in categories:
            url = category.select('./@href').extract()
            if url:
                meta = response.meta
                category_name = category.select('./span/text()').extract()
                meta['category'] = category_name[0] if category_name else ''
                yield Request(urljoin_rfc(base_url, url[0]), meta=meta, callback=self.parse_pagination)
示例#4
0
    def parse(self, response):
        browser = PhantomJS()
        url = self.start_urls[0]
        self.log('>>> BROWSER: GET => %s' % url)
        browser.get(url)
        self.log('>>> BROWSER: OK')

        time.sleep(120)
        browser.driver.find_element_by_xpath(
            '//p[@class="style-inc"]//input').click()
        time.sleep(30)

        page_source = browser.driver.page_source

        browser.close()
        hxs = HtmlXPathSelector(text=page_source)
        for cat in hxs.select('//ul[@class="clear-after"]/li/ul/li/a'):
            yield Request(
                urljoin_rfc(url,
                            cat.select('./@href').extract()[0]),
                callback=self.parse_cat,
                meta={'category': cat.select('./text()').extract()[0]})
示例#5
0
    def parse(self, response):
        # get the lastest link
        client = imaplib.IMAP4_SSL('imap.gmail.com', 993)
        client.login('totalfeedcompetitormonitor', 'uyWTStB6')
        client.select('INBOX')
        mails = client.uid('search', 'ALL')[1][0].split()[::-1]
        for mail_uid in mails:
            mail = client.uid('fetch', mail_uid, '(RFC822)')
            mail = email.message_from_string(mail[1][0][1])
            subject = email.header.decode_header(mail['Subject'])[0][0]
            if 'Nouveau message' not in subject:
                continue
            body = ' '.join([m.get_payload() for m in mail.get_payload()])
            url = re.search('(http.*?DownloadToken.*)',
                            body).group(1).replace('\r', '')
            break

        browser = PhantomJS()
        # url = 'https://poseidon.hubtotal.net/zephyr/DownloadToken.jsp?token=iQ4rBu6SBKEB8KdOLpeO0JplfDhqJPqiIgOQrjsfuKedCnYC'
        self.log('>>> BROWSER: GET => %s' % url)
        browser.get(url)
        self.log('>>> BROWSER: OK')

        time.sleep(180)

        page_source = browser.driver.page_source

        browser.close()
        token = urlparse.parse_qs(urlparse.urlparse(url).query)['token'][0]

        hxs = HtmlXPathSelector(text=page_source)
        link_id = hxs.select('//h3[@class="unit-name"]/a/@id').re('file_(.*)')

        download_link = 'https://poseidon.hubtotal.net/zephyr/MFTWebAppDownloadToken/Download?file={}&token={}'.format(
            link_id[0], token)
        yield Request(download_link, callback=self.parse_feed)
示例#6
0
class JDSportsSpider(BaseSpider):
    name = u'jdsports.co.uk'
    allowed_domains = ['www.jdsports.co.uk']
    start_urls1 = [
        'http://www.jdsports.co.uk/men/mens-footwear/brand/nike/',
        'http://www.jdsports.co.uk/women/womens-footwear/brand/nike/'
    ]
    start_urls2 = [
        'http://www.jdsports.co.uk/featured/kids+nike+footwear?pageSize=9999',
        'http://www.jdsports.co.uk/search/nike-skateboarding?pageSize=9999'
    ]

    def __init__(self, *args, **kwargs):
        super(JDSportsSpider, self).__init__(*args, **kwargs)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

        # Browser
        self.log('>>> BROWSER: Open browser')
        self._browser = PhantomJS()
        self.log('>>> BROWSER: OK')

    def spider_closed(self, spider):
        # Browser close
        self.log('>>> BROWSER: close')
        self._browser.close()
        self.log('>>> BROWSER: OK')

    def start_requests(self):
        product_urls = []
        for url in self.start_urls1:
            self.log('>>> BROWSER: GET => %s' % url)
            self._browser.get(url)
            self.log('>>> BROWSER: OK')
            find_more = True
            while find_more:
                hxs = HtmlXPathSelector(text=self._browser.driver.page_source)
                product_urls += hxs.select(
                    '//a[@data-perf-id="product"]/@href').extract()
                try:
                    self.log('>>> BROWSER: CLICK NEXT PAGE LINK')
                    self._browser.driver.find_element_by_xpath(
                        '//ul[@data-component-name="pagination"]/li[contains(@class, "next")]/a'
                    ).click()
                    self.log('>>> BROWSER: OK')
                except NoSuchElementException:
                    self.log('>>> BROWSER: NEXT PAGE NOT FOUND')
                    find_more = False
                else:
                    time.sleep(5)

        for url in product_urls:
            yield Request(url, callback=self.parse_product)
        for url in self.start_urls2:
            yield Request(url,
                          callback=self.parse_products_list,
                          meta={'category': ''})

    def parse_categories(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        urls = hxs.select('//*[@id="Sport/Activity"]/li/a/@href').extract()
        categories = hxs.select('//*[@id="Sport/Activity"]/li/a/@id').extract()
        for url, category in zip(urls, categories):
            url = url.replace('fh_view_size%3d20', 'fh_view_size%3d9999')
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_products_list,
                          meta={'category': category})

    def parse_products_list(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        for url in hxs.select('//a[@data-perf-id="product"]/@href').extract():
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_product,
                          meta=response.meta)

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        # page 404
        if hxs.select("//img[@class='image-404']"):
            self.log("[WARNING] Product not found on page: %s" % response.url)
            return
        base_url = get_base_url(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        name = hxs.select(
            '//*[@id="infoPanel"]/h1/text()').extract()[0].strip()
        url = response.url
        loader.add_value('url', urljoin_rfc(base_url, url))
        loader.add_value('name', name)
        image_url = hxs.select('//*[@id="main"]/noscript/img/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        price = hxs.select(
            '//*[@id="productSummaryPrice"]/text()').extract()[0]
        if price == 'No price available.':
            return
        price = extract_price(price.replace(u'\xa3', ''))
        loader.add_value('price', price)
        if 'category' in response.meta:
            loader.add_value('category', response.meta.get('category'))
        else:
            categories = hxs.select(
                '//div[@class="breadcrumbs"]/a[not(contains(@class, "current"))]/text()'
            ).extract()
            if categories:
                loader.add_value('category', categories[-1])
        identifier = hxs.select(
            '//div[@id="productPage"]/@data-plu').extract()[0]
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_value('brand', 'Nike')
        if price < 60:
            loader.add_value('shipping_cost', 3.99)
        else:
            loader.add_value('shipping_cost', 0)
        yield loader.load_item()
示例#7
0
class VodafoneSpider(VodafoneBaseSpider):
    name = 'vodafone-t-mobile.de'
    allowed_domains = ['vodafone.co.uk']
    start_urls = (
        'https://www.t-mobile.de/apple-iphone/iphone-6/0,26907,28800-_,00.html',
        'https://www.t-mobile.de/apple-iphone/iphone-6-plus/0,26908,28801-_,00.html?WT.svl=100',
        'https://www.t-mobile.de/samsung-galaxy/samsung-galaxy-s5-lte/0,27026,28852-_,00.html?WT.svl=100'
    )

    def __init__(self, *args, **kwargs):
        super(VodafoneSpider, self).__init__(*args, **kwargs)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        # Browser
        self.log('>>> BROWSER: Open browser')
        self._browser = PhantomJS()
        self.log('>>> BROWSER: OK')

    def spider_closed(self, spider):
        self._browser.close()

    def parse(self, response):
        base_url = get_base_url(response)
        selected_option_id = response.meta.get('option_id', None)
        self._browser.get(response.url)

        container = self._browser.driver.find_element_by_xpath(
            '//div[@class="chosen-container chosen-container-single chosen-container-single-nosearch"]'
        )
        container.click()

        hxs = HtmlXPathSelector(text=self._browser.driver.page_source)

        if not selected_option_id:
            options = hxs.select(
                '//ul[@class="chosen-results"]/li/@data-option-array-index'
            ).extract()
            for option_id in options:
                yield Request(response.url,
                              dont_filter=True,
                              meta={'option_id': option_id})
            return

        option = self._browser.driver.find_element_by_xpath(
            '//ul[@class="chosen-results"]/li[@data-option-array-index="' +
            selected_option_id + '"]')
        option.click()

        hxs = HtmlXPathSelector(text=self._browser.driver.page_source)
        tariffs = hxs.select('//li[contains(@class, "rate-element")]')

        device_identifier = re.search('0,(.*?)-_', response.url).group(1)

        for tariff in tariffs:

            loader = ProductLoader(item=Product(), response=response)
            duration = '24'

            identifier = tariff.select('@data-shop-id').extract()
            loader.add_value(
                'identifier', device_identifier + '-' + selected_option_id +
                '-' + identifier[0])
            phone_name = ' '.join(
                tariff.select(
                    './/div[@class="configuration-output"]//p[not(span)]//text()'
                ).extract())
            tariff_name = ' '.join(
                tariff.select(
                    './/div[@class="heading-2"]/span[@class="title-1" or @class="title-2"]//text()'
                ).extract())
            phone_price = ''.join(
                tariff.select(
                    './/div[@class="configuration-output"]//p/span//text()').
                extract()).replace(',', '.')
            image_url = hxs.select(
                '//div[@id="device-image-slider"]//li/img/@src').extract()
            if image_url:
                image_url = urljoin_rfc(base_url, image_url[0])
            monthly_cost = ''.join(
                tariff.select('.//p[@class="price monthly-price"]/span//text()'
                              ).extract()).replace(',', '.')

            normalized_name = self.get_normalized_name(phone_name)
            loader.add_value('name', normalized_name + ' - ' + tariff_name)
            loader.add_value('url', response.url)
            loader.add_value('brand', phone_name.split()[0])
            loader.add_value('price', phone_price)
            loader.add_value('image_url', image_url)

            product = loader.load_item()
            metadata = VodafoneMeta()
            metadata['device_name'] = phone_name
            metadata['monthly_cost'] = re.search('(\d+.\d+)',
                                                 monthly_cost).group(1)
            metadata['tariff_name'] = tariff_name
            metadata['contract_duration'] = duration
            metadata['operator'] = operator
            metadata['channel'] = channel
            metadata['promotional_text'] = ''
            metadata['network_generation'] = '4G'
            product['metadata'] = metadata

            yield product