예제 #1
0
 def __init__(self, *args, **kwargs):
     super(VodafoneSpider, self).__init__(*args, **kwargs)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     # Browser
     self.log('>>> BROWSER: Open browser')
     self._browser = PhantomJS()
     self.log('>>> BROWSER: OK')
예제 #2
0
    def parse_product(self, response):
        browser = PhantomJS()
        self.log('>>> BROWSER: GET => %s' % response.url)
        browser.get(response.url)
        self.log('>>> BROWSER: OK!')

        hxs = HtmlXPathSelector(text=browser.driver.page_source)

        browser.close()
        self.log('>>> BROWSER: Closed')

        sku = hxs.select(u'//*[@class="displaySkuCode"]//text()').extract()

        sku = sku[0].replace('#', '')

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_xpath(
            'name', u'//div[contains(@class,"title")]//h1/text()')
        product_loader.add_value('sku', sku)
        product_loader.add_xpath(
            'category',
            u'//ul[contains(@class, "pd-breadcrumbs")]/li[2]/a/text()')
        product_loader.add_value('identifier', sku)
        price = hxs.select(
            u'//div[contains(@class, "product-price__reg-price")]/text()'
        ).extract()
        product_loader.add_value('price', price[0].replace('Reg.', ''))
        product_loader.add_value('brand', response.meta['brand'].lower())
        product_loader.add_value('url', response.url)
        image_url = hxs.select(
            u'/html/head/link[@rel="image_src"]/@href').extract()
        if image_url:
            product_loader.add_value('image_url', image_url[0])
        product = product_loader.load_item()

        metadata = KeterMeta()
        metadata['brand'] = response.meta['brand']
        metadata['reviews'] = []
        product['metadata'] = metadata
        response.meta['product'] = product

        brand = response.meta['brand'].lower()
        if brand not in product['name'] and brand not in response.body.lower():
            return

        # http://www.canadiantire.ca/AST/browse/2/OutdoorLiving/3/OutdoorStorage/Sheds/PRD~0600292P/Keter+Rattan+Vertical+Shed.jsp?locale=en
        # http://canadiantire.ugc.bazaarvoice.com/9045/0600292P/reviews.djs?format=embeddedhtml
        # <script language="JavaScript" src="http://canadiantire.ugc.bazaarvoice.com/static/9045/bvapi.js" type="text/javascript"></script>
        part1 = hxs.select(
            u'//script[starts-with(@src,"http://canadiantire.ugc.bazaarvoice.com/static/")]/@src'
        ).extract()[0].split('/')[-2]
        part2 = hxs.select(
            '//div[@id="bazaarVoiceConfig"]/@data-product-code').extract()[0]

        yield Request(
            'http://canadiantire.ugc.bazaarvoice.com/%s/%s/reviews.djs?format=embeddedhtml'
            % (part1, part2),
            meta=response.meta,
            callback=self.parse_review_js)
예제 #3
0
    def start_requests(self):
        browser = PhantomJS()
        url = 'http://www.nisbets.co.uk/Homepage.action'
        self.log('>>> BROWSER: GET => %s' % url)
        browser.get(url)
        self.log('>>> BROWSER: OK')

        time.sleep(120)

        page_source = browser.driver.page_source

        browser.close()

        for req in self.parse(url, page_source):
            yield req
예제 #4
0
    def parse(self, response):
        base_url = get_base_url(response)

        browser = PhantomJS()

        browser.get(response.url)

        hxs = HtmlXPathSelector(text=browser.driver.page_source)

        browser.close()

        categories = hxs.select('//div[@id="nav-full"]//a')
        for category in categories:
            url = category.select('./@href').extract()
            if url:
                meta = response.meta
                category_name = category.select('./span/text()').extract()
                meta['category'] = category_name[0] if category_name else ''
                yield Request(urljoin_rfc(base_url, url[0]), meta=meta, callback=self.parse_pagination)
예제 #5
0
 def _get_new_browser(self):
     proxy = None
     proxy_service_api = ProxyServiceAPI(host=PROXY_SERVICE_HOST,
                                         user=PROXY_SERVICE_USER,
                                         password=PROXY_SERVICE_PSWD)
     proxy_data = {'id': '', 'url': ''}
     proxy_list = proxy_service_api.get_proxy_list(self.proxy_target_id,
                                                   types='https',
                                                   log=self.log,
                                                   length=1)
     if proxy_list:
         proxy_data = proxy_list[0]
         proxy_type, proxy_host = proxy_data['url'].split('://')
         proxy = {
             'host': proxy_host,
             'type': proxy_type,
         }
     user_agent = random.choice(self._all_user_agents)
     return PhantomJS(load_images=True, proxy=proxy, user_agent=user_agent)
예제 #6
0
    def renew_browser(self, browser_profile=None, browser_blocked=False):
        proxy_service_api = ProxyServiceAPI(host=PROXY_SERVICE_HOST, user=PROXY_SERVICE_USER, password=PROXY_SERVICE_PSWD)
        blocked = []

        if browser_profile:
            browser_profile['webdriver'].quit()
            if browser_blocked:
                blocked.append(browser_profile['proxy_id'])
        else:
            browser_profile = {}

        proxy = None
        proxy_data = {'id': '', 'url': ''}
        proxy_list = proxy_service_api.get_proxy_list(self.proxy_service_target_id,
                                                      locations=self.proxy_service_location,
                                                      types='https', blocked=blocked, log=self.log, length=1)
        if proxy_list:
            proxy_data = proxy_list[0]
            proxy_type, proxy_host = proxy_data['url'].split('://')
            proxy = {
                'host': proxy_host,
                'type': proxy_type,
            }

        user_agent = self.user_agents.next()
        browser = PhantomJS(proxy=proxy, user_agent=user_agent, load_images=False)

        browser_profile.update(
            {'webdriver': browser.driver,
             'useragent': user_agent,
             'proxy': proxy_data['url'],
             'proxy_id': proxy_data['id']})

        browser_profile['retry'] = browser_blocked
        if browser_blocked:
            browser_profile['retry_no'] = int(browser_profile.get('retry_no', 0)) + 1
        else:
            browser_profile['retry_no'] = 0

        if not browser_blocked:
            # Add new browser
            self._browsers.append(browser_profile)
예제 #7
0
    def parse(self, response):
        browser = PhantomJS()
        url = self.start_urls[0]
        self.log('>>> BROWSER: GET => %s' % url)
        browser.get(url)
        self.log('>>> BROWSER: OK')

        time.sleep(120)
        browser.driver.find_element_by_xpath(
            '//p[@class="style-inc"]//input').click()
        time.sleep(30)

        page_source = browser.driver.page_source

        browser.close()
        hxs = HtmlXPathSelector(text=page_source)
        for cat in hxs.select('//ul[@class="clear-after"]/li/ul/li/a'):
            yield Request(
                urljoin_rfc(url,
                            cat.select('./@href').extract()[0]),
                callback=self.parse_cat,
                meta={'category': cat.select('./text()').extract()[0]})
예제 #8
0
    def parse(self, response):
        # get the lastest link
        client = imaplib.IMAP4_SSL('imap.gmail.com', 993)
        client.login('totalfeedcompetitormonitor', 'uyWTStB6')
        client.select('INBOX')
        mails = client.uid('search', 'ALL')[1][0].split()[::-1]
        for mail_uid in mails:
            mail = client.uid('fetch', mail_uid, '(RFC822)')
            mail = email.message_from_string(mail[1][0][1])
            subject = email.header.decode_header(mail['Subject'])[0][0]
            if 'Nouveau message' not in subject:
                continue
            body = ' '.join([m.get_payload() for m in mail.get_payload()])
            url = re.search('(http.*?DownloadToken.*)',
                            body).group(1).replace('\r', '')
            break

        browser = PhantomJS()
        # url = 'https://poseidon.hubtotal.net/zephyr/DownloadToken.jsp?token=iQ4rBu6SBKEB8KdOLpeO0JplfDhqJPqiIgOQrjsfuKedCnYC'
        self.log('>>> BROWSER: GET => %s' % url)
        browser.get(url)
        self.log('>>> BROWSER: OK')

        time.sleep(180)

        page_source = browser.driver.page_source

        browser.close()
        token = urlparse.parse_qs(urlparse.urlparse(url).query)['token'][0]

        hxs = HtmlXPathSelector(text=page_source)
        link_id = hxs.select('//h3[@class="unit-name"]/a/@id').re('file_(.*)')

        download_link = 'https://poseidon.hubtotal.net/zephyr/MFTWebAppDownloadToken/Download?file={}&token={}'.format(
            link_id[0], token)
        yield Request(download_link, callback=self.parse_feed)