Exemplo n.º 1
0
class WoolWorthsSpider(BaseSpider):
    name = 'toymonitor-woolworths.co.uk'
    allowed_domains = ['woolworths.co.uk']
    start_urls = ['http://www.woolworths.co.uk/brand-store.page?end=5132']
    errors = []
    brand_selector = BrandSelector(errors)

    #field_modifiers = {'brand': brand_selector.get_brand}

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        file_path = HERE + '/Brandstomonitor.xlsx'
        wb = xlrd.open_workbook(file_path)
        sh = wb.sheet_by_index(0)

        brands_to_monitor = []
        for rownum in xrange(sh.nrows):
            if rownum < 1:
                continue
            row = sh.row_values(rownum)
            brands_to_monitor.append(re.sub(r'\W+', '',
                                            row[0].upper().strip()))

        site_brands = hxs.select('//div[@class="columns"]/ul/li/a')
        for brand in site_brands:
            brand_name = brand.select('text()').extract()[0].split(
                '(')[0].strip()
            brand_url = brand.select('@href').extract()[0]
            if re.sub(r'\W+', '', brand_name.upper()) in brands_to_monitor:
                brand_url = urljoin_rfc(get_base_url(response), brand_url)
                yield Request(brand_url,
                              callback=self.parse_brand,
                              meta={'brand': brand_name})

    def parse_brand(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//a[@class="productTitle"]/@href').extract()
        for product in products:
            yield Request(product,
                          callback=self.parse_product,
                          meta=response.meta)

        next = hxs.select('//a[@class="paginationNext"]/@href').extract()
        if next:
            next = urljoin_rfc(get_base_url(response), next[0])
            yield Request(next, callback=self.parse_brand, meta=response.meta)

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        name = ''.join(
            hxs.select('//h1[@class="productHeading"]//text()').extract())
        loader.add_value('name', name)
        loader.add_value('url', response.url)
        loader.add_value('brand', response.meta.get('brand', ''))
        category = re.findall(u',\\ncategory: "(.*)",', response.body)
        category = category[0] if category else ''
        loader.add_value('category', category)
        loader.add_xpath('sku', '//span[@id="catalogueNumber"]/text()')
        loader.add_xpath('identifier', '//span[@id="catalogueNumber"]/text()')
        image_url = hxs.select(
            '//div[@id="amp-originalImage"]/img/@src').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])

        price = ''.join(
            hxs.select('//div[@class="priceNow"]//text()').extract())
        loader.add_value('price', price)

        out_of_stock = 'IN STOCK' not in ''.join(
            hxs.select('//meta[@property="product:availability"]/@content').
            extract()).upper()
        if out_of_stock:
            loader.add_value('stock', '0')

        item = loader.load_item()
        metadata = ToyMonitorMeta()
        ean = ''.join(
            hxs.select('//span[@id="productEAN"]/text()').extract()).strip()
        if ean:
            metadata['ean'] = ean
        item['metadata'] = metadata

        yield item
Exemplo n.º 2
0
class JohnLewisSpider(BaseSpider):
    name = 'toymonitor-johnlewis.com'
    allowed_domains = ['johnlewis.com', 'johnlewis.ugc.bazaarvoice.com']
    start_urls = ['http://www.johnlewis.com/browse/toys/toys/toys-by-brand/_/N-fev',
                  'http://www.johnlewis.com/toys/toys-by-type/c60000243?rdr=1']
    errors = []
    brand_selector = BrandSelector(errors)
    #field_modifiers = {'brand': brand_selector.get_brand}

    def start_requests(self):
        country_url = "http://www.johnlewis.com/store/international/ajax/changeCountryAjaxRequest.jsp"
        formdata = {'country': 'GB',
                    'sourceUrl': 'http://www.johnlewis.com/',
                    'switchToggle': 'Change Country Overlay'}
        yield FormRequest(country_url, formdata=formdata, callback=self.parse_country)

    def parse_country(self, response):
        for url in self.start_urls:
            yield Request(url)

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        
        site_brands = response.xpath('//section[@id="facet-brand"]/div/ul/li/a')

        for brand in site_brands:
            brand_name = brand.select('text()').extract()[0].split("(")[0].strip()
            brand_url = brand.select('@href').extract()[0]
            brand_url = urljoin_rfc(base_url, brand_url)
            yield Request(brand_url, callback=self.parse_brand)
        
        if response.meta.get('subcategory'):
            return
        
        subcats = response.xpath('//strong[contains(., "Featured Toy Types")]/following-sibling::ul//@href').extract()
        subcats += response.xpath('//section[@id="facet-toysbytype"]/div/ul/li/a/@href').extract()
        subcats += response.xpath('//header[contains(h2, "Toys by Type")]/following-sibling::div//@href').extract()
        subcats.append('http://www.johnlewis.com/browse/toys/toys/toys-by-type/games-puzzles/view-all-games-puzzles/_/N-6hxe')
        for url in subcats:
            yield Request(response.urljoin(url), meta={'subcategory': True})        

    def parse_brand(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select("//div[@class='products']/div/article//a[@class='product-link'][1]/@href").extract()
        products += response.meta.get('products', [])

        next_page = filter(lambda link: link != '#', hxs.select('//li[@class="next"]//a/@href').extract())

        if next_page:
            self.log('PARTIAL => %s products found' % len(products))
            yield Request(url=urljoin_rfc(base_url, next_page[0]), meta={'products': list(products)}, callback=self.parse_brand)
        else:
            self.log('TOTAL PRODUCTS FOUND: %s' % len(products))
            products = set(products)
            self.log('TOTAL UNIQUE PRODUCTS URLS: %s' % len(products))
            for url in products:
                yield Request(urljoin_rfc(base_url, url), self.parse_product)


    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name = hxs.select('normalize-space(//*[@itemprop="name"]/text())').extract()[0]

        brand = hxs.select('normalize-space(//*[@itemprop="brand"]/span/text())').extract()

        try:
            image_url = urljoin_rfc(base_url,
                                    hxs.select('//div[@id="prod-media-player"]'
                                               '//img/@src').extract()[0].strip())
        except IndexError:
            image_url = ''

        options = hxs.select('//div[@id="prod-multi-product-types"]')

        items = []
        if options:
            products = options.select('.//div[@class="product-type"]')
            for product in products:
                opt_name = product.select('.//h3/text()').extract()[0].strip()
                try:
                    stock = product.select('//div[contains(@class, "mod-stock-availability")]'
                                           '//p/strong/text()').re(r'\d+')[0]
                except IndexError:
                    stock = 0

                loader = ProductLoader(item=Product(), selector=product)
                sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model name")]/following-sibling::dd/text()').extract()
                if not sku:
                    sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model Number")]/following-sibling::dd/text()').extract()
                if sku:
                    loader.add_value('sku', sku[0].strip())
                loader.add_xpath('identifier', './/div[contains(@class, "mod-product-code")]/p/text()')
                loader.add_value('name', '%s %s' % (name, opt_name))
                loader.add_xpath('category', '//div[@id="breadcrumbs"]//li[@class="last"]/a/text()')
                loader.add_value('image_url', image_url)
                loader.add_value('brand', brand)
                loader.add_value('url', response.url)
                loader.add_xpath('price', './/p[@class="price"]/strong/text()')
                loader.add_value('stock', stock)
                item = loader.load_item()
                metadata = ToyMonitorMeta()
                metadata['reviews'] = []
                item['metadata'] = metadata
                items.append(item)
        else:
            price = ''.join(hxs.select('//ul/li/strong[@class="price"]/text()').extract()).strip()
            if not price:
                price = ''.join(hxs.select('//span[@class="now-price"]/text()').extract()).split()
                if not price:
                    price = ''.join(hxs.select('//div[@id="prod-price"]//strong/text()').extract()).split()

            try:
                stock = hxs.select('//div[contains(@class, "mod-stock-availability")]'
                                   '//p/strong/text()').re(r'\d+')[0]
            except IndexError:
                stock = 0

            loader = ProductLoader(item=Product(), response=response)
            sku = hxs.select(u'//div[@id="prod-product-code"]//h2[contains(text(),"Product code")]/following-sibling::p/text()').extract()
            if sku:
                loader.add_value('sku', sku[0].strip())
            loader.add_xpath('identifier', '//div[@id="prod-product-code"]/p/text()')
            loader.add_value('name', name)
            loader.add_xpath('category', '//div[@id="breadcrumbs"]//li[@class="last"]/a/text()')
            loader.add_value('image_url', image_url)
            loader.add_value('brand', brand)
            loader.add_value('url', response.url)
            loader.add_value('price', price)
            loader.add_value('stock', stock)

            item = loader.load_item()
            metadata = ToyMonitorMeta()
            metadata['reviews'] = []
            item['metadata'] = metadata

            if item.get('identifier'):
                items.append(item)

        if items:
            product_id = response.xpath('//div/@data-product-id').extract()[0]
            reviews_url = 'http://johnlewis.ugc.bazaarvoice.com/7051redes-en_gb/%s/reviews.djs?format=embeddedhtml&page=1&scrollToTop=true'
            yield Request(reviews_url % product_id, callback=self.parse_review_page, meta={'items': items, 'url': response.url})

    def parse_review_page(self, response):
        items = response.meta.get('items', '')
        url = response.meta.get('url', '')
        hxs = HtmlXPathSelector(text=self._extract_html(response))
        reviews = hxs.xpath('//div[@class="BVRRReviewDisplayStyle5"]')
        for review in reviews:
            l = ReviewLoader(item=Review(), response=response, date_format='%d/%m/%Y')
            rating = review.select(".//span[contains(@class,'BVRRRatingNumber')]/text()").extract()[0]
            date = review.select(".//span[contains(@class,'BVRRValue BVRRReviewDate')]/text()").extract()[0]
            title = review.select(".//span[contains(@class,'BVRRReviewTitle')]/text()").extract()
            review_text = ' '.join(review.select(".//span[contains(@class,'BVRRReviewText')]//text()").extract())

            if title:
                full_text = title[0].strip() + '\n' + review_text.strip()
            else:
                full_text = review_text.strip()

            l.add_value('rating', rating)
            l.add_value('url', url)
            l.add_value('date', datetime.strptime(date, '%d %B %Y').strftime('%d/%m/%Y'))
            l.add_value('full_text', full_text)
            for item in items:
                item['metadata']['reviews'].append(l.load_item())

        next = hxs.xpath('//span[@class="BVRRPageLink BVRRNextPage"]/a/@data-bvjsref').extract()
        if next:
            yield Request(next[0], callback=self.parse_review_page, meta={'items': items, 'url': url})
        else:
            for item in items:
                yield item

    def _extract_html(self, response):
        review_html = ''
        for line in response.body.split('\n'):
            if 'var materials=' in line:
                review_html = line.split('"BVRRSecondaryRatingSummarySourceID":" ')[-1].split('\n}')[0].replace('\\', '')
        return review_html
Exemplo n.º 3
0
class LittleWoodsSpider(BaseSpider):
    name = 'toymonitor-littlewoods.com'
    allowed_domains = ['littlewoods.com', 'api.bazaarvoice.com']
    start_urls = ['http://www.littlewoods.com/toys/e/b/5132.end']
    errors = []
    brand_selector = BrandSelector(errors)
    #field_modifiers = {'brand': brand_selector.get_brand}

    def parse(self, response):
        categories = response.xpath('//div[@id="navigation"]//a/@href').extract()
        for category in categories:
            yield Request(response.urljoin(category), callback=self.parse_category)

    def parse_category(self, response):

        products = response.xpath('//a[@class="productTitle"]/@href').extract()
        for product in products:
            yield Request(product, callback=self.parse_product, meta=response.meta)

        next = response.xpath('//a[@class="paginationNext"]/@href').extract()
        if next:
            next = response.urljoin(next[0])
            yield Request(next, callback=self.parse_category, meta=response.meta)

    def parse_product(self, response):
       
        loader = ProductLoader(item=Product(), response=response)
        name = ''.join(response.xpath('//h1[@class="productHeading"]//text()').extract())
        loader.add_value('name', name)
        loader.add_value('url', response.url)
        loader.add_value('brand', response.meta.get('brand', ''))
        category = re.findall(u',\\ncategory: "(.*)",', response.body)
        category = category[0] if category else ''
        loader.add_value('category', category)
        loader.add_xpath('sku', '//span[@id="catalogueNumber"]/text()')
        loader.add_xpath('identifier', '//span[@id="catalogueNumber"]/text()')
        image_url = response.xpath('//div[@id="amp-originalImage"]/img/@src').extract()
        promotion = None
        if image_url:
            loader.add_value('image_url', image_url[0])
            if '3for2' in image_url[0]:
                promotion = '3 for 2'

        price = ''.join(response.xpath('//div[@class="priceNow"]//text()').extract())
        loader.add_value('price', price)

        out_of_stock = 'IN STOCK' not in ''.join(response.xpath('//meta[@property="product:availability"]/@content').extract()).upper()
        if out_of_stock:
            loader.add_value('stock', '0')

        item = loader.load_item()
        metadata = ToyMonitorMeta()
        ean = ''.join(response.xpath('//span[@id="productEAN"]/text()').extract()).strip()
        if ean:
            metadata['ean'] = ean
        metadata['reviews'] = []
        if promotion is not None:
            metadata['promotions'] = promotion
        item['metadata'] = metadata

        items = []

        amount_options = len(response.xpath('//ul[@class="customerSelection"]'))
        options = []
        # Extract option arrays
        options_text = re.findall('stockMatrix = \[(.*) \]; sdg.productOptions', ' '.join(response.body.split()))
        if options_text:
            options_text = re.findall('(.*)]; sdg.productOptions', options_text[0])
            for line in options_text[0].split(' , '):
               if '"sku' in line:
                   option = re.search('\[(.*)\]', line)
                   if option:
                       option = re.search('\[(.*)\]', line).group(0).replace('null', 'None')
                       options.append(eval(option))

        if len(options)>1:
            for option in options:
                option_item = deepcopy(item)

                name = ' '.join(option[:amount_options])
                identifier = option[amount_options]
                price = option[-5]
                
                option_item['name'] += ' ' + name
                option_item['identifier'] += '-' + identifier
                option_item['price'] = extract_price(price)
                out_of_stock = [value for value in option if value and 'out of stock' in value.lower()]
                if out_of_stock:
                    option_item['stock'] = 0
                items.append(option_item)

                
        else:        
            items.append(item)

        product_id = re.findall('productId: "(.*)"', response.body)[0]

        reviews_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=2x4wql4zeys4t8mu5x3x4rb1a&apiversion=5.5&displaycode=10628-en_gb&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A'+product_id+'&filter.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&sort.q0=isfeatured%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_reviewcomments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_comments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&limit.q0=100&offset.q0=0&limit_comments.q0=3&callback=bv_1111_18822'

        request = Request(reviews_url, meta={'items': items, 'offset': 0, 'url': response.url},
                              callback=self.parse_reviews)
        yield request


    def parse_reviews(self, response):
        items = response.meta['items']
        url = response.meta['url']
        body = response.body.strip().partition('(')[-1].replace('});', '}').replace('})', '}')
        json_body = json.loads(body)

        reviews = json_body['BatchedResults']['q0']['Results']
        for review in reviews:
            review_loader = ReviewLoader(item=Review(), response=response, date_format="%d/%m/%Y")
            review_date = datetime.datetime.strptime(review['SubmissionTime'].split('.')[0], '%Y-%m-%dT%H:%M:%S')
            review_loader.add_value('date', review_date.strftime('%d/%m/%Y'))

            title = review['Title']
            text = review['ReviewText']

            if title:
                full_text = title + '\n' + text
            else:
                full_text = text

            pros = review['Pros']
            cons = review['Cons']
            if pros:
                full_text += '\nPros: ' + ', '.join(pros)
            if cons:
                full_text += '\nCons: ' + ', '.join(cons)


            review_loader.add_value('full_text', full_text)
            rating = review['Rating']
            review_loader.add_value('rating', rating)
            review_loader.add_value('url', url)
            
            for item in items:
                item['metadata']['reviews'].append(review_loader.load_item())

        if len(reviews) == 100:
            offset = response.meta['offset'] + 100

            next_reviews =  add_or_replace_parameter(response.url, "offset.q0", str(offset))
            request = Request(next_reviews, meta={'items': items, 'offset': offset, 'url': url},
                              callback=self.parse_reviews)
            yield request
        else:
            for item in items:
                yield item
Exemplo n.º 4
0
class RakutenCoUk(BaseSpider):
    name = 'toymonitor-rakuten.co.uk'
    allowed_domains = ['www.rakuten.co.uk']
    start_urls = ['http://www.rakuten.co.uk/category/931/?l-id=gb_product_allcat_17',]
    errors = []
    brand_selector = BrandSelector(errors)
    #field_modifiers = {'brand': brand_selector.get_brand}
    
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        subcategory_urls = hxs.select('//li[@class="b-open"]//li/a/@href').extract()
        for url in subcategory_urls:
          yield Request(urljoin(base_url, url))
        
        yield Request(response.url, callback = self.parse_products, dont_filter=True)

    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        # parse pages
        pages = hxs.select('//div[contains(@class, "b-pagination")]/ul/li/a/@href').extract()
        for page in pages:
            yield Request(urljoin_rfc(base_url, page), meta=response.meta, callback=self.parse_products)

        # parse products
        items = hxs.select('//li[@class="b-item"]/div/div[@class="b-img"]/div/a/@href').extract()
        for item in items:
            yield Request(urljoin_rfc(base_url, item), callback=self.parse_product, meta=response.meta)

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        
        mpn = hxs.select('//span[@class="b-item"]').re("MPN: ([0-9]+)")
        ean = hxs.select('//span[@class="b-item"]').re("EAN: ([0-9]+)")
        sku = hxs.select('//input[@name="sku"]/@value').extract()
        name = hxs.select('//h1[@class="b-ttl-main"]/text()').extract()[0]
        dealer_name = "".join(hxs.select('//h2[@id="auto_shop_info_name"]//text()').extract()).strip()
        brand = hxs.select('.//span[@itemprop="brand"]/text()').extract()
        if brand:
          brand = brand[0].strip()
        else:
          brand = response.meta.get('brand')

        categories = hxs.select('//ul[@class="b-breadcrumb"]/li/a/text()').extract()
        image_url = hxs.select('//img[@itemprop="image"]/@data-frz-src').extract()

##        options = hxs.select('//script[contains(text(), "var variant_details")]/text()').re('var variant_details = (.*);\n')
        options = hxs.select('//script[contains(text(), "var variant_details")]/text()').extract()
        if options:
            options = options[0].replace('&quot;', "'")
            options = re.findall('var variant_details = (.*);\n', options)
            variants = json.loads(options[0])
        else:
            identifier = hxs.select('//input[@name="item_id"]/@value').extract()[0]
            price = hxs.select('//div[@class="b-product-main"]//meta[@itemprop="price"]/@content').extract()[0]
            variants = [{'itemVariantId': identifier, 'sku': sku, 'variantValues': [], 'defaultPricing': {'price': price}}]

        items = []
        for variant in variants:
            loader = ProductLoader(item=Product(), selector=hxs)
            loader.add_value('identifier', variant['itemVariantId'])
            loader.add_value('name', " ".join([name] + variant.get('variantValues', [])))
            loader.add_value('sku', variant['sku'])
            loader.add_value('url', response.url)
            loader.add_value('price', variant['defaultPricing']['price'])
            loader.add_value('dealer', dealer_name)
            loader.add_value('category', categories)
            if brand:
                loader.add_value('brand', brand)
            if image_url:
                loader.add_value('image_url', image_url[0])
            product = loader.load_item()

            metadata = ToyMonitorMeta()
            metadata['reviews'] = []
            product['metadata'] = metadata

            if mpn or ean:
                if mpn:
                    metadata['mpn'] = mpn[0]
                if ean:
                    metadata['ean'] = ean[0]
                product['metadata'] = metadata
            items.append(product)

        reviews_url = response.xpath('//a[contains(text(), "See All Reviews")]/@href').extract()
        if reviews_url:
            yield Request(reviews_url[0], callback=self.parse_reviews, meta={'items': items, 'url': response.url})
        else:
            for item in items:
                yield item


    def parse_reviews(self, response):
        items = response.meta.get('items', '')
        url = response.meta.get('url', '')

        reviews = response.xpath('//div[contains(@class, "b-review")]')
        for review in reviews:
            l = ReviewLoader(item=Review(), response=response, date_format='%d/%m/%Y')
            rating = len(review.xpath('.//span/span[contains(@class, "b-star-full")]'))
            date = review.xpath('.//div[@class="b-content"]/span[@class="b-text-sub"]/text()').re('\d+/\d+/\d+')[0]
            title = review.xpath('.//div[@class="b-head"]/div/text()').extract()
            review_text = ' '.join(review.xpath('.//div[@class="b-content" and not(child::*)]/text()').extract())

            if title:
                full_text = title[0].strip() + '\n' + review_text.strip()
            else:
                full_text = review_text.strip()

            l.add_value('rating', rating)
            l.add_value('url', url)
            l.add_value('date', date)
            l.add_value('full_text', full_text)
            for item in items:
                item['metadata']['reviews'].append(l.load_item())

        next = response.xpath('//a[@id="right_arrow"]/@href').extract()
        if next:
            yield Request(next[0], callback=self.parse_reviews, meta={'items': items, 'url': url})
        else:
            for item in items:
                yield item