Exemplo n.º 1
0
    def _scrape_product_links(self, response):
        data = WaitroseProductsSpider._get_data(response)
        for product_data in data['products']:
            product = SiteProductItem()

            for product_key, data_key in self._PRODUCT_TO_DATA_KEYS.items():
                value = product_data.get(data_key, 'null')
                if value != 'null':
                    product[product_key] = product_data[data_key]

            image_url = product.get('image_url', 'None')
            if image_url:
                product['image_url'] = urlparse.urljoin('http://', image_url)

            # This one is not in the mapping since it requires transformation.
            #product['upc'] = int(product_data['productid'])

            if product.get('price', None):
                price = product['price']
                price = price.replace('£', 'p')
                price = re.findall('(p? *[\d ,.]+ *p?) *', price)
                price = price[0] if price else ''
                if price.endswith('p'):
                    price = '0.' + price.strip()
                if 'p' in price:
                    price = re.sub('[p ,]', '', price)
                    product['price'] = Price(priceCurrency='GBP', price=price)
                else:
                    self.log('Unknown price format at %s' % response)

            if not product.get('url', '').startswith('http'):
                product['url'] = urlparse.urljoin('http://www.waitrose.com',
                                                  product['url'])

            yield product['url'], product
Exemplo n.º 2
0
 def _parse_single_product(self, response):
     productdata = "[" + is_empty(
         response.xpath('//meta[@name="productdata"]/@content').extract(),
         "")[:-1].replace("|", ",") + "]"
     productdata = is_empty(json.loads(productdata))
     product = SiteProductItem()
     if productdata:
         product["title"] = productdata["name"]
         product["is_out_of_stock"] = not productdata["available"]
         product["url"] = "http://www.tesco.com/groceries/product/details/"\
             "?id=" + str(productdata["productId"])
         regex = "id=([A-Z0-9\-]+)"
         reseller_id = re.findall(regex, product.get('url', ''))
         reseller_id = reseller_id[0] if reseller_id else None
         cond_set_value(product, "reseller_id", reseller_id)
         try:
             product["price"] = Price(price=productdata["price"],
                                      priceCurrency="GBP")
         except:
             pass
         product["image_url"] = productdata["mediumImage"]
         product["search_term"] = ""
         product["brand"] = is_empty(self.brand_from_title(
             product["title"]))
         product["site"] = is_empty(self.allowed_domains)
     if self.product_url:
         product['is_single_result'] = True
         if product.get("search_term"):
             del product['search_term']
     return product
Exemplo n.º 3
0
    def _scrape_product_links(self, response):
        # To populate the description, fetching the product page is necessary.

        if self.user_agent_key not in ["desktop", "default"]:
            links = response.xpath(
                '//section[contains(@class,"product_listed")]'
                '//div[contains(@class,"product_info")]//a/@href').extract()

            if not links:
                self.log(
                    "[Mobile] Found no product data on: %s" % response.url,
                    ERROR)

            for link in links:
                yield urlparse.urljoin(response.url, link), SiteProductItem()
        else:
            url = response.url

            # This will contain everything except for the URL and description.
            product_jsons = response.xpath(
                '//meta[@name="productdata"]/@content').extract()

            if not product_jsons:
                self.log("Found no product data on: %s" % url, ERROR)

            product_links = response.css(
                ".product > .desc > h2 > a ::attr('href')").extract()
            if not product_links:
                self.log("Found no product links on: %s" % url, ERROR)

            for product_json, product_link in zip(product_jsons[0].split('|'),
                                                  product_links):
                prod = SiteProductItem()
                cond_set_value(prod, 'url',
                               urlparse.urljoin(url, product_link))

                product_data = json.loads(product_json)

                cond_set_value(prod, 'price', product_data.get('price'))
                cond_set_value(prod, 'image_url',
                               product_data.get('mediumImage'))

                #prod['upc'] = product_data.get('productId')
                if prod.get('price', None):
                    prod['price'] = Price(price=str(prod['price']).replace(
                        ',', '').strip(),
                                          priceCurrency='GBP')

                try:
                    brand, title = self.brand_from_title(product_data['name'])
                    cond_set_value(prod, 'brand', brand)
                    cond_set_value(prod, 'title', title)
                except KeyError:
                    raise AssertionError(
                        "Did not find title or brand from JS for product: %s" %
                        product_link)

                yield None, prod
Exemplo n.º 4
0
 def _scrape_product_links(self, response):
     for box in self._fetch_product_boxes(response):
         url = urlparse.urljoin(response.url, self._link_from_box(box))
         product = SiteProductItem()
         self._populate_from_box(response, box, product)
         if not product.get('brand', None):
             dump_url_to_file(response.url)
         meta = response.meta.copy()
         meta['product'] = product
         user_agent = USER_AGENT_LIST.pop(0)
         USER_AGENT_LIST.append(user_agent)
         request = Request(url, callback=self.parse_product, meta=meta)
         request.headers.setdefault('User-Agent', user_agent)
         yield request, product
Exemplo n.º 5
0
    def _scrape_product_links(self, response):
        products = response.xpath('//ol[contains(@class, "search-results")]'
                                  '//div[contains(@class, "sc_result_list")]')
        if not products:
            self.log("Found no product links.", ERROR)

        for product in products:
            prod_links = product.xpath(
                './/div[contains(@class, "sc_result_title")]//a/@href'
            ).extract()
            if not prod_links:
                self.log(
                    "Failed to extract product link for item: %r" %
                    (product.extract(), ), ERROR)
                continue
            prod_link = prod_links[0]

            item = SiteProductItem()

            cond_set(
                item,
                'title',
                product.css('div.sc_result_title a::text').extract(),
                conv=string.strip,
            )

            cond_set(
                item,
                'price',
                product.css('div.sc_result_price::text').re('(\d.+)'),
            )
            if item.get('price', None):
                if not '€' in item['price']:
                    self.log('Unknown currency at' % response.url)
                else:
                    item['price'] = Price(price=item['price'].replace(
                        ',', '').replace('€', '').strip(),
                                          priceCurrency='EUR')

            cond_set(
                item,
                'locale',
                response.xpath('//html/@lang').extract(),
                conv=string.strip,
            )
            cond_set_value(item, 'locale', 'fr-FR')  # Default.

            yield prod_link, item
Exemplo n.º 6
0
    def _scrape_product_links(self, response):
        data = json.loads(response.body_as_unicode())
        for item in data['items']:
            prod = SiteProductItem()
            prod['title'] = item['itemName']
            prod['brand'] = item['brandName']
            prod['site'] = 'http://www.asda.com/'

            # Hardcoded, store seems not to have out of stock products
            prod['is_out_of_stock'] = False
            prod['price'] = item['price']
            if prod.get('price', None):
                prod['price'] = Price(price=prod['price'].replace(
                    '£', '').replace(',', '').strip(),
                                      priceCurrency='GBP')
            # FIXME Verify by comparing a prod in another site.
            total_stars = int(item['totalReviewCount'])
            avg_stars = float(item['avgStarRating'])
            prod['buyer_reviews'] = BuyerReviews(num_of_reviews=total_stars,
                                                 average_rating=avg_stars,
                                                 rating_by_star={})
            prod['model'] = item['cin']
            image_url = item.get('imageURL')
            if not image_url and "images" in item:
                image_url = item.get('images').get('largeImage')
            prod['image_url'] = image_url

            pId = is_empty(re.findall("itemid=(\d+)", item['productURL']))
            if pId and "search_term" in response.meta:
                prod['url'] = self.PRODUCT_LINK % (urllib.quote(
                    response.meta["search_term"]), pId)
            elif "imageURL" in item:
                prod["url"] = item['imageURL']

            prod['locale'] = "en-GB"

            products_ids = item['id']
            url = self.API_URL.format(id=products_ids)

            yield url, prod