示例#1
0
    def parse_products(self, response):
        json_res = parse_json_response(response)
        page_title = json_res['title']
        json_res = json_res[u'_embedded'][u'lanes']

        # Get the ProductDetailLane
        product_lane = next(lane for lane in json_res
                            if lane['type'] == 'ProductDetailLane')
        product_details = product_lane['_embedded']['items'][0]
        if product_details:
            product_name = product_details['_embedded']['product'][
                'description']
            description = product_details['_embedded']['product']['details'][
                'summary']
            description = description.replace('[list]', '')
            description = description.replace('[*]', '')
            size_or_weight = product_details['_embedded']['product'][
                'unitSize']

            if size_or_weight is not None:
                if "stuk" in size_or_weight:
                    size = size_or_weight
                    weight_q = None
                    weight_ind = None
                else:
                    size = None
                    weight_q = WeightStandardizer.standardize_quantity(
                        size_or_weight)
                    weight_ind = WeightStandardizer.standardize_indicator(
                        size_or_weight)
            else:
                size = None
                weight_q = None
                weight_ind = None

            images = product_details['_embedded']['product']['images']
            img_src = None
            if images:
                first_image = images[0]
                if first_image:
                    img_src = first_image['link']['href']

            price = product_details['_embedded']['product']['priceLabel'][
                'now']

            # filename = f'data/ah/{title}.html'
            # with open(filename, 'wb') as f:
            #     f.write(response.body)

            yield create_grocery_bot_item(product_name, page_title,
                                          description,
                                          'albert heijn ah', response.url,
                                          dt.now(), weight_q, weight_ind, size,
                                          '', price, img_src)
示例#2
0
    def parse_product(self, response):
        title = response.css('div.container div.container h1::text').get()
        page_title = response.css('title::text').get()
        img_src = response.css('div#productImage').css("img").xpath(
            "@src").get()

        page_title = page_title.replace('/', '')
        page_title = page_title.replace('_', '')
        # get the h5 headers that contain the information we need to extract
        headers = response.css(
            'div.container div.container div.productInfo div.col-md-6')[0].css(
                'h5::text').getall()
        desc_index = headers.index(
            next(header for header in headers if header == 'Beschrijving'))
        # weight_index = headers.index(next(header for header in headers if header == 'Inhoud en gewicht'))

        description = response.css(
            'div.container div.container div.productInfo div.col-md-6')[0].css(
                'p::text')[desc_index].get()
        weight_or_size = response.css('div.unitQuantity span::text').get()
        weight_or_size = weight_or_size.strip()

        weight_or_size = weight_or_size.replace("Stuks",
                                                "").replace("Stuk", "")

        if weight_or_size is not None:
            if "Per" in weight_or_size:
                size = weight_or_size
                weight_q = None
                weight_ind = None
            else:
                weight_q = WeightStandardizer.standardize_quantity(
                    weight_or_size)
                weight_ind = WeightStandardizer.standardize_indicator(
                    weight_or_size)
                size = None
        else:
            weight_q = None
            weight_ind = None
            size = None

        category = ' '.join(response.css('ol.breadcrumb li ::text').getall())
        price = ''.join(
            response.css(
                'div.priceUnitQuantity div.price span::text').getall())

        yield create_grocery_bot_item(title, page_title,
                                      description, 'vomar', response.url,
                                      dt.now(), weight_q, weight_ind, size,
                                      category, price,
                                      'www.vomar.nl' + img_src)
示例#3
0
    def save_product(self, response):
        product_name = response.css("li.page-header__breadcrumb").css(
            "a::text").getall()[-1]
        # product_name = response.css('div.pdp-right-block h1::text').get()
        page_title = response.css("title::text").get()
        img_src = "https://www.plus.nl/" + response.css("img.lazy").xpath(
            "@data-src").get()

        description = None

        number_of_units = response.css(
            'div.product-detail-packing::text').get()

        if ' \n' in number_of_units:
            number_of_units = number_of_units.strip(' \n')

        if number_of_units is not None:
            if 'stuks' in number_of_units:
                size = number_of_units
                weight_q = None
                weight_ind = None
            else:
                weight_q = WeightStandardizer.standardize_quantity(
                    number_of_units)
                weight_ind = WeightStandardizer.standardize_indicator(
                    number_of_units)
                size = None
        else:
            size = None
            weight_q = None
            weight_ind = None

        try:
            euros = response.css('span.price span::text').getall()[-1]
            cents = response.css('span.price sup::text').get()
            price = euros + '.' + cents
        except:
            print("COULD NOT GET TRUE PRICE")
            price = response.css('span.price span::text').get()

        try:
            category = response.css("li.page-header__breadcrumb").css(
                "a::text").getall()[:2]
        except:
            category = None
            print("Could not find category")

        yield create_grocery_bot_item(product_name, page_title,
                                      description, 'plus', response.url,
                                      dt.now(), weight_q, weight_ind, size,
                                      category, price, img_src)
示例#4
0
    def parse_product(self, response):
        title = response.css('h1.attributebox__headline--h1::text').get()
        # page_title = response.css('title::text').get()

        filename = 'data/lidl/lidl-%s.html' % title
        with open(filename, 'wb') as f:
            f.write(response.body)

        yield create_grocery_bot_item(title, response.url, filename, dt.now())

        for a in response.css(
                'div.product.product--tile a.product__body::attr(href)'
        ).getall():
            yield response.follow(a, callback=self.parse_product)
示例#5
0
    def parse_products(self, response):
        product_name = response.css("div.jum-column-main").css("h1::text").get()
        page_title = response.css('title::text').get()

        # only extract the description when the p is an immediate child of dd.active, otherwise it's not a description
        description = response.css('div.jum-summary-description p::text').get()

        image_url = response.css('div.jum-product-image-figure').css("img").xpath("@data-jum-src").get()

        number_of_units = response.css('div.jum-sale-price-info span.jum-pack-size::text').get()

        if number_of_units is not None:
            if " g" in number_of_units or " l" in number_of_units or " kg" in number_of_units or " ml" in number_of_units:
                weight_q = WeightStandardizer.standardize_quantity(number_of_units)
                weight_ind = WeightStandardizer.standardize_indicator(number_of_units)
                size = None
            else:
                size = number_of_units
                weight_q = None
                weight_ind = None
        else:
            size = None
            weight_q = None
            weight_ind = None

        # nothing
        category = None

        # get both parts of the price
        price_euro = response.css('span.jum-price-format::text').getall()[0]
        price_cent = response.css('span.jum-price-format::text').get()
        price = price_euro+'.'+price_cent

        price_per_unit = response.css('span.jum-price-format::text').getall()[1]

        yield create_grocery_bot_item(product_name, page_title, description, 'jumbo', response.url, dt.now(), weight_q, weight_ind,
                                      size, category, price, image_url)
示例#6
0
    def save_product(self, response):
        product_name = response.css('h1.altHead::text').get()
        page_title = response.css('title::text').get()
        # only extract the description when the p is an immediate child of dd.active, otherwise it's not a description
        description = response.css('dl.definitionList dd.active>p::text').get()
        # in coop, weight and size are currently the same 'block'
        weight_or_size = response.css('header.gi h2.subHead::text').get()
        
        if weight_or_size is not None:
            if "stuk" in weight_or_size:
                size = weight_or_size
                weight_q = None
                weight_ind = None
            else:
                weight_q = WeightStandardizer.standardize_quantity(weight_or_size)
                weight_ind = WeightStandardizer.standardize_indicator(weight_or_size)
                size = None
        else:
            size = None
            weight_q = None
            weight_ind = None  

        # image = response.css('div.gi b0_12 img').xpath('@data-srcset').get()
        img_src = response.css('div.b0_12').css('img').xpath('@data-srcset').get().split(' ')[0]

        # grab the n-2 th element of the item/category breadcrumb
        category = response.css('ol.cf span::text')[-2].get()
        # get both parts of the price (replace comma's by dots to make for easy float parsing)
        price = str(response.css('ins.price::text').get() + response.css('ins.price span.sup::text').get()).replace(',', '.')

        #with open('coop-' + (time.strftime("%d%m%Y")) + '.txt', 'a+') as f:
        #    f.write(response.url + '\n')
        #    print(f.read())
        #    f.close()

        yield create_grocery_bot_item(product_name, page_title, description, 'coop', response.url, dt.now(), weight_q, weight_ind, size, category, price, img_src)