예제 #1
0
    def parse_box(self, response):
        box = ScrapingboxesItem()
        box_data = ItemUpdaterTarra(item=box, measured_in='mm')
        box_data.update_item('description',
                             'all_inner_dimensions',
                             'tags',
                             'standard_size',
                             'product_type',
                             'color',
                             text_element=response.xpath('//h1/span/text()'))

        table_handler = TableHandlerTarra(text_elements=response.xpath(
            '//*[@class="product attribute description"]//p[1]/text()'))

        box_data.analyse_table_rows(string_list=response.xpath(
            '//*[@class="product attribute description"]//p[1]/text()').getall(
            ),
                                    table_handler=table_handler)

        price_handler = PriceHandler2(item=box)

        price_handler.create_price_table(
            tier_elements=response.xpath('//li[@class="item"]'),
            price_elements=response.xpath(
                '//li[@class="item"]//span[@data-label="Excl. BTW"]'))

        price_handler.get_base_price_from_price_table()
        #
        box['url'] = response.request.url
        box['company'] = 'Tarra-pack'

        yield box
예제 #2
0
    def parse_box(self, response):
        box = ScrapingboxesItem()

        in_stock_text = (response.xpath(
            '//*[@id="product_addtocart_form"]/div[3]/img/@alt').get().lower())
        box['in_stock'] = find_in_stock(in_stock_text)

        # retrieve prices and bundle size
        box_data = ItemUpdaterViv(item=box, measured_in="mm")

        box_data.update_item('minimum_purchase',
                             text_element=response.xpath(
                                 '//*[@class="product-shop"]/text()[4]'))

        price_handler = PriceHandlerViv(box)

        box['price_ex_BTW'] = round((price_handler.create_base_price_manually(
            response.xpath(
                '//div[@class="product-shop"]//*[@class="per-one"]//*[@class="price-excluding-tax"]/*[@class="price"]'
            )) / 121) * 100, 2)

        box['price_table'] = price_handler.create_price_table(
            tier_elements=response.xpath(
                "//*[@class='tier-prices product-pricing']/li"),
            price_elements=response.xpath(
                "//*[@class='tier-prices product-pricing']/li/span[1]"))

        box['url'] = response.request.url

        # update item with product description
        product_description_text = response.xpath(
            '//*[@id="product-name"]/h1/text()').get()

        box_data.update_item(
            "tags",
            "description",
            "wall_thickness",
            "standard_size",
            'product_type',
            'bottles',
            description_element=response.xpath('//*[@id="product-name"]/h1'))

        # analyse specs table
        indices_object = TableHandlerViv(
            header_elements=response.xpath('//tbody/tr/th'))

        box_data.analyse_table_rows(
            table_handler=indices_object,
            row_elements=response.xpath('//tbody/tr/td'))

        # indices_dict for testing purposes
        box['indices_dict'] = indices_object.indices_dict, indices_object.multiple_inner_dimensions_words, indices_object.column_names
        box['company'] = 'Verpakkingsindustrie Veenendaal'

        # Product image
        box['image_urls'] = [
            response.xpath('//p[@class="product-image"]/a/img/@src').get()
        ]

        yield box
예제 #3
0
    def parse_box(self, response):
        box = ScrapingboxesItem()
        table_handler = TableHandlerPaco(header_elements=response.xpath('//table[@class="featurestable"]//td[1]'))
        box_data = ItemUpdaterPaco(item=box, measured_in="cm")

        box_data.update_item(
            'description', 'tags', 'standard_size', 'product_type',
            description_element=response.xpath('//h1[@itemprop="name"]')
        )

        box_data.analyse_table_rows(
            row_elements=response.xpath('//table[@class="featurestable"]//td[2]'),
            table_handler=table_handler
        )

        price_handler = PriceHandlerPaco(item=box, price_multiplier=box['minimum_purchase'])

        box['price_ex_BTW'] = price_handler.create_base_price_manually(
            price_element=response.xpath('//div[@class="product-prices"]//*[@itemprop="price"]')
        )

        box['price_table'] = price_handler.create_price_table(
            tier_elements=response.xpath('//table[@class="table-product-discounts"]//tr[position() >1]/td[1]'),
            price_elements=response.xpath('//table[@class="table-product-discounts"]//tr[position() >1]/td[2]')
        )

        box['url'] = response.request.url
        box['company'] = 'PacoVerpakkingen'

        yield box
예제 #4
0
    def parse(self, response):
        products = response.xpath('//*[@class="product-item-link"]')

        for product in products:
            box = ScrapingboxesItem()
            description = product.xpath('./text()').get()
            box['description'] = description
            yield box
예제 #5
0
    def parse_category(self, response):
        # check if page has products or needs to be skipped
        if response.xpath("//*[@class='from-price']").get():

            # iterate over table rows
            table_rows = response.xpath(
                "//*[@class='table products-view']/tbody/tr")
            for idx, row in enumerate(table_rows):
                if idx > TestSettings.MAX_ROWS and TESTING:
                    break
                box = ScrapingboxesItem()
                box_data = ItemUpdaterDozen(item=box, measured_in="mm")
                header_indices_object = TableHandlerDozen(
                    header_elements=response.xpath("//thead/tr/th"))

                # analyse table rows
                box_data.analyse_table_rows(
                    table_handler=header_indices_object,
                    row_elements=row.xpath(".//td"))

                box['price_table'] = create_price_table_dozenNL(
                    string=row.xpath(
                        ".//*[@id='tierprices']/@data-content").get())
                box['price_ex_BTW'] = box['price_table'][list(
                    box['price_table'])[0]]

                # update box from page title
                box_data.update_item(
                    'description',
                    'tags',
                    'standard_size',
                    'product_type',
                    description_element=response.xpath(
                        "//*[@class='page-title category-title']/h1"))

                # extra info found in image alt attribute
                box_data.update_item("color",
                                     "tags",
                                     text_element=row.xpath('.//td[1]//@alt'))

                # create box url
                # example: https://www.dozen.nl/gekleurde-dozen/gekleurde-vouwdozen/breedte/155/hoogte/80/lengte/210.html
                if 'inner_dim3' in box:
                    box['url'] = response.request.url.replace(
                        "/show/all.html",
                        f"/breedte/{int(box['inner_dim2'])}/hoogte/{int(box['inner_dim3'])}/lengte/{int(box['inner_dim1'])}.html"
                    )
                elif 'inner_variable_dimension_MIN' in box:
                    box['url'] = response.request.url.replace(
                        "/show/all.html",
                        f"/breedte/{int(box['inner_dim2'])}/lengte/{int(box['inner_dim1'])}.html"
                    )
                else:
                    box['url'] = "error"

                box['company'] = "Dozen.nl"
                yield box
예제 #6
0
    def parse_box(self, response):
        # initialize item
        box = ScrapingboxesItem()

        # create item data object
        box_data = ItemUpdaterEuropresto(item=box, measured_in="mm")

        # update from specifications
        header_indices_object = TableHandlerEuropresto(
            header_elements=response.xpath("//*[@class='specifics']/li"))

        box_data.analyse_table_rows(
            row_elements=response.xpath("//*[@class='specifics']/li/span"),
            table_handler=header_indices_object)

        # update from main title description
        box_data.update_item('description',
                             'tags',
                             'all_inner_dimensions',
                             'standard_size',
                             'wall_thickness',
                             'product_type',
                             'color',
                             description_element=response.xpath(
                                 "//*[@class='product-description']/*/h1"))

        # use PriceHandler
        price_handler = PriceHandler()

        price_elements = response.xpath(
            "//*[@class='bulk']/li/*[@class='price']")
        tier_elements = response.xpath(
            "//*[@class='bulk']/li/*[@class='from']")
        if not price_elements or not tier_elements:
            box['price_table'] = {}
        else:
            box['price_table'] = price_handler.create_price_table(
                price_elements=response.xpath(
                    "//*[@class='bulk']/li/*[@class='price']"),
                tier_elements=response.xpath(
                    "//*[@class='bulk']/li/*[@class='from']"))

        box['price_ex_BTW'] = price_handler.create_base_price_manually(
            price_element=response.xpath(
                "//*[@class='product-price']//*[@class='euro']"))

        # add missing item attributes
        box["url"] = response.request.url
        box["company"] = "Europresto"

        # for testing
        box["indices_dict"] = header_indices_object.indices_dict, header_indices_object.column_names
        yield box
예제 #7
0
    def parse_box(self, response):
        # iterate over different tables
        for box_table in response.xpath('//table'):

            boxes_rows = box_table.xpath('tbody/tr')
            for row in boxes_rows:
                box = ScrapingboxesItem()
                table = TableHandlerTupak(header_elements=box_table.xpath(
                    'thead/tr[@class="rij-2"][2]/th'))
                box_data = ItemUpdater2(item=box, measured_in="mm")

                # create data from product description
                box_data.update_item(
                    "tags",
                    "color",
                    "wall_thickness",
                    "description",
                    'product_type',
                    description_element=response.xpath('//h1'))

                # iterate over row indices and update box
                box_data.analyse_table_rows(table_handler=table,
                                            row_elements=row.xpath('td'))

                # create product url
                relative_url = row.xpath('./td/a/@href').get()
                if relative_url:
                    box["url"] = 'https://www.tupak.com' + relative_url
                else:
                    box['url'] = response.request.url

                # use PriceHandler
                price_handler = PriceHandler2(item=box)
                box["price_table"] = price_handler.create_price_table(
                    tier_elements=box_table.xpath(
                        './/tr[@class="rij-2"][2]/th[@class="staffel"]'),
                    price_elements=row.xpath(
                        './td[contains(@class, "prijs")]'))

                box['price_ex_BTW'] = price_handler.get_base_price_from_price_table(
                )

                # add item fields manually
                box["company"] = 'Tupak'
                box["in_stock"] = None
                #
                # # for testing
                box["indices_dict"] = table.indices_dict

                yield box
예제 #8
0
    def parse(self, response):

        category_elements = response.xpath(
            '//*[@class="Shop01catOuterWrapper"]//a')
        for element in category_elements:
            box = ScrapingboxesItem()
            box_data = ItemUpdater2(box, measured_in='cm')
            box_data.update_item('product_type',
                                 text_element=element.xpath('@title'))
            link = element.xpath('@href').get()
            link += "?page=1&perPage=300"
            yield response.follow(link,
                                  self.parse_category,
                                  meta={'item': box})
예제 #9
0
    def parse_box(self, response):
        box = ScrapingboxesItem()
        # table_handler = TableHandlerTest(header_elements=None)
        box_data = ItemUpdaterTest(item=box, measured_in="mm")
        price_handler = PriceHandler(price_multiplier=None)

        ## product description element test
        # product_description_element = None
        # yield {'test_field': product_description_element, 'url': response.request.url}

        # table header element test
        table_header_elements = response.xpath('/text()').getall()
        for text in table_header_elements:
            yield {'test_field': text, 'url': response.request.url}

        table_row_elements = None
        price_element = None
        price_tier_elements = None
        price_tierprice_elements = None

        yield {'test_field': None, 'url': response.request.url}
예제 #10
0
    def parse_box(self, response):
        box = ScrapingboxesItem()
        # table_handler = TableHandlerTest(header_elements=None)
        box_data = ItemUpdaterVermeij(item=box, measured_in="cm")

        product_description_element = response.xpath(
            '//div[@class="mobile-title-nr"]/h1[@itemprop="name"]/text()')

        box_data.update_item(
            'description', 'all_inner_dimensions', 'tags', 'box_type', 'standard_size',
            text_element=product_description_element
        )


        table_handler = TableHandlerVermeij(
            header_elements=response.xpath('//*[@class="extraspecs-row"]//td[1]')
        )

        box_data.analyse_table_rows(
            row_elements=response.xpath('//*[@class="extraspecs-row"]//tr'),
            table_handler=table_handler
        )

        # create PriceHandler, check if prices are per piece or per box
        box_or_piece_text = response.xpath('//*[@class="Shop01DetailPrijs"]/span[1]/text()').get()
        other_box_or_piece_text = response.xpath('//table[@class="staffelkortingen"]//tr[1]/th[4]').get()

        if box_or_piece_text:
            if 'doos' in box_or_piece_text:
                print('per doos', response.request.url)
                price_handler = PriceHandler2(box, price_multiplier=box['minimum_purchase'])
            elif 'stuk' in box_or_piece_text:
                print('per stuk', response.request.url)
                price_handler = PriceHandler2(box)
            else:
                raise ValueError("No pricehandler, there is something wrong with the box_or_piece_text")


        elif other_box_or_piece_text:
            if 'doos' in other_box_or_piece_text:
                price_handler = PriceHandler2(box, price_multiplier=box['minimum_purchase'])
            elif 'stuk' in other_box_or_piece_text:
                price_handler = PriceHandler2(box)
            else:
                raise ValueError("No pricehandler, there is something wrong with the box_or_piece_text")

        #create price table
        tier_elements = response.xpath('//table[@class="staffelkortingen"]//tr[position() >1]/th')
        if tier_elements:
            price_handler.create_price_table(
                tier_elements=tier_elements,
                price_elements=response.xpath('//table[@class="staffelkortingen"]//tr[position() >1]/td[3]')
            )
            price_handler.get_base_price_from_price_table()

        else:
            price_handler.create_base_price_manually(
                price_element=response.xpath('//*[@class="Shop01DetailPrijs"]/span[2]')
            )

        box['url'] = response.request.url
        box['company'] = 'Vermeij'

        yield box
예제 #11
0
    def parse_box_table(self, response):
        # iterate over box rows
        boxes_rows = response.xpath('//*[@id="tbody_1"]/tr')
        for idx, row in enumerate(boxes_rows):

            # initialize Item and data object
            box = ScrapingboxesItem()
            table_handler = TableHandlerRajapack(
                header_elements=response.xpath('//thead[@id="thead_1"]/tr/th')
            )

            # crate box data updater
            box_data = ItemUpdater2(item=box, measured_in=table_handler.get_measurement_unit())

            # create data from product description
            box_data.update_item(
                "tags",
                "wall_thickness",  # todo driedubbelgolf wordt niet gepakt, palletdozen
                "description",
                "color",
                'product_type',
                description_element=response.xpath('//*[@test-ihm="ProductName"]')
            )

            # iterate over row indices and update box
            box_data.analyse_table_rows(
                table_handler=table_handler,
                row_elements=row.xpath("./td")
            )

            # create product url
            base_url = response.request.url.split("_")[0]
            product_code = row.xpath('.//*[@class="tooltip-img"]/text()').get()
            try:
                product_url = base_url + "_sku" + product_code + ".html"
                box["url"] = product_url
            except TypeError:
                raise TypeError(base_url, product_code, box)

            # use PriceHandler
            price_handler = PriceHandlerRajapack(item=box,
                                                 # price_multiplier=box['minimum_purchase']
                                                 )

            box["price_table"] = price_handler.create_price_table(
                tier_elements=response.xpath('//*[@id="thead_1"]/tr[2]/th'),
                price_elements=row.xpath('./td[contains(@class, "nobdr")]')
            )

            # HANDLE 'Prijs per doos/pak'
            per_text = response.xpath('//th[contains(@class, "promo")]/b[1]/text()').get()
            multiplier = box.get('minimum_purchase', 1)

            if 'pak' in per_text:
                new_price_table = {}
                for key, value in box['price_table'].items():
                    new_price_table[ key * multiplier ] = round(value / multiplier, 2)
                box['price_table'] = new_price_table
                price_handler.price_table = new_price_table

            box['price_ex_BTW'] = price_handler.get_base_price_from_price_table()

            # add item fields manually
            box["company"] = "Rajapack"
            box["in_stock"] = None
            box['indices_dict'] = table_handler.indices_dict