Пример #1
0
    def parse_product_list_page(self, html_page):
        """Parse the HTML page which has the product list and populate the product_dict"""

        html_tree = lxml.html.fromstring(html_page, parser=lxml.html.HTMLParser(encoding='utf-8'))

        product_list = html_tree.xpath("//table[@class='tablaproductos']/tbody/tr")
        #print len(product_list)

        for product_item in product_list:
            # Check whether the product is available
            if len(product_item.xpath("./td[1]/img[@alt='PRODUCTOS NO DISPONIBLES']")) > 0:
                continue

            product_id = product_item.xpath("./td[4]/input/@value")[0].partition(';')[0]

            product_name = product_item.xpath("./td[1]//label")[0].text.replace(' ***LE RECOMENDAMOS***', '')
            product_name = product_name.encode('utf-8')

            product_price = float(product_item.xpath("./td[2]/span")[0].text.partition(' ')[0].replace(',', '.'))

            product_unitary_price = product_item.xpath("./td[2]/span[contains("
                                                       "concat(' ', normalize-space(@class), ' '), ' precio_ud ')]")
            if len(product_unitary_price) > 0:
                product_unitary_price = product_unitary_price[0].text.partition(': ')
                product_unit = product_unitary_price[0]
                product_unitary_price = float(product_unitary_price[2].partition(' ')[0].replace(',', '.'))

                # The following are fixings to normalize shop "bugs"
                if product_id == '43401':
                    product_unitary_price = product_price
                    product_unit = '1 UNIDAD'
                elif product_id == '40805':
                    amount = re.search(r'(\d+) LAVADOS', product_name).group(1)
                    product_unitary_price = round(round(product_price / float(amount), 4), 2)
                    product_unit = '1 LAVADO'

                product_unitary_price, product_unit = self.normalize_unitary_price(product_unitary_price, product_unit)
            else:
                product_unitary_price, product_unit = self.get_unitary_price(product_price, product_name)

            Shop.add_product(self,
                             Product(product_id, product_name, product_price, product_unitary_price, product_unit))
Пример #2
0
    def parse_product_list_page(self, html_page):
        """Parse the HTML page which has the product list and populate the product_dict"""

        html_tree = lxml.html.fromstring(html_page, parser=lxml.html.HTMLParser(encoding='utf-8'))

        product_list = html_tree.xpath("//table[@id='conte']/form/tr[starts-with(@id, 'prod_') or "
                                       "starts-with(@id, 'categ_')]")
        #print len(product_list)

        for product_item in product_list:
            # Get product category
            product_id = product_item.attrib['id']
            if product_id.startswith('categ_'):
                if not product_id.endswith('_2'):
                    product_category = product_item.xpath("./td/table/tr/td[2]/p")[0].text.strip(' >')
                    #print product_category
                continue

            base_xpath = "./td/table/tr/td/table/tr/td[3]/table"
            # Check whether the product is available
            if len(product_item.xpath(base_xpath + "/tr[3]/td/table/tr/td[@class='sub_menu_11']/a/"
                                                   "strong[text()='Busca Sustituto']")) > 0:
                continue

            product_id = product_id.partition('_')[2]
            product_name = product_item.xpath(base_xpath + "/tr/td/table/tr/td[@class='menu_sup11']")[0].\
                                              text_content().strip()
            product_name = product_name.encode('utf-8')
            product_price = float(product_item.xpath(base_xpath +
                                                     "/tr[3]/td/table/tr/td[@class='menu_12_rojo_sin']/strong")[0].\
                                                     text.partition(' ')[0].replace(',', '.'))

            # The following are fixings to normalize shop "bugs"
            if product_id == '900782_2058535':
                product_name = product_name.replace('3x80', '3x60')

            product_unitary_price, product_unit = self.get_unitary_price(product_price, product_name, product_category)

            Shop.add_product(self,
                             Product(product_id, product_name, product_price, product_unitary_price, product_unit))
Пример #3
0
    def parse_product_list_page(self, html_page):
        """Parse the HTML page which has the product list and populate the product_dict"""

        html_tree = lxml.html.fromstring(html_page, parser=lxml.html.HTMLParser(encoding='utf-8'))

        product_list = html_tree.xpath("//table[@id='shopping-cart-table']/tbody/tr[not(@class)]")
        #print len(product_list)

        for product_item in product_list:
            # Check whether the product is available
            if len(product_item.xpath("./td[3]/span[text()='Producto no disponible']")) > 0:
                continue

            product_id = product_item.xpath(".//div[@class='cart_product_img']//img/@src")[0].split('/')
            product_id = product_id[len(product_id) - 2]

            product_name = product_item.xpath(".//div[@class='cart_product_txt']/h3/a/span")[0].text
            product_name = product_name.encode('utf-8')

            product_price = float(product_item.xpath(".//p[@class='ahora']/span")[0].text.strip().\
                                  partition(' ')[0].replace(',', '.'))

            product_unitary_price = product_item.xpath(".//div[contains(concat(' ', normalize-space(@class), ' '), "
                                                       "' precio_kg ')]")
            if len(product_unitary_price) > 0:
                product_unitary_price = product_unitary_price[0].text.strip(' ()').partition(' / ')
                product_unit = product_unitary_price[2]
                product_unitary_price = float(product_unitary_price[0].partition(' ')[0].replace(',', '.'))

                # The following are fixings to normalize shop "bugs"
                if product_id == '0201030800187':
                    product_unitary_price *= 2

                product_unitary_price, product_unit = self.normalize_unitary_price(product_unitary_price, product_unit)
            else:
                product_unitary_price, product_unit = self.get_unitary_price(product_price, product_name)

            Shop.add_product(self,
                             Product(product_id, product_name, product_price, product_unitary_price, product_unit))