def parse_product_list_page(self, html_page): """Parse the HTML page which has the product list and populate the product_dict""" html_tree = lxml.html.fromstring(html_page, parser=lxml.html.HTMLParser(encoding='utf-8')) product_list = html_tree.xpath("//table[@class='tablaproductos']/tbody/tr") #print len(product_list) for product_item in product_list: # Check whether the product is available if len(product_item.xpath("./td[1]/img[@alt='PRODUCTOS NO DISPONIBLES']")) > 0: continue product_id = product_item.xpath("./td[4]/input/@value")[0].partition(';')[0] product_name = product_item.xpath("./td[1]//label")[0].text.replace(' ***LE RECOMENDAMOS***', '') product_name = product_name.encode('utf-8') product_price = float(product_item.xpath("./td[2]/span")[0].text.partition(' ')[0].replace(',', '.')) product_unitary_price = product_item.xpath("./td[2]/span[contains(" "concat(' ', normalize-space(@class), ' '), ' precio_ud ')]") if len(product_unitary_price) > 0: product_unitary_price = product_unitary_price[0].text.partition(': ') product_unit = product_unitary_price[0] product_unitary_price = float(product_unitary_price[2].partition(' ')[0].replace(',', '.')) # The following are fixings to normalize shop "bugs" if product_id == '43401': product_unitary_price = product_price product_unit = '1 UNIDAD' elif product_id == '40805': amount = re.search(r'(\d+) LAVADOS', product_name).group(1) product_unitary_price = round(round(product_price / float(amount), 4), 2) product_unit = '1 LAVADO' product_unitary_price, product_unit = self.normalize_unitary_price(product_unitary_price, product_unit) else: product_unitary_price, product_unit = self.get_unitary_price(product_price, product_name) Shop.add_product(self, Product(product_id, product_name, product_price, product_unitary_price, product_unit))
def parse_product_list_page(self, html_page): """Parse the HTML page which has the product list and populate the product_dict""" html_tree = lxml.html.fromstring(html_page, parser=lxml.html.HTMLParser(encoding='utf-8')) product_list = html_tree.xpath("//table[@id='conte']/form/tr[starts-with(@id, 'prod_') or " "starts-with(@id, 'categ_')]") #print len(product_list) for product_item in product_list: # Get product category product_id = product_item.attrib['id'] if product_id.startswith('categ_'): if not product_id.endswith('_2'): product_category = product_item.xpath("./td/table/tr/td[2]/p")[0].text.strip(' >') #print product_category continue base_xpath = "./td/table/tr/td/table/tr/td[3]/table" # Check whether the product is available if len(product_item.xpath(base_xpath + "/tr[3]/td/table/tr/td[@class='sub_menu_11']/a/" "strong[text()='Busca Sustituto']")) > 0: continue product_id = product_id.partition('_')[2] product_name = product_item.xpath(base_xpath + "/tr/td/table/tr/td[@class='menu_sup11']")[0].\ text_content().strip() product_name = product_name.encode('utf-8') product_price = float(product_item.xpath(base_xpath + "/tr[3]/td/table/tr/td[@class='menu_12_rojo_sin']/strong")[0].\ text.partition(' ')[0].replace(',', '.')) # The following are fixings to normalize shop "bugs" if product_id == '900782_2058535': product_name = product_name.replace('3x80', '3x60') product_unitary_price, product_unit = self.get_unitary_price(product_price, product_name, product_category) Shop.add_product(self, Product(product_id, product_name, product_price, product_unitary_price, product_unit))
def parse_product_list_page(self, html_page): """Parse the HTML page which has the product list and populate the product_dict""" html_tree = lxml.html.fromstring(html_page, parser=lxml.html.HTMLParser(encoding='utf-8')) product_list = html_tree.xpath("//table[@id='shopping-cart-table']/tbody/tr[not(@class)]") #print len(product_list) for product_item in product_list: # Check whether the product is available if len(product_item.xpath("./td[3]/span[text()='Producto no disponible']")) > 0: continue product_id = product_item.xpath(".//div[@class='cart_product_img']//img/@src")[0].split('/') product_id = product_id[len(product_id) - 2] product_name = product_item.xpath(".//div[@class='cart_product_txt']/h3/a/span")[0].text product_name = product_name.encode('utf-8') product_price = float(product_item.xpath(".//p[@class='ahora']/span")[0].text.strip().\ partition(' ')[0].replace(',', '.')) product_unitary_price = product_item.xpath(".//div[contains(concat(' ', normalize-space(@class), ' '), " "' precio_kg ')]") if len(product_unitary_price) > 0: product_unitary_price = product_unitary_price[0].text.strip(' ()').partition(' / ') product_unit = product_unitary_price[2] product_unitary_price = float(product_unitary_price[0].partition(' ')[0].replace(',', '.')) # The following are fixings to normalize shop "bugs" if product_id == '0201030800187': product_unitary_price *= 2 product_unitary_price, product_unit = self.normalize_unitary_price(product_unitary_price, product_unit) else: product_unitary_price, product_unit = self.get_unitary_price(product_price, product_name) Shop.add_product(self, Product(product_id, product_name, product_price, product_unitary_price, product_unit))