def first_layer_processor(layer_description, exceptions_log_file): next_layer_url_list = list() url = root_site_url + layer_description["items_urls_and_descriptions"] """ html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser) info_container = tree.xpath(layer_description['info_container_xpath']) if info_container != []: print 'first_layer_processor: info_container != [] ', info_container urls_container = info_container[0].xpath(layer_description['info_tag_xpath']) for url_container in urls_container: next_layer_url_list.append(url_container.attrib['href']) next_layer_url_list = list(set([next_layer_url for next_layer_url in next_layer_url_list if next_layer_url != '#'])) for item in next_layer_url_list: print item else: print 'first_layer_processor: info_container == []: ', info_container """ content = get_content(url, headers_json, post_data={}) json_dict = json.loads(content) html = json_dict["d"] all_same_type_nodes = get_all_same_type_nodes(html, layer_description["info_container_xpath"]) # print all_same_type_nodes get_attrib = generate_attrib_getter("href") result_list = node_list_processor(all_same_type_nodes, get_attrib) result_list = [item for item in result_list if item.find("subcatid") != -1] for result in result_list: print "\n" * 2, "-" * 10 print result next_layer_url_list.append(result) next_layer_url_list = list(set([next_layer_url for next_layer_url in next_layer_url_list if next_layer_url != "#"])) return next_layer_url_list
def second_layer_info_processor(product_description): product_name = product_description.xpath(second_layer_trade_name_xpath) print product_name[0].text.encode('utf-8') #product_description_dict['Produkt'] = product_name[0].text.encode('utf-8') product_description_text_container = product_description.xpath(second_layer_product_description_text_xpath) product_description_text_subcontainer = product_description_text_container[0] #print 'product_description_text_subcontainer: ', product_description_text_subcontainer child_list = product_description_text_subcontainer.getchildren() product_description_text = '' if child_list == []: product_description_text = product_description_text_subcontainer.text else: product_description_text = child_list[0].tail if product_description_text != '' and product_description_text.find(', ') != -1: product_description_text = product_description_text[product_description_text.find(', ') + 2:] print 'product_description_text: ', product_description_text #product_description_dict['Beredningsform Styrka'] = product_description_text.encode('utf-8') product_links_nodes = product_description.xpath(next_layer_link_xpath) product_links = node_list_processor(product_links_nodes, next_layer_url_maker) for link in product_links: print link print third_layer_processor(link)
result_data = list() first_layer_info_container_xpath = '//div[@class="abcpanel"]/div/ul/li/a' second_layer_info_container_xpath = '//li[@class="tradeNameList"]' second_layer_trade_name_xpath = 'a/span[@class="innerlabel"]' second_layer_product_link_xpath = 'ul/li[@class="linkList"]/a' second_layer_product_description_text_xpath = 'ul/li[@class="linkList"]/a/div/span[@class="innerlabel"]' third_layer_info_container_xpath_1 = '//div[@class="list-box substance"]/ul/li/a/span' third_layer_info_container_xpath_2 = '//div[@class="list-box substance"]/span' http_processor = HTTProcessor(headers=headers) content = http_processor.send_request(site_url, 'text') content = http_processor.send_request(first_layer_url, 'text') second_layer_link_nodes = get_all_same_type_nodes(content, first_layer_info_container_xpath) second_layer_urls = node_list_processor(second_layer_link_nodes, next_layer_url_maker) third_layer_info_processor = generate_third_layer_info_processor(third_layer_info_container_xpath_1, third_layer_info_container_xpath_2) third_layer_processor = generate_third_layer_processor(http_processor, third_layer_info_processor) second_layer_info_processor = generate_second_layer_info_processor(second_layer_trade_name_xpath, second_layer_product_description_text_xpath, second_layer_product_link_xpath, third_layer_processor) for second_layer_url in second_layer_urls[:3]: print second_layer_url content = http_processor.send_request(second_layer_url, 'text') second_layer_info_containing_nodes = get_all_same_type_nodes(content, second_layer_info_container_xpath) node_list_processor(second_layer_info_containing_nodes, second_layer_info_processor)