def first_layer_processor(layer_description, exceptions_log_file): next_layer_url_list = list() url = root_site_url + layer_description["items_urls_and_descriptions"] """ html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser) info_container = tree.xpath(layer_description['info_container_xpath']) if info_container != []: print 'first_layer_processor: info_container != [] ', info_container urls_container = info_container[0].xpath(layer_description['info_tag_xpath']) for url_container in urls_container: next_layer_url_list.append(url_container.attrib['href']) next_layer_url_list = list(set([next_layer_url for next_layer_url in next_layer_url_list if next_layer_url != '#'])) for item in next_layer_url_list: print item else: print 'first_layer_processor: info_container == []: ', info_container """ content = get_content(url, headers_json, post_data={}) json_dict = json.loads(content) html = json_dict["d"] all_same_type_nodes = get_all_same_type_nodes(html, layer_description["info_container_xpath"]) # print all_same_type_nodes get_attrib = generate_attrib_getter("href") result_list = node_list_processor(all_same_type_nodes, get_attrib) result_list = [item for item in result_list if item.find("subcatid") != -1] for result in result_list: print "\n" * 2, "-" * 10 print result next_layer_url_list.append(result) next_layer_url_list = list(set([next_layer_url for next_layer_url in next_layer_url_list if next_layer_url != "#"])) return next_layer_url_list
if self._response != None: if content_type == 'text': return self._response.text elif content_type == 'binary': return self._response.content elif content_type == 'json': return self._response.json elif content_type == 'raw': return self._response.raw else: return self._response.text else: return None get_href_attrib = generate_attrib_getter('href') next_layer_url_maker = lambda link_node: base_url + get_href_attrib(link_node)[1:] def generate_second_layer_info_processor(second_layer_trade_name_xpath, second_layer_product_description_text_xpath, next_layer_link_xpath, third_layer_processor): def second_layer_info_processor(product_description): product_name = product_description.xpath(second_layer_trade_name_xpath) print product_name[0].text.encode('utf-8') #product_description_dict['Produkt'] = product_name[0].text.encode('utf-8') product_description_text_container = product_description.xpath(second_layer_product_description_text_xpath) product_description_text_subcontainer = product_description_text_container[0] #print 'product_description_text_subcontainer: ', product_description_text_subcontainer child_list = product_description_text_subcontainer.getchildren() product_description_text = '' if child_list == []: product_description_text = product_description_text_subcontainer.text