def first_layer_processor(layer_description, exceptions_log_file):
    next_layer_url_list = list()
    url = root_site_url + layer_description["items_urls_and_descriptions"]

    """
    html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
    parser = etree.HTMLParser()
    tree = etree.parse(StringIO(html), parser)
    info_container = tree.xpath(layer_description['info_container_xpath'])
    if info_container != []:
        print 'first_layer_processor: info_container != [] ', info_container 
        urls_container = info_container[0].xpath(layer_description['info_tag_xpath'])
        for url_container in urls_container:
            next_layer_url_list.append(url_container.attrib['href'])
        next_layer_url_list = list(set([next_layer_url for next_layer_url in next_layer_url_list if next_layer_url != '#']))
        for item in next_layer_url_list:
            print item
    else:
        print 'first_layer_processor: info_container == []: ', info_container 

    """
    content = get_content(url, headers_json, post_data={})

    json_dict = json.loads(content)
    html = json_dict["d"]

    all_same_type_nodes = get_all_same_type_nodes(html, layer_description["info_container_xpath"])

    # print all_same_type_nodes

    get_attrib = generate_attrib_getter("href")
    result_list = node_list_processor(all_same_type_nodes, get_attrib)
    result_list = [item for item in result_list if item.find("subcatid") != -1]
    for result in result_list:
        print "\n" * 2, "-" * 10
        print result
        next_layer_url_list.append(result)

    next_layer_url_list = list(set([next_layer_url for next_layer_url in next_layer_url_list if next_layer_url != "#"]))
    return next_layer_url_list
示例#2
0
    def second_layer_info_processor(product_description):
        product_name = product_description.xpath(second_layer_trade_name_xpath)
        print product_name[0].text.encode('utf-8')
        #product_description_dict['Produkt'] = product_name[0].text.encode('utf-8')
        product_description_text_container = product_description.xpath(second_layer_product_description_text_xpath)
        product_description_text_subcontainer = product_description_text_container[0]
        #print 'product_description_text_subcontainer: ', product_description_text_subcontainer
        child_list = product_description_text_subcontainer.getchildren()
        product_description_text = ''
        if child_list == []:
            product_description_text = product_description_text_subcontainer.text
        else:
            product_description_text = child_list[0].tail

        if product_description_text != '' and product_description_text.find(', ') != -1:
            product_description_text = product_description_text[product_description_text.find(', ') + 2:]

        print 'product_description_text: ', product_description_text
        #product_description_dict['Beredningsform Styrka'] = product_description_text.encode('utf-8')
        product_links_nodes = product_description.xpath(next_layer_link_xpath)
        product_links = node_list_processor(product_links_nodes, next_layer_url_maker)
        for link in product_links:
            print link
            print third_layer_processor(link)
示例#3
0
    
    result_data = list()

    
    first_layer_info_container_xpath = '//div[@class="abcpanel"]/div/ul/li/a'
    second_layer_info_container_xpath = '//li[@class="tradeNameList"]'
    second_layer_trade_name_xpath = 'a/span[@class="innerlabel"]'
    second_layer_product_link_xpath = 'ul/li[@class="linkList"]/a'
    second_layer_product_description_text_xpath = 'ul/li[@class="linkList"]/a/div/span[@class="innerlabel"]'
    third_layer_info_container_xpath_1 = '//div[@class="list-box substance"]/ul/li/a/span'
    third_layer_info_container_xpath_2 = '//div[@class="list-box substance"]/span'
    

    http_processor = HTTProcessor(headers=headers)

    content = http_processor.send_request(site_url, 'text')

    content = http_processor.send_request(first_layer_url, 'text')
    second_layer_link_nodes = get_all_same_type_nodes(content, first_layer_info_container_xpath)
    second_layer_urls = node_list_processor(second_layer_link_nodes, next_layer_url_maker)

    third_layer_info_processor = generate_third_layer_info_processor(third_layer_info_container_xpath_1, third_layer_info_container_xpath_2)
    third_layer_processor = generate_third_layer_processor(http_processor, third_layer_info_processor)

    second_layer_info_processor = generate_second_layer_info_processor(second_layer_trade_name_xpath, second_layer_product_description_text_xpath, second_layer_product_link_xpath, third_layer_processor)
    for second_layer_url in second_layer_urls[:3]:
        print second_layer_url
        content = http_processor.send_request(second_layer_url, 'text')
        second_layer_info_containing_nodes = get_all_same_type_nodes(content, second_layer_info_container_xpath)
        node_list_processor(second_layer_info_containing_nodes, second_layer_info_processor)