예제 #1
0
def looking_for_next_page(root, href_sets):
    ret = {}
    ret['urls_with_next_somehow'] = grouped_hrefs_from_page(
        root,
        xpath=
        "//a[contains(@id, 'next')] | //a[contains(@class, 'next')] | //a[contains(text(), 'Next')] | //a[contains(text(), 'next')] "
    )
    # add url
    r = map(lambda x: map(lambda y: [y[-1], y[-1].get("href")], x), href_sets)
    r = map(lambda x: grouping_list_by_levenstein(x, key=lambda y: y[1]), r)
    r = filter(None, r)
    r = reduce_list_nesting(r)
    map_list_list(lambda x: x.append(EInfo(x[0]).create_xpath()), r)
    ret['urls_grouped_by_levei'] = r
    return ret
예제 #2
0
파일: multiple.py 프로젝트: kittle/htmlwipe
 def xpath_for_categories(self):
     text, root = wget_root(self.categories_url)
     grouped_hrefs = grouped_hrefs_from_page_sets(root)
     grouped_hrefs = map_list_list(lambda x: x[-1], grouped_hrefs)
     #pprint(grouped_hrefs)
     grouped_hrefs = filter(lambda x: is_grouped_hrefs_has_urls(x,
                                                                self.products),
                            grouped_hrefs)
     self.categories_xpath = map(lambda es: xpath_for_es(es, root),
                                 grouped_hrefs)
예제 #3
0
 def xpath_for_categories(self):
     text, root = wget_root(self.categories_url)
     grouped_hrefs = grouped_hrefs_from_page_sets(root)
     grouped_hrefs = map_list_list(lambda x: x[-1], grouped_hrefs)
     #pprint(grouped_hrefs)
     grouped_hrefs = filter(
         lambda x: is_grouped_hrefs_has_urls(x, self.products),
         grouped_hrefs)
     self.categories_xpath = map(lambda es: xpath_for_es(es, root),
                                 grouped_hrefs)
예제 #4
0
def looking_for_next_page(root, href_sets):
    ret = {}
    ret['urls_with_next_somehow'] = grouped_hrefs_from_page(root,
        xpath = "//a[contains(@id, 'next')] | //a[contains(@class, 'next')] | //a[contains(text(), 'Next')] | //a[contains(text(), 'next')] ")
    # add url
    r = map(
                    lambda x: map(
                                    lambda y: [y[-1], y[-1].get("href")], x)
            , href_sets)
    r = map(
              lambda x: grouping_list_by_levenstein(x,
                                                    key=lambda y: y[1]
                                )
              , r)
    r = filter(None, r)
    r = reduce_list_nesting(r)
    map_list_list(lambda x:x.append(EInfo(x[0]).create_xpath()), r)
    ret['urls_grouped_by_levei'] = r
    return ret