Exemplo n.º 1
0
def looking_for_next_page(root, href_sets):
    ret = {}
    ret['urls_with_next_somehow'] = grouped_hrefs_from_page(
        root,
        xpath=
        "//a[contains(@id, 'next')] | //a[contains(@class, 'next')] | //a[contains(text(), 'Next')] | //a[contains(text(), 'next')] "
    )
    # add url
    r = map(lambda x: map(lambda y: [y[-1], y[-1].get("href")], x), href_sets)
    r = map(lambda x: grouping_list_by_levenstein(x, key=lambda y: y[1]), r)
    r = filter(None, r)
    r = reduce_list_nesting(r)
    map_list_list(lambda x: x.append(EInfo(x[0]).create_xpath()), r)
    ret['urls_grouped_by_levei'] = r
    return ret
Exemplo n.º 2
0
def grouped_hrefs_from_page_sets(root, xpath="//a"):

    #es = root.xpath(xpath)
    es = e_with_url_from_root(root, xpath=xpath)

    paths = [EInfo(e).parentpath() for e in es]

    # grouping_by_pathlen
    paths_by_length = sorted(paths, key=len)
    p = itertools.groupby(paths_by_length, key=len)
    p = map(lambda (k, i): list(i), p)
    #pprint(p)

    p = map(grouping_sorted_paths_by_xpath, p)

    href_sets = reduce_list_nesting(p)
    return href_sets
Exemplo n.º 3
0
def grouped_hrefs_from_page_sets(root, xpath="//a"):

    #es = root.xpath(xpath)
    es = e_with_url_from_root(root, xpath=xpath)
    
    paths = [EInfo(e).parentpath() for e in es]

    # grouping_by_pathlen
    paths_by_length = sorted(paths, key=len)
    p = itertools.groupby(paths_by_length, key=len)
    p = map(lambda (k, i): list(i), p)
    #pprint(p)
    
    p = map(grouping_sorted_paths_by_xpath, p)

    href_sets = reduce_list_nesting(p)
    return href_sets
Exemplo n.º 4
0
def looking_for_next_page(root, href_sets):
    ret = {}
    ret['urls_with_next_somehow'] = grouped_hrefs_from_page(root,
        xpath = "//a[contains(@id, 'next')] | //a[contains(@class, 'next')] | //a[contains(text(), 'Next')] | //a[contains(text(), 'next')] ")
    # add url
    r = map(
                    lambda x: map(
                                    lambda y: [y[-1], y[-1].get("href")], x)
            , href_sets)
    r = map(
              lambda x: grouping_list_by_levenstein(x,
                                                    key=lambda y: y[1]
                                )
              , r)
    r = filter(None, r)
    r = reduce_list_nesting(r)
    map_list_list(lambda x:x.append(EInfo(x[0]).create_xpath()), r)
    ret['urls_grouped_by_levei'] = r
    return ret