예제 #1
0
def grouping_sorted_paths_by_xpath(paths):
    #pprint(paths)
    f = lambda e: EInfo(e[-1]).create_xpath(with_attribs=False)
    paths_by_xpath = sorted(paths, key=f)
    p = itertools.groupby(paths_by_xpath, key=f)
    p = map(lambda (k, i): list(i), p)
    return p
예제 #2
0
def xpath_for_es(es, root):
    #pprint(es)
    paths = map(lambda e: EInfo(e).parentpath_tuples(), es)
    #pprint(paths)
    xpath = xpath_by(paths)
    errstring = verify_xpath(xpath, root, paths)
    return xpath, errstring
예제 #3
0
def looking_for_next_page(root, href_sets):
    ret = {}
    ret['urls_with_next_somehow'] = grouped_hrefs_from_page(
        root,
        xpath=
        "//a[contains(@id, 'next')] | //a[contains(@class, 'next')] | //a[contains(text(), 'Next')] | //a[contains(text(), 'next')] "
    )
    # add url
    r = map(lambda x: map(lambda y: [y[-1], y[-1].get("href")], x), href_sets)
    r = map(lambda x: grouping_list_by_levenstein(x, key=lambda y: y[1]), r)
    r = filter(None, r)
    r = reduce_list_nesting(r)
    map_list_list(lambda x: x.append(EInfo(x[0]).create_xpath()), r)
    ret['urls_grouped_by_levei'] = r
    return ret
예제 #4
0
파일: htmlwipe.py 프로젝트: kittle/htmlwipe
def item_info_from_es(es):
    if not es:
        return

    e = es[0]
    i = EInfo(e)
    i.info()
    if i.urls and False:
        c = EInfo(i.urls[0][0])
        #print i.childpath(i.urls[0][0])
        print c.search_for_xpath(e, 1, relroot=e)

    #pprint(i.e.findall(".//*"))
    _es = i.e.xpath(
        u".//*[contains(text(), '$')] | .//*[contains(text(), '€')] | .//*[contains(text(), 'EUR')] | .//*[contains(text(), 'USD')] | .//*[contains(text(), 'GBP')]"
    )
    #print i.e.xpath(".//*[contains(text(), 'Filet')]")
    print_es(_es)
예제 #5
0
def grouped_hrefs_from_page_sets(root, xpath="//a"):

    #es = root.xpath(xpath)
    es = e_with_url_from_root(root, xpath=xpath)

    paths = [EInfo(e).parentpath() for e in es]

    # grouping_by_pathlen
    paths_by_length = sorted(paths, key=len)
    p = itertools.groupby(paths_by_length, key=len)
    p = map(lambda (k, i): list(i), p)
    #pprint(p)

    p = map(grouping_sorted_paths_by_xpath, p)

    href_sets = reduce_list_nesting(p)
    return href_sets
예제 #6
0
    def xpath_for_products_next(self):
        def is_only_url_in_es(url, es):
            return not bool(
                filter(lambda a: not cmp_urls(a.get("href"), url), es))

        first_url, second_url, third_url = self.products_first_second_third[
            0:3]
        _, first = wget_root(first_url)
        candidates = e_by_url_from_page(first, second_url)

        if self.debug:
            print "XPath raw candidates: ", pformat(candidates)
            #print "XPATHs"
            #pprint(map(lambda e: EInfo(e).search_for_xpath_ng(first, lambda z: True), candidates))

        candidates = map(
            lambda e: (
                e,
                EInfo(e).search_for_xpath_ng(first, lambda es:
                                             is_only_url_in_es(second_url, es)
                                             #lambda x: True
                                             )),
            candidates)
        if self.debug:
            print "XPath first candidates: ", pformat(candidates)

        # reverify by second -> third_url
        _, second = wget_root(second_url)

        if self.debug:
            for e, xpaths in candidates:
                for xpath in xpaths:
                    print xpath
                    print third_url
                    es = second.xpath(xpath)
                    print_es(es)

        candidates = filter(
            lambda (e, xpaths): len(
                filter(
                    lambda xpath: is_only_url_in_es(
                        third_url, second.xpath(xpath)), xpaths)), candidates)
        #print "XPath last candidates: ", pformat(candidates)

        if candidates:
            self.next_products_xpath = candidates[0][1][0]
예제 #7
0
def grouped_hrefs_from_page(root, xpath="//a", max_urls_out=10):

    href_sets = grouped_hrefs_from_page_sets(root, xpath=xpath)
    ret = map(
        lambda e: {
            'len':
            len(e),
            'xpath':
            EInfo(e[0][-1]).search_for_xpath(root, len(e))[0],
            'urls':
            map(lambda x: (x[-1].get('href'), x[-1].text), e)[:max_urls_out]
        }, href_sets)
    ret = sorted(ret, key=lambda x: x['len'], reverse=True)

    # verify back xpath and add len_by_xpath
    map(
        lambda e: e.update(len_by_xpath=len(root.xpath(e['xpath']))
                           if e['xpath'] else None), ret)

    return ret, href_sets
예제 #8
0
파일: htmlwipe.py 프로젝트: kittle/htmlwipe
def item_info_from_es(es):
    if not es:
        return
    
    e = es[0]
    i = EInfo(e)
    i.info()
    if i.urls and False:
        c = EInfo(i.urls[0][0])
        #print i.childpath(i.urls[0][0])
        print c.search_for_xpath(e, 1, relroot=e)

    #pprint(i.e.findall(".//*"))
    _es =  i.e.xpath(u".//*[contains(text(), '$')] | .//*[contains(text(), '€')] | .//*[contains(text(), 'EUR')] | .//*[contains(text(), 'USD')] | .//*[contains(text(), 'GBP')]")
    #print i.e.xpath(".//*[contains(text(), 'Filet')]")
    print_es(_es)
예제 #9
0
파일: htmlwipe.py 프로젝트: kittle/htmlwipe
def collect_info_by_es(es):
    return map(lambda x: EInfo(x).info(), es)