def grouping_sorted_paths_by_xpath(paths): #pprint(paths) f = lambda e: EInfo(e[-1]).create_xpath(with_attribs=False) paths_by_xpath = sorted(paths, key=f) p = itertools.groupby(paths_by_xpath, key=f) p = map(lambda (k, i): list(i), p) return p
def xpath_for_es(es, root): #pprint(es) paths = map(lambda e: EInfo(e).parentpath_tuples(), es) #pprint(paths) xpath = xpath_by(paths) errstring = verify_xpath(xpath, root, paths) return xpath, errstring
def looking_for_next_page(root, href_sets): ret = {} ret['urls_with_next_somehow'] = grouped_hrefs_from_page( root, xpath= "//a[contains(@id, 'next')] | //a[contains(@class, 'next')] | //a[contains(text(), 'Next')] | //a[contains(text(), 'next')] " ) # add url r = map(lambda x: map(lambda y: [y[-1], y[-1].get("href")], x), href_sets) r = map(lambda x: grouping_list_by_levenstein(x, key=lambda y: y[1]), r) r = filter(None, r) r = reduce_list_nesting(r) map_list_list(lambda x: x.append(EInfo(x[0]).create_xpath()), r) ret['urls_grouped_by_levei'] = r return ret
def item_info_from_es(es): if not es: return e = es[0] i = EInfo(e) i.info() if i.urls and False: c = EInfo(i.urls[0][0]) #print i.childpath(i.urls[0][0]) print c.search_for_xpath(e, 1, relroot=e) #pprint(i.e.findall(".//*")) _es = i.e.xpath( u".//*[contains(text(), '$')] | .//*[contains(text(), '€')] | .//*[contains(text(), 'EUR')] | .//*[contains(text(), 'USD')] | .//*[contains(text(), 'GBP')]" ) #print i.e.xpath(".//*[contains(text(), 'Filet')]") print_es(_es)
def grouped_hrefs_from_page_sets(root, xpath="//a"): #es = root.xpath(xpath) es = e_with_url_from_root(root, xpath=xpath) paths = [EInfo(e).parentpath() for e in es] # grouping_by_pathlen paths_by_length = sorted(paths, key=len) p = itertools.groupby(paths_by_length, key=len) p = map(lambda (k, i): list(i), p) #pprint(p) p = map(grouping_sorted_paths_by_xpath, p) href_sets = reduce_list_nesting(p) return href_sets
def xpath_for_products_next(self): def is_only_url_in_es(url, es): return not bool( filter(lambda a: not cmp_urls(a.get("href"), url), es)) first_url, second_url, third_url = self.products_first_second_third[ 0:3] _, first = wget_root(first_url) candidates = e_by_url_from_page(first, second_url) if self.debug: print "XPath raw candidates: ", pformat(candidates) #print "XPATHs" #pprint(map(lambda e: EInfo(e).search_for_xpath_ng(first, lambda z: True), candidates)) candidates = map( lambda e: ( e, EInfo(e).search_for_xpath_ng(first, lambda es: is_only_url_in_es(second_url, es) #lambda x: True )), candidates) if self.debug: print "XPath first candidates: ", pformat(candidates) # reverify by second -> third_url _, second = wget_root(second_url) if self.debug: for e, xpaths in candidates: for xpath in xpaths: print xpath print third_url es = second.xpath(xpath) print_es(es) candidates = filter( lambda (e, xpaths): len( filter( lambda xpath: is_only_url_in_es( third_url, second.xpath(xpath)), xpaths)), candidates) #print "XPath last candidates: ", pformat(candidates) if candidates: self.next_products_xpath = candidates[0][1][0]
def grouped_hrefs_from_page(root, xpath="//a", max_urls_out=10): href_sets = grouped_hrefs_from_page_sets(root, xpath=xpath) ret = map( lambda e: { 'len': len(e), 'xpath': EInfo(e[0][-1]).search_for_xpath(root, len(e))[0], 'urls': map(lambda x: (x[-1].get('href'), x[-1].text), e)[:max_urls_out] }, href_sets) ret = sorted(ret, key=lambda x: x['len'], reverse=True) # verify back xpath and add len_by_xpath map( lambda e: e.update(len_by_xpath=len(root.xpath(e['xpath'])) if e['xpath'] else None), ret) return ret, href_sets
def item_info_from_es(es): if not es: return e = es[0] i = EInfo(e) i.info() if i.urls and False: c = EInfo(i.urls[0][0]) #print i.childpath(i.urls[0][0]) print c.search_for_xpath(e, 1, relroot=e) #pprint(i.e.findall(".//*")) _es = i.e.xpath(u".//*[contains(text(), '$')] | .//*[contains(text(), '€')] | .//*[contains(text(), 'EUR')] | .//*[contains(text(), 'USD')] | .//*[contains(text(), 'GBP')]") #print i.e.xpath(".//*[contains(text(), 'Filet')]") print_es(_es)
def collect_info_by_es(es): return map(lambda x: EInfo(x).info(), es)