def looking_for_next_page(root, href_sets): ret = {} ret['urls_with_next_somehow'] = grouped_hrefs_from_page( root, xpath= "//a[contains(@id, 'next')] | //a[contains(@class, 'next')] | //a[contains(text(), 'Next')] | //a[contains(text(), 'next')] " ) # add url r = map(lambda x: map(lambda y: [y[-1], y[-1].get("href")], x), href_sets) r = map(lambda x: grouping_list_by_levenstein(x, key=lambda y: y[1]), r) r = filter(None, r) r = reduce_list_nesting(r) map_list_list(lambda x: x.append(EInfo(x[0]).create_xpath()), r) ret['urls_grouped_by_levei'] = r return ret
def xpath_for_categories(self): text, root = wget_root(self.categories_url) grouped_hrefs = grouped_hrefs_from_page_sets(root) grouped_hrefs = map_list_list(lambda x: x[-1], grouped_hrefs) #pprint(grouped_hrefs) grouped_hrefs = filter(lambda x: is_grouped_hrefs_has_urls(x, self.products), grouped_hrefs) self.categories_xpath = map(lambda es: xpath_for_es(es, root), grouped_hrefs)
def xpath_for_categories(self): text, root = wget_root(self.categories_url) grouped_hrefs = grouped_hrefs_from_page_sets(root) grouped_hrefs = map_list_list(lambda x: x[-1], grouped_hrefs) #pprint(grouped_hrefs) grouped_hrefs = filter( lambda x: is_grouped_hrefs_has_urls(x, self.products), grouped_hrefs) self.categories_xpath = map(lambda es: xpath_for_es(es, root), grouped_hrefs)
def looking_for_next_page(root, href_sets): ret = {} ret['urls_with_next_somehow'] = grouped_hrefs_from_page(root, xpath = "//a[contains(@id, 'next')] | //a[contains(@class, 'next')] | //a[contains(text(), 'Next')] | //a[contains(text(), 'next')] ") # add url r = map( lambda x: map( lambda y: [y[-1], y[-1].get("href")], x) , href_sets) r = map( lambda x: grouping_list_by_levenstein(x, key=lambda y: y[1] ) , r) r = filter(None, r) r = reduce_list_nesting(r) map_list_list(lambda x:x.append(EInfo(x[0]).create_xpath()), r) ret['urls_grouped_by_levei'] = r return ret