def hn_turn_page(url: URL, response_body: Mapping) -> Optional[URL]: final_page = response_body["nbPages"] - 1 current_page = response_body["page"] if current_page < final_page: q_dict = parse_qs(url.query) q_dict["page"] = current_page + 1 new_url = url.follow("?" + urlencode(q_dict, doseq=True)) return new_url return None
def extract_links(root, url: URL) -> Set[URL]: rv: Set[URL] = set() for anchor in root.xpath("//a"): if "href" in anchor.attrib: href: str = anchor.attrib["href"] try: rv.add(url.follow(href, coerce_canonicalisation=True)) except URLException: log.debug("bad link: %s (from: %s)", href, url) return rv
def extract_canonical_link(root, url: URL) -> Optional[URL]: rel_canonicals = root.xpath("//head/link[@rel='canonical']") if len(rel_canonicals) > 0: if "href" in rel_canonicals[0].attrib: href = rel_canonicals[0].attrib["href"] try: return url.follow(href, coerce_canonicalisation=True) except URLException: log.debug("bad canonical link: %s (from %s)", href, url) else: log.debug("canonical link with no href on %s", url) return None log.debug("no canonical link found for %s", url) return None
def extract_icons(root, url: URL) -> Sequence[Icon]: icon_elements = root.xpath( "//head/link[(@rel='icon' or @rel='shortcut icon' or @rel='apple-touch-icon' or @rel='alternate icon')]" ) icons = [] for icon_element in icon_elements: icons.append( Icon( url=url.follow(icon_element.attrib.get("href"), coerce_canonicalisation=True), scope=IconScope.PAGE, type=icon_element.attrib.get("type"), rel_text=icon_element.attrib["rel"], sizes=icon_element.attrib.get("sizes"), )) return icons