def deduplicate_links_enumerated( self, links: Iterable[Dict], seen_urls: Optional[Set] = None) -> Iterator[Tuple[int, Dict]]: """ Filter out links with duplicate URLs. See :meth:`deduplicate_links`. """ if seen_urls is None: seen_urls = self.seen_urls for idx, link in enumerate(links): url = link['url'] canonical = canonicalize_url(url) if canonical in seen_urls: continue seen_urls.add(canonical) yield idx, link
def _clean_page_url_keep_domain(link: Dict) -> str: return canonicalize_url(link.get('page_url'))