Пример #1
0
 def deduplicate_links_enumerated(
         self,
         links: Iterable[Dict],
         seen_urls: Optional[Set] = None) -> Iterator[Tuple[int, Dict]]:
     """
     Filter out links with duplicate URLs. See :meth:`deduplicate_links`.
     """
     if seen_urls is None:
         seen_urls = self.seen_urls
     for idx, link in enumerate(links):
         url = link['url']
         canonical = canonicalize_url(url)
         if canonical in seen_urls:
             continue
         seen_urls.add(canonical)
         yield idx, link
Пример #2
0
def _clean_page_url_keep_domain(link: Dict) -> str:
    return canonicalize_url(link.get('page_url'))