def get_follow_http_html_redirects(self, url: str) -> Response: """GET an URL while resolving HTTP / HTML redirects.""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McGetFollowHTTPHTMLRedirectsException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): raise McGetFollowHTTPHTMLRedirectsException("URL is not HTTP(s): %s" % url) if self.max_redirect() == 0: raise McGetFollowHTTPHTMLRedirectsException( "User agent's max_redirect is 0, subroutine might loop indefinitely." ) response = self.get(url) response_after_redirects = self.__get_follow_http_html_redirects( response_=response, meta_redirects_left=self.max_redirect() ) if response_after_redirects is None: # One of the redirects failed -- return original response return response else: return response_after_redirects
def __init__(self, url: str, recursion_level: int, ua: Optional[UserAgent] = None): if recursion_level > self.__MAX_RECURSION_LEVEL: raise McSitemapsException( "Recursion level exceeded {} for URL {}.".format( self.__MAX_RECURSION_LEVEL, url)) url = fix_common_url_mistakes(url) if not is_http_url(url): raise McSitemapsException( "URL {} is not a HTTP(s) URL.".format(url)) try: url = normalize_url(url) except Exception as ex: raise McSitemapsException("Unable to normalize URL {}: {}".format( url, ex)) if not ua: ua = sitemap_useragent() self._url = url self._ua = ua self._recursion_level = recursion_level
def sitemap_tree_for_homepage(homepage_url: str) -> AbstractSitemap: """Using a homepage URL, fetch the tree of sitemaps and its stories.""" if not is_http_url(homepage_url): raise McSitemapsException("URL {} is not a HTTP(s) URL.".format(homepage_url)) try: url = normalize_url(homepage_url) except Exception as ex: raise McSitemapsException("Unable to normalize URL {}: {}".format(homepage_url, ex)) try: uri = furl(url) except Exception as ex: raise McSitemapsException("Unable to parse URL {}: {}".format(url, ex)) if not is_homepage_url(homepage_url): try: uri = uri.remove(path=True, query=True, query_params=True, fragment=True) log.warning("Assuming that the homepage of {} is {}".format(homepage_url, uri.url)) except Exception as ex: raise McSitemapsException("Unable to determine homepage URL for URL {}: {}".format(homepage_url, ex)) uri.path = '/robots.txt' robots_txt_url = str(uri.url) robots_txt_fetcher = SitemapFetcher(url=robots_txt_url, recursion_level=0) sitemap_tree = robots_txt_fetcher.sitemap() return sitemap_tree
def fetch_download(self, db: DatabaseHandler, download: dict) -> Optional[Response]: download = decode_object_from_bytes_if_needed(download) url = self._download_url(download=download) if not is_http_url(url): raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}") download['download_time'] = sql_now() download['state'] = 'fetching' try: db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download) except McTupleAlreadyMovedError as ex: # Some attempts to set the download's row to "fetching" fail with: # # "tuple to be locked was already moved to another partition due to concurrent update" # # If that happens, we assume that some other fetcher instance somehow got to the download first and do # nothing log.warning(f"Some other fetcher got to download {download['downloads_id']} first: {ex}") return None except Exception as ex: # Raise further on misc. errors raise ex ua = UserAgent() response = ua.get_follow_http_html_redirects(url) return response
def url_path_has_mp3_extension(self) -> bool: """Return True if URL's path has .mp3 extension.""" if is_http_url(self.url): uri = furl(self.url) if '.mp3' in str(uri.path).lower(): return True return False
def __get_meta_refresh_url_from_tag(inner_tag: str, inner_base_url: Optional[str] = None) -> Optional[str]: """Given a <meta ...> tag, return the url from the content="url=XXX" attribute. Return None if no such url isfound. """ if not re.search(r'http-equiv\s*?=\s*?["\']\s*?refresh\s*?["\']', inner_tag, re.I): return None # content="url='http://foo.bar'" inner_url = None match = re.search(r'content\s*?=\s*?"\d*?\s*?;?\s*?URL\s*?=\s*?\'(.+?)\'', inner_tag, re.I) if match: inner_url = match.group(1) else: # content="url='http://foo.bar'" match = re.search(r'content\s*?=\s*?\'\d*?\s*?;?\s*?URL\s*?=\s*?"(.+?)"', inner_tag, re.I) if match: inner_url = match.group(1) else: # Fallback match = re.search(r'content\s*?=\s*?["\']\d*?\s*?;?\s*?URL\s*?=\s*?(.+?)["\']', inner_tag, re.I) if match: inner_url = match.group(1) if inner_url is None: return None if is_http_url(inner_url): return inner_url if inner_base_url is not None: return urljoin(base=str(inner_base_url), url=inner_url) return None
def get_links_from_html(html: str) -> typing.List[str]: """Return a list of all links that appear in the html. Only return absolute urls, because we would rather get fewer internal media source links. Also include embedded youtube video urls. Arguments: html - html to parse Returns: list of string urls """ soup = BeautifulSoup(html, 'lxml') links = [] # get everything with an href= element rather than just <a /> links for tag in soup.find_all(href=True): url = tag['href'] if re.search(IGNORE_LINK_PATTERN, url, flags=re.I) is not None: continue if not is_http_url(url): continue url = re.sub(r'(https)?://www[a-z0-9]+.nytimes', r'\1://www.nytimes', url, flags=re.I) links.append(url) return links
def get_follow_http_html_redirects(self, url: str) -> Response: """GET an URL while resolving HTTP / HTML redirects.""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McGetFollowHTTPHTMLRedirectsException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): raise McGetFollowHTTPHTMLRedirectsException( "URL is not HTTP(s): %s" % url) if self.max_redirect() == 0: raise McGetFollowHTTPHTMLRedirectsException( "User agent's max_redirect is 0, subroutine might loop indefinitely." ) response = self.get(url) response_after_redirects = self.__get_follow_http_html_redirects( response_=response, meta_redirects_left=self.max_redirect()) if response_after_redirects is None: # One of the redirects failed -- return original response return response else: return response_after_redirects
def link_canonical_url_from_html(html: str, base_url: str = None) -> Optional[str]: """From the provided HTML, determine the <link rel="canonical" /> URL (if any).""" html = decode_object_from_bytes_if_needed(html) base_url = decode_object_from_bytes_if_needed(base_url) link_elements = re.findall(r'(<\s*?link.+?>)', html, re.I) for link_element in link_elements: if re.search(r'rel\s*?=\s*?["\']\s*?canonical\s*?["\']', link_element, re.I): url = re.search(r'href\s*?=\s*?["\'](.+?)["\']', link_element, re.I) if url: url = url.group(1) if not is_http_url(url): # Maybe it's absolute path? if base_url is not None: return urljoin(base=base_url, url=url) else: log.debug( "HTML <link rel='canonical'/> found, but the new URL '%s' doesn't seem to be valid." % url) else: # Looks like URL, so return it return url return None
def sitemap(self) -> AbstractSitemap: # Serves as an ordered set because we want to deduplicate URLs but sitemap_urls = OrderedDict() for robots_txt_line in self._content.splitlines(): robots_txt_line = robots_txt_line.strip() # robots.txt is supposed to be case sensitive but who cares in these Node.js times? robots_txt_line = robots_txt_line.lower() sitemap_match = re.search(r'^sitemap: (.+?)$', robots_txt_line, flags=re.IGNORECASE) if sitemap_match: sitemap_url = sitemap_match.group(1) if is_http_url(sitemap_url): sitemap_urls[sitemap_url] = True else: log.debug("Sitemap URL {} doesn't look like an URL, skipping".format(sitemap_url)) sub_sitemaps = [] for sitemap_url in sitemap_urls.keys(): fetcher = SitemapFetcher(url=sitemap_url, recursion_level=self._recursion_level, ua=self._ua) fetched_sitemap = fetcher.sitemap() sub_sitemaps.append(fetched_sitemap) # noinspection PyArgumentList index_sitemap = IndexRobotsTxtSitemap(url=self._url, sub_sitemaps=sub_sitemaps) return index_sitemap
def meta_refresh_url_from_html(html: str, base_url: Optional[str] = None ) -> Optional[str]: """From the provided HTML, determine the <meta http-equiv="refresh" /> URL (if any).""" def __get_meta_refresh_url_from_tag( inner_tag: str, inner_base_url: Optional[str] = None) -> Optional[str]: """Given a <meta ...> tag, return the url from the content="url=XXX" attribute. return undef if no such url isfound. """ if not re.search(r'http-equiv\s*?=\s*?["\']\s*?refresh\s*?["\']', inner_tag, re.I): return None # content="url='http://foo.bar'" inner_url = None match = re.search( r'content\s*?=\s*?"\d*?\s*?;?\s*?URL\s*?=\s*?\'(.+?)\'', inner_tag, re.I) if match: inner_url = str(match.group(1)) else: # content="url='http://foo.bar'" match = re.search( r'content\s*?=\s*?\'\d*?\s*?;?\s*?URL\s*?=\s*?"(.+?)"', inner_tag, re.I) if match: inner_url = str(match.group(1)) else: # Fallback match = re.search( r'content\s*?=\s*?["\']\d*?\s*?;?\s*?URL\s*?=\s*?(.+?)["\']', inner_tag, re.I) if match: inner_url = str(match.group(1)) if is_http_url(str(inner_url)): return inner_url if inner_base_url is not None: return urljoin(base=str(inner_base_url), url=str(inner_url)) return None html = str(decode_object_from_bytes_if_needed(html)) base_url_decode = decode_object_from_bytes_if_needed(base_url) base_url = None if base_url_decode is None else str(base_url_decode) if not is_http_url(str(base_url)): log.info("Base URL is not HTTP(s): %s" % base_url) tags = re.findall(r'(<\s*meta[^>]+>)', html, re.I) for tag in tags: url = __get_meta_refresh_url_from_tag(tag, base_url) if url is not None: return url return None
def link_canonical_url_from_html(html: str, base_url: Optional[str] = None) -> Optional[str]: """From the provided HTML, determine the <link rel="canonical" /> URL (if any).""" html = str(decode_object_from_bytes_if_needed(html)) base_url_decode = decode_object_from_bytes_if_needed(base_url) base_url = None if base_url_decode is None else str(base_url_decode) link_elements = re.findall(r'(<\s*?link.+?>)', html, re.I) for link_element in link_elements: if re.search(r'rel\s*?=\s*?["\']\s*?canonical\s*?["\']', link_element, re.I): match = re.search(r'href\s*?=\s*?["\'](.+?)["\']', link_element, re.I) if match: url = match.group(1) if not is_http_url(url): # Maybe it's absolute path? if base_url is not None: return urljoin(base=base_url, url=url) else: log.debug( "HTML <link rel='canonical'/> found, but the new URL '%s' doesn't seem to be valid." % url ) else: # Looks like URL, so return it return url return None
def _get_links_from_html(html: str) -> List[str]: """Return a list of all links that appear in the html. Only return absolute urls, because we would rather get fewer internal media source links. Also include embedded youtube video urls. Arguments: html - html to parse Returns: list of string urls """ soup = BeautifulSoup(html, 'lxml') links = [] # get everything with an href= element rather than just <a /> links for tag in soup.find_all(href=True): url = tag['href'] if re.search(IGNORE_LINK_PATTERN, url, flags=re.I) is not None: continue if not is_http_url(url): continue url = re.sub(r'(https)?://www[a-z0-9]+.nytimes', r'\1://www.nytimes', url, flags=re.I) links.append(url) return links
def sitemap(self) -> AbstractSitemap: # Serves as an ordered set because we want to deduplicate URLs but sitemap_urls = OrderedDict() for robots_txt_line in self._content.splitlines(): robots_txt_line = robots_txt_line.strip() # robots.txt is supposed to be case sensitive but who cares in these Node.js times? robots_txt_line = robots_txt_line.lower() sitemap_match = re.search(r'^sitemap: (.+?)$', robots_txt_line, flags=re.IGNORECASE) if sitemap_match: sitemap_url = sitemap_match.group(1) if is_http_url(sitemap_url): sitemap_urls[sitemap_url] = True else: log.debug( "Sitemap URL {} doesn't look like an URL, skipping". format(sitemap_url)) sub_sitemaps = [] for sitemap_url in sitemap_urls.keys(): fetcher = SitemapFetcher(url=sitemap_url, recursion_level=self._recursion_level, ua=self._ua) fetched_sitemap = fetcher.sitemap() sub_sitemaps.append(fetched_sitemap) # noinspection PyArgumentList index_sitemap = IndexRobotsTxtSitemap(url=self._url, sub_sitemaps=sub_sitemaps) return index_sitemap
def test_get_links_from_html(): def test_links(html_: str, links_: list) -> None: assert _get_links_from_html(html_) == links_ test_links('<a href="http://foo.com">', ['http://foo.com']) test_links('<link href="http://bar.com">', ['http://bar.com']) test_links('<img src="http://img.tag">', []) test_links('<a href="http://foo.com"/> <a href="http://bar.com"/>', ['http://foo.com', 'http://bar.com']) # transform nyt urls test_links('<a href="http://www3.nytimes.com/foo/bar">', ['http://www.nytimes.com/foo/bar']) # ignore relative urls test_links('<a href="/foo/bar">', []) # ignore invalid urls test_links(r'<a href="http:\\foo.bar">', []) # ignore urls from ignore pattern test_links('<a href="http://www.addtoany.com/http://foo.bar">', []) test_links( '<a href="https://en.unionpedia.org/c/SOE_F_Section_timeline/vs/Special_Operations_Executive">', []) test_links('<a href="http://digg.com/submit/this">', []) test_links('<a href="http://politicalgraveyard.com/>', []) test_links( '<a href="http://api.bleacherreport.com/api/v1/tags/cm-punk.json">', []) test_links('<a href="http://apidomain.com">', ['http://apidomain.com']) test_links( '<a href="http://www.rumormillnews.com/cgi-bin/forum.cgi?noframes;read=54990">', []) test_links( '<a href="http://tvtropes.org/pmwiki/pmwiki.php/Main/ClockTower">', []) test_links('<a href="https://twitter.com/account/suspended">', []) test_links('<a href="https://misuse.ncbi.nlm.nih.gov/error/abuse.shtml">', []) test_links( '<a href="https://assets.feedblitzstatic.com/images/blank.gif">', []) test_links('<a href="https://accounts.google.com/ServiceLogin">', []) test_links( '<a href="http://network.wwe.com/video/v92665683/milestone/526767283">', []) # sanity test to make sure that we are able to get all of the links from a real html page filename = '/opt/mediacloud/tests/data/html-strip/strip.html' with open(filename, 'r', encoding='utf8') as fh: html = fh.read() links = _get_links_from_html(html) assert len(links) == 310 for link in links: assert is_http_url(link)
def _fetch_url(db: DatabaseHandler, url: str, network_down_host: str = DEFAULT_NETWORK_DOWN_HOST, network_down_port: int = DEFAULT_NETWORK_DOWN_PORT, network_down_timeout: int = DEFAULT_NETWORK_DOWN_TIMEOUT, domain_timeout: Optional[int] = None) -> FetchLinkResponse: """Fetch a url and return the content. If fetching the url results in a 400 error, check whether the network_down_host is accessible. If so, return the errored response. Otherwise, wait network_down_timeout seconds and try again. This function catches McGetException and returns a dummy 400 Response object. Arguments: db - db handle url - url to fetch network_down_host - host to check if network is down on error network_down_port - port to check if network is down on error network_down_timeout - seconds to wait if the network is down domain_timeout - value to pass to ThrottledUserAgent() Returns: Response object """ if url_has_binary_extension(url): return _make_dummy_bypassed_response(url) while True: ua = ThrottledUserAgent(db, domain_timeout=domain_timeout) if is_http_url(url): ua_response = ua.get_follow_http_html_redirects(url) response = FetchLinkResponse.from_useragent_response( url, ua_response) else: log.warning(f"URL is not HTTP(s), returning dummy response: {url}") response = FetchLinkResponse( url=url, is_success=False, code=HTTPStatus.BAD_REQUEST.value, message=HTTPStatus.BAD_REQUEST.phrase, content='bad url', last_requested_url=None, ) if response.is_success: return response if response.code == HTTPStatus.BAD_REQUEST.value and not tcp_port_is_open( port=network_down_port, hostname=network_down_host): log.warning( "Response failed with %s and network is down. Waiting to retry ..." % (url, )) time.sleep(network_down_timeout) else: return response
def xml_element_end(self, name: str) -> None: if name == 'sitemap:loc': sub_sitemap_url = html_unescape_strip(self._last_char_data) if not is_http_url(sub_sitemap_url): log.warning("Sub-sitemap URL does not look like one: {}".format(sub_sitemap_url)) else: if sub_sitemap_url not in self._sub_sitemap_urls: self._sub_sitemap_urls.append(sub_sitemap_url) super().xml_element_end(name=name)
def target_request_from_alarabiya_url( content: str, archive_site_url: str) -> Union[Request, None]: """alarabiya uses an interstitial that requires JavaScript. If the download URL matches alarabiya and returns the 'requires JavaScript' page, manually parse out the necessary cookie and add it to the $ua so that the request will work.""" content = decode_object_from_bytes_if_needed(content) archive_site_url = decode_object_from_bytes_if_needed(archive_site_url) if not is_http_url(archive_site_url): log.error("Archive site URL is not HTTP(s): %s" % archive_site_url) return None if content is None: return None if not re.search( pattern='alarabiya', string=archive_site_url, flags=re.IGNORECASE): return None if not re.search( pattern='This site requires JavaScript and Cookies to be enabled', string=content, flags=re.IGNORECASE): return None matches = re.search( pattern= r"setCookie\('(?P<cookie_name>[^']+)', '(?P<cookie_value>[^']+)'", string=content, flags=re.IGNORECASE) if matches: cookie_name = matches.group('cookie_name') cookie_value = matches.group('cookie_value') request = Request(method='GET', url=archive_site_url) request.set_header(name='Cookie', value="%s=%s" % ( cookie_name, cookie_value, )) return request else: log.warning("Unable to parse cookie from alarabiya URL %s: %s" % ( archive_site_url, content, )) return None
def xml_element_end(self, name: str) -> None: if name == 'sitemap:loc': sub_sitemap_url = html_unescape_strip(self._last_char_data) if not is_http_url(sub_sitemap_url): log.warning( "Sub-sitemap URL does not look like one: {}".format( sub_sitemap_url)) else: if sub_sitemap_url not in self._sub_sitemap_urls: self._sub_sitemap_urls.append(sub_sitemap_url) super().xml_element_end(name=name)
def set_url(self, url: str) -> None: """Set URL, e.g. https://www.mediacloud.org/page.html""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McUserAgentRequestException("URL is None.") if len(url) == 0: raise McUserAgentRequestException("URL is empty.") # Might be coming from "requests" which managed to fetch a bogus URL but we deem it to be invalid url = fix_common_url_mistakes(url) if not is_http_url(url): raise McUserAgentRequestException("URL is not HTTP(s): %s" % str(url)) self.__url = url
def meta_refresh_url_from_html(html: str, base_url: Optional[str] = None) -> Optional[str]: """From the provided HTML, determine the <meta http-equiv="refresh" /> URL (if any).""" html = str(decode_object_from_bytes_if_needed(html)) base_url_decode = decode_object_from_bytes_if_needed(base_url) base_url = None if base_url_decode is None else str(base_url_decode) if not is_http_url(str(base_url)): log.info("Base URL is not HTTP(s): %s" % base_url) tags = re.findall(r'(<\s*meta[^>]+>)', html, re.I) for tag in tags: url = __get_meta_refresh_url_from_tag(tag, base_url) if url is not None: return url return None
def fetch_download(self, db: DatabaseHandler, download: dict) -> Response: download = decode_object_from_bytes_if_needed(download) url = self._download_url(download=download) if not is_http_url(url): raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}") download['download_time'] = sql_now() download['state'] = 'fetching' db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download) ua = UserAgent() response = ua.get_follow_http_html_redirects(url) return response
def get(self, url: str) -> Response: """GET an URL.""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McGetException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): raise McGetException("URL is not HTTP(s): %s" % url) # Add HTTP authentication url = self.__url_with_http_auth(url=url) request = Request(method='GET', url=url) return self.request(request)
def __get_meta_refresh_url_from_tag( inner_tag: str, inner_base_url: Optional[str] = None) -> Optional[str]: """Given a <meta ...> tag, return the url from the content="url=XXX" attribute. Return None if no such url isfound. """ if not re.search(r'http-equiv\s*?=\s*?["\']\s*?refresh\s*?["\']', inner_tag, re.I): return None # content="url='http://foo.bar'" inner_url = None match = re.search( r'content\s*?=\s*?"\d*?\s*?;?\s*?URL\s*?=\s*?\'(.+?)\'', inner_tag, re.I) if match: inner_url = match.group(1) else: # content="url='http://foo.bar'" match = re.search( r'content\s*?=\s*?\'\d*?\s*?;?\s*?URL\s*?=\s*?"(.+?)"', inner_tag, re.I) if match: inner_url = match.group(1) else: # Fallback match = re.search( r'content\s*?=\s*?["\']\d*?\s*?;?\s*?URL\s*?=\s*?(.+?)["\']', inner_tag, re.I) if match: inner_url = match.group(1) if inner_url is None: return None if is_http_url(inner_url): return inner_url if inner_base_url is not None: return urljoin(base=str(inner_base_url), url=inner_url) return None
def get(self, url: str) -> Response: """GET an URL.""" log.debug("mediawords.util.web.user_agent.get: %s" % url) url = decode_object_from_bytes_if_needed(url) if url is None: raise McGetException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): raise McGetException("URL is not HTTP(s): %s" % url) # Add HTTP authentication url = self.__url_with_http_auth(url=url) request = Request(method='GET', url=url) return self.request(request)
def meta_refresh_url_from_html(html: str, base_url: Optional[str] = None ) -> Optional[str]: """From the provided HTML, determine the <meta http-equiv="refresh" /> URL (if any).""" html = str(decode_object_from_bytes_if_needed(html)) base_url_decode = decode_object_from_bytes_if_needed(base_url) base_url = None if base_url_decode is None else str(base_url_decode) if not is_http_url(str(base_url)): log.info("Base URL is not HTTP(s): %s" % base_url) tags = re.findall(r'(<\s*meta[^>]+>)', html, re.I) for tag in tags: url = __get_meta_refresh_url_from_tag(tag, base_url) if url is not None: return url return None
def target_request_from_meta_refresh_url( content: str, archive_site_url: str) -> Union[Request, None]: """Given a URL and content from website with META refresh, return a request for the original URL.""" content = decode_object_from_bytes_if_needed(content) archive_site_url = decode_object_from_bytes_if_needed(archive_site_url) if content is None: return None target_url = meta_refresh_url_from_html(html=content, base_url=archive_site_url) if target_url is None: return None if not is_http_url(target_url): log.error("URL matched, but is not HTTP(s): %s" % target_url) return None return Request(method='GET', url=target_url)
def target_request_from_archive_is_url( content: str, archive_site_url: str) -> Union[Request, None]: """Given a URL and content from archive.is, return a request for the original URL.""" content = decode_object_from_bytes_if_needed(content) archive_site_url = decode_object_from_bytes_if_needed(archive_site_url) if content is None: return None if re.match(pattern=r'^https?://archive\.is/(.+?)$', string=archive_site_url, flags=re.IGNORECASE): canonical_link = link_canonical_url_from_html(html=content) if canonical_link is not None: matches = re.match( pattern= r'^https?://archive\.is/\d+?/(?P<target_url>https?://.+?)$', string=canonical_link, flags=re.IGNORECASE) if matches: target_url = matches.group('target_url') if is_http_url(target_url): return Request(method='GET', url=target_url) else: log.error("URL matched, but is not HTTP(s): %s" % target_url) else: log.error( "Unable to parse original URL from archive.is response '%s': %s" % ( archive_site_url, canonical_link, )) else: log.error( "Unable to parse original URL from archive.is response '%s'" % archive_site_url) return None
def __init__(self, url: str, recursion_level: int, ua: Optional[UserAgent] = None): if recursion_level > self.__MAX_RECURSION_LEVEL: raise McSitemapsException("Recursion level exceeded {} for URL {}.".format(self.__MAX_RECURSION_LEVEL, url)) url = fix_common_url_mistakes(url) if not is_http_url(url): raise McSitemapsException("URL {} is not a HTTP(s) URL.".format(url)) try: url = normalize_url(url) except Exception as ex: raise McSitemapsException("Unable to normalize URL {}: {}".format(url, ex)) if not ua: ua = sitemap_useragent() self._url = url self._ua = ua self._recursion_level = recursion_level
def target_request_from_archive_org_url( content: Union[str, None], archive_site_url: str) -> Union[Request, None]: """Given a URL and content from archive.org, return a request for the original URL.""" content = decode_object_from_bytes_if_needed(content) archive_site_url = decode_object_from_bytes_if_needed(archive_site_url) matches = re.match( pattern= r'^https?://web\.archive\.org/web/(?P<date>\d+?/)?(?P<target_url>https?://.+?)$', string=archive_site_url, flags=re.IGNORECASE) if matches: target_url = matches.group('target_url') if is_http_url(target_url): return Request(method='GET', url=target_url) else: log.error("URL matched, but is not HTTP(s): %s" % target_url) return None
def __get_url_domain(url_: str) -> str: if not is_http_url(url_): return url_ host = get_url_host(url_) name_parts = host.split('.') n = len(name_parts) - 1 # for country domains, use last three parts of name if re.search(pattern=r"\...$", string=host): domain = '.'.join([name_parts[n - 2], name_parts[n - 1], name_parts[0]]) elif re.search(pattern=r"(localhost|blogspot\.com|wordpress\.com)", string=host): domain = url_ else: domain = '.'.join([name_parts[n - 1], name_parts[n]]) return domain.lower()
def sitemap(self) -> AbstractSitemap: story_urls = OrderedDict() for story_url in self._content.splitlines(): story_url = story_url.strip() if not story_url: continue if is_http_url(story_url): story_urls[story_url] = True else: log.warning("Story URL {} doesn't look like an URL, skipping".format(story_url)) pages = [] for page_url in story_urls.keys(): page = SitemapPage(url=page_url) pages.append(page) # noinspection PyArgumentList text_sitemap = PagesTextSitemap(url=self._url, pages=pages) return text_sitemap
def sitemap(self) -> AbstractSitemap: story_urls = OrderedDict() for story_url in self._content.splitlines(): story_url = story_url.strip() if not story_url: continue if is_http_url(story_url): story_urls[story_url] = True else: log.warning( "Story URL {} doesn't look like an URL, skipping".format( story_url)) pages = [] for page_url in story_urls.keys(): page = SitemapPage(url=page_url) pages.append(page) # noinspection PyArgumentList text_sitemap = PagesTextSitemap(url=self._url, pages=pages) return text_sitemap
def parallel_get(urls: List[str]) -> List[Response]: """GET multiple URLs in parallel.""" # FIXME doesn't respect timing() and other object properties urls = decode_object_from_bytes_if_needed(urls) # Original implementation didn't raise on undefined / empty list of URLs if urls is None: return [] if len(urls) == 0: return [] # Remove duplicates from list while maintaining order because: # 1) We don't want to fetch the same URL twice # 2) URLs are being used as unique dictionary IDs later on urls_before_removing_duplicates = urls.copy() urls = list(OrderedDict.fromkeys(urls)) if len(urls) != len(urls_before_removing_duplicates): log.warning("Some of the URLs are duplicate; URLs: %s" % str(urls_before_removing_duplicates)) # Raise on one or more invalid URLs because we consider it a caller's problem; if URL at least looks valid, # get() in a fork should be able to come up with a reasonable Response object for it for url in urls: if not is_http_url(url): raise McParallelGetException("URL %s is not a valid URL; URLs: %s" % (url, str(urls),)) config = py_get_config() if 'web_store_num_parallel' not in config['mediawords']: raise McParallelGetException('"web_store_num_parallel" is not set.') num_parallel = config['mediawords']['web_store_num_parallel'] if 'web_store_timeout' not in config['mediawords']: raise McParallelGetException('"web_store_timeout" is not set.') timeout = config['mediawords']['web_store_timeout'] if 'web_store_per_domain_timeout' not in config['mediawords']: raise McParallelGetException('"web_store_per_domain_timeout" is not set.') per_domain_timeout = config['mediawords']['web_store_per_domain_timeout'] url_stack = UserAgent.__get_scheduled_urls(urls_=urls, per_domain_timeout_=per_domain_timeout) start_time = time.time() url_blocks = {} while len(url_stack) > 0: block_i = len(url_stack) % num_parallel if block_i not in url_blocks: url_blocks[block_i] = [] url_blocks[block_i].append(url_stack.pop()) pool = multiprocessing.Pool(processes=num_parallel) all_results = [] for i, url_block in url_blocks.items(): result = pool.apply_async(_parallel_get_web_store, args=(url_block, start_time, timeout,)) all_results.append(result) all_responses = [] for result in all_results: responses = result.get() all_responses = all_responses + responses # No timeouts here because we trust the workers to timeout by themselves (by UserAgent) pool.close() pool.join() pool.terminate() # Sort URLs in parameter order # (if URLs weren't split into blocks, we could probably use map_async) response_url_map = {} for response in all_responses: url = response.scheduled_url.url response_url_map[url] = response.response sorted_responses = [] for url in urls: if url not in response_url_map: raise McParallelGetException("URL %s is not in the response URL map %s." % (url, response_url_map,)) sorted_responses.append(response_url_map[url]) if len(urls) != len(sorted_responses): raise McParallelGetException( "Response count doesn't match URL count; responses: %s; URLs: %s" % (sorted_responses, urls,) ) return sorted_responses
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]: """Given the URL, return all URL variants that we can think of: 1) Normal URL (the one passed as a parameter) 2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere) 3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.) 4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL) 5) URL from <link rel="canonical" /> (if any) 6) Any alternative URLs from topic_merged_stories or topic_links""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McAllURLVariantsException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): log.warning("URL %s is not a valid HTTP URL." % url) return [ url, ] # Get URL after HTTP / HTML redirects ua = UserAgent() response = ua.get_follow_http_html_redirects(url) url_after_redirects = response.request().url() data_after_redirects = response.decoded_content() urls = { # Normal URL (don't touch anything) 'normal': url, # Normal URL after redirects 'after_redirects': url_after_redirects, # Canonical URL 'normalized': normalize_url(url), # Canonical URL after redirects 'after_redirects_normalized': normalize_url(url_after_redirects), } # If <link rel="canonical" /> is present, try that one too if data_after_redirects is not None: url_link_rel_canonical = link_canonical_url_from_html( html=data_after_redirects, base_url=url_after_redirects) if url_link_rel_canonical is not None and len( url_link_rel_canonical) > 0: log.debug( ('Found <link rel="canonical" /> for URL %(url_after_redirects)s ' '(original URL: %(url)s): %(url_link_rel_canonical)s') % { "url_after_redirects": url_after_redirects, "url": url, "url_link_rel_canonical": url_link_rel_canonical, }) urls['after_redirects_canonical'] = url_link_rel_canonical # If URL gets redirected to the homepage (e.g. # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads # to http://www.wired.com/), don't use those redirects if not is_homepage_url(url): urls = { key: urls[key] for key in urls.keys() if not is_homepage_url(urls[key]) } distinct_urls = list(set(urls.values())) topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls) distinct_urls = distinct_urls + topic_urls distinct_urls = list(set(distinct_urls)) # Remove URLs that can't be variants of the initial URL for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES: distinct_urls = [ x for x in distinct_urls if not re.search(pattern=invalid_url_variant_regex, string=x) ] return distinct_urls
def test_is_http_url(): # noinspection PyTypeChecker assert not mc_url.is_http_url(None) assert not mc_url.is_http_url('') assert not mc_url.is_http_url('abc') assert not mc_url.is_http_url('/abc') assert not mc_url.is_http_url('//abc') assert not mc_url.is_http_url('///abc') assert not mc_url.is_http_url('gopher://gopher.floodgap.com/0/v2/vstat') assert not mc_url.is_http_url('ftp://ftp.freebsd.org/pub/FreeBSD/') assert mc_url.is_http_url('http://cyber.law.harvard.edu/about') assert mc_url.is_http_url('https://github.com/berkmancenter/mediacloud') funky_url = ('http://Las%20Vegas%20mass%20shooting%20raises' '%20new%20doubts%20about%20safety%20of%20live%20entertainment') assert mc_url.is_http_url(funky_url) is False # URLs with port, HTTP auth, localhost assert mc_url.is_http_url('https://*****:*****@domain.com:12345/path?query=string#fragment') assert mc_url.is_http_url('http://*****:*****@') is False
def podcast_viable_enclosure_for_story(db: DatabaseHandler, stories_id: int) -> Optional[StoryEnclosure]: """Fetch all enclosures, find and return the one that looks like a podcast episode the most (or None).""" story_enclosures_dicts = db.query(""" SELECT * FROM story_enclosures WHERE stories_id = %(stories_id)s -- Returning by insertion order so the enclosures listed earlier will have a better chance of being considered -- episodes ORDER BY story_enclosures_id """, { 'stories_id': stories_id, }).hashes() if not story_enclosures_dicts: log.warning(f"Story {stories_id} has no enclosures to choose from.") return None story_enclosures = [] for enclosure_dict in story_enclosures_dicts: if is_http_url(enclosure_dict['url']): story_enclosures.append(StoryEnclosure.from_db_row(db_row=enclosure_dict)) chosen_enclosure = None # Look for MP3 files in MIME type for enclosure in story_enclosures: if enclosure.mime_type_is_mp3(): log.info(f"Choosing enclosure '{enclosure}' by its MP3 MIME type '{enclosure.mime_type}'") chosen_enclosure = enclosure break # If that didn't work, look into URL's path if not chosen_enclosure: for enclosure in story_enclosures: if enclosure.url_path_has_mp3_extension(): log.info(f"Choosing enclosure '{enclosure}' by its URL '{enclosure.url}'") chosen_enclosure = enclosure break # If there are no MP3s in sight, try to find any kind of audio enclosure because it's a smaller download than video # and faster to transcode if not chosen_enclosure: for enclosure in story_enclosures: if enclosure.mime_type_is_audio(): log.info(f"Choosing enclosure '{enclosure}' by its audio MIME type '{enclosure.mime_type}'") chosen_enclosure = enclosure break # In case there are no audio enclosures, look for videos then if not chosen_enclosure: for enclosure in story_enclosures: if enclosure.mime_type_is_video(): log.info(f"Choosing enclosure '{enclosure}' by its video MIME type '{enclosure.mime_type}'") chosen_enclosure = enclosure break # Return either the best option that we've found so far, or None if there were no (explicitly declared) # audio / video enclosures return chosen_enclosure
def _api_request_url_with_signature(cls, api_url: str, client_id: str, client_secret: str, http_method: str = 'GET') -> str: """Return API URL with request signature appended.""" api_url = decode_object_from_bytes_if_needed(api_url) client_id = decode_object_from_bytes_if_needed(client_id) client_secret = decode_object_from_bytes_if_needed(client_secret) http_method = decode_object_from_bytes_if_needed(http_method) if not (api_url and client_id and client_secret): raise McCrawlerFetcherHardError("One or more required parameters are unset.") if not is_http_url(api_url): raise McCrawlerFetcherHardError(f"API URL '{api_url}' is not a HTTP(S) URL") if not http_method: http_method = 'GET' http_method = http_method.upper() uri = furl(api_url) if uri.args.get('client_id', None): raise McCrawlerFetcherHardError("Query already contains 'client_id'.") uri.args.add('client_id', client_id) if not str(uri.path): # Set to slash if it's unset uri.path.segments = [''] # Sort query params as per API spec sorted_args = [] for key in sorted(uri.args.keys()): values = uri.args.getlist(key) for value in sorted(values): sorted_args.append({key: value}) uri.args.clear() for sorted_arg in sorted_args: key, value = sorted_arg.popitem() uri.args.add(key, value) log.debug(f"Sorted query params: {uri.args}") log.debug(f"URI: {str(uri)}") api_url_path = str(uri.path) api_url_query = str(uri.query) unhashed_secret_key = f"{http_method}{client_id}{api_url_path}?{api_url_query}{client_secret}" log.debug(f"Unhashed secret key: {unhashed_secret_key}") signature = hashlib.sha1(unhashed_secret_key.encode('utf-8')).hexdigest() log.debug(f"Signature (hashed secret key): {signature}") uri.args.add('signature', signature) log.debug(f"API request URL: {str(uri)}") return str(uri)
def get_follow_http_html_redirects(self, url: str) -> Response: """GET an URL while resolving HTTP / HTML redirects.""" def __inner_follow_redirects( response_: Response, meta_redirects_left: int) -> Union[Response, None]: from mediawords.util.web.user_agent.html_redirects import ( target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ) if response_ is None: raise McGetFollowHTTPHTMLRedirectsException( "Response is None.") if response_.is_success(): base_url = get_base_url(response_.request().url()) html_redirect_functions = [ target_request_from_meta_refresh_url, target_request_from_archive_org_url, target_request_from_archive_is_url, target_request_from_linkis_com_url, target_request_from_alarabiya_url, ] for html_redirect_function in html_redirect_functions: request_after_meta_redirect = html_redirect_function( content=response_.decoded_content(), archive_site_url=base_url, ) if request_after_meta_redirect is not None: if not urls_are_equal( url1=response_.request().url(), url2=request_after_meta_redirect.url()): log.debug("URL after HTML redirects: %s" % request_after_meta_redirect.url()) orig_redirect_response = self.request( request=request_after_meta_redirect) redirect_response = orig_redirect_response # Response might have its previous() already set due to HTTP redirects, # so we have to find the initial response first previous = None for x in range(self.max_redirect() + 1): previous = redirect_response.previous() if previous is None: break redirect_response = previous if previous is not None: raise McGetFollowHTTPHTMLRedirectsException( "Can't find the initial redirected response; URL: %s" % request_after_meta_redirect.url()) log.debug( "Setting previous of URL %(url)s to %(previous_url)s" % { 'url': redirect_response.request().url(), 'previous_url': response_.request().url(), }) redirect_response.set_previous(response_) meta_redirects_left = meta_redirects_left - 1 return __inner( response_=orig_redirect_response, meta_redirects_left=meta_redirects_left, ) # No <meta /> refresh, the current URL is the final one return response_ else: log.debug("Request to %s was unsuccessful: %s" % ( response_.request().url(), response_.status_line(), )) # Return the original URL and give up return None def __inner_redirects_exhausted( response_: Response) -> Union[Response, None]: if response_ is None: raise McGetFollowHTTPHTMLRedirectsException( "Response is None.") # If one of the URLs that we've been redirected to contains another encoded URL, assume # that we're hitting a paywall and the URLencoded URL is the right one urls_redirected_to = [] for x in range(self.max_redirect() + 1): previous = response_.previous() if previous is None: break url_redirected_to = previous.request().url() encoded_url_redirected_to = quote(url_redirected_to) for redir_url in urls_redirected_to: if re.search(pattern=re.escape(encoded_url_redirected_to), string=redir_url, flags=re.IGNORECASE | re.UNICODE): log.debug( """ Encoded URL %(encoded_url_redirected_to)s is a substring of another URL %(matched_url)s, so I'll assume that %(url_redirected_to)s is the correct one. """ % { 'encoded_url_redirected_to': encoded_url_redirected_to, 'matched_url': redir_url, 'url_redirected_to': url_redirected_to, }) return previous urls_redirected_to.append(url_redirected_to) # Return the original URL (unless we find a URL being a substring of another URL, see below) return None def __inner(response_: Response, meta_redirects_left: int) -> Union[Response, None]: if response_ is None: raise McGetFollowHTTPHTMLRedirectsException( "Response is None.") if meta_redirects_left > 0: return __inner_follow_redirects( response_=response_, meta_redirects_left=meta_redirects_left, ) else: return __inner_redirects_exhausted(response_=response_) # --- url = decode_object_from_bytes_if_needed(url) if url is None: raise McGetFollowHTTPHTMLRedirectsException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): raise McGetFollowHTTPHTMLRedirectsException( "URL is not HTTP(s): %s" % url) if self.max_redirect() == 0: raise McGetFollowHTTPHTMLRedirectsException( "User agent's max_redirect is 0, subroutine might loop indefinitely." ) response = self.get(url) response_after_redirects = __inner( response_=response, meta_redirects_left=self.max_redirect()) if response_after_redirects is None: # One of the redirects failed -- return original response return response else: return response_after_redirects
def parallel_get(urls: List[str]) -> List[Response]: """GET multiple URLs in parallel.""" # FIXME doesn't respect timing() and other object properties def __get_url_domain(url_: str) -> str: if not is_http_url(url_): return url_ host = get_url_host(url_) name_parts = host.split('.') n = len(name_parts) - 1 # for country domains, use last three parts of name if re.search(pattern=r"\...$", string=host): domain = '.'.join( [name_parts[n - 2], name_parts[n - 1], name_parts[0]]) elif re.search(pattern=r"(localhost|blogspot\.com|wordpress\.com)", string=host): domain = url_ else: domain = '.'.join([name_parts[n - 1], name_parts[n]]) return domain.lower() def __get_scheduled_urls( urls_: List[str], per_domain_timeout_: int) -> List[_ParallelGetScheduledURL]: """Schedule the URLs by adding a { time => $time } field to each URL to make sure we obey the 'per_domain_timeout'. Sort requests by ascending time.""" domain_urls = {} for url_ in urls_: domain = __get_url_domain(url_=url_) if domain not in domain_urls: domain_urls[domain] = [] domain_urls[domain].append(url_) scheduled_urls = [] for domain, urls_in_domain in domain_urls.items(): time_ = 0 for domain_url in urls_in_domain: domain_url = _ParallelGetScheduledURL(url=domain_url, time_=time_) scheduled_urls.append(domain_url) if time_ % 5 == 0: # FIXME why 5? time_ = time_ + per_domain_timeout_ scheduled_urls = sorted(scheduled_urls, key=lambda x: x.time) return scheduled_urls # --- urls = decode_object_from_bytes_if_needed(urls) # Original implementation didn't raise on undefined / empty list of URLs if urls is None: return [] if len(urls) == 0: return [] # Remove duplicates from list while maintaining order because: # 1) We don't want to fetch the same URL twice # 2) URLs are being used as unique dictionary IDs later on urls_before_removing_duplicates = urls.copy() urls = list(OrderedDict.fromkeys(urls)) if len(urls) != len(urls_before_removing_duplicates): log.warning("Some of the URLs are duplicate; URLs: %s" % str(urls_before_removing_duplicates)) # Raise on one or more invalid URLs because we consider it a caller's problem; if URL at least looks valid, # get() in a fork should be able to come up with a reasonable Response object for it for url in urls: if not is_http_url(url): raise McParallelGetException( "URL %s is not a valid URL; URLs: %s" % ( url, str(urls), )) config = py_get_config() if 'web_store_num_parallel' not in config['mediawords']: raise McParallelGetException( '"web_store_num_parallel" is not set.') num_parallel = config['mediawords']['web_store_num_parallel'] if 'web_store_timeout' not in config['mediawords']: raise McParallelGetException('"web_store_timeout" is not set.') timeout = config['mediawords']['web_store_timeout'] if 'web_store_per_domain_timeout' not in config['mediawords']: raise McParallelGetException( '"web_store_per_domain_timeout" is not set.') per_domain_timeout = config['mediawords'][ 'web_store_per_domain_timeout'] url_stack = __get_scheduled_urls( urls_=urls, per_domain_timeout_=per_domain_timeout) start_time = time.time() url_blocks = {} while len(url_stack) > 0: block_i = len(url_stack) % num_parallel if block_i not in url_blocks: url_blocks[block_i] = [] url_blocks[block_i].append(url_stack.pop()) pool = multiprocessing.Pool(processes=num_parallel) all_results = [] for i, url_block in url_blocks.items(): result = pool.apply_async(_parallel_get_web_store, args=( url_block, start_time, timeout, )) all_results.append(result) all_responses = [] for result in all_results: responses = result.get() all_responses = all_responses + responses # No timeouts here because we trust the workers to timeout by themselves (by UserAgent) pool.close() pool.join() pool.terminate() # Sort URLs in parameter order # (if URLs weren't split into blocks, we could probably use map_async) response_url_map = {} for response in all_responses: url = response.scheduled_url.url response_url_map[url] = response.response sorted_responses = [] for url in urls: if url not in response_url_map: raise McParallelGetException( "URL %s is not in the response URL map %s." % ( url, response_url_map, )) sorted_responses.append(response_url_map[url]) if len(urls) != len(sorted_responses): raise McParallelGetException( "Response count doesn't match URL count; responses: %s; URLs: %s" % ( sorted_responses, urls, )) return sorted_responses
def test_is_http_url(): # noinspection PyTypeChecker assert not mc_url.is_http_url(None) assert not mc_url.is_http_url('') assert not mc_url.is_http_url('abc') assert not mc_url.is_http_url('/abc') assert not mc_url.is_http_url('//abc') assert not mc_url.is_http_url('///abc') assert not mc_url.is_http_url('gopher://gopher.floodgap.com/0/v2/vstat') assert not mc_url.is_http_url('ftp://ftp.freebsd.org/pub/FreeBSD/') assert mc_url.is_http_url('http://cyber.law.harvard.edu/about') assert mc_url.is_http_url('https://github.com/berkmancenter/mediacloud') funky_url = ( 'http://Las%20Vegas%20mass%20shooting%20raises' '%20new%20doubts%20about%20safety%20of%20live%20entertainment') assert mc_url.is_http_url(funky_url) is False # URLs with port, HTTP auth, localhost assert mc_url.is_http_url( 'https://*****:*****@domain.com:12345/path?query=string#fragment' ) assert mc_url.is_http_url('http://*****:*****@') is False
def target_request_from_linkis_com_url( content: str, archive_site_url: str) -> Union[Request, None]: """Given the content of a linkis.com web page, find the original URL in the content, which may be in one of sereral places in the DOM, and return a request for said URL.""" content = decode_object_from_bytes_if_needed(content) archive_site_url = decode_object_from_bytes_if_needed(archive_site_url) if content is None: return None if not re.match(pattern='^https?://[^/]*linkis.com/', string=archive_site_url, flags=re.IGNORECASE): return None # list of dom search patterns to find nodes with a url and the # attributes to use from those nodes as the url. # # for instance the first item matches: # # <meta property="og:url" content="http://foo.bar"> # try: html_parser = etree.HTMLParser() html_tree = etree.parse(StringIO(content), html_parser) dom_maps = [ ('//meta[@property="og:url"]', 'content'), ('//a[@class="js-youtube-ln-event"]', 'href'), ('//iframe[@id="source_site"]', 'src'), ] for xpath, url_attribute in dom_maps: nodes = html_tree.xpath(xpath) if len(nodes) > 0: first_node = nodes[0] matched_url = first_node.get(url_attribute) if matched_url is not None: if not re.match(pattern='^https?://linkis.com', string=matched_url, flags=re.IGNORECASE): if is_http_url(matched_url): return Request(method='GET', url=matched_url) else: log.error("URL matched, but is not HTTP(s): %s" % matched_url) except Exception as ex: log.warning("Unable to parse HTML for URL %s: %s" % ( archive_site_url, str(ex), )) # As a last resort, look for the longUrl key in a JavaScript array matches = re.search(pattern=r'"longUrl":\s*"(?P<target_url>[^"]+)"', string=content, flags=re.IGNORECASE) if matches: target_url = matches.group('target_url') # kludge to de-escape \'d characters in javascript -- 99% of urls # are captured by the dom stuff above, we shouldn't get to this # point often target_url = target_url.replace('\\', '') if not re.match(pattern='^https?://linkis.com', string=target_url, flags=re.IGNORECASE): if is_http_url(target_url): return Request(method='GET', url=target_url) else: log.error("URL matched, but is not HTTP(s): %s" % target_url) log.warning("No URL found for linkis URL: %s" % archive_site_url) return None
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]: """Given the URL, return all URL variants that we can think of: 1) Normal URL (the one passed as a parameter) 2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere) 3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.) 4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL) 5) URL from <link rel="canonical" /> (if any) 6) Any alternative URLs from topic_merged_stories or topic_links""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McAllURLVariantsException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): log.warning("URL %s is not a valid HTTP URL." % url) return [ url, ] # Get URL after HTTP / HTML redirects ua = UserAgent() response = ua.get_follow_http_html_redirects(url) url_after_redirects = response.request().url() data_after_redirects = response.decoded_content() urls = { # Normal URL (don't touch anything) 'normal': url, # Normal URL after redirects 'after_redirects': url_after_redirects, # Canonical URL 'normalized': normalize_url(url), # Canonical URL after redirects 'after_redirects_normalized': normalize_url(url_after_redirects), } # If <link rel="canonical" /> is present, try that one too if data_after_redirects is not None: url_link_rel_canonical = link_canonical_url_from_html(html=data_after_redirects, base_url=url_after_redirects) if url_link_rel_canonical is not None and len(url_link_rel_canonical) > 0: log.debug( ( 'Found <link rel="canonical" /> for URL %(url_after_redirects)s ' '(original URL: %(url)s): %(url_link_rel_canonical)s' ) % { "url_after_redirects": url_after_redirects, "url": url, "url_link_rel_canonical": url_link_rel_canonical, } ) urls['after_redirects_canonical'] = url_link_rel_canonical # If URL gets redirected to the homepage (e.g. # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads # to http://www.wired.com/), don't use those redirects if not is_homepage_url(url): urls = {key: urls[key] for key in urls.keys() if not is_homepage_url(urls[key])} distinct_urls = list(set(urls.values())) topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls) distinct_urls = distinct_urls + topic_urls distinct_urls = list(set(distinct_urls)) # Remove URLs that can't be variants of the initial URL for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES: distinct_urls = [x for x in distinct_urls if not re.search(pattern=invalid_url_variant_regex, string=x)] return distinct_urls