def test_fix_common_url_mistakes(): urls = { # "http://http://" 'http://http://www.al-monitor.com/pulse': 'http://www.al-monitor.com/pulse', # With only one slash ("http:/www.") 'http:/www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled': 'http://www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled', # missing / before ? 'http://foo.bar?baz=bat': 'http://foo.bar/?baz=bat', # Whitespace ' http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html ': 'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html', # Missing port 'https://www.gpo.gov:/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf': 'https://www.gpo.gov/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf', # Non-URLencoded space 'http://www.ldeo.columbia.edu/~peter/ site/Home.html': 'http://www.ldeo.columbia.edu/~peter/%20site/Home.html', } for orig_url, fixed_url in urls.items(): # Fix once assert mc_url.urls_are_equal(url1=mc_url.fix_common_url_mistakes(orig_url), url2=fixed_url) # Try fixing the same URL twice, see what happens assert mc_url.urls_are_equal( url1=mc_url.fix_common_url_mistakes(mc_url.fix_common_url_mistakes(orig_url)), url2=fixed_url, )
def test_fix_common_url_mistakes(): urls = { # "http://http://" 'http://http://www.al-monitor.com/pulse': 'http://www.al-monitor.com/pulse', # With only one slash ("http:/www.") 'http:/www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled': 'http://www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled', # missing / before ? 'http://foo.bar?baz=bat': 'http://foo.bar/?baz=bat', # Whitespace ' http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html ': 'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html', # Missing port 'https://www.gpo.gov:/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf': 'https://www.gpo.gov/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf' } for orig_url, fixed_url in urls.items(): # Fix once assert mc_url.urls_are_equal(url1=mc_url.fix_common_url_mistakes(orig_url), url2=fixed_url) # Try fixing the same URL twice, see what happens assert mc_url.urls_are_equal( url1=mc_url.fix_common_url_mistakes(mc_url.fix_common_url_mistakes(orig_url)), url2=fixed_url, )
def get_follow_http_html_redirects(self, url: str) -> Response: """GET an URL while resolving HTTP / HTML redirects.""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McGetFollowHTTPHTMLRedirectsException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): raise McGetFollowHTTPHTMLRedirectsException("URL is not HTTP(s): %s" % url) if self.max_redirect() == 0: raise McGetFollowHTTPHTMLRedirectsException( "User agent's max_redirect is 0, subroutine might loop indefinitely." ) response = self.get(url) response_after_redirects = self.__get_follow_http_html_redirects( response_=response, meta_redirects_left=self.max_redirect() ) if response_after_redirects is None: # One of the redirects failed -- return original response return response else: return response_after_redirects
def get_follow_http_html_redirects(self, url: str) -> Response: """GET an URL while resolving HTTP / HTML redirects.""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McGetFollowHTTPHTMLRedirectsException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): raise McGetFollowHTTPHTMLRedirectsException( "URL is not HTTP(s): %s" % url) if self.max_redirect() == 0: raise McGetFollowHTTPHTMLRedirectsException( "User agent's max_redirect is 0, subroutine might loop indefinitely." ) response = self.get(url) response_after_redirects = self.__get_follow_http_html_redirects( response_=response, meta_redirects_left=self.max_redirect()) if response_after_redirects is None: # One of the redirects failed -- return original response return response else: return response_after_redirects
def _country_tld_from_url(url: str) -> Optional[str]: """ Extract country TLD from URL; it's URL looks weird, don't sweat about it. :param url: URL, e.g. "https://www.bbc.co.uk/news/politics/eu-regions/vote2014_sitemap.xml". :return: Country TLD of URL without the prefix period, e.g. "uk", or None if there's no TLD. """ if not url: return None url = fix_common_url_mistakes(url) try: url = canonical_url(url) except Exception as ex: log.error(f"Unable to get canonical URL from URL {url}: {ex}") return None try: parsed_url = urlparse(url) except Exception as ex: log.warning(f"Unable to parse URL {url}: {ex}") return None hostname_parts = parsed_url.hostname.split('.') if len(hostname_parts) < 2: log.warning(f"No TLD found in URL {url}") return None return hostname_parts[-1].lower()
def __init__(self, url: str, recursion_level: int, ua: Optional[UserAgent] = None): if recursion_level > self.__MAX_RECURSION_LEVEL: raise McSitemapsException( "Recursion level exceeded {} for URL {}.".format( self.__MAX_RECURSION_LEVEL, url)) url = fix_common_url_mistakes(url) if not is_http_url(url): raise McSitemapsException( "URL {} is not a HTTP(s) URL.".format(url)) try: url = normalize_url(url) except Exception as ex: raise McSitemapsException("Unable to normalize URL {}: {}".format( url, ex)) if not ua: ua = sitemap_useragent() self._url = url self._ua = ua self._recursion_level = recursion_level
def set_url(self, url: str) -> None: """Set URL, e.g. https://www.mediacloud.org/page.html""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McUserAgentRequestException("URL is None.") if len(url) == 0: raise McUserAgentRequestException("URL is empty.") # Might be coming from "requests" which managed to fetch a bogus URL but we deem it to be invalid url = fix_common_url_mistakes(url) if not is_http_url(url): raise McUserAgentRequestException("URL is not HTTP(s): %s" % str(url)) self.__url = url
def get(self, url: str) -> Response: """GET an URL.""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McGetException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): raise McGetException("URL is not HTTP(s): %s" % url) # Add HTTP authentication url = self.__url_with_http_auth(url=url) request = Request(method='GET', url=url) return self.request(request)
def get(self, url: str) -> Response: """GET an URL.""" log.debug("mediawords.util.web.user_agent.get: %s" % url) url = decode_object_from_bytes_if_needed(url) if url is None: raise McGetException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): raise McGetException("URL is not HTTP(s): %s" % url) # Add HTTP authentication url = self.__url_with_http_auth(url=url) request = Request(method='GET', url=url) return self.request(request)
def __init__(self, url: str, recursion_level: int, ua: Optional[UserAgent] = None): if recursion_level > self.__MAX_RECURSION_LEVEL: raise McSitemapsException("Recursion level exceeded {} for URL {}.".format(self.__MAX_RECURSION_LEVEL, url)) url = fix_common_url_mistakes(url) if not is_http_url(url): raise McSitemapsException("URL {} is not a HTTP(s) URL.".format(url)) try: url = normalize_url(url) except Exception as ex: raise McSitemapsException("Unable to normalize URL {}: {}".format(url, ex)) if not ua: ua = sitemap_useragent() self._url = url self._ua = ua self._recursion_level = recursion_level
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]: """Given the URL, return all URL variants that we can think of: 1) Normal URL (the one passed as a parameter) 2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere) 3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.) 4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL) 5) URL from <link rel="canonical" /> (if any) 6) Any alternative URLs from topic_merged_stories or topic_links""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McAllURLVariantsException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): log.warning("URL %s is not a valid HTTP URL." % url) return [ url, ] # Get URL after HTTP / HTML redirects ua = UserAgent() response = ua.get_follow_http_html_redirects(url) url_after_redirects = response.request().url() data_after_redirects = response.decoded_content() urls = { # Normal URL (don't touch anything) 'normal': url, # Normal URL after redirects 'after_redirects': url_after_redirects, # Canonical URL 'normalized': normalize_url(url), # Canonical URL after redirects 'after_redirects_normalized': normalize_url(url_after_redirects), } # If <link rel="canonical" /> is present, try that one too if data_after_redirects is not None: url_link_rel_canonical = link_canonical_url_from_html( html=data_after_redirects, base_url=url_after_redirects) if url_link_rel_canonical is not None and len( url_link_rel_canonical) > 0: log.debug( ('Found <link rel="canonical" /> for URL %(url_after_redirects)s ' '(original URL: %(url)s): %(url_link_rel_canonical)s') % { "url_after_redirects": url_after_redirects, "url": url, "url_link_rel_canonical": url_link_rel_canonical, }) urls['after_redirects_canonical'] = url_link_rel_canonical # If URL gets redirected to the homepage (e.g. # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads # to http://www.wired.com/), don't use those redirects if not is_homepage_url(url): urls = { key: urls[key] for key in urls.keys() if not is_homepage_url(urls[key]) } distinct_urls = list(set(urls.values())) topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls) distinct_urls = distinct_urls + topic_urls distinct_urls = list(set(distinct_urls)) # Remove URLs that can't be variants of the initial URL for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES: distinct_urls = [ x for x in distinct_urls if not re.search(pattern=invalid_url_variant_regex, string=x) ] return distinct_urls
def _get_url_stats(url: str, config: Optional[FacebookConfig] = None) -> FacebookURLStats: """ Get Facebook statistics for an URL. Return URL stats on success, throw an exception on failure. :param url: URL to fetch the stats for. :param config: (optional) Facebook configuration object. :return FacebookURLStats object, or None if stats for this URL couldn't be fetched. """ url = decode_object_from_bytes_if_needed(url) if not url: # Treat unset URLs as a soft failure raise McFacebookInvalidURLException(url=url, error_message="URL is not set.") url = fix_common_url_mistakes(url) if not is_http_url(url): log.error(f": {url}") raise McFacebookInvalidURLException(url=url, error_message="URL is not HTTP(s).") try: url = canonical_url(url) except Exception as ex: raise McFacebookInvalidURLException(url=url, error_message=f"Unable to canonicalize URL: {ex}") for pattern in __URL_PATTERNS_WHICH_WONT_WORK: if re.search(pattern, url): raise McFacebookInvalidURLException( url=url, error_message=f"URL matches one of the patterns for URLs that won't work against Facebook API.", ) if not config: config = FacebookConfig() if not config.is_enabled(): raise McFacebookInvalidConfigurationException("Facebook API is not enabled.") # Make API request (https://developers.facebook.com/docs/graph-api/reference/v5.0/url) try: data = _api_request( node='', params={ 'id': url, 'fields': 'engagement', }, config=config, ) except McFacebookException as ex: # Pass the known exception back to the caller for them to deal with log.error(f"Unable to fetch stats for URL '{url}': {ex}") raise ex except Exception as ex: # If an unknown exception was raised while making an API call, consider it a fatal error raise McFacebookErrorAPIResponseException( f"Unknown error happened while fetching stats for URL '{url}': {ex}" ) if 'error' in data: log.error(f"Facebook API responded with error while fetching stats for URL '{url}': {data}") error = data['error'] error_type = error.get('type', 'unknown type') error_message = error.get('message', 'unknown message') if error_type == 'GraphMethodException' and 'Unsupported get request' in error_message: # Non-fatal permissions error for this specific URL raise McFacebookInvalidURLException(url=url, error_message=error_message) elif error_type == 'OAuthException' and error_message == 'An unknown error has occurred.': # some urls consistently return this error. true permissions errors don't return 'unknown error' message. raise McFacebookInvalidURLException(url=url, error_message=error_message) elif error_type == 'OAuthException' and 'facebook.com' in error_message: # facebook urls require permissions we don't have raise McFacebookInvalidURLException(url=url, error_message=error_message) else: # Everything else is considered a fatal error by us as we don't know what exactly happened raise McFacebookErrorAPIResponseException( f"Error response while fetching stats for URL '{url}': {error_type} {error_message}" ) response_url = data.get('id', None) if response_url is None: # Facebook API is expected to always return URL that we got the stats for raise McFacebookUnexpectedAPIResponseException( response=data, error_message="Response doesn't have 'id' key", ) response_url = str(response_url) # Facebook API returns a numeric ID for a URL that's a Facebook page if not response_url.isdigit(): # Verify that we got stats for the right URL # FIXME for whatever reason 'url' does get un-canonicalized at this point if response_url != url and canonical_url(response_url) != canonical_url(url): raise McFacebookUnexpectedAPIResponseException( response=data, error_message=f"Response URL ({response_url}) is not the same as request URL ({url})", ) engagement = data.get('engagement', None) if engagement is None: # We expect 'engagement' to be at least set to an empty dict raise McFacebookUnexpectedAPIResponseException( response=data, error_message="Response doesn't have 'engagement' key", ) # While 'engagement' is expected to always be set, all URL stats are not required to be present because Facebook # might not have ever seen this URL before stats = FacebookURLStats( share_count=engagement.get('share_count', None), comment_count=engagement.get('comment_count', None), reaction_count=engagement.get('reaction_count', None), ) # If none of the stats are set, just return None if stats.share_count is None and stats.comment_count is None and stats.reaction_count is None: raise McFacebookInvalidURLException(url=url, error_message="No statistics were returned for URL.") log.debug(f"Facebook statistics for URL '{url}': {stats}") return stats
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]: """Given the URL, return all URL variants that we can think of: 1) Normal URL (the one passed as a parameter) 2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere) 3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.) 4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL) 5) URL from <link rel="canonical" /> (if any) 6) Any alternative URLs from topic_merged_stories or topic_links""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McAllURLVariantsException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): log.warning("URL %s is not a valid HTTP URL." % url) return [ url, ] # Get URL after HTTP / HTML redirects ua = UserAgent() response = ua.get_follow_http_html_redirects(url) url_after_redirects = response.request().url() data_after_redirects = response.decoded_content() urls = { # Normal URL (don't touch anything) 'normal': url, # Normal URL after redirects 'after_redirects': url_after_redirects, # Canonical URL 'normalized': normalize_url(url), # Canonical URL after redirects 'after_redirects_normalized': normalize_url(url_after_redirects), } # If <link rel="canonical" /> is present, try that one too if data_after_redirects is not None: url_link_rel_canonical = link_canonical_url_from_html(html=data_after_redirects, base_url=url_after_redirects) if url_link_rel_canonical is not None and len(url_link_rel_canonical) > 0: log.debug( ( 'Found <link rel="canonical" /> for URL %(url_after_redirects)s ' '(original URL: %(url)s): %(url_link_rel_canonical)s' ) % { "url_after_redirects": url_after_redirects, "url": url, "url_link_rel_canonical": url_link_rel_canonical, } ) urls['after_redirects_canonical'] = url_link_rel_canonical # If URL gets redirected to the homepage (e.g. # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads # to http://www.wired.com/), don't use those redirects if not is_homepage_url(url): urls = {key: urls[key] for key in urls.keys() if not is_homepage_url(urls[key])} distinct_urls = list(set(urls.values())) topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls) distinct_urls = distinct_urls + topic_urls distinct_urls = list(set(distinct_urls)) # Remove URLs that can't be variants of the initial URL for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES: distinct_urls = [x for x in distinct_urls if not re.search(pattern=invalid_url_variant_regex, string=x)] return distinct_urls