def _country_tld_from_url(url: str) -> Optional[str]: """ Extract country TLD from URL; it's URL looks weird, don't sweat about it. :param url: URL, e.g. "https://www.bbc.co.uk/news/politics/eu-regions/vote2014_sitemap.xml". :return: Country TLD of URL without the prefix period, e.g. "uk", or None if there's no TLD. """ if not url: return None url = fix_common_url_mistakes(url) try: url = canonical_url(url) except Exception as ex: log.error(f"Unable to get canonical URL from URL {url}: {ex}") return None try: parsed_url = urlparse(url) except Exception as ex: log.warning(f"Unable to parse URL {url}: {ex}") return None hostname_parts = parsed_url.hostname.split('.') if len(hostname_parts) < 2: log.warning(f"No TLD found in URL {url}") return None return hostname_parts[-1].lower()
def test_canonical_url(): # Bad input with pytest.raises(mc_url.McCanonicalURLException): # noinspection PyTypeChecker mc_url.canonical_url(None) with pytest.raises(mc_url.McCanonicalURLException): # noinspection PyTypeChecker mc_url.canonical_url('') # Invalid URL with pytest.raises(mc_url.McCanonicalURLException): funky_url = ('http://Las%20Vegas%20mass%20shooting%20raises%20new%20' 'doubts%20about%20safety%20of%20live%20entertainment') mc_url.canonical_url(funky_url) # No urls_are_equal() because we want to compare them as strings here assert mc_url.canonical_url('HTTP://CYBER.LAW.HARVARD.EDU:80/node/9244') == 'http://cyber.law.harvard.edu/node/9244'
def _get_url_stats(url: str, config: Optional[FacebookConfig] = None) -> FacebookURLStats: """ Get Facebook statistics for an URL. Return URL stats on success, throw an exception on failure. :param url: URL to fetch the stats for. :param config: (optional) Facebook configuration object. :return FacebookURLStats object, or None if stats for this URL couldn't be fetched. """ url = decode_object_from_bytes_if_needed(url) if not url: # Treat unset URLs as a soft failure raise McFacebookInvalidURLException(url=url, error_message="URL is not set.") url = fix_common_url_mistakes(url) if not is_http_url(url): log.error(f": {url}") raise McFacebookInvalidURLException(url=url, error_message="URL is not HTTP(s).") try: url = canonical_url(url) except Exception as ex: raise McFacebookInvalidURLException(url=url, error_message=f"Unable to canonicalize URL: {ex}") for pattern in __URL_PATTERNS_WHICH_WONT_WORK: if re.search(pattern, url): raise McFacebookInvalidURLException( url=url, error_message=f"URL matches one of the patterns for URLs that won't work against Facebook API.", ) if not config: config = FacebookConfig() if not config.is_enabled(): raise McFacebookInvalidConfigurationException("Facebook API is not enabled.") # Make API request (https://developers.facebook.com/docs/graph-api/reference/v5.0/url) try: data = _api_request( node='', params={ 'id': url, 'fields': 'engagement', }, config=config, ) except McFacebookException as ex: # Pass the known exception back to the caller for them to deal with log.error(f"Unable to fetch stats for URL '{url}': {ex}") raise ex except Exception as ex: # If an unknown exception was raised while making an API call, consider it a fatal error raise McFacebookErrorAPIResponseException( f"Unknown error happened while fetching stats for URL '{url}': {ex}" ) if 'error' in data: log.error(f"Facebook API responded with error while fetching stats for URL '{url}': {data}") error = data['error'] error_type = error.get('type', 'unknown type') error_message = error.get('message', 'unknown message') if error_type == 'GraphMethodException' and 'Unsupported get request' in error_message: # Non-fatal permissions error for this specific URL raise McFacebookInvalidURLException(url=url, error_message=error_message) elif error_type == 'OAuthException' and error_message == 'An unknown error has occurred.': # some urls consistently return this error. true permissions errors don't return 'unknown error' message. raise McFacebookInvalidURLException(url=url, error_message=error_message) elif error_type == 'OAuthException' and 'facebook.com' in error_message: # facebook urls require permissions we don't have raise McFacebookInvalidURLException(url=url, error_message=error_message) else: # Everything else is considered a fatal error by us as we don't know what exactly happened raise McFacebookErrorAPIResponseException( f"Error response while fetching stats for URL '{url}': {error_type} {error_message}" ) response_url = data.get('id', None) if response_url is None: # Facebook API is expected to always return URL that we got the stats for raise McFacebookUnexpectedAPIResponseException( response=data, error_message="Response doesn't have 'id' key", ) response_url = str(response_url) # Facebook API returns a numeric ID for a URL that's a Facebook page if not response_url.isdigit(): # Verify that we got stats for the right URL # FIXME for whatever reason 'url' does get un-canonicalized at this point if response_url != url and canonical_url(response_url) != canonical_url(url): raise McFacebookUnexpectedAPIResponseException( response=data, error_message=f"Response URL ({response_url}) is not the same as request URL ({url})", ) engagement = data.get('engagement', None) if engagement is None: # We expect 'engagement' to be at least set to an empty dict raise McFacebookUnexpectedAPIResponseException( response=data, error_message="Response doesn't have 'engagement' key", ) # While 'engagement' is expected to always be set, all URL stats are not required to be present because Facebook # might not have ever seen this URL before stats = FacebookURLStats( share_count=engagement.get('share_count', None), comment_count=engagement.get('comment_count', None), reaction_count=engagement.get('reaction_count', None), ) # If none of the stats are set, just return None if stats.share_count is None and stats.comment_count is None and stats.reaction_count is None: raise McFacebookInvalidURLException(url=url, error_message="No statistics were returned for URL.") log.debug(f"Facebook statistics for URL '{url}': {stats}") return stats