Exemplo n.º 1
0
def test_fix_common_url_mistakes():
    urls = {
        # "http://http://"
        'http://http://www.al-monitor.com/pulse': 'http://www.al-monitor.com/pulse',

        # With only one slash ("http:/www.")
        'http:/www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled':
            'http://www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled',

        # missing / before ?
        'http://foo.bar?baz=bat': 'http://foo.bar/?baz=bat',

        # Whitespace
        '  http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html  ':
            'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html',

        # Missing port
        'https://www.gpo.gov:/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf':
            'https://www.gpo.gov/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf',

        # Non-URLencoded space
        'http://www.ldeo.columbia.edu/~peter/ site/Home.html': 'http://www.ldeo.columbia.edu/~peter/%20site/Home.html',
    }

    for orig_url, fixed_url in urls.items():
        # Fix once
        assert mc_url.urls_are_equal(url1=mc_url.fix_common_url_mistakes(orig_url), url2=fixed_url)

        # Try fixing the same URL twice, see what happens
        assert mc_url.urls_are_equal(
            url1=mc_url.fix_common_url_mistakes(mc_url.fix_common_url_mistakes(orig_url)),
            url2=fixed_url,
        )
Exemplo n.º 2
0
def test_fix_common_url_mistakes():
    urls = {
        # "http://http://"
        'http://http://www.al-monitor.com/pulse': 'http://www.al-monitor.com/pulse',

        # With only one slash ("http:/www.")
        'http:/www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled':
            'http://www.theinquirer.net/inquirer/news/2322928/net-neutrality-rules-lie-in-tatters-as-fcc-overruled',

        # missing / before ?
        'http://foo.bar?baz=bat': 'http://foo.bar/?baz=bat',

        # Whitespace
        '  http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html  ':
            'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html',

        # Missing port
        'https://www.gpo.gov:/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf':
            'https://www.gpo.gov/fdsys/pkg/PLAW-107publ289/pdf/PLAW-107publ289.pdf'
    }

    for orig_url, fixed_url in urls.items():
        # Fix once
        assert mc_url.urls_are_equal(url1=mc_url.fix_common_url_mistakes(orig_url), url2=fixed_url)

        # Try fixing the same URL twice, see what happens
        assert mc_url.urls_are_equal(
            url1=mc_url.fix_common_url_mistakes(mc_url.fix_common_url_mistakes(orig_url)),
            url2=fixed_url,
        )
Exemplo n.º 3
0
    def get_follow_http_html_redirects(self, url: str) -> Response:
        """GET an URL while resolving HTTP / HTML redirects."""

        url = decode_object_from_bytes_if_needed(url)

        if url is None:
            raise McGetFollowHTTPHTMLRedirectsException("URL is None.")

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McGetFollowHTTPHTMLRedirectsException("URL is not HTTP(s): %s" % url)

        if self.max_redirect() == 0:
            raise McGetFollowHTTPHTMLRedirectsException(
                "User agent's max_redirect is 0, subroutine might loop indefinitely."
            )

        response = self.get(url)

        response_after_redirects = self.__get_follow_http_html_redirects(
            response_=response,
            meta_redirects_left=self.max_redirect()
        )
        if response_after_redirects is None:
            # One of the redirects failed -- return original response
            return response

        else:
            return response_after_redirects
Exemplo n.º 4
0
    def get_follow_http_html_redirects(self, url: str) -> Response:
        """GET an URL while resolving HTTP / HTML redirects."""

        url = decode_object_from_bytes_if_needed(url)

        if url is None:
            raise McGetFollowHTTPHTMLRedirectsException("URL is None.")

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McGetFollowHTTPHTMLRedirectsException(
                "URL is not HTTP(s): %s" % url)

        if self.max_redirect() == 0:
            raise McGetFollowHTTPHTMLRedirectsException(
                "User agent's max_redirect is 0, subroutine might loop indefinitely."
            )

        response = self.get(url)

        response_after_redirects = self.__get_follow_http_html_redirects(
            response_=response, meta_redirects_left=self.max_redirect())
        if response_after_redirects is None:
            # One of the redirects failed -- return original response
            return response

        else:
            return response_after_redirects
Exemplo n.º 5
0
def _country_tld_from_url(url: str) -> Optional[str]:
    """
    Extract country TLD from URL; it's URL looks weird, don't sweat about it.

    :param url: URL, e.g. "https://www.bbc.co.uk/news/politics/eu-regions/vote2014_sitemap.xml".
    :return: Country TLD of URL without the prefix period, e.g. "uk", or None if there's no TLD.
    """
    if not url:
        return None

    url = fix_common_url_mistakes(url)

    try:
        url = canonical_url(url)
    except Exception as ex:
        log.error(f"Unable to get canonical URL from URL {url}: {ex}")
        return None

    try:
        parsed_url = urlparse(url)
    except Exception as ex:
        log.warning(f"Unable to parse URL {url}: {ex}")
        return None

    hostname_parts = parsed_url.hostname.split('.')

    if len(hostname_parts) < 2:
        log.warning(f"No TLD found in URL {url}")
        return None

    return hostname_parts[-1].lower()
Exemplo n.º 6
0
    def __init__(self,
                 url: str,
                 recursion_level: int,
                 ua: Optional[UserAgent] = None):

        if recursion_level > self.__MAX_RECURSION_LEVEL:
            raise McSitemapsException(
                "Recursion level exceeded {} for URL {}.".format(
                    self.__MAX_RECURSION_LEVEL, url))

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McSitemapsException(
                "URL {} is not a HTTP(s) URL.".format(url))

        try:
            url = normalize_url(url)
        except Exception as ex:
            raise McSitemapsException("Unable to normalize URL {}: {}".format(
                url, ex))

        if not ua:
            ua = sitemap_useragent()

        self._url = url
        self._ua = ua
        self._recursion_level = recursion_level
Exemplo n.º 7
0
    def set_url(self, url: str) -> None:
        """Set URL, e.g. https://www.mediacloud.org/page.html"""
        url = decode_object_from_bytes_if_needed(url)
        if url is None:
            raise McUserAgentRequestException("URL is None.")
        if len(url) == 0:
            raise McUserAgentRequestException("URL is empty.")

        # Might be coming from "requests" which managed to fetch a bogus URL but we deem it to be invalid
        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McUserAgentRequestException("URL is not HTTP(s): %s" % str(url))

        self.__url = url
Exemplo n.º 8
0
    def get(self, url: str) -> Response:
        """GET an URL."""
        url = decode_object_from_bytes_if_needed(url)

        if url is None:
            raise McGetException("URL is None.")

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McGetException("URL is not HTTP(s): %s" % url)

        # Add HTTP authentication
        url = self.__url_with_http_auth(url=url)

        request = Request(method='GET', url=url)

        return self.request(request)
Exemplo n.º 9
0
    def get(self, url: str) -> Response:
        """GET an URL."""
        log.debug("mediawords.util.web.user_agent.get: %s" % url)
        url = decode_object_from_bytes_if_needed(url)

        if url is None:
            raise McGetException("URL is None.")

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McGetException("URL is not HTTP(s): %s" % url)

        # Add HTTP authentication
        url = self.__url_with_http_auth(url=url)

        request = Request(method='GET', url=url)

        return self.request(request)
Exemplo n.º 10
0
    def __init__(self, url: str, recursion_level: int, ua: Optional[UserAgent] = None):

        if recursion_level > self.__MAX_RECURSION_LEVEL:
            raise McSitemapsException("Recursion level exceeded {} for URL {}.".format(self.__MAX_RECURSION_LEVEL, url))

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McSitemapsException("URL {} is not a HTTP(s) URL.".format(url))

        try:
            url = normalize_url(url)
        except Exception as ex:
            raise McSitemapsException("Unable to normalize URL {}: {}".format(url, ex))

        if not ua:
            ua = sitemap_useragent()

        self._url = url
        self._ua = ua
        self._recursion_level = recursion_level
Exemplo n.º 11
0
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]:
    """Given the URL, return all URL variants that we can think of:

    1) Normal URL (the one passed as a parameter)
    2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere)
    3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.)
    4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL)
    5) URL from <link rel="canonical" /> (if any)
    6) Any alternative URLs from topic_merged_stories or topic_links"""

    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        raise McAllURLVariantsException("URL is None.")

    url = fix_common_url_mistakes(url)
    if not is_http_url(url):
        log.warning("URL %s is not a valid HTTP URL." % url)
        return [
            url,
        ]

    # Get URL after HTTP / HTML redirects
    ua = UserAgent()
    response = ua.get_follow_http_html_redirects(url)
    url_after_redirects = response.request().url()
    data_after_redirects = response.decoded_content()

    urls = {

        # Normal URL (don't touch anything)
        'normal': url,

        # Normal URL after redirects
        'after_redirects': url_after_redirects,

        # Canonical URL
        'normalized': normalize_url(url),

        # Canonical URL after redirects
        'after_redirects_normalized': normalize_url(url_after_redirects),
    }

    # If <link rel="canonical" /> is present, try that one too
    if data_after_redirects is not None:
        url_link_rel_canonical = link_canonical_url_from_html(
            html=data_after_redirects, base_url=url_after_redirects)
        if url_link_rel_canonical is not None and len(
                url_link_rel_canonical) > 0:
            log.debug(
                ('Found <link rel="canonical" /> for URL %(url_after_redirects)s '
                 '(original URL: %(url)s): %(url_link_rel_canonical)s') % {
                     "url_after_redirects": url_after_redirects,
                     "url": url,
                     "url_link_rel_canonical": url_link_rel_canonical,
                 })

            urls['after_redirects_canonical'] = url_link_rel_canonical

    # If URL gets redirected to the homepage (e.g.
    # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads
    # to http://www.wired.com/), don't use those redirects
    if not is_homepage_url(url):
        urls = {
            key: urls[key]
            for key in urls.keys() if not is_homepage_url(urls[key])
        }

    distinct_urls = list(set(urls.values()))

    topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls)

    distinct_urls = distinct_urls + topic_urls
    distinct_urls = list(set(distinct_urls))

    # Remove URLs that can't be variants of the initial URL
    for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES:
        distinct_urls = [
            x for x in distinct_urls
            if not re.search(pattern=invalid_url_variant_regex, string=x)
        ]

    return distinct_urls
Exemplo n.º 12
0
def _get_url_stats(url: str, config: Optional[FacebookConfig] = None) -> FacebookURLStats:
    """
    Get Facebook statistics for an URL.

    Return URL stats on success, throw an exception on failure.

    :param url: URL to fetch the stats for.
    :param config: (optional) Facebook configuration object.
    :return FacebookURLStats object, or None if stats for this URL couldn't be fetched.
    """
    url = decode_object_from_bytes_if_needed(url)

    if not url:
        # Treat unset URLs as a soft failure
        raise McFacebookInvalidURLException(url=url, error_message="URL is not set.")

    url = fix_common_url_mistakes(url)

    if not is_http_url(url):
        log.error(f": {url}")
        raise McFacebookInvalidURLException(url=url, error_message="URL is not HTTP(s).")

    try:
        url = canonical_url(url)
    except Exception as ex:
        raise McFacebookInvalidURLException(url=url, error_message=f"Unable to canonicalize URL: {ex}")

    for pattern in __URL_PATTERNS_WHICH_WONT_WORK:
        if re.search(pattern, url):
            raise McFacebookInvalidURLException(
                url=url,
                error_message=f"URL matches one of the patterns for URLs that won't work against Facebook API.",
            )

    if not config:
        config = FacebookConfig()

    if not config.is_enabled():
        raise McFacebookInvalidConfigurationException("Facebook API is not enabled.")

    # Make API request (https://developers.facebook.com/docs/graph-api/reference/v5.0/url)
    try:
        data = _api_request(
            node='',
            params={
                'id': url,
                'fields': 'engagement',
            },
            config=config,
        )
    except McFacebookException as ex:
        # Pass the known exception back to the caller for them to deal with
        log.error(f"Unable to fetch stats for URL '{url}': {ex}")
        raise ex

    except Exception as ex:
        # If an unknown exception was raised while making an API call, consider it a fatal error
        raise McFacebookErrorAPIResponseException(
            f"Unknown error happened while fetching stats for URL '{url}': {ex}"
        )

    if 'error' in data:
        log.error(f"Facebook API responded with error while fetching stats for URL '{url}': {data}")

        error = data['error']
        error_type = error.get('type', 'unknown type')
        error_message = error.get('message', 'unknown message')

        if error_type == 'GraphMethodException' and 'Unsupported get request' in error_message:
            # Non-fatal permissions error for this specific URL
            raise McFacebookInvalidURLException(url=url, error_message=error_message)
        elif error_type == 'OAuthException' and error_message == 'An unknown error has occurred.':
            # some urls consistently return this error.  true permissions errors don't return 'unknown error' message.
            raise McFacebookInvalidURLException(url=url, error_message=error_message)
        elif error_type == 'OAuthException' and 'facebook.com' in error_message:
            # facebook urls require permissions we don't have
            raise McFacebookInvalidURLException(url=url, error_message=error_message)
        else:
            # Everything else is considered a fatal error by us as we don't know what exactly happened
            raise McFacebookErrorAPIResponseException(
                f"Error response while fetching stats for URL '{url}': {error_type} {error_message}"
            )

    response_url = data.get('id', None)
    if response_url is None:
        # Facebook API is expected to always return URL that we got the stats for
        raise McFacebookUnexpectedAPIResponseException(
            response=data,
            error_message="Response doesn't have 'id' key",
        )

    response_url = str(response_url)

    # Facebook API returns a numeric ID for a URL that's a Facebook page
    if not response_url.isdigit():

        # Verify that we got stats for the right URL
        # FIXME for whatever reason 'url' does get un-canonicalized at this point
        if response_url != url and canonical_url(response_url) != canonical_url(url):
            raise McFacebookUnexpectedAPIResponseException(
                response=data,
                error_message=f"Response URL ({response_url}) is not the same as request URL ({url})",
            )

    engagement = data.get('engagement', None)
    if engagement is None:
        # We expect 'engagement' to be at least set to an empty dict
        raise McFacebookUnexpectedAPIResponseException(
            response=data,
            error_message="Response doesn't have 'engagement' key",
        )

    # While 'engagement' is expected to always be set, all URL stats are not required to be present because Facebook
    # might not have ever seen this URL before
    stats = FacebookURLStats(
        share_count=engagement.get('share_count', None),
        comment_count=engagement.get('comment_count', None),
        reaction_count=engagement.get('reaction_count', None),
    )

    # If none of the stats are set, just return None
    if stats.share_count is None and stats.comment_count is None and stats.reaction_count is None:
        raise McFacebookInvalidURLException(url=url, error_message="No statistics were returned for URL.")

    log.debug(f"Facebook statistics for URL '{url}': {stats}")

    return stats
Exemplo n.º 13
0
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]:
    """Given the URL, return all URL variants that we can think of:

    1) Normal URL (the one passed as a parameter)
    2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere)
    3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.)
    4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL)
    5) URL from <link rel="canonical" /> (if any)
    6) Any alternative URLs from topic_merged_stories or topic_links"""

    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        raise McAllURLVariantsException("URL is None.")

    url = fix_common_url_mistakes(url)
    if not is_http_url(url):
        log.warning("URL %s is not a valid HTTP URL." % url)
        return [
            url,
        ]

    # Get URL after HTTP / HTML redirects
    ua = UserAgent()
    response = ua.get_follow_http_html_redirects(url)
    url_after_redirects = response.request().url()
    data_after_redirects = response.decoded_content()

    urls = {

        # Normal URL (don't touch anything)
        'normal': url,

        # Normal URL after redirects
        'after_redirects': url_after_redirects,

        # Canonical URL
        'normalized': normalize_url(url),

        # Canonical URL after redirects
        'after_redirects_normalized': normalize_url(url_after_redirects),
    }

    # If <link rel="canonical" /> is present, try that one too
    if data_after_redirects is not None:
        url_link_rel_canonical = link_canonical_url_from_html(html=data_after_redirects, base_url=url_after_redirects)
        if url_link_rel_canonical is not None and len(url_link_rel_canonical) > 0:
            log.debug(
                (
                    'Found <link rel="canonical" /> for URL %(url_after_redirects)s '
                    '(original URL: %(url)s): %(url_link_rel_canonical)s'
                ) % {
                    "url_after_redirects": url_after_redirects,
                    "url": url,
                    "url_link_rel_canonical": url_link_rel_canonical,
                }
            )

            urls['after_redirects_canonical'] = url_link_rel_canonical

    # If URL gets redirected to the homepage (e.g.
    # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads
    # to http://www.wired.com/), don't use those redirects
    if not is_homepage_url(url):
        urls = {key: urls[key] for key in urls.keys() if not is_homepage_url(urls[key])}

    distinct_urls = list(set(urls.values()))

    topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls)

    distinct_urls = distinct_urls + topic_urls
    distinct_urls = list(set(distinct_urls))

    # Remove URLs that can't be variants of the initial URL
    for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES:
        distinct_urls = [x for x in distinct_urls if not re.search(pattern=invalid_url_variant_regex, string=x)]

    return distinct_urls