예제 #1
0
def infer_redirection(url):
    """
    Function returning the url that the given url will redirect to. This is done
    by finding obvious hints in the GET parameters that the given url is in
    fact a redirection.

    Args:
        url (string): Target url.

    Returns:
        string: Redirected url or the original url if nothing was found.
    """

    redirection_split = REDIRECTION_DOMAINS_RE.split(url, 1)

    if len(redirection_split) > 1:
        return infer_redirection('https://' + redirection_split[1])

    obvious_redirect_match = re.search(OBVIOUS_REDIRECTS_RE, url)

    if obvious_redirect_match is not None:
        target = unquote(obvious_redirect_match.group(1))

        if target.startswith('http://') or target.startswith('https://'):
            return target

        if target.startswith('/'):
            return urljoin(url, target)

    return url
예제 #2
0
def extract_url_from_facebook_link(url):
    m = URL_EXTRACT_RE.search(url)

    if m is None:
        return None

    return unquote(m.group(1))
예제 #3
0
def infer_redirection(url, recursive=True):
    """
    Function returning the url that the given url will redirect to. This is done
    by finding obvious hints in the GET parameters that the given url is in
    fact a redirection.

    Args:
        url (string): Target url.
        recursive (bool): Whether to apply the function recursively until
            no redirection can be inferred. Defaults to `True`.

    Returns:
        string: Redirected url or the original url if nothing was found.
    """

    redirection_split = REDIRECTION_DOMAINS_RE.split(url, 1)

    target = None

    if len(redirection_split) > 1:
        target = 'https://' + redirection_split[1]

    else:
        obvious_redirect_match = re.search(OBVIOUS_REDIRECTS_RE, url)

        if obvious_redirect_match is not None:
            potential_target = unquote(obvious_redirect_match.group(1))

            if potential_target.startswith(
                    'http://') or potential_target.startswith('https://'):
                target = potential_target

            if potential_target.startswith('/'):
                target = urljoin(url, potential_target)

    if target is None:
        return url

    if recursive:
        return infer_redirection(target, recursive=True)

    return target
예제 #4
0
def normalize_url(url,
                  unsplit=True,
                  sort_query=True,
                  strip_authentication=True,
                  strip_trailing_slash=True,
                  strip_index=True,
                  strip_protocol=True,
                  strip_irrelevant_subdomains=True,
                  strip_lang_subdomains=False,
                  strip_lang_query_items=False,
                  strip_fragment='except-routing',
                  normalize_amp=True,
                  fix_common_mistakes=True,
                  infer_redirection=True,
                  quoted=True):
    """
    Function normalizing the given url by stripping it of usually
    non-discriminant parts such as irrelevant query items or sub-domains etc.

    This is a very useful utility when attempting to match similar urls
    written slightly differently when shared on social media etc.

    Args:
        url (str): Target URL as a string.
        sort_query (bool, optional): Whether to sort query items or not.
            Defaults to `True`.
        strip_authentication (bool, optional): Whether to drop authentication.
            Defaults to `True`.
        strip_trailing_slash (bool, optional): Whether to drop trailing slash.
            Defaults to `False`.
        strip_index (bool, optional): Whether to drop trailing index at the end
            of the url. Defaults to `True`.
        strip_irrelevant_subdomains (bool, optional): Whether to strip irrelevant subdomains such as www etc.
            Default to True.
        strip_lang_subdomains (bool, optional): Whether to drop language subdomains
            (ex: 'fr-FR.lemonde.fr' to only 'lemonde.fr' because 'fr-FR' isn't a relevant subdomain, it indicates the language and the country).
            Defaults to `False`.
        strip_protocol (bool, optional): Whether to strip the url's protocol.
            Defaults to `True`.
        strip_fragment (bool|str, optional): Whether to drop non-routing fragment from the url?
            If set to `except-routing` will only drop non-routing fragment (i.e. fragments that
            do not contain a "/").
            Defaults to `except-routing`.
        normalize_amp (bool, optional): Whether to attempt to normalize Google
            AMP urls. Defaults to True.
        fix_common_mistakes (bool, optional): Whether to attempt solving common mistakes.
            Defaults to True.
        infer_redirection (bool, optional): Whether to attempt resolving common
            redirects by leveraging well-known GET parameters. Defaults to `False`.
        quoted (bool, optional): Normalizing to quoted or unquoted.
            Defaults to True.

    Returns:
        string: The normalized url.

    """
    original_url_arg = url

    if infer_redirection:
        url = resolve(url)

    if isinstance(url, SplitResult):
        has_protocol = bool(splitted.scheme)
        splitted = url
    else:
        has_protocol = PROTOCOL_RE.match(url)

        # Ensuring scheme so parsing works correctly
        if not has_protocol:
            url = 'http://' + url

        # Parsing
        try:
            splitted = urlsplit(url)
        except ValueError:
            return original_url_arg

    scheme, netloc, path, query, fragment = splitted

    # Fixing common mistakes
    if fix_common_mistakes:
        if query:
            query = re.sub(MISTAKES_RE, '&', query)

    # Handling punycode
    netloc = decode_punycode(netloc)

    # Dropping :80 & :443
    if netloc.endswith(':80'):
        netloc = netloc[:-3]
    elif netloc.endswith(':443'):
        netloc = netloc[:-4]

    # Normalizing the path
    if path:
        trailing_slash = False
        if path.endswith('/') and len(path) > 1:
            trailing_slash = True
        path = normpath(path)
        if trailing_slash and not strip_trailing_slash:
            path = path + '/'

    # Handling Google AMP suffixes
    if normalize_amp:
        path = AMP_SUFFIXES_RE.sub('', path)

    # Dropping index:
    if strip_index:
        segments = path.rsplit('/', 1)

        if len(segments) != 0:
            last_segment = segments[-1]
            filename, ext = splitext(last_segment)

            if filename == 'index':
                segments.pop()
                path = '/'.join(segments)

    # Dropping irrelevant query items
    if query:
        domain_filter = None

        if splitted.hostname:
            domain_filter = next((f for d, f in PER_DOMAIN_QUERY_FILTERS
                                  if splitted.hostname.endswith(d)), None)

        qsl = parse_qsl(query, keep_blank_values=True)
        qsl = [
            stringify_qs(item) for item in qsl if not should_strip_query_item(
                item,
                normalize_amp=normalize_amp,
                strip_lang_query_items=strip_lang_query_items,
                domain_filter=domain_filter)
        ]

        if sort_query:
            qsl = sorted(qsl)

        query = '&'.join(qsl)

    # Dropping fragment if it's not routing
    if fragment and strip_fragment:
        if strip_fragment is True or not should_strip_fragment(fragment):
            fragment = ''

    # Always dropping trailing slash with empty query & fragment
    if path == '/' and not fragment and not query:
        path = ''

    # Dropping irrelevant subdomains
    if strip_irrelevant_subdomains:
        netloc = re.sub(
            IRRELEVANT_SUBDOMAIN_AMP_RE
            if normalize_amp else IRRELEVANT_SUBDOMAIN_RE, '', netloc)

    # Dropping language as subdomains
    if strip_lang_subdomains:
        netloc = strip_lang_subdomains_from_netloc(netloc)

    # Dropping scheme
    if strip_protocol or not has_protocol:
        scheme = ''

    # Dropping authentication
    if strip_authentication:
        netloc = netloc.split('@', 1)[-1]

    # Normalizing AMP subdomains
    if normalize_amp and netloc.startswith('amp-'):
        netloc = netloc[4:]

    # Dropping trailing slash
    if strip_trailing_slash and path.endswith('/'):
        path = path.rstrip('/')

    # Quoting or not
    if quoted:
        path = quote(path)
        query = quote(query, RESERVED_CHARACTERS)
        fragment = quote(fragment, SAFE_CHARACTERS)
    else:
        path = unquote(path)
        query = unquote(query)
        fragment = unquote(fragment)

    # Result
    result = SplitResult(scheme, netloc.lower(), path, query, fragment)

    if not unsplit:
        return result

    # TODO: check if works with `unsplit=False`
    if strip_protocol or not has_protocol:
        result = urlunsplit(result)[2:]
    else:
        result = urlunsplit(result)

    return result