예제 #1
0
def test_normalization():
    assert normalize_url('HTTPS://WWW.DWDS.DE/') == 'https://www.dwds.de/'
    assert normalize_url(
        'http://test.net/foo.html#bar') == 'http://test.net/foo.html'
    assert normalize_url('http://test.net/foo.html#:~:text=night-,vision'
                         ) == 'http://test.net/foo.html'
    assert normalize_url('http://www.example.org:80/test.html'
                         ) == 'http://www.example.org/test.html'
예제 #2
0
def test_examples():
    '''test README examples'''
    assert check_url('https://github.com/adbar/courlan') == (
        'https://github.com/adbar/courlan', 'github.com')
    assert check_url(
        'https://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.org',
        strict=True) == ('https://httpbin.org/redirect-to', 'httpbin.org')
    assert clean_url('HTTPS://WWW.DWDS.DE:80/') == 'https://www.dwds.de'
    assert validate_url('http://1234') == (False, None)
    assert validate_url('http://www.example.org/')[0] is True
    assert normalize_url(
        'http://test.net/foo.html?utm_source=twitter&post=abc&page=2#fragment',
        strict=True) == 'http://test.net/foo.html?page=2&post=abc'
예제 #3
0
def extract_url(tree, default_url=None):
    '''Extract the URL from the canonical link'''
    # https://www.tutorialrepublic.com/html-reference/html-base-tag.php
    # default url as fallback
    url = default_url
    # try canonical link first
    element = tree.find('.//head//link[@rel="canonical"]')
    if element is not None and 'href' in element.attrib and URL_COMP_CHECK.match(
            element.attrib['href']):
        url = element.attrib['href']
    # try default language link
    else:
        for element in tree.iterfind('.//head//link[@rel="alternate"]'):
            if 'hreflang' in element.attrib and element.attrib[
                    'hreflang'] is not None and element.attrib[
                        'hreflang'] == 'x-default':
                if URL_COMP_CHECK.match(element.attrib['href']):
                    LOGGER.debug(
                        html.tostring(element,
                                      pretty_print=False,
                                      encoding='unicode').strip())
                    url = element.attrib['href']
    # add domain name if it's missing
    if url is not None and url.startswith('/'):
        for element in tree.iterfind('.//head//meta[@content]'):
            if 'name' in element.attrib:
                attrtype = element.attrib['name']
            elif 'property' in element.attrib:
                attrtype = element.attrib['property']
            else:
                continue
            if attrtype.startswith('og:') or attrtype.startswith('twitter:'):
                domain_match = re.match(r'https?://[^/]+',
                                        element.attrib['content'])
                if domain_match:
                    # prepend URL
                    url = domain_match.group(0) + url
                    break
    # sanity check: don't return invalid URLs
    if url is not None:
        validation_result, parsed_url = validate_url(url)
        if validation_result is False:
            url = None
        else:
            url = normalize_url(parsed_url)
    return url
예제 #4
0
def test_qelems():
    assert normalize_url('http://test.net/foo.html?utm_source=twitter'
                         ) == 'http://test.net/foo.html?utm_source=twitter'
    assert normalize_url('http://test.net/foo.html?utm_source=twitter',
                         strict=True) == 'http://test.net/foo.html'
    assert normalize_url(
        'http://test.net/foo.html?utm_source=twitter&post=abc&page=2'
    ) == 'http://test.net/foo.html?page=2&post=abc&utm_source=twitter'
    assert normalize_url(
        'http://test.net/foo.html?utm_source=twitter&post=abc&page=2',
        strict=True) == 'http://test.net/foo.html?page=2&post=abc'
    assert normalize_url(
        'http://test.net/foo.html?page=2&itemid=10&lang=en'
    ) == 'http://test.net/foo.html?itemid=10&lang=en&page=2'
    with pytest.raises(ValueError):
        assert normalize_url('http://test.net/foo.html?page=2&lang=en',
                             language='de')
        assert normalize_url(
            'http://www.evolanguage.de/index.php?page=deutschkurse_fuer_aerzte&language=ES',
            language='de')