Exemplo n.º 1
0
def find_new_links(htmlstring,
                   base_url,
                   known_links,
                   language=None,
                   rules=None):
    """Extract and filter new internal links after an optional language check."""
    new_links = []
    # reference=None
    # optional language check: run baseline extraction + language identifier
    if language is not None and LANGID_FLAG is True:
        _, text, _ = baseline(htmlstring)
        result = cld3.get_language(text)
        if result is not None and result.language != language:
            return new_links, known_links
    # iterate through the links and filter them
    for link in extract_links(htmlstring,
                              base_url,
                              False,
                              language=language,
                              with_nav=True):
        # check robots.txt rules
        if rules is not None and not rules.can_fetch("*", link):
            continue
        # sanity check
        if is_known_link(link, known_links) is True or is_not_crawlable(link):
            continue
        new_links.append(link)
        known_links.add(link)
    return new_links, known_links
Exemplo n.º 2
0
def process_links(htmlstring, base_url, known_links, todo):
    """Examine the HTML code and process the retrieved internal links. Store
       the links in todo-list while prioritizing the navigation ones."""
    navlinks, links = [], []
    # language=None, reference=None
    # to add by next courlan version: with_nav=True
    for link in extract_links(htmlstring, base_url, False):
        if link in known_links:
            continue
        if NAVIGATION_FILTER.search(link):
            navlinks.append(link)
        else:
            links.append(link)
        known_links.add(link)
    # add links to deque
    todo.extend(links)
    # prioritize navigation links
    todo.extendleft(navlinks)
    return todo, known_links
Exemplo n.º 3
0
def process_links(htmlstring, base_url, known_links, todo, language=None):
    """Examine the HTML code and process the retrieved internal links. Store
       the links in todo-list while prioritizing the navigation ones."""
    navlinks, links = [], []
    # language=None, reference=None
    for link in extract_links(htmlstring, base_url, False, language=language, with_nav=True):
        if link in known_links:
            continue
        if is_navigation_page(link):
            navlinks.append(link)
        else:
            links.append(link)
        known_links.add(link)
    # add links to deque
    if todo is None:
        todo = deque()
    todo.extend(links)
    # prioritize navigation links
    todo.extendleft(navlinks)
    return todo, known_links
Exemplo n.º 4
0
def test_extraction():
    '''test link comparison in HTML'''
    pagecontent = '<html><a href="https://test.com/example" hreflang="de-DE"/></html>'
    assert len(extract_links(pagecontent, 'https://test.com/', False)) == 1
    assert len(extract_links(pagecontent, 'https://test.com/', True)) == 0
    assert len(
        extract_links(pagecontent, 'https://test.com/', False,
                      language='de')) == 1
    assert len(
        extract_links(pagecontent, 'https://test.com/', False,
                      language='en')) == 0
    pagecontent = '<html><a hreflang="de-DE" href="https://test.com/example"/><a href="https://test.com/example2"/></html>'
    assert len(
        extract_links(pagecontent, 'https://test.com/', False,
                      language=None)) == 2
    assert len(
        extract_links(pagecontent, 'https://test.com/', False,
                      language='de')) == 2
Exemplo n.º 5
0
def test_extraction():
    '''test link comparison in HTML'''
    assert len(extract_links(None, 'https://test.com/', False)) == 0
    assert len(extract_links('', 'https://test.com/', False)) == 0
    # link known under another form
    pagecontent = '<html><a href="https://test.org/example"/><a href="https://test.org/example/&"/></html>'
    assert len(extract_links(pagecontent, 'https://test.org', False)) == 1
    # language
    pagecontent = '<html><a href="https://test.com/example" hreflang="de-DE"/></html>'
    assert len(extract_links(pagecontent, 'https://test.com/', False)) == 1
    assert len(extract_links(pagecontent, 'https://test.com/', True)) == 0
    assert len(
        extract_links(pagecontent, 'https://test.com/', False,
                      language='de')) == 1
    assert len(
        extract_links(pagecontent, 'https://test.com/', False,
                      language='en')) == 0
    pagecontent = '<html><a href=https://test.com/example hreflang=de-DE/></html>'
    assert len(
        extract_links(pagecontent, 'https://test.com/', False,
                      language='de')) == 1
    # x-default
    pagecontent = '<html><a href="https://test.com/example" hreflang="x-default"/></html>'
    assert len(
        extract_links(pagecontent, 'https://test.com/', False,
                      language='de')) == 1
    assert len(
        extract_links(pagecontent, 'https://test.com/', False,
                      language='en')) == 1
    # language + content
    pagecontent = '<html><a hreflang="de-DE" href="https://test.com/example"/><a href="https://test.com/example2"/><a href="https://test.com/example2 ADDITIONAL"/></html>'
    links = extract_links(pagecontent, 'https://test.com/', False)
    assert sorted(links) == [
        'https://test.com/example', 'https://test.com/example2'
    ]
    assert len(
        extract_links(pagecontent, 'https://test.com/', False,
                      language='de')) == 2
    pagecontent = '<html><a hreflang="de-DE" href="https://test.com/example"/><a href="https://test.com/page/2"/></html>'
    assert len(
        extract_links(pagecontent, 'https://test.com/', False,
                      with_nav=False)) == 1
    assert len(
        extract_links(pagecontent, 'https://test.com/', False,
                      with_nav=True)) == 2
    pagecontent = "<html><head><title>Links</title></head><body><a href='/links/2/0'>0</a> <a href='/links/2/1'>1</a> </body></html>"
    links = extract_links(pagecontent,
                          'https://httpbin.org',
                          False,
                          with_nav=True)
    assert sorted(links) == [
        'https://httpbin.org/links/2/0', 'https://httpbin.org/links/2/1'
    ]
    # links undeveloped by CMS
    pagecontent = '<html><a href="{privacy}" target="_privacy">{privacy-link}</a></html>'
    assert len(extract_links(pagecontent, 'https://test.com/', False)) == 0
    assert len(extract_links(pagecontent, 'https://test.com/', True)) == 0
    # links without quotes
    pagecontent = '<html><a href=/contact>Link</a></html>'
    assert extract_links(pagecontent, 'https://test.com/',
                         False) == {'https://test.com/contact'}
    assert extract_links(pagecontent, 'https://test.com/', True) == set()
    pagecontent = '<html><a href=/contact attribute=value>Link</a></html>'
    assert extract_links(pagecontent, 'https://test.com/',
                         False) == {'https://test.com/contact'}