Python extract_links 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: trafilatura.feeds

메소드/함수: extract_links

hotexamples.com에서의 예제들: 7

Python extract_links - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 trafilatura.feeds.extract_links에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: feeds_tests.py 프로젝트: phongtnit/trafilatura

def test_atom_extraction():
    '''Test link extraction from an Atom feed'''
    filepath = os.path.join(RESOURCES_DIR, 'feed1.atom')
    with open(filepath) as f:
        teststring = f.read()
    assert len(feeds.extract_links(teststring)) > 0
    assert len(feeds.extract_links('<link type="application/atom+xml" rel="self" href="https://www.dwds.de/api/feed/themenglossar/Corona"/>')) == 0

예제 #2

파일 보기

def test_atom_extraction():
    '''Test link extraction from an Atom feed'''
    assert len(feeds.extract_links('<html></html>', 'example.org', 'https://example.org', '')) == 0
    filepath = os.path.join(RESOURCES_DIR, 'feed1.atom')
    with open(filepath) as f:
        teststring = f.read()
    assert len(feeds.extract_links(teststring, 'example.org', 'https://example.org', '')) > 0
    assert len(feeds.extract_links(XMLDECL + '<link type="application/atom+xml" rel="self" href="https://www.dwds.de/api/feed/themenglossar/Corona"/>', 'dwds.de', 'https://www.dwds.de', '')) == 0
    assert len(feeds.extract_links(XMLDECL + '<link type="application/atom+xml" rel="self" href="123://api.exe"/>', 'example.org', 'https://example.org', '')) == 0

예제 #3

파일 보기

def test_rss_extraction():
    '''Test link extraction from a RSS feed'''
    assert len(feeds.extract_links(XMLDECL + '<link>http://example.org/article1/</link>', 'example.org', 'http://example.org/', '')) == 1
    assert len(feeds.extract_links(XMLDECL + '<link>http://example.org/</link>', 'example.org', 'http://example.org', 'http://example.org')) == 0
    assert len(feeds.extract_links(XMLDECL + '<link rel="self">http://example.org/article1/</link>', 'example.org', 'http://example.org/', '')) == 0
    assert feeds.extract_links(XMLDECL + '<link>/api/feed/themenglossar/Corona</link>', 'www.dwds.de', 'https://www.dwds.de', 'https://www.dwds.de') == ['https://www.dwds.de/api/feed/themenglossar/Corona']
    filepath = os.path.join(RESOURCES_DIR, 'feed2.rss')
    with open(filepath) as f:
        teststring = f.read()
    assert len(feeds.extract_links(teststring, 'example.com', 'https://example.org', '')) > 0

예제 #4

파일 보기

파일: feeds_tests.py 프로젝트: adbar/trafilatura

def test_rss_extraction():
    '''Test link extraction from a RSS feed'''
    assert len(
        feeds.extract_links(
            XMLDECL + '<link>http://example.org/article1/</link>',
            'example.org', 'http://example.org/', '')) == 1
    # CDATA
    assert feeds.extract_links(
        XMLDECL + '<link><![CDATA[http://example.org/article1/]]></link>',
        'example.org', 'http://example.org/',
        '') == ['http://example.org/article1/']
    # spaces
    assert len(
        feeds.extract_links(
            XMLDECL +
            '<link>\r\n    https://www.ak-kurier.de/akkurier/www/artikel/108815-sinfonisches-blasorchester-spielt-1500-euro-fuer-kinder-in-drk-krankenhaus-kirchen-ein    </link>',
            'ak-kurier.de', 'https://www.ak-kurier.de/', '')) == 1
    assert len(
        feeds.extract_links(XMLDECL + '<link>http://example.org/</link>',
                            'example.org', 'http://example.org',
                            'http://example.org')) == 0
    assert len(
        feeds.extract_links(XMLDECL + '<link>https://example.org</link>',
                            'example.org', 'http://example.org/', '')) == 0
    assert feeds.extract_links(
        XMLDECL + '<link>/api/feed/themenglossar/Corona</link>', 'www.dwds.de',
        'https://www.dwds.de', 'https://www.dwds.de') == [
            'https://www.dwds.de/api/feed/themenglossar/Corona'
        ]
    filepath = os.path.join(RESOURCES_DIR, 'feed2.rss')
    with open(filepath) as f:
        teststring = f.read()
    assert len(
        feeds.extract_links(teststring, 'example.com', 'https://example.org',
                            '')) > 0

예제 #5

파일 보기

파일: feeds_tests.py 프로젝트: adbar/trafilatura

def test_json_extraction():
    '''Test link extraction from a JSON feed'''
    # find link
    assert len(
        feeds.determine_feed(
            '<html><meta><link rel="alternate" type="application/json" title="JSON Feed" href="https://www.jsonfeed.org/feed.json" />></meta><body/></html>',
            'jsonfeed.org', 'https://www.jsonfeed.org')) == 1
    # extract data
    filepath = os.path.join(RESOURCES_DIR, 'feed.json')
    with open(filepath) as f:
        teststring = f.read()
    links = feeds.extract_links(teststring, 'npr.org', 'https://npr.org', '')
    assert len(links) == 25
    # id as a backup
    links = feeds.extract_links(
        r'{"version":"https:\/\/jsonfeed.org\/version\/1","items":[{"id":"https://www.example.org/1","title":"Test"}]}',
        'example.org', 'https://example.org', '')
    assert len(links) == 1

예제 #6

파일 보기

파일: feeds_tests.py 프로젝트: phongtnit/trafilatura

def test_rss_extraction():
    '''Test link extraction from a RSS feed'''
    filepath = os.path.join(RESOURCES_DIR, 'feed2.rss')
    with open(filepath) as f:
        teststring = f.read()
    assert len(feeds.extract_links(teststring)) > 0

예제 #7

파일 보기

파일: feeds_tests.py 프로젝트: scroobiustrip/trafilatura

def test_atom_extraction():
    '''Test link extraction from an Atom feed'''
    filepath = os.path.join(RESOURCES_DIR, 'feed1.atom')
    with open(filepath) as f:
        teststring = f.read()
    assert len(feeds.extract_links(teststring)) > 0