def test_external(): '''Test external components''' # remove unwanted elements mydoc = html.fromstring( '<html><body><footer>Test text</footer></body></html>') _, _, mylen = sanitize_tree(mydoc) assert mylen == 0 mydoc = html.fromstring( '<html><body><table><th>Test text</th><tr><td>Test</td></tr></table></body></html>' ) _, _, mylen = sanitize_tree(mydoc) assert mylen > 0 # strip fancy tags while including links and images mydoc = html.fromstring( '<html><body><p>Text here <fancy>Test text</fancy><a href="">with a link</a>.</p><img src="test.jpg"/></body></html>' ) mytree, _, _ = sanitize_tree(mydoc, include_links=False, include_images=False) assert len(mytree) == 1 mydoc = html.fromstring( '<html><body><p>Text here <fancy>Test text</fancy><a href="">with a link</a>.</p><img src="test.jpg"/></body></html>' ) mytree, _, _ = sanitize_tree(mydoc, include_links=True, include_images=True) myelems = {element.tag for element in set(mytree.iter())} assert 'graphic' in myelems and 'ref' in myelems # test langid if LANGID_FLAG is True: doc = html.fromstring('<html><body>' + '<p>Non è inglese.</p>' * 20 + '</body></html>') assert extract( doc, no_fallback=False, target_language='en', deduplicate=False) is None
def test_external(): '''Test external components''' # remove unwanted elements mydoc = html.fromstring('<html><body><footer>Test text</footer></body></html>') _, _, mylen = sanitize_tree(mydoc) assert mylen == 0 # strip fancy tags mydoc = html.fromstring('<html><body><p>Text here <fancy>Test text</fancy></p></body></html>') mytree, _, _ = sanitize_tree(mydoc) assert len(mytree) == 1 # justext stoplist # if LANGID_FLAG is True: doc = html.fromstring('<html><body>' + '<p>abc</p>'*10 + '</body></html>') result = extract(doc, no_fallback=False, target_language='en')
def test_external(): '''Test external components''' # remove unwanted elements mydoc = html.fromstring('<html><body><footer>Test text</footer></body></html>') _, _, mylen = sanitize_tree(mydoc) assert mylen == 0 mydoc = html.fromstring('<html><body><table><th>Test text</th><tr><td>Test</td></tr></table></body></html>') _, _, mylen = sanitize_tree(mydoc) assert mylen > 0 # strip fancy tags mydoc = html.fromstring('<html><body><p>Text here <fancy>Test text</fancy></p></body></html>') mytree, _, _ = sanitize_tree(mydoc) assert len(mytree) == 1 # test langid if LANGID_FLAG is True: doc = html.fromstring('<html><body>' + '<p>Non è inglese.</p>' * 20 + '</body></html>') assert extract(doc, no_fallback=False, target_language='en', deduplicate=False) is None