예제 #1
0
def test_wa_loader_with_known_entities():

    loader = WebAnnotatorLoader(known_entities={'ORG'})
    html = b"<html><body><p><span wa-subtypes='' wa-id='227' wa-type='ORG' class='WebAnnotator_org'>Scrapinghub</span> has an <b>office</b> in <span wa-subtypes='' wa-id='228' wa-type='CITY' class='WebAnnotator_org'>Montevideo</span></p></body></html>"
    tree = loader.loadbytes(html)
    res = lxml.html.tostring(tree)
    assert b'<html><body><p> __START_ORG__ Scrapinghub __END_ORG__  has an <b>office</b> in Montevideo</p></body></html>' in res
예제 #2
0
def test_wa_loader():
    ld = WebAnnotatorLoader()
    tree = ld.load(os.path.join(os.path.dirname(__file__), 'data', 'wa1.html'))
    res = lxml.html.tostring(tree)
    assert b"<p> __START_ORG__ Scrapinghub __END_ORG__  has an <b>office</b> in  __START_CITY__ Montevideo __END_CITY__ </p>" in res, res
    assert b"wa-" not in res, res
    assert b"WA-" not in res, res
예제 #3
0
def test_wa_loader_with_known_entities():

    loader = WebAnnotatorLoader(known_entities={'ORG'})
    html = b"<html><body><p><span wa-subtypes='' wa-id='227' wa-type='ORG' class='WebAnnotator_org'>Scrapinghub</span> has an <b>office</b> in <span wa-subtypes='' wa-id='228' wa-type='CITY' class='WebAnnotator_org'>Montevideo</span></p></body></html>"
    tree = loader.loadbytes(html)
    res = lxml.html.tostring(tree)
    assert b'<html><body><p> __START_ORG__ Scrapinghub __END_ORG__  has an <b>office</b> in Montevideo</p></body></html>' in res
예제 #4
0
def test_wa_loader():
    ld = WebAnnotatorLoader()
    tree = ld.loadbytes(HTML)
    res = lxml.html.tostring(tree)
    assert "<p> __START_ORG__ Scrapinghub __END_ORG__  has an <b>office</b> in  __START_CITY__ Montevideo __END_CITY__ </p>" in res
    assert "wa-" not in res, res
    assert "WA-" not in res, res
예제 #5
0
def test_wa_loader():
    ld = WebAnnotatorLoader()
    tree = ld.load(os.path.join(os.path.dirname(__file__), 'data', 'wa1.html'))
    res = lxml.html.tostring(tree)
    assert "<p> __START_ORG__ Scrapinghub __END_ORG__  has an <b>office</b> in  __START_CITY__ Montevideo __END_CITY__ </p>" in res, res
    assert "wa-" not in res, res
    assert "WA-" not in res, res
예제 #6
0
def _assert_entities(fragment, known_entities, expected):

    ld = WebAnnotatorLoader(known_entities=known_entities)
    tree = ld.loadbytes(fragment)
    tokenizer = HtmlTokenizer()

    html_tokens, tags = tokenizer.tokenize_single(tree)
    tokens = [html_token.token for html_token in html_tokens]
    assert expected == dict([(token, tag) for token, tag in zip(tokens, tags) if tag != 'O'])
예제 #7
0
def _assert_entities(fragment, known_entities, expected):

    ld = WebAnnotatorLoader(known_entities=known_entities)
    tree = ld.loadbytes(fragment)
    tokenizer = HtmlTokenizer()

    html_tokens, tags = tokenizer.tokenize_single(tree)
    tokens = [html_token.token for html_token in html_tokens]
    assert expected == dict([(token, tag) for token, tag in zip(tokens, tags) if tag != 'O'])
예제 #8
0
def test_wa_loader_None_bug():
    ld = WebAnnotatorLoader()
    tree = ld.load(os.path.join(os.path.dirname(__file__), 'data', 'wa2.html'))
    res = lxml.html.tostring(tree)
    assert b'<em>Inc.</em> __END_ORG__ </p>' in res, res
예제 #9
0
def test_wa_loader_None_bug():
    ld = WebAnnotatorLoader()
    tree = ld.load(os.path.join(os.path.dirname(__file__), 'data', 'wa2.html'))
    res = lxml.html.tostring(tree)
    assert b'<em>Inc.</em> __END_ORG__ </p>' in res, res