예제 #1
0
def test_unitize_elision_file(unit_connection, tessfiles_greek_path):
    tokenizer = GreekTokenizer(unit_connection)
    t = Text(path=str(tessfiles_greek_path.joinpath('test.elision.tess')),
             language='greek')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 1
예제 #2
0
def test_tokenize(token_connection, greek_tessfiles, greek_tokens):
    grc = GreekTokenizer(token_connection)

    for i, tessfile in enumerate(greek_tessfiles):
        print(tessfile.metadata.title)
        tokens, tags, features = grc.tokenize(tessfile.read(),
                                              text=tessfile.metadata)
        tokens = [t for t in tokens if re.search(r'[\w]+', t.display)]

        for j, token in enumerate(tokens):
            # Detect all connected
            assert token.display == greek_tokens[i][j]['display']
            # if tessfile.metadata.title == 'gorgias':
            #     print(token.display, greek_tokens[i][j])
            # print(token.display, token.features['form'].token, [t.token for t in token.features['lemmata']])
            # print(greek_tokens[i][j])
            assert token.features['form'].token == greek_tokens[i][j]['form']
            assert all([
                any(
                    map(lambda x: lemma.token == x,
                        greek_tokens[i][j]['lemmata']))
                for lemma in token.features['lemmata']
            ])