def test_parse_raw(): check_status(AlpinoParser()) deps = parse_raw(_SENT) assert_equal({dep for dep in deps.split("\n") if dep}, {dep for dep in _PARSE.split("\n") if dep})
def test_parse(): p = AlpinoParser() check_status(p) deps = p.process(_SENT) assert_equal({dep for dep in deps.split("\n") if dep}, {dep for dep in _PARSE.split("\n") if dep})
def test_convert(): p = AlpinoParser() check_status(p) s = p.convert(123, p.process(_SENT), "csv") tokens = list(csv.DictReader(StringIO(s))) print(tokens) assert_equal(len(tokens), 3) assert_equal(tokens[0]['doc'], '123') assert_equal(tokens[0]['lemma'], 'Toob') assert_equal(tokens[0]['parent'], tokens[1]['id'])
def test_process(): """ Test CoreNLP processing Make sure a corenlp server is listening at port 9000, e.g.: docker run -dp 9000:9000 chilland/corenlp-docker """ c = CoreNLPLemmatizer() check_status(c) result = c.process("two words") assert_in("<lemma>word</lemma>", result) tokens = list(csv.DictReader(StringIO(c.convert(1, result, format="csv")))) assert_equal(len(tokens), 2) assert_equal(tokens[1]['lemma'], "word")
def test_process(): """ Test Frog lemmatizing Make sure a frog server is listening at port 9000, e.g.: sudo docker run -dp 9887:9887 proycon/lamachine frog -S 9887 --skip=pm """ c = FrogLemmatizer() check_status(c) result = c.process("Nederlandse woordjes") print(result) r = list(csv.DictReader(StringIO(result))) assert_equal(len(r), 2) assert_equal(r[0]["lemma"], "nederlands") assert_equal(r[0]["ner"], "B-LOC")
def test_alpino_unicode(): "Test what happens with non-ascii characters in input" check_status(AlpinoParser()) text = "Bjarnfre\xf0arson leeft" # tokenize should convery to utf-8 and only add final line break assert_equal(tokenize(text), text + "\n")
def test_tokenize(): check_status(AlpinoParser()) text = u"D\xedt is een zin, met komma |nietwaar|? En nog 'n zin" expected = u"D\xedt is een zin , met komma nietwaar ?\nEn nog 'n zin\n" assert_equal(tokenize(text), expected)