def tag(fi_tok, zh_tok, zh_untok, align): fi_tagging = get_extractor("FinExtractor").extract(fi_tok) zh_tagging = get_extractor("CmnExtractor").extract(zh_untok, zh_tok) for id, (_token, tag) in enumerate( chain(fi_tagging.iter_tags(), zh_tagging.iter_tags())): tag.id = id add_supports(fi_tagging, zh_tagging, align) return fi_tagging, zh_tagging
def test_heraa_hetkeksi(): extor = get_extractor("FinExtractor") tagging = extor.extract("hetkeksi") assert len(tagging.tokens) == 1 tagging = extor.extract("Herää hetkeksi") hetkeksi_tokens = _filter_toks(tagging, "hetki") assert len(hetkeksi_tokens) == 1
def test_extract_zh_friend(): zh_tok = "朋友" zh_untok = "朋友" zh_tagging = get_extractor("CmnExtractor").extract(zh_untok, zh_tok) matching_token = None for tok in zh_tagging.tokens: if tok.token == "朋友": assert matching_token is None matching_token = tok assert matching_token is not None assert len(matching_token.anchors) == 2 assert len(matching_token.tags) == 4
def test_hyvaa(): tagging = get_extractor("FinExtractor").extract("Hyvää !") matching_token = None wordnet_counts = {"fin": 0, "qf2": 0, "qwf": 0} for tok in tagging.tokens: if tok.token == "Hyvää": matching_token = tok assert matching_token is not None for tag in matching_token.tags: for wn, _ in tag.lemma_objs: wordnet_counts[wn] += 1 assert wordnet_counts["fin"] >= 27 assert wordnet_counts["qf2"] >= 25 assert wordnet_counts["qwf"] >= 7
def test_extract_zh_sincere_congrats_dave(): zh_tok = "真诚地 , 大卫 。 恭喜 你 。" zh_untok = "真诚地,大卫。 恭喜你。" get_extractor("CmnExtractor").extract(zh_untok, zh_tok)
def test_extract_zh_untok_sincere(): zh_untok = "真诚地" tagging = get_extractor("CmnExtractor").extract_untok(zh_untok) sincere_asserts(tagging)
def test_extract_fin_murhamies_has_murha_and_mies(): tagging = get_extractor("FinExtractor").extract("murhamies") murha_tokens = _filter_toks(tagging, "murha") assert len(murha_tokens) == 1 mies_tokens = _filter_toks(tagging, "mies") assert len(mies_tokens) == 1
def test_extract_fin_ei_koskaan(): tagging = get_extractor("FinExtractor").extract( "Älä koskaan sano mitään tuollaista hänestä !") ei_koskaan_tokens = _filter_toks(tagging, "ei_koskaan") assert 1 <= len(ei_koskaan_tokens) <= 2
def test_extract_fin_saada_aikaan(): tagging = get_extractor("FinExtractor").extract( "Katso , mitä olet saanut aikaan .") saada_aikaan_tokens = _filter_toks(tagging, "saada_aikaan") assert len(saada_aikaan_tokens) >= 1
def test_open_brace(): tagging = get_extractor("FinExtractor").extract("[ sillä on tapansa !") for token in tagging.tokens: for tag in token.tags: assert tag.lemma != ""
def test_gordon(): tagging = get_extractor("FinExtractor").extract( "Gordon on jossain täällä .") assert tagging.tokens[0].anchors[0].char == 7