def test_creating_sent(self): tokens = 'It rains .'.split() s = ttl.Sentence('It rains.', tokens=tokens) self.assertEqual([t.text for t in s.tokens], tokens) self.assertEqual(repr(s), "Sentence('It rains.')") self.assertEqual(str(s), s.text) self.assertEqual(s.surface(s[0]), 'It') self.assertEqual(s.surface(ttl.Tag(cfrom=-1, cto=-1)), '') self.assertEqual(s.surface(ttl.Tag(cfrom=-1, cto=3)), '') self.assertEqual(s.surface(ttl.Tag(cfrom=3, cto=-1)), '') self.assertEqual(s.surface(ttl.Tag(cfrom=None, cto=None)), '') s.ID = '1' self.assertEqual(repr(s), "Sentence(ID='1', text='It rains.')") self.assertEqual(str(s), s.text) # tag sentence url = 'https://github.com/letuananh/chirptext' s.tag.URL = url self.assertEqual(s.tag['URL'].text, url) self.assertEqual(s.tag['URL'].value, url) self.assertEqual(len(s.tags.URL), 1) self.assertEqual(list(s.tags.values('URL')), [url]) # test concepts c = s.concepts.new(value='02756558-v', clemma='rain') self.assertRaises(Exception, lambda: s.concepts.new(None)) self.assertRaises(Exception, lambda: s.concepts.new('')) c2 = s.concepts.new(value='dummy', clemma='it') self.assertEqual(len(s.concepts), 2) self.assertRaises(Exception, lambda: s.concept.remove(3)) self.assertEqual(s.concepts.remove(c2), c2) self.assertEqual(list(s.concepts), [c])
def test_tagged_sentences(self): print("test converting MeCabSent into TTL Sent manually") sent = ttl.Sentence('猫が好きです 。') mecab_sent = deko.mecab._mecab_output_to_sent(sent4, sent4_mecab) # import tags sent._import_tokens(mecab_sent.tokens.values()) for mtoken, token in zip(mecab_sent, sent.tokens): if mtoken.reading_hira != token.text: if token.text in ('猫', '好き'): token.tag.reading = mtoken.reading_hira token.lemma = mtoken.reading_hira token.pos = mtoken.pos3 self.assertEqual(mecab_sent.tokens.values(), [x.text for x in sent.tokens]) self.assertEqual(sent[0].tag.reading.value, 'ねこ') self.assertEqual(sent[0].lemma, 'ねこ') self.assertEqual(sent[2].tag.reading.value, 'すき') # accessing gold-value self.assertFalse(sent[3].lemma, '') # if there is no lemma label => return '' self.assertEqual(sent[2].surface(), '好き') self.assertFalse(len(sent[1])) self.assertFalse(len(sent[3])) self.assertFalse(len(sent[4])) return sent
def test_import_tokens(self): sent = ttl.Sentence('It rains.') tokens = ['It', 'rains', '.'] sent.tokens = tokens self.assertEqual([t.text for t in sent.tokens], tokens) # cannot import twice self.assertRaises(Exception, lambda: self.import_tokens(sent, tokens)) # or import half-way sent2 = ttl.Sentence("Cats don't like cats that meow.") sent2._import_tokens(('Cats', )) tokens2 = "do n't like cats that meow .".split() self.assertRaises(Exception, lambda: self.import_tokens(sent, tokens2)) sent2._import_tokens( tokens2) # but use import_tokens explicitly is fine self.assertEqual( [t.text for t in sent2.tokens], ['Cats', 'do', "n't", 'like', 'cats', 'that', 'meow', '.'])
def test_sentids(self): doc = ttl.Document('boo') s = ttl.Sentence('odd', ID=3) self.assertEqual(s.ID, "3") doc.sents.append(s) # add sent#3 first doc.sents.new('foo') # 1 doc.sents.new('boo') # 2 moo = doc.sents.new('moo') # moo will be #4 because sent #3 exists self.assertEqual(moo.ID, "4") sids = [s.ID for s in doc] self.assertEqual(sids, ["3", "1", "2", "4"])
def test_tagging_erg_sent(self): """ Test import tokens """ txt = """In this way I am no doubt indirectly responsible for Dr. Grimesby Roylott's death, and I cannot say that it is likely to weigh very heavily upon my conscience." """ words = [ 'in', 'this', 'way', 'i', 'am', 'no', 'doubt', 'indirectly', 'responsible', 'for', 'dr.', 'Grimesby', 'Roylott', "'s", 'death', ',', 'and', 'i', 'can', 'not', 'say', 'that', 'it', 'is', 'likely', 'to', 'weigh', 'very', 'heavily', 'upon', 'my', 'conscience', '.', '"' ] s = ttl.Sentence(txt) s._import_tokens(words) self.assertEqual(words, [x.text for x in s.tokens])
def test_sentid(self): doc = ttl.Document('mydoc') sent = doc.sents.new('First sentence.') self.assertEqual(sent.ID, "1") sent2 = doc.sents.new('Second sentence.') self.assertEqual(sent2.ID, "2") # add some sentences manually sentm1 = ttl.Sentence('Another one', ID=3) sentm2 = ttl.Sentence('Another one 2', ID='5') doc.sents.append(sentm1) doc.sents.append(sentm2) doc.sents.new('Third sentence.') doc.sents.new('Fourth sentence.') sent5 = doc.sents.new('Fifth sentence.') self.assertEqual(sent5.ID, "7") # cannot add 3 again sent_foo = ttl.Sentence('Foo sentence.', ID=3) self.assertRaises(Exception, lambda: doc._add_sent_obj(sent_foo)) # cannot add a None sentence self.assertRaises(Exception, lambda: doc._add_sent_obj(None)) # document should have 5 created sentences + 2 imported sentences self.assertEqual(len(doc), 7)
def build_test_sent(self): sent = ttl.Sentence(sent1) sent.flag = '0' sent.comment = 'written in Japanese' sent.tags.new('I like calico cats.', 'eng') sent._import_tokens('三 毛 猫 が 好き です 。'.split()) for tk, pos in zip(sent, '名詞 名詞 名詞 助詞 名詞 助動詞 記号'.split()): tk.pos = pos sent.concepts.new("三毛猫", "wiki", "wiki.ja:三毛猫", tokens=[0, 1, 2]) sent[0].tags.new('mi', type='reading') sent[1].tags.new('ke', type='reading') sent[2].tag.reading = 'neko' getLogger().debug(sent.to_dict()) return sent
def to_ttl(self): ttl_sent = ttl.Sentence(text=self.text) data = self.to_dict() for l in TTLIG.KNOWN_LABELS: if l not in ['text', 'orth', 'tokens'] and l in data and data[l]: ttl_sent.tags.new(data[l], type=l) if self.tokens: _tokens = parse_ruby(self.tokens) ttl_sent.tokens = (t.text() for t in _tokens) for ttl_token, furi_token in zip(ttl_sent, _tokens): if furi_token.surface != furi_token.text(): ttl_token.tags.new(furi_token.surface, type='furi') if self.morphtrans: _morphtokens = tokenize(self.morphtrans) if len(_morphtokens) != len(ttl_sent): logging.getLogger(__name__).warning("Morphophonemic transliteration line and tokens line are mismatched for sentence: {}".format(self.ident or self.ID or self.Id or self.id or self.text)) else: for t, m in zip(ttl_sent, _morphtokens): t.tags.new(m, type='mtrans') if self.pos: _postokens = tokenize(self.pos) if len(_postokens) != len(ttl_sent): logging.getLogger(__name__).warning("Part-of-speech line and tokens line are mismatched for sentence: {}".format(self.ident or self.ID or self.Id or self.id or self.text)) else: for t, m in zip(ttl_sent, _postokens): t.pos = m if self.lemma: _lemmas = tokenize(self.lemma) if len(_lemmas) != len(ttl_sent): logging.getLogger(__name__).warning("Lemma line and tokens line are mismatched for sentence: {}".format(self.ident or self.ID or self.Id or self.id or self.text)) else: for t, m in zip(ttl_sent, _lemmas): t.lemma = m if self.morphgloss: _glosstokens = tokenize(self.morphgloss) if len(_glosstokens) != len(ttl_sent): logging.getLogger(__name__).warning("morpheme-by-morpheme gloss and tokens lines are mismatched for sentence {}".format(self.ident or self.ID or self.Id or self.id or self.text)) else: for t, m in zip(ttl_sent, _glosstokens): t.tags.new(m, type='mgloss') if self.wordgloss: _glosstokens = tokenize(self.wordgloss) if len(_glosstokens) != len(ttl_sent): logging.getLogger(__name__).warning("word-by-word gloss and tokens lines are mismatched for sentence {}".format(self.ident or self.ID or self.Id or self.id or self.text)) else: for t, m in zip(ttl_sent, _glosstokens): t.tags.new(m, type='wgloss') return ttl_sent
def test_recover_surface_string(self): s = ttl.Sentence( """a religious sect founded in the United States in 1966; based on Vedic scriptures; groups engage in joyful chanting of `Hare Krishna' and other mantras based on the name of the Hindu god Krishna; devotees usually wear saffron robes and practice vegetarianism and celibacy""" ) tokens = [ 'a', 'religious', 'sect', 'founded', 'in', 'the', 'United', 'States', 'in', '1966', ';', 'based', 'on', 'Vedic', 'scriptures', ';', 'groups', 'engage', 'in', 'joyful', 'chanting', 'of', 'Hare', 'Krishna', 'and', 'other', 'mantras', 'based', 'on', 'the', 'name', 'of', 'the', 'Hindu', 'god', 'Krishna', ';', 'devotees', 'usually', 'wear', 'saffron', 'robes', 'and', 'practice', 'vegetarianism', 'and', 'celibacy' ] s._import_tokens(tokens) cfrom = min(x.cfrom for x in s.tokens) cto = max(x.cto for x in s.tokens) self.assertEqual(s.text, s.text[cfrom:cto])
def test_comment(self): sent = ttl.Sentence("Dogs bark.") sent._import_tokens("Dogs bark .".split()) sent.comment = 'I am a test sentence.' sent[0].comment = "canine" sent.concepts.new("02084071-n", clemma="dog", tokens=(sent[0], )) list( sent.concepts )[0].comment = 'a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times' expected = { 'text': 'Dogs bark.', 'comment': 'I am a test sentence.', 'tokens': [{ 'cto': 4, 'cfrom': 0, 'comment': 'canine', 'text': 'Dogs' }, { 'cto': 9, 'cfrom': 5, 'text': 'bark' }, { 'cto': 10, 'cfrom': 9, 'text': '.' }], 'concepts': [{ 'value': '02084071-n', 'clemma': 'dog', 'comment': 'a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times', 'tokens': [0] }] } getLogger().debug(sent.to_dict()) getLogger().debug(expected) self.assertEqual(expected, sent.to_dict()) self.assertFalse(sent.tags) sent.tags.new(GDOG_SID, 'wn30', 0, 4) sent.tag.wn30 = {"value": BARK_SID, "cfrom": 5, "cto": 9} for t in sent.tags: getLogger().debug("{}: label={} | type = {}".format( t, t.value, t.type))
def to_ttl(sent, with_nonsense=True, sk_map=None, wnctx=None): tokens = sent['tokens'] text = detokenize(tokens) s = ttl.Sentence(text=text) s.new_tag(sent['sid'], tagtype='origid') s.import_tokens((t.text for t in tokens)) for tinfo, tk in zip(tokens, s): for k, v in tinfo.data: if (k, v) == ('tag', 'wf') or k == 'sk': continue if k == 'lemma': tk.lemma = v elif k == 'pos': tk.pos = v else: tk.new_tag(label=v, tagtype=k) # if sensekey exists, add it as a concept lemma = tinfo.lemma sk = fix_sensekey(tinfo.get('sk')) rdf = tinfo.get('rdf') comment = None if sk and (with_nonsense or not is_nonsense(lemma, sk, rdf)): sensetag = sk if sk_map is not None and sk in sk_map: sensetag = sk_map[sk] elif wnctx is not None: # try to determine synsetID ss = wnctx.senses.select_single('sensekey=?', (sk, )) if ss is not None: sid = str(SynsetID.from_string(ss.synsetid)) if sk_map is not None: sk_map[sk] = sid sensetag = sid else: # sensekey not found getLogger().warning( "There is no synsetID with sensekey={} | rdf={}". format(sk, rdf)) comment = 'sensekey' s.new_concept(clemma=lemma, tag=sensetag, tokens=(tk, ), comment=comment) return s
def test_tag_type_and_searching(self): taggable_objects = [ ttl.Token("text", 0, 4), ttl.Sentence('I am a sentence.') ] for obj in taggable_objects: obj.tags.new("06387980-n", type="synset") obj.tags.new("06414372-n", type="synset") obj.tags.new("manual", type="tagtype") # find all values by types synsets = list(obj.tags.values("synset")) self.assertEqual(synsets, ["06387980-n", "06414372-n"]) # find a specific tag self.assertEqual(obj.tag.tagtype.text, "manual") self.assertEqual(obj.tag['tagtype'].text, "manual") # auto create .. self.assertEqual( obj.tag.get_or_create('meaning', default='N/A').value, "N/A") self.assertEqual(obj.tag.meaning.text, "N/A")
def test_tagged_sent_to_json(self): sent = ttl.Sentence("女の子は猫が好きです。") sent._import_tokens("女 の 子 は 猫 が 好き です 。".split()) sent[0].lemma = "おんな" sent[2].lemma = "こ" sent[4].lemma = "ねこ" sent[4].comment = "Say neh-koh" sent[4].pos = "名詞-一般" sent[6].lemma = "すき" sent[6].pos = "名詞-形容動詞語幹" c = sent.concepts.new("10084295-n", "wn", clemma="女の子", tokens=(sent[0], sent[1], sent[2])) sent.concept.wn.comment = "若々しい女の人" # set comment for gold wn concept, which is c self.assertEqual(c.comment, "若々しい女の人") sent.concepts.new("02121620-n", clemma="猫").tokens.append(sent[4]) sent.concepts.new("01292683-a", clemma="好き").tokens.append(sent[6]) expected = { 'tokens': [{ 'cfrom': 0, 'cto': 1, 'lemma': 'おんな', 'text': '女' }, { 'cfrom': 1, 'cto': 2, 'text': 'の' }, { 'cfrom': 2, 'cto': 3, 'lemma': 'こ', 'text': '子' }, { 'cfrom': 3, 'cto': 4, 'text': 'は' }, { 'pos': '名詞-一般', 'cfrom': 4, 'cto': 5, 'lemma': 'ねこ', 'text': '猫', 'comment': 'Say neh-koh' }, { 'cfrom': 5, 'cto': 6, 'text': 'が' }, { 'pos': '名詞-形容動詞語幹', 'cfrom': 6, 'cto': 8, 'lemma': 'すき', 'text': '好き' }, { 'cfrom': 8, 'cto': 10, 'text': 'です' }, { 'cfrom': 10, 'cto': 11, 'text': '。' }], 'text': '女の子は猫が好きです。', 'concepts': [{ 'value': '10084295-n', 'tokens': [0, 1, 2], 'type': 'wn', 'clemma': '女の子', 'comment': '若々しい女の人' }, { 'value': '02121620-n', 'tokens': [4], 'clemma': '猫' }, { 'value': '01292683-a', 'tokens': [6], 'clemma': '好き' }] } actual = sent.to_dict() self.assertEqual(expected['text'], actual['text']) self.assertEqual(expected['concepts'], actual['concepts']) self.assertEqual(expected['tokens'], actual['tokens']) self.assertEqual(expected, actual) getLogger().debug(actual)
def test_extra_fields(self): cmt = 'This sentence is in English' s = ttl.Sentence(text='I am a sentence.', docID=1, comment=cmt) self.assertEqual(s.docID, 1) self.assertEqual(s.comment, cmt)