Пример #1
0
 def test_creating_sent(self):
     tokens = 'It rains .'.split()
     s = ttl.Sentence('It rains.', tokens=tokens)
     self.assertEqual([t.text for t in s.tokens], tokens)
     self.assertEqual(repr(s), "Sentence('It rains.')")
     self.assertEqual(str(s), s.text)
     self.assertEqual(s.surface(s[0]), 'It')
     self.assertEqual(s.surface(ttl.Tag(cfrom=-1, cto=-1)), '')
     self.assertEqual(s.surface(ttl.Tag(cfrom=-1, cto=3)), '')
     self.assertEqual(s.surface(ttl.Tag(cfrom=3, cto=-1)), '')
     self.assertEqual(s.surface(ttl.Tag(cfrom=None, cto=None)), '')
     s.ID = '1'
     self.assertEqual(repr(s), "Sentence(ID='1', text='It rains.')")
     self.assertEqual(str(s), s.text)
     # tag sentence
     url = 'https://github.com/letuananh/chirptext'
     s.tag.URL = url
     self.assertEqual(s.tag['URL'].text, url)
     self.assertEqual(s.tag['URL'].value, url)
     self.assertEqual(len(s.tags.URL), 1)
     self.assertEqual(list(s.tags.values('URL')), [url])
     # test concepts
     c = s.concepts.new(value='02756558-v', clemma='rain')
     self.assertRaises(Exception, lambda: s.concepts.new(None))
     self.assertRaises(Exception, lambda: s.concepts.new(''))
     c2 = s.concepts.new(value='dummy', clemma='it')
     self.assertEqual(len(s.concepts), 2)
     self.assertRaises(Exception, lambda: s.concept.remove(3))
     self.assertEqual(s.concepts.remove(c2), c2)
     self.assertEqual(list(s.concepts), [c])
Пример #2
0
 def test_tagged_sentences(self):
     print("test converting MeCabSent into TTL Sent manually")
     sent = ttl.Sentence('猫が好きです 。')
     mecab_sent = deko.mecab._mecab_output_to_sent(sent4, sent4_mecab)
     # import tags
     sent._import_tokens(mecab_sent.tokens.values())
     for mtoken, token in zip(mecab_sent, sent.tokens):
         if mtoken.reading_hira != token.text:
             if token.text in ('猫', '好き'):
                 token.tag.reading = mtoken.reading_hira
             token.lemma = mtoken.reading_hira
             token.pos = mtoken.pos3
     self.assertEqual(mecab_sent.tokens.values(),
                      [x.text for x in sent.tokens])
     self.assertEqual(sent[0].tag.reading.value, 'ねこ')
     self.assertEqual(sent[0].lemma, 'ねこ')
     self.assertEqual(sent[2].tag.reading.value,
                      'すき')  # accessing gold-value
     self.assertFalse(sent[3].lemma,
                      '')  # if there is no lemma label => return ''
     self.assertEqual(sent[2].surface(), '好き')
     self.assertFalse(len(sent[1]))
     self.assertFalse(len(sent[3]))
     self.assertFalse(len(sent[4]))
     return sent
Пример #3
0
 def test_import_tokens(self):
     sent = ttl.Sentence('It rains.')
     tokens = ['It', 'rains', '.']
     sent.tokens = tokens
     self.assertEqual([t.text for t in sent.tokens], tokens)
     # cannot import twice
     self.assertRaises(Exception, lambda: self.import_tokens(sent, tokens))
     # or import half-way
     sent2 = ttl.Sentence("Cats don't like cats that meow.")
     sent2._import_tokens(('Cats', ))
     tokens2 = "do n't like cats that meow .".split()
     self.assertRaises(Exception, lambda: self.import_tokens(sent, tokens2))
     sent2._import_tokens(
         tokens2)  # but use import_tokens explicitly is fine
     self.assertEqual(
         [t.text for t in sent2.tokens],
         ['Cats', 'do', "n't", 'like', 'cats', 'that', 'meow', '.'])
Пример #4
0
 def test_sentids(self):
     doc = ttl.Document('boo')
     s = ttl.Sentence('odd', ID=3)
     self.assertEqual(s.ID, "3")
     doc.sents.append(s)  # add sent#3 first
     doc.sents.new('foo')  # 1
     doc.sents.new('boo')  # 2
     moo = doc.sents.new('moo')  # moo will be #4 because sent #3 exists
     self.assertEqual(moo.ID, "4")
     sids = [s.ID for s in doc]
     self.assertEqual(sids, ["3", "1", "2", "4"])
Пример #5
0
 def test_tagging_erg_sent(self):
     """ Test import tokens """
     txt = """In this way I am no doubt indirectly responsible for Dr. Grimesby Roylott's death, and I cannot say that it is likely to weigh very heavily upon my conscience." """
     words = [
         'in', 'this', 'way', 'i', 'am', 'no', 'doubt', 'indirectly',
         'responsible', 'for', 'dr.', 'Grimesby', 'Roylott', "'s", 'death',
         ',', 'and', 'i', 'can', 'not', 'say', 'that', 'it', 'is', 'likely',
         'to', 'weigh', 'very', 'heavily', 'upon', 'my', 'conscience', '.',
         '"'
     ]
     s = ttl.Sentence(txt)
     s._import_tokens(words)
     self.assertEqual(words, [x.text for x in s.tokens])
Пример #6
0
 def test_sentid(self):
     doc = ttl.Document('mydoc')
     sent = doc.sents.new('First sentence.')
     self.assertEqual(sent.ID, "1")
     sent2 = doc.sents.new('Second sentence.')
     self.assertEqual(sent2.ID, "2")
     # add some sentences manually
     sentm1 = ttl.Sentence('Another one', ID=3)
     sentm2 = ttl.Sentence('Another one 2', ID='5')
     doc.sents.append(sentm1)
     doc.sents.append(sentm2)
     doc.sents.new('Third sentence.')
     doc.sents.new('Fourth sentence.')
     sent5 = doc.sents.new('Fifth sentence.')
     self.assertEqual(sent5.ID, "7")
     # cannot add 3 again
     sent_foo = ttl.Sentence('Foo sentence.', ID=3)
     self.assertRaises(Exception, lambda: doc._add_sent_obj(sent_foo))
     # cannot add a None sentence
     self.assertRaises(Exception, lambda: doc._add_sent_obj(None))
     # document should have 5 created sentences + 2 imported sentences
     self.assertEqual(len(doc), 7)
Пример #7
0
 def build_test_sent(self):
     sent = ttl.Sentence(sent1)
     sent.flag = '0'
     sent.comment = 'written in Japanese'
     sent.tags.new('I like calico cats.', 'eng')
     sent._import_tokens('三 毛 猫 が 好き です 。'.split())
     for tk, pos in zip(sent, '名詞 名詞 名詞 助詞 名詞 助動詞 記号'.split()):
         tk.pos = pos
     sent.concepts.new("三毛猫", "wiki", "wiki.ja:三毛猫", tokens=[0, 1, 2])
     sent[0].tags.new('mi', type='reading')
     sent[1].tags.new('ke', type='reading')
     sent[2].tag.reading = 'neko'
     getLogger().debug(sent.to_dict())
     return sent
Пример #8
0
 def to_ttl(self):
     ttl_sent = ttl.Sentence(text=self.text)
     data = self.to_dict()
     for l in TTLIG.KNOWN_LABELS:
         if l not in ['text', 'orth', 'tokens'] and l in data and data[l]:
             ttl_sent.tags.new(data[l], type=l)
     if self.tokens:
         _tokens = parse_ruby(self.tokens)
         ttl_sent.tokens = (t.text() for t in _tokens)
         for ttl_token, furi_token in zip(ttl_sent, _tokens):
             if furi_token.surface != furi_token.text():
                 ttl_token.tags.new(furi_token.surface, type='furi')
         if self.morphtrans:
             _morphtokens = tokenize(self.morphtrans)
             if len(_morphtokens) != len(ttl_sent):
                 logging.getLogger(__name__).warning("Morphophonemic transliteration line and tokens line are mismatched for sentence: {}".format(self.ident or self.ID or self.Id or self.id or self.text))
             else:
                 for t, m in zip(ttl_sent, _morphtokens):
                     t.tags.new(m, type='mtrans')
         if self.pos:
             _postokens = tokenize(self.pos)
             if len(_postokens) != len(ttl_sent):
                 logging.getLogger(__name__).warning("Part-of-speech line and tokens line are mismatched for sentence: {}".format(self.ident or self.ID or self.Id or self.id or self.text))
             else:
                 for t, m in zip(ttl_sent, _postokens):
                     t.pos = m
         if self.lemma:
             _lemmas = tokenize(self.lemma)
             if len(_lemmas) != len(ttl_sent):
                 logging.getLogger(__name__).warning("Lemma line and tokens line are mismatched for sentence: {}".format(self.ident or self.ID or self.Id or self.id or self.text))
             else:
                 for t, m in zip(ttl_sent, _lemmas):
                     t.lemma = m
         if self.morphgloss:
             _glosstokens = tokenize(self.morphgloss)
             if len(_glosstokens) != len(ttl_sent):
                 logging.getLogger(__name__).warning("morpheme-by-morpheme gloss and tokens lines are mismatched for sentence {}".format(self.ident or self.ID or self.Id or self.id or self.text))
             else:
                 for t, m in zip(ttl_sent, _glosstokens):
                     t.tags.new(m, type='mgloss')
         if self.wordgloss:
             _glosstokens = tokenize(self.wordgloss)
             if len(_glosstokens) != len(ttl_sent):
                 logging.getLogger(__name__).warning("word-by-word gloss and tokens lines are mismatched for sentence {}".format(self.ident or self.ID or self.Id or self.id or self.text))
             else:
                 for t, m in zip(ttl_sent, _glosstokens):
                     t.tags.new(m, type='wgloss')
     return ttl_sent
Пример #9
0
 def test_recover_surface_string(self):
     s = ttl.Sentence(
         """a religious sect founded in the United States in 1966; based on Vedic scriptures; groups engage in joyful chanting of `Hare Krishna' and other mantras based on the name of the Hindu god Krishna; devotees usually wear saffron robes and practice vegetarianism and celibacy"""
     )
     tokens = [
         'a', 'religious', 'sect', 'founded', 'in', 'the', 'United',
         'States', 'in', '1966', ';', 'based', 'on', 'Vedic', 'scriptures',
         ';', 'groups', 'engage', 'in', 'joyful', 'chanting', 'of', 'Hare',
         'Krishna', 'and', 'other', 'mantras', 'based', 'on', 'the', 'name',
         'of', 'the', 'Hindu', 'god', 'Krishna', ';', 'devotees', 'usually',
         'wear', 'saffron', 'robes', 'and', 'practice', 'vegetarianism',
         'and', 'celibacy'
     ]
     s._import_tokens(tokens)
     cfrom = min(x.cfrom for x in s.tokens)
     cto = max(x.cto for x in s.tokens)
     self.assertEqual(s.text, s.text[cfrom:cto])
Пример #10
0
    def test_comment(self):
        sent = ttl.Sentence("Dogs bark.")
        sent._import_tokens("Dogs bark .".split())
        sent.comment = 'I am a test sentence.'
        sent[0].comment = "canine"
        sent.concepts.new("02084071-n", clemma="dog", tokens=(sent[0], ))

        list(
            sent.concepts
        )[0].comment = 'a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times'
        expected = {
            'text':
            'Dogs bark.',
            'comment':
            'I am a test sentence.',
            'tokens': [{
                'cto': 4,
                'cfrom': 0,
                'comment': 'canine',
                'text': 'Dogs'
            }, {
                'cto': 9,
                'cfrom': 5,
                'text': 'bark'
            }, {
                'cto': 10,
                'cfrom': 9,
                'text': '.'
            }],
            'concepts': [{
                'value': '02084071-n',
                'clemma': 'dog',
                'comment':
                'a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times',
                'tokens': [0]
            }]
        }
        getLogger().debug(sent.to_dict())
        getLogger().debug(expected)
        self.assertEqual(expected, sent.to_dict())
        self.assertFalse(sent.tags)
        sent.tags.new(GDOG_SID, 'wn30', 0, 4)
        sent.tag.wn30 = {"value": BARK_SID, "cfrom": 5, "cto": 9}
        for t in sent.tags:
            getLogger().debug("{}: label={} | type = {}".format(
                t, t.value, t.type))
Пример #11
0
def to_ttl(sent, with_nonsense=True, sk_map=None, wnctx=None):
    tokens = sent['tokens']
    text = detokenize(tokens)
    s = ttl.Sentence(text=text)
    s.new_tag(sent['sid'], tagtype='origid')
    s.import_tokens((t.text for t in tokens))
    for tinfo, tk in zip(tokens, s):
        for k, v in tinfo.data:
            if (k, v) == ('tag', 'wf') or k == 'sk':
                continue
            if k == 'lemma':
                tk.lemma = v
            elif k == 'pos':
                tk.pos = v
            else:
                tk.new_tag(label=v, tagtype=k)
        # if sensekey exists, add it as a concept
        lemma = tinfo.lemma
        sk = fix_sensekey(tinfo.get('sk'))
        rdf = tinfo.get('rdf')
        comment = None
        if sk and (with_nonsense or not is_nonsense(lemma, sk, rdf)):
            sensetag = sk
            if sk_map is not None and sk in sk_map:
                sensetag = sk_map[sk]
            elif wnctx is not None:
                # try to determine synsetID
                ss = wnctx.senses.select_single('sensekey=?', (sk, ))
                if ss is not None:
                    sid = str(SynsetID.from_string(ss.synsetid))
                    if sk_map is not None:
                        sk_map[sk] = sid
                        sensetag = sid
                else:
                    # sensekey not found
                    getLogger().warning(
                        "There is no synsetID with sensekey={} | rdf={}".
                        format(sk, rdf))
                    comment = 'sensekey'
            s.new_concept(clemma=lemma,
                          tag=sensetag,
                          tokens=(tk, ),
                          comment=comment)
    return s
Пример #12
0
 def test_tag_type_and_searching(self):
     taggable_objects = [
         ttl.Token("text", 0, 4),
         ttl.Sentence('I am a sentence.')
     ]
     for obj in taggable_objects:
         obj.tags.new("06387980-n", type="synset")
         obj.tags.new("06414372-n", type="synset")
         obj.tags.new("manual", type="tagtype")
         # find all values by types
         synsets = list(obj.tags.values("synset"))
         self.assertEqual(synsets, ["06387980-n", "06414372-n"])
         # find a specific tag
         self.assertEqual(obj.tag.tagtype.text, "manual")
         self.assertEqual(obj.tag['tagtype'].text, "manual")
         # auto create ..
         self.assertEqual(
             obj.tag.get_or_create('meaning', default='N/A').value, "N/A")
         self.assertEqual(obj.tag.meaning.text, "N/A")
Пример #13
0
 def test_tagged_sent_to_json(self):
     sent = ttl.Sentence("女の子は猫が好きです。")
     sent._import_tokens("女 の 子 は 猫 が 好き です 。".split())
     sent[0].lemma = "おんな"
     sent[2].lemma = "こ"
     sent[4].lemma = "ねこ"
     sent[4].comment = "Say neh-koh"
     sent[4].pos = "名詞-一般"
     sent[6].lemma = "すき"
     sent[6].pos = "名詞-形容動詞語幹"
     c = sent.concepts.new("10084295-n",
                           "wn",
                           clemma="女の子",
                           tokens=(sent[0], sent[1], sent[2]))
     sent.concept.wn.comment = "若々しい女の人"  # set comment for gold wn concept, which is c
     self.assertEqual(c.comment, "若々しい女の人")
     sent.concepts.new("02121620-n", clemma="猫").tokens.append(sent[4])
     sent.concepts.new("01292683-a", clemma="好き").tokens.append(sent[6])
     expected = {
         'tokens': [{
             'cfrom': 0,
             'cto': 1,
             'lemma': 'おんな',
             'text': '女'
         }, {
             'cfrom': 1,
             'cto': 2,
             'text': 'の'
         }, {
             'cfrom': 2,
             'cto': 3,
             'lemma': 'こ',
             'text': '子'
         }, {
             'cfrom': 3,
             'cto': 4,
             'text': 'は'
         }, {
             'pos': '名詞-一般',
             'cfrom': 4,
             'cto': 5,
             'lemma': 'ねこ',
             'text': '猫',
             'comment': 'Say neh-koh'
         }, {
             'cfrom': 5,
             'cto': 6,
             'text': 'が'
         }, {
             'pos': '名詞-形容動詞語幹',
             'cfrom': 6,
             'cto': 8,
             'lemma': 'すき',
             'text': '好き'
         }, {
             'cfrom': 8,
             'cto': 10,
             'text': 'です'
         }, {
             'cfrom': 10,
             'cto': 11,
             'text': '。'
         }],
         'text':
         '女の子は猫が好きです。',
         'concepts': [{
             'value': '10084295-n',
             'tokens': [0, 1, 2],
             'type': 'wn',
             'clemma': '女の子',
             'comment': '若々しい女の人'
         }, {
             'value': '02121620-n',
             'tokens': [4],
             'clemma': '猫'
         }, {
             'value': '01292683-a',
             'tokens': [6],
             'clemma': '好き'
         }]
     }
     actual = sent.to_dict()
     self.assertEqual(expected['text'], actual['text'])
     self.assertEqual(expected['concepts'], actual['concepts'])
     self.assertEqual(expected['tokens'], actual['tokens'])
     self.assertEqual(expected, actual)
     getLogger().debug(actual)
Пример #14
0
 def test_extra_fields(self):
     cmt = 'This sentence is in English'
     s = ttl.Sentence(text='I am a sentence.', docID=1, comment=cmt)
     self.assertEqual(s.docID, 1)
     self.assertEqual(s.comment, cmt)