Пример #1
0
    def test_create_triples(self):
        from amcat.models.token import TripleValues, TokenValues
        s = amcattest.create_test_analysis_sentence()
        tokens = [
            TokenValues(s.id,
                        0,
                        word="a",
                        lemma="l",
                        pos="p",
                        major="major",
                        minor="minor",
                        namedentity=None),
            TokenValues(s.id,
                        1,
                        word="b",
                        lemma="l",
                        pos="p",
                        major="major",
                        minor="minor",
                        namedentity=None)
        ]
        t = TripleValues(s.id, 0, 1, "su")
        result_tokens, result_triples, corefsets = store_analysis(
            s.analysed_article, tokens, [t])
        tr, = Triple.objects.filter(parent__sentence=s)
        self.assertEqual(tr.relation.label, t.relation)
        self.assertEqual(tr.child.word.word, "a")

        for tokenvalue, token in result_tokens.items():
            self.assertEqual(tokenvalue.position, token.position)
            self.assertEqual(tokenvalue.lemma, token.word.lemma.lemma)
Пример #2
0
 def test_create_lemmata(self):
     from amcat.models.token import TokenValues
     lang = amcattest.get_test_language()
     l1 = Lemma.objects.create(lemma="a", pos="b")
     tokens = [
         TokenValues(None,
                     None,
                     None,
                     lemma=l,
                     pos="b",
                     major=None,
                     minor=None,
                     namedentity=None) for l in "a" * 10
     ]
     tokens += [
         TokenValues(None,
                     None,
                     None,
                     lemma=l,
                     pos="c",
                     major=None,
                     minor=None,
                     namedentity=None) for l in "ab" * 5
     ]
     with self.checkMaxQueries(
             3):  # 1 to cache, 2 to create with different poss
         lemmata = create_lemmata(tokens)
     # are existing lemmata 'recycled'?
     self.assertEqual(lemmata["a", "b"].id, l1.id)
     # did we get the correct lemmata?
     self.assertEqual(set(lemmata.keys()),
                      set([("a", "b"), ("a", "c"), ("b", "c")]))
     for (lemmastr, pos), lemma in lemmata.items():
         self.assertEqual(lemma.lemma, lemmastr)
Пример #3
0
    def test_long_strings(self):
        """Test whether overly long lemmata, words, and pos are truncated"""
        from amcat.models.token import TokenValues, TripleValues

        s = amcattest.create_test_analysis_sentence()   
        longpos = TokenValues(s.id, 0, word="a", lemma="l", pos="pp", major="m", minor="m", namedentity=None)
        
        self.assertRaises(Exception, list, create_tokens([longpos]))

        nonepos = TokenValues(s.id, 0, word="a", lemma="l", pos="p", major="m", minor="m", namedentity=None)

        longvals = TokenValues(s.id, 1, word="a"*9999, lemma="l"*9999, pos="p",
                               major="m"*9999, minor="m"*9999, namedentity=None)
        triple = TripleValues(s.id, 0, 1, "x"*9999)
        create_triples([nonepos, longvals], [triple])
        
        # django validation for length
        t, = Triple.objects.filter(parent__sentence=s)

        t.full_clean()
        t.relation.full_clean()
        for token in (t.parent, t.child):
            token.full_clean()
            token.word.full_clean()
            token.word.lemma.full_clean()
            token.pos.full_clean()
Пример #4
0
    def test_create_words(self):
        from amcat.models.token import TokenValues
        lang = amcattest.get_test_language()
        tokens = []
        l1 = Lemma.objects.create(lemma="a", pos="b")
        w1 = Word.objects.create(lemma=l1, word="b")
        for lemma in "ab":
            for word in "bbcc":
                tokens.append(
                    TokenValues(None,
                                None,
                                word=word,
                                lemma=lemma,
                                pos="b",
                                major=None,
                                minor=None,
                                namedentity=None))
        with self.checkMaxQueries(
                8
        ):  # 2 to cache lemmata+words, 1 to create lemmata, 5 to create words
            words = create_words(tokens)

        self.assertEqual(
            set(words.keys()),
            set([("a", "b", "b"), ("a", "b", "c"), ("b", "b", "b"),
                 ("b", "b", "c")]))
        for (lemmastr, pos, wordstr), word in words.items():
            self.assertEqual(word.word, wordstr)
            self.assertEqual(word.lemma.lemma, lemmastr)

        self.assertEqual(words["a", "b", "b"].id, w1.id)
        self.assertEqual(words["a", "b", "c"].lemma_id, l1.id)
Пример #5
0
def create_tokens(sid, words, tokens):
    for position, s in enumerate(tokens):
        lemma, pos = s.rsplit("/", 1)
        poscat = POSMAP[pos]

        yield TokenValues(sid, position, words[position], lemma, poscat, pos,
                          None)
Пример #6
0
def create_tokenvalue(analysis_article=None, **kargs):
    if 'analysis_sentence' not in kargs:
        kargs['analysis_sentence'] = create_test_analysis_sentence(analysis_article).id
    for key, default in dict(position=_get_next_id(), word='test_word', lemma='test_lemma',
                             pos='T', major='test_major', minor='test_minor', namedentity=None).items():
        if key not in kargs: kargs[key] = default
    from amcat.models.token import TokenValues
    return TokenValues(**kargs)
Пример #7
0
def get_tokenvalues(words, analysis_sentence):
    for i, info in enumerate(words):
	word = info['Text']
	pos = info['PartOfSpeech']
	poscat = POSMAP[pos]
	ner = info['NamedEntityTag']
	ner = NERMAP[ner] if ner != 'O' else None

	yield TokenValues(analysis_sentence.id, i, word, info['Lemma'], poscat, pos, None, ner)
Пример #8
0
def get_token(analysis_sentence_id, token):
    #TokenValues = namedtuple("TokenValues", ["analysis_sentence", "position", "word", "lemma", "pos", "major", "minor", "namedentity"])
    pos_major = token.find("POS").text
    pos = POSMAP[pos_major]
    ner = token.find("NER").text
    ner = NERMAP[ner] if ner != 'O' else None
    return TokenValues(analysis_sentence_id, int(token.get("id")) - 1,
                       token.find("word").text, token.find("lemma").text,
                       pos, pos_major, None, ner)
Пример #9
0
def create_values(sid, words):
    tokens = []
    triples = []
    for word in words:
        tokens.append(
            TokenValues(sid, int(word["id"]), word["form"], word["lemma"],
                        map_pos(word["pos"]), word["pos"], None, None))
        head = int(word["head"])
        if head:
            triples.append(
                TripleValues(sid, int(word['id']), head, word['deprel']))
    return tokens, triples
Пример #10
0
    def run(self, _input=None):

        articles = self.options["articleset"].articles.only("uuid")

        print "["  # manually output json so we don't need to keep all in memory

        def sent_tuple(article, analysissentence):
            return (analysissentence.sentence.parnr,
                    analysissentence.sentence.sentnr)

        for i, a in enumerate(articles):
            if i: print ","

            print >> sys.stderr, "{i} / {n}: {a.id} / {a.uuid}".format(
                n=len(articles), **locals())
            sentences = list(a.sentences.all())
            sentencevalues = [(s.parnr, s.sentnr, s.sentence)
                              for s in sentences]

            tokens = list(
                Token.objects.filter(
                    sentence__sentence__in=sentences).select_related(
                        "sentence__sentence", "word", "word__lemma", "pos"))

            sent_tuples = {t: sent_tuple(a, t.sentence) for t in tokens}

            tokenvalues = [
                TokenValues(sent_tuples[t], t.position, t.word.word,
                            t.word.lemma.lemma, t.pos.pos, t.pos.major,
                            t.pos.minor, None) for t in tokens
            ]

            triples = list(
                Triple.objects.filter(child__in=tokens).select_related(
                    "child", "parent", "relation"))

            triplevalues = [
                TripleValues(sent_tuples[t.child], t.child.position,
                             t.parent.position, t.relation.label)
                for t in triples
            ]
            data = dict(article=a.uuid,
                        sentences=sentencevalues,
                        tokens=tokenvalues,
                        triples=triplevalues)

            json.dump(data, sys.stdout)
            sys.stdout.flush()

        print "]"
Пример #11
0
 def test_create_tokens(self):
     from amcat.models.token import TokenValues
     s = amcattest.create_test_analysis_sentence()
     tokens = [
         TokenValues(s.id,
                     2,
                     word="w",
                     lemma="l",
                     pos="p",
                     major="major",
                     minor="minor",
                     namedentity=None)
     ]
     token, = dict(create_tokens(tokens)).values()
     self.assertEqual(token.word.lemma.lemma, "l")
Пример #12
0
def interpret_token(sid, lemma, word, begin, _end, dummypos, dummypos2, pos):
    if "(" in pos:
        major, minor = pos.split("(", 1)
        minor = minor[:-1]
    else:
        major, minor = pos, None
    if "_" in major:
        m2 = major.split("_")[-1]
    else:
        m2 = major
    cat = POSMAP.get(m2)
    if not cat:
        raise Exception("Unknown POS: %r (%s/%s/%s/%s)" %
                        (m2, major, begin, word, pos))
    return TokenValues(sid, int(begin), word, lemma, cat, major, minor, None)
Пример #13
0
    def test_process(self):
        from amcat.models.token import TokenValues

        class X(AnalysisScript):
            def __init__(self):
                super(X, self).__init__(analysis=None,
                                        tokens=True,
                                        triples=False)

            def get_tokens(self, analysis_sentence, memo=None):
                for i, x in enumerate(
                        analysis_sentence.sentence.sentence.split()):
                    yield TokenValues(analysis_sentence, i + 1, x, None, None,
                                      None, None)

        a = amcattest.create_test_analysis_sentence(
            sentence=amcattest.create_test_sentence(
                sentence="dit is een test"))
        tokens, triples = list(X().process_sentence(a))
        print(tokens)
        self.assertIsNone(triples)
        self.assertEqual(
            list(tokens)[0],
            (TokenValues(a, 1, "dit", None, None, None, None)))
Пример #14
0
    def test_interpret_xml(self):
        # <!-- Mary met John. She likes him. -->
        analysis_sentences=range(10)
        tokens, triples, corefsets = interpret_xml(analysis_sentences, self._get_test_xml())
        self.assertEqual(set(tokens), {
                TokenValues(0, 0, 'Mary', 'Mary', 'N', "NNP", None, 'P'),
                TokenValues(0, 1, 'met', 'meet', 'V', "VBD", None, None),
                TokenValues(0, 2, 'John', 'John', 'N', "NNP", None, 'P'),
                TokenValues(1, 0, 'She', 'she', 'O', "PRP", None, None),
                TokenValues(1, 1, 'likes', 'like', 'V', "VBZ", None, None),
                TokenValues(1, 2, 'him', 'he', 'O', "PRP", None, None),
                })

        self.assertEqual(set(triples), {
                TripleValues(0, 0, 1, "nsubj"),
                TripleValues(0, 2, 1, "dobj"),
                TripleValues(1, 0, 1, "nsubj"),
                TripleValues(1, 2, 1, "dobj"),
                })


        self.assertEqual({frozenset(coref) for coref in corefsets}, {
                frozenset([(0,0), (1,0)]),
                frozenset([(0,2), (1,2)])})
Пример #15
0
 def get_tokens(self, analysis_sentence, memo=None):
     for i, x in enumerate(
             analysis_sentence.sentence.sentence.split()):
         yield TokenValues(analysis_sentence, i + 1, x, None, None,
                           None, None)
Пример #16
0
 def get_tokens(self, analysis_sentence, memo=None):
     if memo is None: memo = self.preprocess_sentence(analysis_sentence)
     for line in memo:
         position, word, lemma, pos = [line[i] for i in (0, 1, 2, 4)]
         yield TokenValues(analysis_sentence,
                           int(position) - 1, word, lemma, *read_pos(pos))
Пример #17
0
 def clean_tokens(self):
     tokens = self.cleaned_data["tokens"]
     try:
         return [TokenValues(*fields) for fields in json.loads(tokens)]
     except ValueError as e:
         raise forms.ValidationError(e)