Пример #1
0
    def test(self):
        """Tests the coverage of GeoPar's lexicon.

        For each MLU-NR pair in the training data, for each lexical term in the
        MR, it checks whether there is a word or multiword (up to length 3) in
        the NLU that in the lexicon is associated with that lexical term.
        """
        missing_terms = []
        lex = lexicon.read_lexicon('lexicon.txt')
        for words, mr in data.geo880_train():
            printed = False
            self.assertEqual(mr.functor_name, 'answer')
            self.assertEqual(len(mr.args), 2)
            self.assertIsInstance(mr.args[0], terms.Variable)
            for lexterm in lexicon.lexical_subterms(mr.args[1]):
                word_found = False
                unigrams = util.ngrams(1, words)
                bigrams = util.ngrams(2, words)
                trigrams = util.ngrams(3, words)
                for word in itertools.chain(unigrams, bigrams, trigrams):
                    for term in lex.meanings(word):
                        if term.equivalent(lexterm):
                            word_found = True
                            break
                if not word_found:
                    if not printed:
                        print(str(words))
                        printed = True
                    print('WARNING: no word found that means ' +
                          lexterm.to_string())
                    missing_terms.append(lexterm)
        self.assertEqual(missing_terms, [])
Пример #2
0
 def reject(self, item):
     # TODO can only drop/lift/sdrop something that already has all variable bindings with its environment??
     if item.finished:
         return not item.stack.head.mr.equivalent(self.target_mr)
     # predicate bag check
     stack_lsts = collections.Counter(
         l.to_string() for se in item.stack
         for l in lexicon.lexical_subterms(se.mr))
     if not util.issubset(stack_lsts, self.lsts):
         return True
     queue_lsts = collections.Counter(
         meaning.to_string()
         for length in range(1, config.MAX_TOKEN_LENGTH + 1)
         for word in util.ngrams(length, tuple(item.words[item.offset:]))
         for meaning in self.lex.meanings(word))
     if not util.issubset(self.lsts, stack_lsts + queue_lsts):
         return True
     # fragment check (false negatives (and positives?) unless mr is augmented!)
     fragments = tuple(
         find_fragment(se.mr, self.fragments) for se in item.stack)
     bindings = {}
     for se, fr in zip(item.stack, fragments):
         if not se.mr.subsumes(fr, bindings):
             return True
     return False
Пример #3
0
    def make(self, ngramstr, makeline):
        file = open(self.path, 'r', encoding='utf-8')
        fileindex = 0
        filecount = 0
        fw = open(BASEPATH + ngramstr + '/' + self.wordtype +
                  '%d.txt' % fileindex,
                  'w',
                  encoding='utf-8')
        #print(len(file.readlines()))
        #if self.N != 1:
        for line in ngrams(file, self.N, word=self.wordtype):
            #temp = line.replace('\n','').split('\t')
            for i in line:
                templine = makeline(i)
                fw.write(templine + '\n')
                filecount += 1

                if filecount % 1000 == 0:
                    fw.flush()

                if filecount >= 50000000:
                    #print(filecount)
                    fileindex += 1
                    fw.close()
                    fw = open(BASEPATH + ngramstr + '/' + self.wordtype +
                              '%d.txt' % fileindex,
                              'w',
                              encoding='utf-8')
                    filecount = 0

        fw.close()
        file.close()
Пример #4
0
    def make_freq(self):
        with open(self.path, 'r', encoding='utf-8') as f:
            text = f.readlines()

        for line_list in ngrams(text, self.N):
            for line in line_list:
                self.unigram[(line[0])] += 1
Пример #5
0
    def make_freq(self):
        with open(self.path, 'r', encoding='utf-8') as f:
            text = f.readlines()

        for line_list in ngrams(text, 3):
            for line in line_list:
                self.fourgram[(line[0], line[1], line[2], line[3])] += 1
                self.fivegram[(line[0], line[1], line[2], line[3],
                               line[4])] += 1
Пример #6
0
    def make_freq(self):
        with open(self.path, 'r', encoding='utf-8') as f:
            text = f.readlines()

        for line_list in ngrams(text, self.N):
            for line in line_list:
                self.sixgram[(line[0], line[1], line[2], line[3], line[4],
                              line[5])] += 1
                self.sevengram[(line[0], line[1], line[2], line[3], line[4],
                                line[5], line[6])] += 1
Пример #7
0
"""This program checks the coverage of GeoPar's lexicon.

For each NLU-MR pair in the training data, for each lexical term in the MR, it
checks whether there is a word or multiword (up to length 2) in the NLU that in
the lexicon is associated with that lexical term. Prints a warning if not.
"""

import data
import lexicon
import terms
import itertools
import util

for words, reader, mr in data.geo880_train():
    print(str(words))
    assert mr.functor_name == 'answer'
    assert len(mr.args) == 2
    assert isinstance(mr.args[0], terms.Variable)
    for lexterm in mr.args[1].lexterms():
        word_found = False
        unigrams = util.ngrams(1, words)
        bigrams = util.ngrams(2, words)
        for word in itertools.chain(unigrams, bigrams):
            for term in lexicon.meanings(word):
                if term.equivalent(lexterm):
                    word_found = True
                    break
        if not word_found:
            print('WARNING: no word found that means ' +
                  lexterm.to_string(reader))
Пример #8
0
	def __process_uppercased_2grams(self, text):
		return list(filter(lambda x: x[0][0].isupper() and x[1][0].isupper(), ngrams(text, 2)))
Пример #9
0
	def __process_2grams(self, text):
		return list(ngrams(text, 2))