def test(self): """Tests the coverage of GeoPar's lexicon. For each MLU-NR pair in the training data, for each lexical term in the MR, it checks whether there is a word or multiword (up to length 3) in the NLU that in the lexicon is associated with that lexical term. """ missing_terms = [] lex = lexicon.read_lexicon('lexicon.txt') for words, mr in data.geo880_train(): printed = False self.assertEqual(mr.functor_name, 'answer') self.assertEqual(len(mr.args), 2) self.assertIsInstance(mr.args[0], terms.Variable) for lexterm in lexicon.lexical_subterms(mr.args[1]): word_found = False unigrams = util.ngrams(1, words) bigrams = util.ngrams(2, words) trigrams = util.ngrams(3, words) for word in itertools.chain(unigrams, bigrams, trigrams): for term in lex.meanings(word): if term.equivalent(lexterm): word_found = True break if not word_found: if not printed: print(str(words)) printed = True print('WARNING: no word found that means ' + lexterm.to_string()) missing_terms.append(lexterm) self.assertEqual(missing_terms, [])
def reject(self, item): # TODO can only drop/lift/sdrop something that already has all variable bindings with its environment?? if item.finished: return not item.stack.head.mr.equivalent(self.target_mr) # predicate bag check stack_lsts = collections.Counter( l.to_string() for se in item.stack for l in lexicon.lexical_subterms(se.mr)) if not util.issubset(stack_lsts, self.lsts): return True queue_lsts = collections.Counter( meaning.to_string() for length in range(1, config.MAX_TOKEN_LENGTH + 1) for word in util.ngrams(length, tuple(item.words[item.offset:])) for meaning in self.lex.meanings(word)) if not util.issubset(self.lsts, stack_lsts + queue_lsts): return True # fragment check (false negatives (and positives?) unless mr is augmented!) fragments = tuple( find_fragment(se.mr, self.fragments) for se in item.stack) bindings = {} for se, fr in zip(item.stack, fragments): if not se.mr.subsumes(fr, bindings): return True return False
def make(self, ngramstr, makeline): file = open(self.path, 'r', encoding='utf-8') fileindex = 0 filecount = 0 fw = open(BASEPATH + ngramstr + '/' + self.wordtype + '%d.txt' % fileindex, 'w', encoding='utf-8') #print(len(file.readlines())) #if self.N != 1: for line in ngrams(file, self.N, word=self.wordtype): #temp = line.replace('\n','').split('\t') for i in line: templine = makeline(i) fw.write(templine + '\n') filecount += 1 if filecount % 1000 == 0: fw.flush() if filecount >= 50000000: #print(filecount) fileindex += 1 fw.close() fw = open(BASEPATH + ngramstr + '/' + self.wordtype + '%d.txt' % fileindex, 'w', encoding='utf-8') filecount = 0 fw.close() file.close()
def make_freq(self): with open(self.path, 'r', encoding='utf-8') as f: text = f.readlines() for line_list in ngrams(text, self.N): for line in line_list: self.unigram[(line[0])] += 1
def make_freq(self): with open(self.path, 'r', encoding='utf-8') as f: text = f.readlines() for line_list in ngrams(text, 3): for line in line_list: self.fourgram[(line[0], line[1], line[2], line[3])] += 1 self.fivegram[(line[0], line[1], line[2], line[3], line[4])] += 1
def make_freq(self): with open(self.path, 'r', encoding='utf-8') as f: text = f.readlines() for line_list in ngrams(text, self.N): for line in line_list: self.sixgram[(line[0], line[1], line[2], line[3], line[4], line[5])] += 1 self.sevengram[(line[0], line[1], line[2], line[3], line[4], line[5], line[6])] += 1
"""This program checks the coverage of GeoPar's lexicon. For each NLU-MR pair in the training data, for each lexical term in the MR, it checks whether there is a word or multiword (up to length 2) in the NLU that in the lexicon is associated with that lexical term. Prints a warning if not. """ import data import lexicon import terms import itertools import util for words, reader, mr in data.geo880_train(): print(str(words)) assert mr.functor_name == 'answer' assert len(mr.args) == 2 assert isinstance(mr.args[0], terms.Variable) for lexterm in mr.args[1].lexterms(): word_found = False unigrams = util.ngrams(1, words) bigrams = util.ngrams(2, words) for word in itertools.chain(unigrams, bigrams): for term in lexicon.meanings(word): if term.equivalent(lexterm): word_found = True break if not word_found: print('WARNING: no word found that means ' + lexterm.to_string(reader))
def __process_uppercased_2grams(self, text): return list(filter(lambda x: x[0][0].isupper() and x[1][0].isupper(), ngrams(text, 2)))
def __process_2grams(self, text): return list(ngrams(text, 2))