def t_3b_5(): unigramCost, bigramCost, possibleFills = getRealCosts() smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2) for query in QUERIES_INS: query = wordsegUtil.cleanLine(query) parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(query)] pred = [submission.segmentAndInsert(part, smoothCost, possibleFills) for part in parts]
def test_4(self): """3b-4-hidden: hidden test case for all queries in QUERIES_BOTH with bigram costs and possible fills from the corpus""" smoothCost = wordsegUtil.smoothUnigramAndBigram(self.unigramCost, self.bigramCost, 0.2) for i, query in enumerate(QUERIES_BOTH): if i != 1: continue query = wordsegUtil.cleanLine(query) parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(query)] self.compare_with_solution_or_wait(submission, 'segmentAndInsert', lambda f: [f(part, smoothCost, self.possibleFills) for part in parts])
self.query = query self.bigramCost = bigramCost self.possibleFills = possibleFills def start_state(self): # position before which text is reconstructed & previous word return 0, wordsegUtil.SENTENCE_BEGIN def is_end(self, state): return state[0] == len(self.query) def succ_and_cost(self, state): raise NotImplementedError unigramCost, bigramCost = wordsegUtil.makeLanguageModels('leo-will.txt') smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2) possibleFills = wordsegUtil.makeInverseRemovalDictionary( 'leo-will.txt', 'aeiou') problem = JointSegmentationInsertionProblem('mgnllthppl', smoothCost, possibleFills) import dynamic_programming_search dps = dynamic_programming_search.DynamicProgrammingSearch(verbose=1) # dps = dynamic_programming_search.DynamicProgrammingSearch(memory_use=False, verbose=1) # print(dps.solve(problem)) import uniform_cost_search ucs = uniform_cost_search.UniformCostSearch(verbose=0) print(ucs.solve(problem))
def repl(unigramCost, bigramCost, possibleFills, command=None): '''REPL: read, evaluate, print, loop''' while True: sys.stdout.write('>> ') line = sys.stdin.readline().strip() if not line: break if command is None: cmdAndLine = line.split(None, 1) cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:]) else: cmd = command line = line print ('') if cmd == 'help': print ('Usage: <command> [arg1, arg2, ...]') print ('') print ('Commands:') print ('\n'.join(a + '\t\t' + b for a, b in [ ('help', 'This'), ('seg', 'Segment character sequences as in 1b'), ('ins', 'Insert vowels into words as in 2b'), ('both', 'Joint segment-and-insert as in 3b'), ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'), ('ug', 'Query unigram cost function, treating input as a single word'), ('bg', 'Call bigram cost function on the last two words of the input'), ])) print ('') print ('Enter empty line to quit') elif cmd == 'seg': line = wordsegUtil.cleanLine(line) parts = wordsegUtil.words(line) print (' Query (seg):', ' '.join(parts)) print ('') print (' ' + ' '.join( submission.segmentWords(part, unigramCost) for part in parts)) elif cmd == 'ins': line = wordsegUtil.cleanLine(line) ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print (' Query (ins):', ' '.join(ws)) print ('') print (' ' + submission.insertVowels(ws, bigramCost, possibleFills)) elif cmd == 'both': line = wordsegUtil.cleanLine(line) smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2) parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print (' Query (both):', ' '.join(parts)) print ('') print (' ' + ' '.join( submission.segmentAndInsert(part, smoothCost, possibleFills) for part in parts )) elif cmd == 'fills': line = wordsegUtil.cleanLine(line) print ('\n'.join(possibleFills(line))) elif cmd == 'ug': line = wordsegUtil.cleanLine(line) print (unigramCost(line)) elif cmd == 'bg': grams = tuple(wordsegUtil.words(line)) prefix, ending = grams[-2], grams[-1] print (bigramCost(prefix, ending)) else: print ('Unrecognized command:', cmd) print ('')
def repl(unigramCost, bigramCost, possibleFills, command=None): '''REPL: read, evaluate, print, loop''' while True: sys.stdout.write('>> ') line = sys.stdin.readline().strip() if not line: break if command is None: cmdAndLine = line.split(None, 1) cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:]) else: cmd = command line = line print '' if cmd == 'help': print 'Usage: <command> [arg1, arg2, ...]' print '' print 'Commands:' print '\n'.join(a + '\t\t' + b for a, b in [ ('help', 'This'), ('seg', 'Segment character sequences'), ('ins', 'Insert vowels into words'), ('both', 'Joint segment-and-insert'), ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'), ('ug', 'Query unigram cost function'), ('bg', 'Query bigram cost function'), ]) print '' print 'Enter empty line to quit' elif cmd == 'seg': line = wordsegUtil.cleanLine(line) parts = wordsegUtil.words(line) print ' Query (seg):', ' '.join(parts) print '' print ' ' + ' '.join( submission.segmentWords(part, unigramCost) for part in parts) elif cmd == 'ins': line = wordsegUtil.cleanLine(line) ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print ' Query (ins):', ' '.join(ws) print '' print ' ' + submission.insertVowels(ws, bigramCost, possibleFills) elif cmd == 'both': line = wordsegUtil.cleanLine(line) smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2) parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print ' Query (both):', ' '.join(parts) print '' print ' ' + ' '.join( submission.segmentAndInsert(part, smoothCost, possibleFills) for part in parts ) elif cmd == 'fills': line = wordsegUtil.cleanLine(line) print '\n'.join(possibleFills(line)) elif cmd == 'ug': line = wordsegUtil.cleanLine(line) print unigramCost(line) elif cmd == 'bg': grams = tuple(wordsegUtil.words(line)) prefix, ending = grams[-2], grams[-1] print bigramCost(prefix, ending) else: print 'Unrecognized command:', cmd print ''