def test_0(self): """2b-0-basic: simple test case""" def bigramCost(a, b): corpus = [wordsegUtil.SENTENCE_BEGIN] + 'beam me up scotty'.split() if (a, b) in list(zip(corpus, corpus[1:])): return 1.0 else: return 1000.0 def possibleFills(x): fills = { 'bm': set(['beam', 'bam', 'boom']), 'm': set(['me', 'ma']), 'p': set(['up', 'oop', 'pa', 'epe']), 'sctty': set(['scotty']), } return fills.get(x, set()) self.assertEqual( '', submission.insertVowels([], bigramCost, possibleFills)) self.assertEqual( # No fills 'zz$z$zz', submission.insertVowels(['zz$z$zz'], bigramCost, possibleFills)) self.assertEqual( 'beam', submission.insertVowels(['bm'], bigramCost, possibleFills)) self.assertEqual( 'me up', submission.insertVowels(['m', 'p'], bigramCost, possibleFills)) self.assertEqual( 'beam me up scotty', submission.insertVowels('bm m p sctty'.split(), bigramCost, possibleFills))
def test_1(self): """2b-1-hidden: Simple hidden test case""" solution1 = submission.insertVowels([], self.bigramCost, self.possibleFills) # No fills solution2 = submission.insertVowels(['zz$z$zz'], self.bigramCost, self.possibleFills) solution3 = submission.insertVowels([''], self.bigramCost, self.possibleFills) solution4 = submission.insertVowels('wld lk t hv mr lttrs'.split(), self.bigramCost, self.possibleFills) solution5 = submission.insertVowels('ngh lrdy'.split(), self.bigramCost, self.possibleFills)
def repl(unigramCost, bigramCost, possibleFills, command=None): '''REPL: read, evaluate, print, loop''' while True: sys.stdout.write('>> ') line = sys.stdin.readline().strip() if not line: break if command is None: cmdAndLine = line.split(None, 1) cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:]) else: cmd = command line = line print ('') if cmd == 'help': print ('Usage: <command> [arg1, arg2, ...]') print ('') print ('Commands:') print ('\n'.join(a + '\t\t' + b for a, b in [ ('help', 'This'), ('seg', 'Segment character sequences as in 1b'), ('ins', 'Insert vowels into words as in 2b'), ('both', 'Joint segment-and-insert as in 3b'), ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'), ('ug', 'Query unigram cost function, treating input as a single word'), ('bg', 'Call bigram cost function on the last two words of the input'), ])) print ('') print ('Enter empty line to quit') elif cmd == 'seg': line = wordsegUtil.cleanLine(line) parts = wordsegUtil.words(line) print (' Query (seg):', ' '.join(parts)) print ('') print (' ' + ' '.join( submission.segmentWords(part, unigramCost) for part in parts)) elif cmd == 'ins': line = wordsegUtil.cleanLine(line) ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print (' Query (ins):', ' '.join(ws)) print ('') print (' ' + submission.insertVowels(ws, bigramCost, possibleFills)) elif cmd == 'both': line = wordsegUtil.cleanLine(line) smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2) parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print (' Query (both):', ' '.join(parts)) print ('') print (' ' + ' '.join( submission.segmentAndInsert(part, smoothCost, possibleFills) for part in parts )) elif cmd == 'fills': line = wordsegUtil.cleanLine(line) print ('\n'.join(possibleFills(line))) elif cmd == 'ug': line = wordsegUtil.cleanLine(line) print (unigramCost(line)) elif cmd == 'bg': grams = tuple(wordsegUtil.words(line)) prefix, ending = grams[-2], grams[-1] print (bigramCost(prefix, ending)) else: print ('Unrecognized command:', cmd) print ('')
def test_2(self): """2b-2-hidden: Simple hidden test case.""" SB = wordsegUtil.SENTENCE_BEGIN # Check for correct use of SENTENCE_BEGIN def bigramCost(a, b): if (a, b) == (SB, 'cat'): return 5.0 elif a != SB and b == 'dog': return 1.0 else: return 1000.0 solution1 = submission.insertVowels(['x'], bigramCost, lambda x: set(['cat', 'dog'])) # Check for non-greediness def bigramCost(a, b): # Dog over log -- a test poem by rf costs = { (SB, 'cat'): 1.0, # Always start with cat ('cat', 'log'): 1.0, # Locally prefer log ('cat', 'dog'): 2.0, # rather than dog ('log', 'mouse'): 3.0, # But dog would have been ('dog', 'mouse'): 1.0, # better in retrospect } return costs.get((a, b), 1000.0) def fills(x): return { 'x1': set(['cat', 'dog']), 'x2': set(['log', 'dog', 'frog']), 'x3': set(['mouse', 'house', 'cat']) }[x] solution2 = submission.insertVowels('x1 x2 x3'.split(), bigramCost, fills) # Check for non-trivial long-range dependencies def bigramCost(a, b): # Dogs over logs -- another test poem by rf costs = { (SB, 'cat'): 1.0, # Always start with cat ('cat', 'log1'): 1.0, # Locally prefer log ('cat', 'dog1'): 2.0, # Rather than dog ('log20', 'mouse'): 1.0, # And this might even ('dog20', 'mouse'): 1.0, # seem to be okay } for i in range(1, 20): # But along the way # Dog's cost will decay costs[('log' + str(i), 'log' + str(i + 1))] = 0.25 costs[('dog' + str(i), 'dog' + str(i + 1))] = 1.0 / float(i) # Hooray return costs.get((a, b), 1000.0) def fills(x): f = { 'x0': set(['cat', 'dog']), 'x21': set(['mouse', 'house', 'cat']), } for i in range(1, 21): f['x' + str(i)] = set(['log' + str(i), 'dog' + str(i), 'frog']) return f[x] solution3 = submission.insertVowels( ['x' + str(i) for i in range(0, 22)], bigramCost, fills)
def repl(unigramCost, bigramCost, possibleFills, command=None): '''REPL: read, evaluate, print, loop''' while True: sys.stdout.write('>> ') line = sys.stdin.readline().strip() if not line: break if command is None: cmdAndLine = line.split(None, 1) cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:]) else: cmd = command line = line print '' if cmd == 'help': print 'Usage: <command> [arg1, arg2, ...]' print '' print 'Commands:' print '\n'.join(a + '\t\t' + b for a, b in [ ('help', 'This'), ('seg', 'Segment character sequences'), ('ins', 'Insert vowels into words'), ('both', 'Joint segment-and-insert'), ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'), ('ug', 'Query unigram cost function'), ('bg', 'Query bigram cost function'), ]) print '' print 'Enter empty line to quit' elif cmd == 'seg': line = wordsegUtil.cleanLine(line) parts = wordsegUtil.words(line) print ' Query (seg):', ' '.join(parts) print '' print ' ' + ' '.join( submission.segmentWords(part, unigramCost) for part in parts) elif cmd == 'ins': line = wordsegUtil.cleanLine(line) ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print ' Query (ins):', ' '.join(ws) print '' print ' ' + submission.insertVowels(ws, bigramCost, possibleFills) elif cmd == 'both': line = wordsegUtil.cleanLine(line) smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2) parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print ' Query (both):', ' '.join(parts) print '' print ' ' + ' '.join( submission.segmentAndInsert(part, smoothCost, possibleFills) for part in parts ) elif cmd == 'fills': line = wordsegUtil.cleanLine(line) print '\n'.join(possibleFills(line)) elif cmd == 'ug': line = wordsegUtil.cleanLine(line) print unigramCost(line) elif cmd == 'bg': grams = tuple(wordsegUtil.words(line)) prefix, ending = grams[-2], grams[-1] print bigramCost(prefix, ending) else: print 'Unrecognized command:', cmd print ''