Exemplo n.º 1
0
    def test_3(self):
        """3b-3-hidden:  hidden test case with hand-picked bigram costs and possible fills"""
        def bigramCost(a, b):
            corpus = [wordsegUtil.SENTENCE_BEGIN] + 'beam me up scotty'.split()
            if (a, b) in list(zip(corpus, corpus[1:])):
                return 1.0
            else:
                return 1000.0

        def possibleFills(x):
            fills = {
                'bm': set(['beam', 'bam', 'boom']),
                'm': set(['me', 'ma']),
                'p': set(['up', 'oop', 'pa', 'epe']),
                'sctty': set(['scotty']),
                'z': set(['ze']),
            }
            return fills.get(x, set())

        # Ensure no non-word makes it through
        solution1 = submission.segmentAndInsert('zzzzz', bigramCost,
                                                possibleFills)
        solution2 = submission.segmentAndInsert('bm', bigramCost,
                                                possibleFills)
        solution3 = submission.segmentAndInsert('mp', bigramCost,
                                                possibleFills)
        solution4 = submission.segmentAndInsert('bmmpsctty', bigramCost,
                                                possibleFills)
Exemplo n.º 2
0
    def test_0(self):
        """3b-0-basic:  Simple test case with hand-picked bigram costs and possible fills."""
        def bigramCost(a, b):
            if b in ['and', 'two', 'three', 'word', 'words']:
                return 1.0
            else:
                return 1000.0

        fills_ = {
            'nd': set(['and']),
            'tw': set(['two']),
            'thr': set(['three']),
            'wrd': set(['word']),
            'wrds': set(['words']),
        }
        fills = lambda x: fills_.get(x, set())

        self.assertEqual('',
                         submission.segmentAndInsert('', bigramCost, fills))
        self.assertEqual('word',
                         submission.segmentAndInsert('wrd', bigramCost, fills))
        self.assertEqual(
            'two words',
            submission.segmentAndInsert('twwrds', bigramCost, fills))
        self.assertEqual(
            'and three words',
            submission.segmentAndInsert('ndthrwrds', bigramCost, fills))
Exemplo n.º 3
0
    def test_2(self):
        """3b-2-hidden:  hidden test case with unigram costs as bigram costs and additional possible fills."""
        bigramCost = lambda a, b: self.unigramCost(b)
        fills_ = {
            'nd': set(['and']),
            'tw': set(['two']),
            'thr': set(['three']),
            'wrd': set(['word']),
            'wrds': set(['words']),
            # Hah!  Hit them with two better words
            'th': set(['the']),
            'rwrds': set(['rewards']),
        }
        fills = lambda x: fills_.get(x, set())

        solution1 = submission.segmentAndInsert('wrd', bigramCost, fills)
        solution2 = submission.segmentAndInsert('twwrds', bigramCost, fills)
        # Waddaya know
        solution3 = submission.segmentAndInsert('ndthrwrds', bigramCost, fills)
Exemplo n.º 4
0
    def test_1(self):
        """3b-1-basic:  simple test case with unigram costs as bigram costs"""
        bigramCost = lambda a, b: self.unigramCost(b)

        fills_ = {
            'nd': set(['and']),
            'tw': set(['two']),
            'thr': set(['three']),
            'wrd': set(['word']),
            'wrds': set(['words']),
        }
        fills = lambda x: fills_.get(x, set())

        self.assertEqual('word',
                         submission.segmentAndInsert('wrd', bigramCost, fills))
        self.assertEqual(
            'two words',
            submission.segmentAndInsert('twwrds', bigramCost, fills))
        self.assertEqual(
            'and three words',
            submission.segmentAndInsert('ndthrwrds', bigramCost, fills))
Exemplo n.º 5
0
def repl(unigramCost, bigramCost, possibleFills, command=None):
    '''REPL: read, evaluate, print, loop'''

    while True:
        sys.stdout.write('>> ')
        line = sys.stdin.readline().strip()
        if not line:
            break

        if command is None:
            cmdAndLine = line.split(None, 1)
            cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:])
        else:
            cmd = command
            line = line

        print ('')

        if cmd == 'help':
            print ('Usage: <command> [arg1, arg2, ...]')
            print ('')
            print ('Commands:')
            print ('\n'.join(a + '\t\t' + b for a, b in [
                ('help', 'This'),
                ('seg', 'Segment character sequences as in 1b'),
                ('ins', 'Insert vowels into words as in 2b'),
                ('both', 'Joint segment-and-insert as in 3b'),
                ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'),
                ('ug', 'Query unigram cost function, treating input as a single word'),
                ('bg', 'Call bigram cost function on the last two words of the input'),
            ]))
            print ('')
            print ('Enter empty line to quit')

        elif cmd == 'seg':
            line = wordsegUtil.cleanLine(line)
            parts = wordsegUtil.words(line)
            print ('  Query (seg):', ' '.join(parts))
            print ('')
            print ('  ' + ' '.join(
                submission.segmentWords(part, unigramCost) for part in parts))

        elif cmd == 'ins':
            line = wordsegUtil.cleanLine(line)
            ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print ('  Query (ins):', ' '.join(ws))
            print ('')
            print ('  ' + submission.insertVowels(ws, bigramCost, possibleFills))

        elif cmd == 'both':
            line = wordsegUtil.cleanLine(line)
            smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2)
            parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print ('  Query (both):', ' '.join(parts))
            print ('')
            print ('  ' + ' '.join(
                submission.segmentAndInsert(part, smoothCost, possibleFills)
                for part in parts
            ))

        elif cmd == 'fills':
            line = wordsegUtil.cleanLine(line)
            print ('\n'.join(possibleFills(line)))

        elif cmd == 'ug':
            line = wordsegUtil.cleanLine(line)
            print (unigramCost(line))

        elif cmd == 'bg':
            grams = tuple(wordsegUtil.words(line))
            prefix, ending = grams[-2], grams[-1]
            print (bigramCost(prefix, ending))

        else:
            print ('Unrecognized command:', cmd)

        print ('')
Exemplo n.º 6
0
def repl(unigramCost, bigramCost, possibleFills, command=None):
    '''REPL: read, evaluate, print, loop'''

    while True:
        sys.stdout.write('>> ')
        line = sys.stdin.readline().strip()
        if not line:
            break

        if command is None:
            cmdAndLine = line.split(None, 1)
            cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:])
        else:
            cmd = command
            line = line

        print ''

        if cmd == 'help':
            print 'Usage: <command> [arg1, arg2, ...]'
            print ''
            print 'Commands:'
            print '\n'.join(a + '\t\t' + b for a, b in [
                ('help', 'This'),
                ('seg', 'Segment character sequences'),
                ('ins', 'Insert vowels into words'),
                ('both', 'Joint segment-and-insert'),
                ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'),
                ('ug', 'Query unigram cost function'),
                ('bg', 'Query bigram cost function'),
            ])
            print ''
            print 'Enter empty line to quit'

        elif cmd == 'seg':
            line = wordsegUtil.cleanLine(line)
            parts = wordsegUtil.words(line)
            print '  Query (seg):', ' '.join(parts)
            print ''
            print '  ' + ' '.join(
                submission.segmentWords(part, unigramCost) for part in parts)

        elif cmd == 'ins':
            line = wordsegUtil.cleanLine(line)
            ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print '  Query (ins):', ' '.join(ws)
            print ''
            print '  ' + submission.insertVowels(ws, bigramCost, possibleFills)

        elif cmd == 'both':
            line = wordsegUtil.cleanLine(line)
            smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2)
            parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print '  Query (both):', ' '.join(parts)
            print ''
            print '  ' + ' '.join(
                submission.segmentAndInsert(part, smoothCost, possibleFills)
                for part in parts
            )

        elif cmd == 'fills':
            line = wordsegUtil.cleanLine(line)
            print '\n'.join(possibleFills(line))

        elif cmd == 'ug':
            line = wordsegUtil.cleanLine(line)
            print unigramCost(line)

        elif cmd == 'bg':
            grams = tuple(wordsegUtil.words(line))
            prefix, ending = grams[-2], grams[-1]
            print bigramCost(prefix, ending)

        else:
            print 'Unrecognized command:', cmd

        print ''