Пример #1
0
 def test_1(self):
     """1b-1-basic:  simple test case using unigram cost from the corpus"""
     self.assertEqual('word',
                      submission.segmentWords('word', self.unigramCost))
     self.assertEqual('two words',
                      submission.segmentWords('twowords', self.unigramCost))
     self.assertEqual(
         'and three words',
         submission.segmentWords('andthreewords', self.unigramCost))
Пример #2
0
  def test_0(self):
    """1b-0-basic:  simple test case using hand-picked unigram costs."""
    def unigramCost(x):
      if x in ['and', 'two', 'three', 'word', 'words']:
        return 1.0
      else:
        return 1000.0

    self.assertEqual('', submission.segmentWords('', unigramCost))
    self.assertEqual('word', submission.segmentWords('word', unigramCost))
    self.assertEqual('two words', submission.segmentWords('twowords', unigramCost))
    self.assertEqual('and three words', submission.segmentWords('andthreewords', unigramCost))
Пример #3
0
    def test_2(self):
        """1b-2-hidden:  """
        # Word seen in corpus
        solution1 = submission.segmentWords('pizza', self.unigramCost)

        # Even long unseen words are preferred to their arbitrary segmentations
        solution2 = submission.segmentWords('qqqqq', self.unigramCost)
        solution3 = submission.segmentWords('z' * 100, self.unigramCost)

        # But 'a' is a word
        solution4 = submission.segmentWords('aa', self.unigramCost)

        # With an apparent crossing point at length 6->7
        solution5 = submission.segmentWords('aaaaaa', self.unigramCost)
        solution6 = submission.segmentWords('aaaaaaa', self.unigramCost)
Пример #4
0
def repl(unigramCost, bigramCost, possibleFills, command=None):
    '''REPL: read, evaluate, print, loop'''

    while True:
        sys.stdout.write('>> ')
        line = sys.stdin.readline().strip()
        if not line:
            break

        if command is None:
            cmdAndLine = line.split(None, 1)
            cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:])
        else:
            cmd = command
            line = line

        print ('')

        if cmd == 'help':
            print ('Usage: <command> [arg1, arg2, ...]')
            print ('')
            print ('Commands:')
            print ('\n'.join(a + '\t\t' + b for a, b in [
                ('help', 'This'),
                ('seg', 'Segment character sequences as in 1b'),
                ('ins', 'Insert vowels into words as in 2b'),
                ('both', 'Joint segment-and-insert as in 3b'),
                ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'),
                ('ug', 'Query unigram cost function, treating input as a single word'),
                ('bg', 'Call bigram cost function on the last two words of the input'),
            ]))
            print ('')
            print ('Enter empty line to quit')

        elif cmd == 'seg':
            line = wordsegUtil.cleanLine(line)
            parts = wordsegUtil.words(line)
            print ('  Query (seg):', ' '.join(parts))
            print ('')
            print ('  ' + ' '.join(
                submission.segmentWords(part, unigramCost) for part in parts))

        elif cmd == 'ins':
            line = wordsegUtil.cleanLine(line)
            ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print ('  Query (ins):', ' '.join(ws))
            print ('')
            print ('  ' + submission.insertVowels(ws, bigramCost, possibleFills))

        elif cmd == 'both':
            line = wordsegUtil.cleanLine(line)
            smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2)
            parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print ('  Query (both):', ' '.join(parts))
            print ('')
            print ('  ' + ' '.join(
                submission.segmentAndInsert(part, smoothCost, possibleFills)
                for part in parts
            ))

        elif cmd == 'fills':
            line = wordsegUtil.cleanLine(line)
            print ('\n'.join(possibleFills(line)))

        elif cmd == 'ug':
            line = wordsegUtil.cleanLine(line)
            print (unigramCost(line))

        elif cmd == 'bg':
            grams = tuple(wordsegUtil.words(line))
            prefix, ending = grams[-2], grams[-1]
            print (bigramCost(prefix, ending))

        else:
            print ('Unrecognized command:', cmd)

        print ('')
Пример #5
0
import grader
import submission

#_realUnigramCost, _realBigramCost = wordsegUtil.makeLanguageModels('toy-will.txt')


def unigramCost(x):
    if x in ['and', 'two', 'three', 'word', 'words', 'there', 'the', 're']:
        return 1.0
    else:
        return 1000.0


def equal(a, b):
    if a != b:
        raise "bad " + a + b


equal('', submission.segmentWords('', unigramCost))
equal('word', submission.segmentWords('word', unigramCost))
equal('two words', submission.segmentWords('twowords', unigramCost))
equal('and three words', submission.segmentWords('andthreewords', unigramCost))
equal('there', submission.segmentWords('there', unigramCost))
equal('there word', submission.segmentWords('thereword', unigramCost))
equal('garbage', submission.segmentWords('garbage', unigramCost))
Пример #6
0
def repl(unigramCost, bigramCost, possibleFills, command=None):
    '''REPL: read, evaluate, print, loop'''

    while True:
        sys.stdout.write('>> ')
        line = sys.stdin.readline().strip()
        if not line:
            break

        if command is None:
            cmdAndLine = line.split(None, 1)
            cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:])
        else:
            cmd = command
            line = line

        print ''

        if cmd == 'help':
            print 'Usage: <command> [arg1, arg2, ...]'
            print ''
            print 'Commands:'
            print '\n'.join(a + '\t\t' + b for a, b in [
                ('help', 'This'),
                ('seg', 'Segment character sequences'),
                ('ins', 'Insert vowels into words'),
                ('both', 'Joint segment-and-insert'),
                ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'),
                ('ug', 'Query unigram cost function'),
                ('bg', 'Query bigram cost function'),
            ])
            print ''
            print 'Enter empty line to quit'

        elif cmd == 'seg':
            line = wordsegUtil.cleanLine(line)
            parts = wordsegUtil.words(line)
            print '  Query (seg):', ' '.join(parts)
            print ''
            print '  ' + ' '.join(
                submission.segmentWords(part, unigramCost) for part in parts)

        elif cmd == 'ins':
            line = wordsegUtil.cleanLine(line)
            ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print '  Query (ins):', ' '.join(ws)
            print ''
            print '  ' + submission.insertVowels(ws, bigramCost, possibleFills)

        elif cmd == 'both':
            line = wordsegUtil.cleanLine(line)
            smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2)
            parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print '  Query (both):', ' '.join(parts)
            print ''
            print '  ' + ' '.join(
                submission.segmentAndInsert(part, smoothCost, possibleFills)
                for part in parts
            )

        elif cmd == 'fills':
            line = wordsegUtil.cleanLine(line)
            print '\n'.join(possibleFills(line))

        elif cmd == 'ug':
            line = wordsegUtil.cleanLine(line)
            print unigramCost(line)

        elif cmd == 'bg':
            grams = tuple(wordsegUtil.words(line))
            prefix, ending = grams[-2], grams[-1]
            print bigramCost(prefix, ending)

        else:
            print 'Unrecognized command:', cmd

        print ''