예제 #1
0
    def test_get_posterior(self):
        motor_cycles_prior = -19.400790000000001
        motor_space_cycles_prior = -20.64209
        # phrases with splits and errors
        split_str = 'picture of state trooper motor cycles'
        joined_str = 'picture of state trooper motorcycles'

        query = Suggestion(suggestion_str = joined_str)
        suggestion = Suggestion(suggestion_str = split_str)
        # edit_distance = (phrase.get_edits(''.join(joined_str.split()), 
        #                                   ''.join(split_str.split()))[0]\
        #                                   + 2 * phrase.space_edit_cost)
        # expected = -(edit_distance / len(str(split_str)))

        expected = -0.0033333333333333335

        likelihood = phrase.get_likelihood(query, suggestion)
        self.assertAlmostEqual(likelihood,
                               expected)

        log_split_posterior = motor_space_cycles_prior + phrase.get_likelihood(query, 
                                                                               suggestion)

        log_joined_posterior = motor_cycles_prior + phrase.get_likelihood(query, 
                                                                          query)
        self.assertAlmostEqual(log_split_posterior, -20.645423333333333)
        self.assertAlmostEqual(log_joined_posterior, -19.400790000000001)
예제 #2
0
    def test_get_posterior(self):
        motor_cycles_prior = -19.400790000000001
        motor_space_cycles_prior = -20.64209
        # phrases with splits and errors
        split_str = 'picture of state trooper motor cycles'
        joined_str = 'picture of state trooper motorcycles'

        query = Suggestion(suggestion_str=joined_str)
        suggestion = Suggestion(suggestion_str=split_str)
        # edit_distance = (phrase.get_edits(''.join(joined_str.split()),
        #                                   ''.join(split_str.split()))[0]\
        #                                   + 2 * phrase.space_edit_cost)
        # expected = -(edit_distance / len(str(split_str)))

        expected = -0.0033333333333333335

        likelihood = phrase.get_likelihood(query, suggestion)
        self.assertAlmostEqual(likelihood, expected)

        log_split_posterior = motor_space_cycles_prior + phrase.get_likelihood(
            query, suggestion)

        log_joined_posterior = motor_cycles_prior + phrase.get_likelihood(
            query, query)
        self.assertAlmostEqual(log_split_posterior, -20.645423333333333)
        self.assertAlmostEqual(log_joined_posterior, -19.400790000000001)
예제 #3
0
    def test_get_likelihood(self):
        query = Suggestion(['foo'])
        suggestion = Suggestion(['bar'])
        expected = phrase.error_penalization * -(
            phrase.get_edits(suggestion[0], query[0])[0] / len(str(query)))
        self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                               expected, 2)

        # phrase with splits joined up
        split_str = 'fo o bar roc ks'
        joined_str = 'foo bar rocks'
        query = Suggestion(suggestion_str=split_str)
        suggestion = Suggestion(suggestion_str=joined_str)
        expected = phrase.error_penalization * -(2 * phrase.space_edit_cost /
                                                 len(split_str))
        self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                               expected)

        # phrases with joins split up
        split_str = 'fo o bar roc ks'
        joined_str = 'foo bar rocks'
        query = Suggestion(suggestion_str=joined_str)
        suggestion = Suggestion(suggestion_str=split_str)
        expected = phrase.error_penalization * -(2 * phrase.space_edit_cost /
                                                 len(joined_str))
        self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                               expected)

        # phrases with splits and errors
        split_str = 'fo o bcr rxc ks'
        joined_str = 'foo bar rocks'
        query = Suggestion(suggestion_str=split_str)
        suggestion = Suggestion(suggestion_str=joined_str)
        edit_distance = (phrase.get_edits(''.join(joined_str.split()),
                                          ''.join(split_str.split()))[0]\
                                          + 2 * phrase.space_edit_cost)
        expected = phrase.error_penalization * -(edit_distance /
                                                 len(str(split_str)))
        self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                               expected)
예제 #4
0
    def test_get_likelihood(self): 
        query = Suggestion(['foo'])
        suggestion = Suggestion(['bar'])
        expected = phrase.error_penalization * -(phrase.get_edits(suggestion[0], 
                                                 query[0])[0] / len(str(query)))
        self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                               expected, 2)

        # phrase with splits joined up
        split_str = 'fo o bar roc ks'
        joined_str = 'foo bar rocks'
        query = Suggestion(suggestion_str = split_str)
        suggestion = Suggestion(suggestion_str = joined_str)
        expected = phrase.error_penalization * -(2 * phrase.space_edit_cost / len(split_str))
        self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                               expected)

        # phrases with joins split up
        split_str = 'fo o bar roc ks'
        joined_str = 'foo bar rocks'
        query = Suggestion(suggestion_str = joined_str)
        suggestion = Suggestion(suggestion_str = split_str)
        expected = phrase.error_penalization * -(2 * phrase.space_edit_cost / len(joined_str))
        self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                               expected)

        # phrases with splits and errors
        split_str = 'fo o bcr rxc ks'
        joined_str = 'foo bar rocks'
        query = Suggestion(suggestion_str = split_str)
        suggestion = Suggestion(suggestion_str = joined_str)
        edit_distance = (phrase.get_edits(''.join(joined_str.split()), 
                                          ''.join(split_str.split()))[0]\
                                          + 2 * phrase.space_edit_cost)
        expected = phrase.error_penalization * -(edit_distance / len(str(split_str)))
        self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                               expected)
예제 #5
0
 def test_get_likelihood_splits_special(self):
     # TODO
     suggestion_likelihood_list = [
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'horses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), 0.0),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'courses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.0092284029735965143),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'horses', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), 0.0),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'horses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), 0.0),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'cores', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.0015698587127158554),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'cores', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.0015887726731100226),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'cores', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.0015887726731100226),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'cortes', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.007872379721119217),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'cortes', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.0079672276695664374),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'cortes', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.0079672276695664374),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'courses', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.0091185410334346517),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'courses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.0092284029735965143),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'torses', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.010714285714285714),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'torses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.010843373493975905),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'torses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.010843373493975905),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'curses', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.010989010989010988),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'curses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.011121408711770158),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'curses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.011121408711770158),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'corsets', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.012605042016806723),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'corsets', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.012756909992912829),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'corsets', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.012756909992912829),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'corset', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.013632791245994799),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'corset', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.013797041742934496),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'corset', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.013797041742934496),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'copses', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.030612244897959183),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'copses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.030981067125645436),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'copses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.030981067125645436),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'corpses', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.034985422740524783),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'corpses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.035406933857880504),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'corpses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.035406933857880504)
     ]
     for i, _tuple in enumerate(suggestion_likelihood_list):
         query, suggestion, likelihood = _tuple
         # print 'suggestion number: ', i
         self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                                likelihood, 2)
예제 #6
0
 def test_get_likelihood_splits_special(self):
     # TODO
     suggestion_likelihood_list = [
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
          Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'horses', 'conducted', 'by', 'highly', 'qualified', 'staff']), 0.0),
          (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
           Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'courses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
           -0.0092284029735965143),
         (Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'horses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         0.0),
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'horses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         0.0),
         (Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'cores', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.0015698587127158554),
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'cores', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.0015887726731100226),
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'cores', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.0015887726731100226),
         (Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'cortes', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.007872379721119217),
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'cortes', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.0079672276695664374),
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'cortes', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.0079672276695664374),
         (Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'courses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.0091185410334346517),
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'courses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.0092284029735965143),
         (Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'torses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.010714285714285714),
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'torses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.010843373493975905),
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'torses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.010843373493975905),
         (Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'curses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.010989010989010988),
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'curses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.011121408711770158),
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'curses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.011121408711770158),
         (Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corsets', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.012605042016806723),
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'corsets', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.012756909992912829),
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corsets', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.012756909992912829),
         (Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corset', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.013632791245994799),
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'corset', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.013797041742934496),
         (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corset', 'conducted', 'by', 'highly', 'qualified', 'staff']),
         -0.013797041742934496),
         (Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
          Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'copses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
          -0.030612244897959183),
          (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
           Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'copses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
           -0.030981067125645436),
           (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
            Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'copses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
            -0.030981067125645436),
            (Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
             Suggestion(['the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corpses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
             -0.034985422740524783),
             (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
              Suggestion(['the', 'departments', 'of', 'the', 'institute', 'offer', 'corpses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
              -0.035406933857880504),
              (Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
               Suggestion(['the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corpses', 'conducted', 'by', 'highly', 'qualified', 'staff']),
               -0.035406933857880504)]
     for i, _tuple in enumerate(suggestion_likelihood_list):
         query, suggestion, likelihood = _tuple
         # print 'suggestion number: ', i
         self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                                likelihood, 2)
    def generate_suggestions_and_posteriors(self, query, 
                                            get_posterior_fn = None):
        """Return (suggestion, posterior) pairs for query.

        Get a list of candidate suggestions and calculate posteriors
        for each of them.

        Arguments:
        - `query`: Suggestion object.
        """
        if get_posterior_fn == None:
            get_posterior_fn = self.get_posterior_fn

        # all_queries = [query] + utils.get_corrected_split_queries(query, self.lexicon) \
        #   + utils.get_corrected_run_on_queries(query, self.lexicon)

        all_queries = [query] + utils.get_corrected_split_queries(query, self.lexicon)
        #   + utils.get_corrected_run_on_queries(query, self.lexicon)

        # print 'all_queries'
        # pprint(all_queries)

        # List of list of (query, suggestion, likelihood) for each query
        all_suggestions = [[(query, suggestion) 
                            for suggestion in self.generate_candidate_suggestions(
                                    map(self.generate_candidate_terms, query),
                                    query.suggestion_type)] 
                                    for query in all_queries]

        # Flatten the list of list of suggestions
        all_suggestions = list(itertools.chain(*all_suggestions))

        # print 'all_suggestions after flattening'
        # pprint(all_suggestions)

        all_suggestions.sort(key = lambda query_sugg_tuple: 
                             phrase.get_likelihood(*query_sugg_tuple), 
                             reverse = True)

        # print 'suggestions and likelihood'
        # pprint([(query, suggestion, phrase.get_likelihood(query, suggestion)) 
        #         for query, suggestion in all_suggestions])

        # Remove duplicates (if any)
        all_suggestions = [key for key, _ in itertools.groupby(all_suggestions)]

        # print 'all_suggestions after removing duplicates'
        # pprint(all_suggestions)

        # Take only the top few suggestions
        all_suggestions = all_suggestions[:self.MAX_NUM_SUGGESTIONS]

        # print 'len(all_suggestions)'
        # pprint(len(all_suggestions))

        # print 'all_suggestions after taking off the top'
        # pprint(all_suggestions)

        all_posteriors = [get_posterior_fn(suggestion, query)
                          for query, suggestion in all_suggestions]

        all_suggestions = list(zip(*all_suggestions)[1])

        # TODO
        # original_query = query
        # original_query_posterior = get_posterior_fn(query, query)
        # print 'original_query'
        # pprint(original_query, original_query_posterior)
        # if original_query_posterior > self.ORIGINAL_POSTERIOR_THRESHOLD:
        #     all_suggestions += [original_query]
        #     all_posteriors += [original_query_posterior]

        normalized_posteriors = utils.get_normalized_probabilities(all_posteriors)
        suggestion_posterior_list = list(zip(all_suggestions, normalized_posteriors))
        suggestion_posterior_list.sort(key = lambda pair: pair[1], reverse = True)
        return suggestion_posterior_list
예제 #8
0
from datetime import datetime
import edit_distance_calculator
import itertools
import lexicon
import math
from memoize import save_memoize_table
from suggestion import Suggestion
import phrase
import spell_checker
import sys
import utils

dummy_posterior = 1 / 3.0
dummy_prior = 1 / 3.0
dummy_posterior_fn = lambda suggestion, query: math.exp(math.log(dummy_prior) + phrase.get_likelihood(query, suggestion))
lexicon = lexicon.Lexicon()

def get_inputs(test_label, filename = '../data/words.input'):
    """Return list of input queries read from filename.

    Lowercase all the words.
    If a query is a sentence, remove the period at the end.
    """
    f = open(filename, 'r')
    query_list = [Suggestion(suggestion_str = line.strip(), 
                             suggestion_type = test_label[:-1]) 
                             for line in f]
    f.close()
    
    print 'query_list', query_list