def test_zero_weight(self): weights = CounterMap() weights['dog'] = Counter({'warm': 2.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['dog'], 0.0)
def test_zero_weight(self): weights = CounterMap() weights['dog'] = Counter({'warm' : 2.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['dog'], 0.0)
def test_extraneous_label(self): weights = CounterMap() weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['cat'], float('-inf'))
def test_extraneous_label(self): weights = CounterMap() weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['cat'], float('-inf'))
def test_uneven_weights(self): weights = CounterMap() weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 1.0}) weights['cat'] = Counter({'warm': 1.0, 'fuzzy': 1.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) # construct scores scores = Counter() scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0 scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0 scores.log_normalize() # check scores explicitly self.assertAlmostEqual(scores['dog'], log(0.731), 3) self.assertAlmostEqual(scores['cat'], log(0.269), 3) # check that log probs is correct self.assertEqual(logp['dog'], scores['dog']) self.assertEqual(logp['cat'], scores['cat'])
def test_uneven_weights(self): weights = CounterMap() weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 1.0}) weights['cat'] = Counter({'warm' : 1.0, 'fuzzy' : 1.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) # construct scores scores = Counter() scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0 scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0 scores.log_normalize() # check scores explicitly self.assertAlmostEqual(scores['dog'], log(0.731), 3) self.assertAlmostEqual(scores['cat'], log(0.269), 3) # check that log probs is correct self.assertEqual(logp['dog'], scores['dog']) self.assertEqual(logp['cat'], scores['cat'])
class NaiveBayesClassifier: def train(self, labeled_data): self.feature_distribution = CounterMap() labels = set() for label, datum in labeled_data: labels.add(label) for feature in ngrams(datum, 3) self.feature_distribution[feature][label] += 1 for feature in self.feature_distribution.iterkeys(): self.feature_distribution[feature].default = 0.01 self.feature_distribution.normalize() self.feature_distribution.log() def label_distribution(self, datum): distribution = None for feature in ngrams(datum, 3): if distribution: distribution += self.feature_distribution[feature] else: distribution = copy(self.feature_distribution[feature]) distribution.log_normalize() return distribution def label(self, datum): distribution = None for feature in ngrams(datum, 3): if distribution: distribution += self.feature_distribution[feature] else: distribution = copy(self.feature_distribution[feature]) return distribution.arg_max()
class MaximumEntropyLogProbsTest(unittest.TestCase): def setUp(self): self.features = Counter((key, 1.0) for key in ['warm', 'fuzzy']) self.weights = CounterMap() self.weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5}) self.weights['cat'] = Counter({'warm': 0.5, 'fuzzy': 2.0}) self.labels = set(self.weights.iterkeys()) self.logp = maxent.get_log_probabilities(self.features, self.weights, self.labels) def test_fast_slow_equal(self): slow_logp = maximumentropy.slow_log_probs(self.features, self.weights, self.labels) self.assertEqual(self.logp, slow_logp) def test_logp_is_probability_distribution(self): """ Verify that all log probs are <= 0 and total probability is 1.0 """ self.assertTrue(max(self.logp.itervalues()) <= 0.0) self.assertAlmostEqual(sum(exp(val) for val in self.logp.itervalues()), 1.0) def test_basic_values(self): """ Are the log probs as expected? """ self.assertAlmostEqual(exp(self.logp['cat']), 0.5) self.assertAlmostEqual(exp(self.logp['dog']), 0.5) def test_single_label(self): weights = CounterMap() weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['dog'], 0.0) def test_extraneous_label(self): weights = CounterMap() weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['cat'], float('-inf')) def test_zero_weight(self): weights = CounterMap() weights['dog'] = Counter({'warm': 2.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['dog'], 0.0) def test_uneven_weights(self): weights = CounterMap() weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 1.0}) weights['cat'] = Counter({'warm': 1.0, 'fuzzy': 1.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) # construct scores scores = Counter() scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0 scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0 scores.log_normalize() # check scores explicitly self.assertAlmostEqual(scores['dog'], log(0.731), 3) self.assertAlmostEqual(scores['cat'], log(0.269), 3) # check that log probs is correct self.assertEqual(logp['dog'], scores['dog']) self.assertEqual(logp['cat'], scores['cat']) def test_performance(self): """ C api should be faster than python API (this is potentialy flakey, depending on system load patterns) """ start = time.time() for i in xrange(100000): test = maximumentropy.slow_log_probs(self.features, self.weights, self.labels) slow_time = time.time() - start start = time.time() for i in xrange(100000): test = maxent.get_log_probabilities(self.features, self.weights, self.labels) fast_time = time.time() - start self.assertTrue(fast_time < slow_time)
class MaximumEntropyLogProbsTest(unittest.TestCase): def setUp(self): self.features = Counter((key, 1.0) for key in ['warm', 'fuzzy']) self.weights = CounterMap() self.weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5}) self.weights['cat'] = Counter({'warm' : 0.5, 'fuzzy' : 2.0}) self.labels = set(self.weights.iterkeys()) self.logp = maxent.get_log_probabilities(self.features, self.weights, self.labels) def test_fast_slow_equal(self): slow_logp = maximumentropy.slow_log_probs(self.features, self.weights, self.labels) self.assertEqual(self.logp, slow_logp) def test_logp_is_probability_distribution(self): """ Verify that all log probs are <= 0 and total probability is 1.0 """ self.assertTrue(max(self.logp.itervalues()) <= 0.0) self.assertAlmostEqual(sum(exp(val) for val in self.logp.itervalues()), 1.0) def test_basic_values(self): """ Are the log probs as expected? """ self.assertAlmostEqual(exp(self.logp['cat']), 0.5) self.assertAlmostEqual(exp(self.logp['dog']), 0.5) def test_single_label(self): weights = CounterMap() weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['dog'], 0.0) def test_extraneous_label(self): weights = CounterMap() weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['cat'], float('-inf')) def test_zero_weight(self): weights = CounterMap() weights['dog'] = Counter({'warm' : 2.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['dog'], 0.0) def test_uneven_weights(self): weights = CounterMap() weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 1.0}) weights['cat'] = Counter({'warm' : 1.0, 'fuzzy' : 1.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) # construct scores scores = Counter() scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0 scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0 scores.log_normalize() # check scores explicitly self.assertAlmostEqual(scores['dog'], log(0.731), 3) self.assertAlmostEqual(scores['cat'], log(0.269), 3) # check that log probs is correct self.assertEqual(logp['dog'], scores['dog']) self.assertEqual(logp['cat'], scores['cat']) def test_performance(self): """ C api should be faster than python API (this is potentialy flakey, depending on system load patterns) """ start = time.time() for i in xrange(100000): test = maximumentropy.slow_log_probs(self.features, self.weights, self.labels) slow_time = time.time() - start start = time.time() for i in xrange(100000): test = maxent.get_log_probabilities(self.features, self.weights, self.labels) fast_time = time.time() - start self.assertTrue(fast_time < slow_time)