예제 #1
0
    def test_zero_weight(self):
        weights = CounterMap()
        weights['dog'] = Counter({'warm': 2.0})
        labels = set(weights.iterkeys())
        logp = maxent.get_log_probabilities(self.features, weights, labels)

        self.assertEqual(logp['dog'], 0.0)
예제 #2
0
	def test_zero_weight(self):
		weights = CounterMap()
		weights['dog'] = Counter({'warm' : 2.0})
		labels = set(weights.iterkeys())
		logp = maxent.get_log_probabilities(self.features, weights, labels)

		self.assertEqual(logp['dog'], 0.0)
예제 #3
0
    def test_extraneous_label(self):
        weights = CounterMap()
        weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5})
        labels = set(weights.iterkeys())
        logp = maxent.get_log_probabilities(self.features, weights, labels)

        self.assertEqual(logp['cat'], float('-inf'))
예제 #4
0
	def test_extraneous_label(self):
		weights = CounterMap()
		weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5})
		labels = set(weights.iterkeys())
		logp = maxent.get_log_probabilities(self.features, weights, labels)

		self.assertEqual(logp['cat'], float('-inf'))
예제 #5
0
    def test_uneven_weights(self):
        weights = CounterMap()
        weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 1.0})
        weights['cat'] = Counter({'warm': 1.0, 'fuzzy': 1.0})
        labels = set(weights.iterkeys())
        logp = maxent.get_log_probabilities(self.features, weights, labels)

        # construct scores
        scores = Counter()
        scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0
        scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0
        scores.log_normalize()

        # check scores explicitly
        self.assertAlmostEqual(scores['dog'], log(0.731), 3)
        self.assertAlmostEqual(scores['cat'], log(0.269), 3)

        # check that log probs is correct
        self.assertEqual(logp['dog'], scores['dog'])
        self.assertEqual(logp['cat'], scores['cat'])
예제 #6
0
	def test_uneven_weights(self):
		weights = CounterMap()
		weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 1.0})
		weights['cat'] = Counter({'warm' : 1.0, 'fuzzy' : 1.0})
		labels = set(weights.iterkeys())
		logp = maxent.get_log_probabilities(self.features, weights, labels)

		# construct scores
		scores = Counter()
		scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0
		scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0
		scores.log_normalize()

		# check scores explicitly
		self.assertAlmostEqual(scores['dog'], log(0.731), 3)
		self.assertAlmostEqual(scores['cat'], log(0.269), 3)

		# check that log probs is correct
		self.assertEqual(logp['dog'], scores['dog'])
		self.assertEqual(logp['cat'], scores['cat'])
예제 #7
0
class NaiveBayesClassifier:
	def train(self, labeled_data):
		self.feature_distribution = CounterMap()
		labels = set()

		for label, datum in labeled_data:
			labels.add(label)
			for feature in ngrams(datum, 3)
				self.feature_distribution[feature][label] += 1

		for feature in self.feature_distribution.iterkeys():
			self.feature_distribution[feature].default = 0.01

		self.feature_distribution.normalize()
		self.feature_distribution.log()

	def label_distribution(self, datum):
		distribution = None

		for feature in ngrams(datum, 3):
			if distribution:
				distribution += self.feature_distribution[feature]
			else:
				distribution = copy(self.feature_distribution[feature])

		distribution.log_normalize()

		return distribution

	def label(self, datum):
		distribution = None

		for feature in ngrams(datum, 3):
			if distribution:
				distribution += self.feature_distribution[feature]
			else:
				distribution = copy(self.feature_distribution[feature])

		return distribution.arg_max()
예제 #8
0
class MaximumEntropyLogProbsTest(unittest.TestCase):
    def setUp(self):
        self.features = Counter((key, 1.0) for key in ['warm', 'fuzzy'])

        self.weights = CounterMap()
        self.weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5})
        self.weights['cat'] = Counter({'warm': 0.5, 'fuzzy': 2.0})

        self.labels = set(self.weights.iterkeys())
        self.logp = maxent.get_log_probabilities(self.features, self.weights,
                                                 self.labels)

    def test_fast_slow_equal(self):
        slow_logp = maximumentropy.slow_log_probs(self.features, self.weights,
                                                  self.labels)

        self.assertEqual(self.logp, slow_logp)

    def test_logp_is_probability_distribution(self):
        """
		Verify that all log probs are <= 0 and total probability is 1.0
		"""
        self.assertTrue(max(self.logp.itervalues()) <= 0.0)
        self.assertAlmostEqual(sum(exp(val) for val in self.logp.itervalues()),
                               1.0)

    def test_basic_values(self):
        """
		Are the log probs as expected?
		"""
        self.assertAlmostEqual(exp(self.logp['cat']), 0.5)
        self.assertAlmostEqual(exp(self.logp['dog']), 0.5)

    def test_single_label(self):
        weights = CounterMap()
        weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5})
        labels = set(weights.iterkeys())
        logp = maxent.get_log_probabilities(self.features, weights, labels)

        self.assertEqual(logp['dog'], 0.0)

    def test_extraneous_label(self):
        weights = CounterMap()
        weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5})
        labels = set(weights.iterkeys())
        logp = maxent.get_log_probabilities(self.features, weights, labels)

        self.assertEqual(logp['cat'], float('-inf'))

    def test_zero_weight(self):
        weights = CounterMap()
        weights['dog'] = Counter({'warm': 2.0})
        labels = set(weights.iterkeys())
        logp = maxent.get_log_probabilities(self.features, weights, labels)

        self.assertEqual(logp['dog'], 0.0)

    def test_uneven_weights(self):
        weights = CounterMap()
        weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 1.0})
        weights['cat'] = Counter({'warm': 1.0, 'fuzzy': 1.0})
        labels = set(weights.iterkeys())
        logp = maxent.get_log_probabilities(self.features, weights, labels)

        # construct scores
        scores = Counter()
        scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0
        scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0
        scores.log_normalize()

        # check scores explicitly
        self.assertAlmostEqual(scores['dog'], log(0.731), 3)
        self.assertAlmostEqual(scores['cat'], log(0.269), 3)

        # check that log probs is correct
        self.assertEqual(logp['dog'], scores['dog'])
        self.assertEqual(logp['cat'], scores['cat'])

    def test_performance(self):
        """
		C api should be faster than python API (this is potentialy flakey, depending on system load patterns)
		"""
        start = time.time()
        for i in xrange(100000):
            test = maximumentropy.slow_log_probs(self.features, self.weights,
                                                 self.labels)

        slow_time = time.time() - start

        start = time.time()
        for i in xrange(100000):
            test = maxent.get_log_probabilities(self.features, self.weights,
                                                self.labels)

        fast_time = time.time() - start

        self.assertTrue(fast_time < slow_time)
예제 #9
0
class MaximumEntropyLogProbsTest(unittest.TestCase):
	def setUp(self):
		self.features = Counter((key, 1.0) for key in ['warm', 'fuzzy'])

		self.weights = CounterMap()
		self.weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5})
		self.weights['cat'] = Counter({'warm' : 0.5, 'fuzzy' : 2.0})

		self.labels = set(self.weights.iterkeys())
		self.logp = maxent.get_log_probabilities(self.features, self.weights, self.labels)

	def test_fast_slow_equal(self):
		slow_logp = maximumentropy.slow_log_probs(self.features, self.weights, self.labels)

		self.assertEqual(self.logp, slow_logp)

	def test_logp_is_probability_distribution(self):
		"""
		Verify that all log probs are <= 0 and total probability is 1.0
		"""
		self.assertTrue(max(self.logp.itervalues()) <= 0.0)
		self.assertAlmostEqual(sum(exp(val) for val in self.logp.itervalues()), 1.0)

	def test_basic_values(self):
		"""
		Are the log probs as expected?
		"""
		self.assertAlmostEqual(exp(self.logp['cat']), 0.5)
		self.assertAlmostEqual(exp(self.logp['dog']), 0.5)

	def test_single_label(self):
		weights = CounterMap()
		weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5})
		labels = set(weights.iterkeys())
		logp = maxent.get_log_probabilities(self.features, weights, labels)

		self.assertEqual(logp['dog'], 0.0)

	def test_extraneous_label(self):
		weights = CounterMap()
		weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5})
		labels = set(weights.iterkeys())
		logp = maxent.get_log_probabilities(self.features, weights, labels)

		self.assertEqual(logp['cat'], float('-inf'))

	def test_zero_weight(self):
		weights = CounterMap()
		weights['dog'] = Counter({'warm' : 2.0})
		labels = set(weights.iterkeys())
		logp = maxent.get_log_probabilities(self.features, weights, labels)

		self.assertEqual(logp['dog'], 0.0)
		
	def test_uneven_weights(self):
		weights = CounterMap()
		weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 1.0})
		weights['cat'] = Counter({'warm' : 1.0, 'fuzzy' : 1.0})
		labels = set(weights.iterkeys())
		logp = maxent.get_log_probabilities(self.features, weights, labels)

		# construct scores
		scores = Counter()
		scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0
		scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0
		scores.log_normalize()

		# check scores explicitly
		self.assertAlmostEqual(scores['dog'], log(0.731), 3)
		self.assertAlmostEqual(scores['cat'], log(0.269), 3)

		# check that log probs is correct
		self.assertEqual(logp['dog'], scores['dog'])
		self.assertEqual(logp['cat'], scores['cat'])

	def test_performance(self):
		"""
		C api should be faster than python API (this is potentialy flakey, depending on system load patterns)
		"""
		start = time.time()
		for i in xrange(100000):
			test = maximumentropy.slow_log_probs(self.features, self.weights, self.labels)

		slow_time = time.time() - start

		start = time.time()
		for i in xrange(100000):
			test = maxent.get_log_probabilities(self.features, self.weights, self.labels)

		fast_time = time.time() - start

		self.assertTrue(fast_time < slow_time)