def test_zero_weight(self): weights = CounterMap() weights['dog'] = Counter({'warm': 2.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['dog'], 0.0)
def _linear_smooth(cls, labels, fallback_transition, label_history_size): transition = CounterMap() linear_smoothing_weights = [1.0 - 0.1 * (label_history_size - 1)] linear_smoothing_weights.extend( 0.1 for _ in xrange(label_history_size - 1)) # This is super inefficient - it should be caching smoothings involving the less-specific counters # e.g. smoothed['NN']['CD'] = cnter['NN']['CD'] * \lambda * smoothed['NN'] and so on all_label_histories = set(permutations(labels, label_history_size - 1)) for label_history in all_label_histories: histories = [ history for history in (label_history[i:] for i in xrange(label_history_size)) ] # >>> label_history = ('WDT', 'RBR') # histories = [('WDT', 'RBR'), ('RBR')] history_strings = ['::'.join(history) for history in histories] history_scores = [ fallback_transition[len(history)][history_string] for history, history_string in izip(histories, history_strings) ] transition[history_strings[0]] = Counter() for smoothing, history_score in izip(linear_smoothing_weights, history_scores): transition[history_strings[0]] += history_score * smoothing transition.normalize() return transition
def test_extraneous_label(self): weights = CounterMap() weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['cat'], float('-inf'))
def test_zero_weight(self): weights = CounterMap() weights['dog'] = Counter({'warm' : 2.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['dog'], 0.0)
def test_extraneous_label(self): weights = CounterMap() weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['cat'], float('-inf'))
def setUp(self): self.features = Counter((key, 1.0) for key in ['warm', 'fuzzy']) self.weights = CounterMap() self.weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5}) self.weights['cat'] = Counter({'warm': 0.5, 'fuzzy': 2.0}) self.labels = set(self.weights.iterkeys()) self.logp = maxent.get_log_probabilities(self.features, self.weights, self.labels)
def value_and_gradient(self, weights, verbose=False): if weights == self.last_vg_weights: return self.last_vg objective = 0.0 gradient = CounterMap() if verbose: print "Calculating log probabilities and objective..." # log_prob log_probs = list() for pos, (label, features) in enumerate(self.labeled_extracted_features): log_probs.append(get_log_probs(features, weights, self.labels)) assert ( abs(sum(exp(log_probs[pos][label]) for label in self.labels) - 1.0) < 0.0001 ), "Not a distribution: P[any | features] = %f" % (sum(exp(log_probs[pos][label]) for label in self.labels)) objective = -sum(log_prob[label] for (log_prob, (label, _)) in zip(log_probs, self.labeled_extracted_features)) if verbose: print "Raw objective: %f" % objective if verbose: print "Calculating expected counts..." expected_counts = get_expected_counts(self.labeled_extracted_features, self.labels, log_probs, CounterMap()) if verbose: print "Calculating gradient..." gradient = expected_counts - self.empirical_counts if verbose: print "Applying penalty" # Apply a penalty (e.g. smooth the results) if self.sigma: penalty = 0.0 for label, feature_weights in gradient.iteritems(): for feature in feature_weights: weight = weights[label][feature] penalty += weight ** 2 gradient[label][feature] += weight / (self.sigma ** 2) penalty /= 2 * self.sigma ** 2 objective += penalty if verbose: print "Penalized objective: %f" % objective self.last_vg_weights = weights self.last_vg = (objective, gradient) return (objective, gradient)
def value_and_gradient(self, weights, verbose=False): if weights == self.last_vg_weights: return self.last_vg objective = 0.0 gradient = CounterMap() if verbose: print "Calculating log probabilities and objective..." # log_prob log_probs = list() for pos, (label, features) in enumerate(self.labeled_extracted_features): log_probs.append(get_log_probs(features, weights, self.labels)) assert abs( sum(exp(log_probs[pos][label]) for label in self.labels) - 1.0) < 0.0001, "Not a distribution: P[any | features] = %f" % ( sum(exp(log_probs[pos][label]) for label in self.labels)) objective = -sum(log_prob[label] for (log_prob, ( label, _)) in zip(log_probs, self.labeled_extracted_features)) if verbose: print "Raw objective: %f" % objective if verbose: print "Calculating expected counts..." expected_counts = get_expected_counts(self.labeled_extracted_features, self.labels, log_probs, CounterMap()) if verbose: print "Calculating gradient..." gradient = expected_counts - self.empirical_counts if verbose: print "Applying penalty" # Apply a penalty (e.g. smooth the results) if self.sigma: penalty = 0.0 for label, feature_weights in gradient.iteritems(): for feature in feature_weights: weight = weights[label][feature] penalty += weight**2 gradient[label][feature] += (weight / (self.sigma**2)) penalty /= 2 * self.sigma**2 objective += penalty if verbose: print "Penalized objective: %f" % objective self.last_vg_weights = weights self.last_vg = (objective, gradient) return (objective, gradient)
def train(self, labeled_data): self.feature_distribution = CounterMap() labels = set() for label, datum in labeled_data: labels.add(label) for feature in ngrams(datum, 3) self.feature_distribution[feature][label] += 1 for feature in self.feature_distribution.iterkeys(): self.feature_distribution[feature].default = 0.01 self.feature_distribution.normalize() self.feature_distribution.log()
def __init__(self, label_history_size=2): # Distribution over next state given current state self.labels = list() self.label_history_size = label_history_size self.transition = CounterMap() self.reverse_transition = CounterMap() # same as transitions but indexed in reverse (useful for decoding) self.fallback_emissions_model = None self.fallback_transition = None self.fallback_reverse_transition = None # Multinomial distribution over emissions given label self.emission = CounterMap() # p(label | emission) self.label_emissions = CounterMap()
def setUp(self): self.features = Counter((key, 1.0) for key in ['warm', 'fuzzy']) self.weights = CounterMap() self.weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5}) self.weights['cat'] = Counter({'warm' : 0.5, 'fuzzy' : 2.0}) self.labels = set(self.weights.iterkeys()) self.logp = maxent.get_log_probabilities(self.features, self.weights, self.labels)
def slow_expected_counts(labeled_extracted_features, labels, log_probs): expected_counts = CounterMap() for (index, (_, datum_features)) in enumerate(labeled_extracted_features): for (feature, cnt) in datum_features.iteritems(): for label in labels: expected_counts[label][feature] += exp( log_probs[index][label]) * cnt return expected_counts
def test_uneven_weights(self): weights = CounterMap() weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 1.0}) weights['cat'] = Counter({'warm': 1.0, 'fuzzy': 1.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) # construct scores scores = Counter() scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0 scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0 scores.log_normalize() # check scores explicitly self.assertAlmostEqual(scores['dog'], log(0.731), 3) self.assertAlmostEqual(scores['cat'], log(0.269), 3) # check that log probs is correct self.assertEqual(logp['dog'], scores['dog']) self.assertEqual(logp['cat'], scores['cat'])
def test_uneven_weights(self): weights = CounterMap() weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 1.0}) weights['cat'] = Counter({'warm' : 1.0, 'fuzzy' : 1.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) # construct scores scores = Counter() scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0 scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0 scores.log_normalize() # check scores explicitly self.assertAlmostEqual(scores['dog'], log(0.731), 3) self.assertAlmostEqual(scores['cat'], log(0.269), 3) # check that log probs is correct self.assertEqual(logp['dog'], scores['dog']) self.assertEqual(logp['cat'], scores['cat'])
def _gather_colocation_counts(self, files): files = [open(path) for path in files] triples = chain(*[self._file_triples(file) for file in files]) pre_counts = CounterMap() post_counts = CounterMap() full_counts = CounterMap() for pre, word, post in triples: full_context = '::'.join(pre + post) pre_context = '::'.join(pre) post_context = '::'.join(post) pre_counts[word][pre_context] += 1 post_counts[word][post_context] += 1 full_counts[word][full_context] += 1 for file in files: file.close() return pre_counts, post_counts, full_counts
def test_fast_slow_equal(self): weights = CounterMap() weights['cat'] = Counter( (key, 1.0) for key in ('fuzzy', 'claws', 'small', 'medium', 'large')) weights['bear'] = Counter( (key, 1.0) for key in ('fuzzy', 'claws', 'small', 'medium', 'large')) log_probs = [ maxent.get_log_probabilities(datum[1], weights, self.labels) for datum in self.labeled_extracted_features ] slow_expectation = maximumentropy.slow_expected_counts( self.labeled_extracted_features, self.labels, log_probs) fast_expectation = maxent.get_expected_counts( self.labeled_extracted_features, self.labels, log_probs, CounterMap()) self.assertEqual(slow_expectation, fast_expectation) # And try again with different weights weights['cat'] = Counter( (key, 1.0) for key in ('fuzzy', 'claws', 'small', 'medium')) weights['bear'] = Counter( (key, 1.0) for key in ('fuzzy', 'claws', 'big')) log_probs = [ maxent.get_log_probabilities(datum[1], weights, self.labels) for datum in self.labeled_extracted_features ] slow_expectation = maximumentropy.slow_expected_counts( self.labeled_extracted_features, self.labels, log_probs) fast_expectation = maxent.get_expected_counts( self.labeled_extracted_features, self.labels, log_probs, CounterMap()) self.assertEqual(slow_expectation, fast_expectation)
def __init__(self, labeled_extracted_features, labels, features): self.labeled_extracted_features = labeled_extracted_features self.labels = labels self.features = features self.empirical_counts = CounterMap() print "Calculating empirical counts..." for (index, (datum_label, datum_features)) in enumerate(self.labeled_extracted_features): for (feature, cnt) in datum_features.iteritems(): self.empirical_counts[datum_label][feature] += cnt
class NaiveBayesClassifier: def train(self, labeled_data): self.feature_distribution = CounterMap() labels = set() for label, datum in labeled_data: labels.add(label) for feature in ngrams(datum, 3) self.feature_distribution[feature][label] += 1 for feature in self.feature_distribution.iterkeys(): self.feature_distribution[feature].default = 0.01 self.feature_distribution.normalize() self.feature_distribution.log() def label_distribution(self, datum): distribution = None for feature in ngrams(datum, 3): if distribution: distribution += self.feature_distribution[feature] else: distribution = copy(self.feature_distribution[feature]) distribution.log_normalize() return distribution def label(self, datum): distribution = None for feature in ngrams(datum, 3): if distribution: distribution += self.feature_distribution[feature] else: distribution = copy(self.feature_distribution[feature]) return distribution.arg_max()
def _linear_smooth(cls, labels, fallback_transition, label_history_size): transition = CounterMap() linear_smoothing_weights = [1.0 - 0.1 * (label_history_size-1)] linear_smoothing_weights.extend(0.1 for _ in xrange(label_history_size-1)) # This is super inefficient - it should be caching smoothings involving the less-specific counters # e.g. smoothed['NN']['CD'] = cnter['NN']['CD'] * \lambda * smoothed['NN'] and so on all_label_histories = set(permutations(labels, label_history_size-1)) for label_history in all_label_histories: histories = [history for history in (label_history[i:] for i in xrange(label_history_size))] # >>> label_history = ('WDT', 'RBR') # histories = [('WDT', 'RBR'), ('RBR')] history_strings = ['::'.join(history) for history in histories] history_scores = [fallback_transition[len(history)][history_string] for history, history_string in izip(histories, history_strings)] transition[history_strings[0]] = Counter() for smoothing, history_score in izip(linear_smoothing_weights, history_scores): transition[history_strings[0]] += history_score * smoothing transition.normalize() return transition
def train_with_features(self, labeled_features, sigma=None, quiet=False): print "Optimizing weights..." weight_function = MaxEntWeightFunction(labeled_features, self.labels, self.features) weight_function.sigma = sigma print "Building initial dictionary..." initial_weights = CounterMap() print "Training on %d labelled features" % (len(labeled_features)) print "Minimizing..." self.weights = Minimizer.minimize(weight_function, initial_weights, quiet=quiet)
def __init__(self, label_history_size=2): # Distribution over next state given current state self.labels = list() self.label_history_size = label_history_size self.transition = CounterMap() self.reverse_transition = CounterMap( ) # same as transitions but indexed in reverse (useful for decoding) self.fallback_emissions_model = None self.fallback_transition = None self.fallback_reverse_transition = None # Multinomial distribution over emissions given label self.emission = CounterMap() # p(label | emission) self.label_emissions = CounterMap()
class HiddenMarkovModel: def __init__(self, label_history_size=2): # Distribution over next state given current state self.labels = list() self.label_history_size = label_history_size self.transition = CounterMap() self.reverse_transition = CounterMap( ) # same as transitions but indexed in reverse (useful for decoding) self.fallback_emissions_model = None self.fallback_transition = None self.fallback_reverse_transition = None # Multinomial distribution over emissions given label self.emission = CounterMap() # p(label | emission) self.label_emissions = CounterMap() def _pad_sequence(self, sequence, pairs=False): if pairs: yield (START_LABEL, START_LABEL) else: yield START_LABEL for item in sequence: yield item # Pad the end so we'll decode the whole thing for _ in xrange(self.label_history_size): if pairs: yield (STOP_LABEL, STOP_LABEL) else: yield STOP_LABEL @classmethod def _extend_labels(cls, sequence, label_history_size): ''' >>> foo = HiddenMarkovModel() >>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 1) [('A', (), 3), ('B', (), 4), ('C', (), 5)] >>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 2) [('A', ('<START>',), 3), ('B', ('A',), 4), ('C', ('B',), 5)] ''' last_labels = [START_LABEL for _ in xrange(label_history_size)] for label, emission in sequence: last_labels.append(label) last_labels.pop(0) if label == START_LABEL: last_labels = [START_LABEL for _ in xrange(label_history_size)] all_labels = ('::'.join(last_labels[label_history_size - length - 2:-1]) for length in xrange(label_history_size - 1)) yield (label, tuple(all_labels), emission) @property def start_label(self): return '::'.join(repeat(START_LABEL, self.label_history_size)) @property def stop_label(self): return '::'.join(repeat(STOP_LABEL, self.label_history_size)) def push_label(self, history, label): return '::'.join(history.split('::')[1:] + [ label, ]) @classmethod def _linear_smooth(cls, labels, fallback_transition, label_history_size): transition = CounterMap() linear_smoothing_weights = [1.0 - 0.1 * (label_history_size - 1)] linear_smoothing_weights.extend( 0.1 for _ in xrange(label_history_size - 1)) # This is super inefficient - it should be caching smoothings involving the less-specific counters # e.g. smoothed['NN']['CD'] = cnter['NN']['CD'] * \lambda * smoothed['NN'] and so on all_label_histories = set(permutations(labels, label_history_size - 1)) for label_history in all_label_histories: histories = [ history for history in (label_history[i:] for i in xrange(label_history_size)) ] # >>> label_history = ('WDT', 'RBR') # histories = [('WDT', 'RBR'), ('RBR')] history_strings = ['::'.join(history) for history in histories] history_scores = [ fallback_transition[len(history)][history_string] for history, history_string in izip(histories, history_strings) ] transition[history_strings[0]] = Counter() for smoothing, history_score in izip(linear_smoothing_weights, history_scores): transition[history_strings[0]] += history_score * smoothing transition.normalize() return transition def train(self, labeled_sequence, fallback_model=None, fallback_training_limit=None, use_linear_smoothing=True): label_counts = [Counter() for _ in xrange(self.label_history_size)] self.fallback_transition = [ CounterMap() for _ in xrange(self.label_history_size) ] self.fallback_reverse_transition = [ CounterMap() for _ in xrange(self.label_history_size) ] labeled_sequence = self._pad_sequence(labeled_sequence, pairs=True) labeled_sequence = list( HiddenMarkovModel._extend_labels(labeled_sequence, self.label_history_size + 1)) # Load emission and transition counters from the raw data for label, label_histories, emission in labeled_sequence: full_label = self.push_label(label_histories[-1], label) self.emission[full_label][emission] += 1.0 self.label_emissions[emission][full_label] += 1.0 for history_size, label_history in enumerate(label_histories): label_counts[history_size][label_history] += 1.0 self.fallback_transition[history_size][label_history][ full_label] += 1.0 # Make the counters distributions for transition in self.fallback_transition: transition.normalize() self.label_emissions.normalize() self.emission.normalize() self.labels = self.emission.keys() # Smooth transitions using fallback data # Doesn't work with label history size 1! if use_linear_smoothing and self.label_history_size > 1: self.transition = \ HiddenMarkovModel._linear_smooth(self.labels, self.fallback_transition, self.label_history_size) else: self.transition = self.fallback_transition[-1] # Convert to log score counters self.transition.log() self.label_emissions.log() self.emission.log() self.reverse_transition = self.transition.inverted() # Train the fallback model on the label-emission pairs if fallback_model: try: start = time() pickle_file = open("fallback_model.pickle") self.fallback_emissions_model, training_pairs_length = pickle.load( pickle_file) pickle_file.close() if fallback_training_limit and fallback_training_limit != training_pairs_length: raise IOError() elif not fallback_training_limit and len( labeled_sequence) != training_pairs_length: raise IOError() print "Unpickling fallback model: %f" % (time() - start) except (IOError, EOFError), e: print "Training fallback model" self.fallback_emissions_model = fallback_model() emissions_training_pairs = [ (emission_history[-1] + '::' + label, emission) for label, emission_history, emission in labeled_sequence if label != START_LABEL and label != STOP_LABEL ] if fallback_training_limit: emissions_training_pairs = islice(emissions_training_pairs, fallback_training_limit) self.fallback_emissions_model.train(emissions_training_pairs) serialized = (self.fallback_emissions_model, len(labeled_sequence)) pickle_file = open("fallback_model.pickle", "w") pickle.dump(serialized, pickle_file, protocol=pickle.HIGHEST_PROTOCOL) pickle_file.close() self._post_training()
class MaximumEntropyLogProbsTest(unittest.TestCase): def setUp(self): self.features = Counter((key, 1.0) for key in ['warm', 'fuzzy']) self.weights = CounterMap() self.weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5}) self.weights['cat'] = Counter({'warm' : 0.5, 'fuzzy' : 2.0}) self.labels = set(self.weights.iterkeys()) self.logp = maxent.get_log_probabilities(self.features, self.weights, self.labels) def test_fast_slow_equal(self): slow_logp = maximumentropy.slow_log_probs(self.features, self.weights, self.labels) self.assertEqual(self.logp, slow_logp) def test_logp_is_probability_distribution(self): """ Verify that all log probs are <= 0 and total probability is 1.0 """ self.assertTrue(max(self.logp.itervalues()) <= 0.0) self.assertAlmostEqual(sum(exp(val) for val in self.logp.itervalues()), 1.0) def test_basic_values(self): """ Are the log probs as expected? """ self.assertAlmostEqual(exp(self.logp['cat']), 0.5) self.assertAlmostEqual(exp(self.logp['dog']), 0.5) def test_single_label(self): weights = CounterMap() weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['dog'], 0.0) def test_extraneous_label(self): weights = CounterMap() weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['cat'], float('-inf')) def test_zero_weight(self): weights = CounterMap() weights['dog'] = Counter({'warm' : 2.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['dog'], 0.0) def test_uneven_weights(self): weights = CounterMap() weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 1.0}) weights['cat'] = Counter({'warm' : 1.0, 'fuzzy' : 1.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) # construct scores scores = Counter() scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0 scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0 scores.log_normalize() # check scores explicitly self.assertAlmostEqual(scores['dog'], log(0.731), 3) self.assertAlmostEqual(scores['cat'], log(0.269), 3) # check that log probs is correct self.assertEqual(logp['dog'], scores['dog']) self.assertEqual(logp['cat'], scores['cat']) def test_performance(self): """ C api should be faster than python API (this is potentialy flakey, depending on system load patterns) """ start = time.time() for i in xrange(100000): test = maximumentropy.slow_log_probs(self.features, self.weights, self.labels) slow_time = time.time() - start start = time.time() for i in xrange(100000): test = maxent.get_log_probabilities(self.features, self.weights, self.labels) fast_time = time.time() - start self.assertTrue(fast_time < slow_time)
def countermap_init(iter_src): test_countermap = CounterMap() for i in iter_src: test_countermap[i] += 1 return test_countermap
class MaximumEntropyLogProbsTest(unittest.TestCase): def setUp(self): self.features = Counter((key, 1.0) for key in ['warm', 'fuzzy']) self.weights = CounterMap() self.weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5}) self.weights['cat'] = Counter({'warm': 0.5, 'fuzzy': 2.0}) self.labels = set(self.weights.iterkeys()) self.logp = maxent.get_log_probabilities(self.features, self.weights, self.labels) def test_fast_slow_equal(self): slow_logp = maximumentropy.slow_log_probs(self.features, self.weights, self.labels) self.assertEqual(self.logp, slow_logp) def test_logp_is_probability_distribution(self): """ Verify that all log probs are <= 0 and total probability is 1.0 """ self.assertTrue(max(self.logp.itervalues()) <= 0.0) self.assertAlmostEqual(sum(exp(val) for val in self.logp.itervalues()), 1.0) def test_basic_values(self): """ Are the log probs as expected? """ self.assertAlmostEqual(exp(self.logp['cat']), 0.5) self.assertAlmostEqual(exp(self.logp['dog']), 0.5) def test_single_label(self): weights = CounterMap() weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['dog'], 0.0) def test_extraneous_label(self): weights = CounterMap() weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['cat'], float('-inf')) def test_zero_weight(self): weights = CounterMap() weights['dog'] = Counter({'warm': 2.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) self.assertEqual(logp['dog'], 0.0) def test_uneven_weights(self): weights = CounterMap() weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 1.0}) weights['cat'] = Counter({'warm': 1.0, 'fuzzy': 1.0}) labels = set(weights.iterkeys()) logp = maxent.get_log_probabilities(self.features, weights, labels) # construct scores scores = Counter() scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0 scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0 scores.log_normalize() # check scores explicitly self.assertAlmostEqual(scores['dog'], log(0.731), 3) self.assertAlmostEqual(scores['cat'], log(0.269), 3) # check that log probs is correct self.assertEqual(logp['dog'], scores['dog']) self.assertEqual(logp['cat'], scores['cat']) def test_performance(self): """ C api should be faster than python API (this is potentialy flakey, depending on system load patterns) """ start = time.time() for i in xrange(100000): test = maximumentropy.slow_log_probs(self.features, self.weights, self.labels) slow_time = time.time() - start start = time.time() for i in xrange(100000): test = maxent.get_log_probabilities(self.features, self.weights, self.labels) fast_time = time.time() - start self.assertTrue(fast_time < slow_time)
from itertools import izip, repeat, chain from maxent import get_log_probabilities, get_expected_counts from countermap import CounterMap from counter import Counter def cnter(l): return Counter(izip(l, repeat(1.0, len(l)))) training_data = (('cat', cnter( ('fuzzy', 'claws', 'small'))), ('bear', cnter( ('fuzzy', 'claws', 'big'))), ('cat', cnter(('claws', 'medium')))) labels = set([label for label, _ in training_data]) features = set() for _, counter in training_data: features.update(set(counter.keys())) weights = CounterMap() log_probs = list() for pos, (label, features) in enumerate(training_data): log_probs.append(get_log_probabilities(features, weights, labels)) test = get_expected_counts(training_data, labels, log_probs, CounterMap()) print test
class HiddenMarkovModel: def __init__(self, label_history_size=2): # Distribution over next state given current state self.labels = list() self.label_history_size = label_history_size self.transition = CounterMap() self.reverse_transition = CounterMap() # same as transitions but indexed in reverse (useful for decoding) self.fallback_emissions_model = None self.fallback_transition = None self.fallback_reverse_transition = None # Multinomial distribution over emissions given label self.emission = CounterMap() # p(label | emission) self.label_emissions = CounterMap() def _pad_sequence(self, sequence, pairs=False): if pairs: yield (START_LABEL, START_LABEL) else: yield START_LABEL for item in sequence: yield item # Pad the end so we'll decode the whole thing for _ in xrange(self.label_history_size): if pairs: yield (STOP_LABEL, STOP_LABEL) else: yield STOP_LABEL @classmethod def _extend_labels(cls, sequence, label_history_size): ''' >>> foo = HiddenMarkovModel() >>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 1) [('A', (), 3), ('B', (), 4), ('C', (), 5)] >>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 2) [('A', ('<START>',), 3), ('B', ('A',), 4), ('C', ('B',), 5)] ''' last_labels = [START_LABEL for _ in xrange(label_history_size)] for label, emission in sequence: last_labels.append(label) last_labels.pop(0) if label == START_LABEL: last_labels = [START_LABEL for _ in xrange(label_history_size)] all_labels = ('::'.join(last_labels[label_history_size-length-2:-1]) for length in xrange(label_history_size-1)) yield (label, tuple(all_labels), emission) @property def start_label(self): return '::'.join(repeat(START_LABEL, self.label_history_size)) @property def stop_label(self): return '::'.join(repeat(STOP_LABEL, self.label_history_size)) def push_label(self, history, label): return '::'.join(history.split('::')[1:] + [label,]) @classmethod def _linear_smooth(cls, labels, fallback_transition, label_history_size): transition = CounterMap() linear_smoothing_weights = [1.0 - 0.1 * (label_history_size-1)] linear_smoothing_weights.extend(0.1 for _ in xrange(label_history_size-1)) # This is super inefficient - it should be caching smoothings involving the less-specific counters # e.g. smoothed['NN']['CD'] = cnter['NN']['CD'] * \lambda * smoothed['NN'] and so on all_label_histories = set(permutations(labels, label_history_size-1)) for label_history in all_label_histories: histories = [history for history in (label_history[i:] for i in xrange(label_history_size))] # >>> label_history = ('WDT', 'RBR') # histories = [('WDT', 'RBR'), ('RBR')] history_strings = ['::'.join(history) for history in histories] history_scores = [fallback_transition[len(history)][history_string] for history, history_string in izip(histories, history_strings)] transition[history_strings[0]] = Counter() for smoothing, history_score in izip(linear_smoothing_weights, history_scores): transition[history_strings[0]] += history_score * smoothing transition.normalize() return transition def train(self, labeled_sequence, fallback_model=None, fallback_training_limit=None, use_linear_smoothing=True): label_counts = [Counter() for _ in xrange(self.label_history_size)] self.fallback_transition = [CounterMap() for _ in xrange(self.label_history_size)] self.fallback_reverse_transition = [CounterMap() for _ in xrange(self.label_history_size)] labeled_sequence = self._pad_sequence(labeled_sequence, pairs=True) labeled_sequence = list(HiddenMarkovModel._extend_labels(labeled_sequence, self.label_history_size+1)) # Load emission and transition counters from the raw data for label, label_histories, emission in labeled_sequence: full_label = self.push_label(label_histories[-1], label) self.emission[full_label][emission] += 1.0 self.label_emissions[emission][full_label] += 1.0 for history_size, label_history in enumerate(label_histories): label_counts[history_size][label_history] += 1.0 self.fallback_transition[history_size][label_history][full_label] += 1.0 # Make the counters distributions for transition in self.fallback_transition: transition.normalize() self.label_emissions.normalize() self.emission.normalize() self.labels = self.emission.keys() # Smooth transitions using fallback data # Doesn't work with label history size 1! if use_linear_smoothing and self.label_history_size > 1: self.transition = \ HiddenMarkovModel._linear_smooth(self.labels, self.fallback_transition, self.label_history_size) else: self.transition = self.fallback_transition[-1] # Convert to log score counters self.transition.log() self.label_emissions.log() self.emission.log() self.reverse_transition = self.transition.inverted() # Train the fallback model on the label-emission pairs if fallback_model: try: start = time() pickle_file = open("fallback_model.pickle") self.fallback_emissions_model, training_pairs_length = pickle.load(pickle_file) pickle_file.close() if fallback_training_limit and fallback_training_limit != training_pairs_length: raise IOError() elif not fallback_training_limit and len(labeled_sequence) != training_pairs_length: raise IOError() print "Unpickling fallback model: %f" % (time() - start) except (IOError, EOFError), e: print "Training fallback model" self.fallback_emissions_model = fallback_model() emissions_training_pairs = [(emission_history[-1] + '::' + label, emission) for label, emission_history, emission in labeled_sequence if label != START_LABEL and label != STOP_LABEL] if fallback_training_limit: emissions_training_pairs = islice(emissions_training_pairs, fallback_training_limit) self.fallback_emissions_model.train(emissions_training_pairs) serialized = (self.fallback_emissions_model, len(labeled_sequence)) pickle_file = open("fallback_model.pickle", "w") pickle.dump(serialized, pickle_file, protocol=pickle.HIGHEST_PROTOCOL) pickle_file.close() self._post_training()
def _sample_datum(self, datum): likelihoods = Counter(float("-inf")) priors = Counter(float("-inf")) posteriors = Counter(float("-inf")) sizes = Counter() # Regenerate all the cluster params (should be caching this, # not doing it inline) for c_idx, cluster in self._cluster_to_datum.iteritems(): if not cluster: continue sizes[c_idx] = len(cluster) cluster_mean = sum(cluster) / float(sizes[c_idx]) cluster_covariance = 1.0 / float(len(cluster) + 1) * sum( outer_product((pt - cluster_mean), (pt - cluster_mean)) for pt in cluster) posteriors[c_idx], priors[c_idx], likelihoods[ c_idx] = self._cluster_log_probs(cluster, sizes[c_idx], cluster_mean, cluster_covariance, datum) if all(prob == float("-inf") for prob in (priors[c_idx], likelihoods[c_idx], posteriors[c_idx])): del priors[c_idx] del likelihoods[c_idx] del posteriors[c_idx] del sizes[c_idx] continue # Now generate probs for the new cluster # prefer to reuse an old cluster # if possible new_cluster = min( [c for c, d in self._cluster_to_datum.iteritems() if not d], len(self._cluster_to_datum)) sizes[new_cluster] = self._concentration # build a really lame covariance matrix for single points covariance = CounterMap() for axis in datum: covariance[axis] = 1.0 posteriors[new_cluster], priors[new_cluster], likelihoods[ new_cluster] = self._cluster_log_probs([], sizes[new_cluster], datum, covariance, datum) for dist in priors, likelihoods, posteriors: if not all(v <= 0.0 for v in dist.itervalues()): print "Not a log distribution: %s" % dist print "(new cluster %d)" % new_cluster print datum for k, scores in dist.iteritems(): if all(v <= 0.0 for v in scores.itervalues()): continue print "error on cluster %d" % k print "posteriors: %r" % posteriors[k] print "priors: %r" % priors[k] print "likelihoods: %r" % likelihoods[k] print "sizes: %r" % sizes[k] raise Exception() probs = likelihoods + priors - posteriors probs.exp() probs *= sizes # filter out nan for k, v in probs.items(): if v != v: del probs[k] probs.normalize() assert all( 0.0 <= p <= 1.0 for p in probs.itervalues()), "Not a distribution: %s" % probs return probs.sample()
def train(self, labeled_sequence, fallback_model=None, fallback_training_limit=None, use_linear_smoothing=True): label_counts = [Counter() for _ in xrange(self.label_history_size)] self.fallback_transition = [ CounterMap() for _ in xrange(self.label_history_size) ] self.fallback_reverse_transition = [ CounterMap() for _ in xrange(self.label_history_size) ] labeled_sequence = self._pad_sequence(labeled_sequence, pairs=True) labeled_sequence = list( HiddenMarkovModel._extend_labels(labeled_sequence, self.label_history_size + 1)) # Load emission and transition counters from the raw data for label, label_histories, emission in labeled_sequence: full_label = self.push_label(label_histories[-1], label) self.emission[full_label][emission] += 1.0 self.label_emissions[emission][full_label] += 1.0 for history_size, label_history in enumerate(label_histories): label_counts[history_size][label_history] += 1.0 self.fallback_transition[history_size][label_history][ full_label] += 1.0 # Make the counters distributions for transition in self.fallback_transition: transition.normalize() self.label_emissions.normalize() self.emission.normalize() self.labels = self.emission.keys() # Smooth transitions using fallback data # Doesn't work with label history size 1! if use_linear_smoothing and self.label_history_size > 1: self.transition = \ HiddenMarkovModel._linear_smooth(self.labels, self.fallback_transition, self.label_history_size) else: self.transition = self.fallback_transition[-1] # Convert to log score counters self.transition.log() self.label_emissions.log() self.emission.log() self.reverse_transition = self.transition.inverted() # Train the fallback model on the label-emission pairs if fallback_model: try: start = time() pickle_file = open("fallback_model.pickle") self.fallback_emissions_model, training_pairs_length = pickle.load( pickle_file) pickle_file.close() if fallback_training_limit and fallback_training_limit != training_pairs_length: raise IOError() elif not fallback_training_limit and len( labeled_sequence) != training_pairs_length: raise IOError() print "Unpickling fallback model: %f" % (time() - start) except (IOError, EOFError), e: print "Training fallback model" self.fallback_emissions_model = fallback_model() emissions_training_pairs = [ (emission_history[-1] + '::' + label, emission) for label, emission_history, emission in labeled_sequence if label != START_LABEL and label != STOP_LABEL ] if fallback_training_limit: emissions_training_pairs = islice(emissions_training_pairs, fallback_training_limit) self.fallback_emissions_model.train(emissions_training_pairs) serialized = (self.fallback_emissions_model, len(labeled_sequence)) pickle_file = open("fallback_model.pickle", "w") pickle.dump(serialized, pickle_file, protocol=pickle.HIGHEST_PROTOCOL) pickle_file.close()
def toy_problem(args): #pragma: no cover # Simulate a 3 state markov chain with transition matrix (given states in row vector): # (destination) # 1 2 3 # 1 0.7 0.3 0 # 2 0.05 0.4 0.55 # 3 0.25 0.25 0.5 transitions = CounterMap() transitions['1']['1'] = 0.7 transitions['1']['2'] = 0.3 transitions['1']['3'] = 0.0 transitions['2']['1'] = 0.05 transitions['2']['2'] = 0.4 transitions['2']['3'] = 0.55 transitions['3']['1'] = 0.25 transitions['3']['2'] = 0.25 transitions['3']['3'] = 0.5 def sample_transition(label): sample = random.random() for next, prob in transitions[label].iteritems(): sample -= prob if sample <= 0.0: return next assert False, "Should have returned a next state" # And emissions (state, (counter distribution)): {1 : (yes : 0.5, sure : 0.5), 2 : (maybe : 0.75, who_knows : 0.25), 3 : (no : 1)} emissions = { '1': { 'yes': 0.5, 'sure': 0.5 }, '2': { 'maybe': 0.75, 'who_knows': 0.25 }, '3': { 'no': 1.0 } } def sample_emission(label): if label in [START_LABEL, STOP_LABEL]: return label choice = random.random() for emission, prob in emissions[label].iteritems(): choice -= prob if choice <= 0.0: return emission assert False, "Should have returned an emission" # Create the training/test data states = ['1', '2', '3'] start = random.choice(states) # Burn-in (easier than hand-calculating stationary distribution & sampling) for i in xrange(10000): start = sample_transition(start) def label_generator(start_label): next = start_label while True: yield next next = sample_transition(next) training_labels = [ val for _, val in izip(xrange(1000), label_generator('1')) ] training_labels.extend((START_LABEL, STOP_LABEL)) training_labels.extend( [val for _, val in izip(xrange(1000), label_generator('2'))]) training_labels.extend((START_LABEL, STOP_LABEL)) training_labels.extend( [val for _, val in izip(xrange(1000), label_generator('3'))]) training_emissions = [sample_emission(label) for label in training_labels] training_signal = zip(training_labels, training_emissions) # Training phase signal_decoder = HiddenMarkovModel(label_history_size=1) signal_decoder.train(training_signal) # Labeling phase: given a set of emissions, guess the correct states start = random.choice(states) for i in xrange(10000): start = sample_transition(start) test_labels = [val for _, val in izip(xrange(500), label_generator(start))] test_emissions = [sample_emission(label) for label in training_labels] guessed_labels = signal_decoder.label(test_emissions) correct = sum(1 for guessed, correct in izip(guessed_labels, test_labels) if guessed == correct) print "%d labels recovered correctly (%.2f%% correct out of %d)" % ( correct, 100.0 * float(correct) / float(len(test_labels)), len(test_labels))