class NaiveBayesClassifier: def train(self, labeled_data): self.feature_distribution = CounterMap() labels = set() for label, datum in labeled_data: labels.add(label) for feature in ngrams(datum, 3) self.feature_distribution[feature][label] += 1 for feature in self.feature_distribution.iterkeys(): self.feature_distribution[feature].default = 0.01 self.feature_distribution.normalize() self.feature_distribution.log() def label_distribution(self, datum): distribution = None for feature in ngrams(datum, 3): if distribution: distribution += self.feature_distribution[feature] else: distribution = copy(self.feature_distribution[feature]) distribution.log_normalize() return distribution def label(self, datum): distribution = None for feature in ngrams(datum, 3): if distribution: distribution += self.feature_distribution[feature] else: distribution = copy(self.feature_distribution[feature]) return distribution.arg_max()
class HiddenMarkovModel: def __init__(self, label_history_size=2): # Distribution over next state given current state self.labels = list() self.label_history_size = label_history_size self.transition = CounterMap() self.reverse_transition = CounterMap( ) # same as transitions but indexed in reverse (useful for decoding) self.fallback_emissions_model = None self.fallback_transition = None self.fallback_reverse_transition = None # Multinomial distribution over emissions given label self.emission = CounterMap() # p(label | emission) self.label_emissions = CounterMap() def _pad_sequence(self, sequence, pairs=False): if pairs: yield (START_LABEL, START_LABEL) else: yield START_LABEL for item in sequence: yield item # Pad the end so we'll decode the whole thing for _ in xrange(self.label_history_size): if pairs: yield (STOP_LABEL, STOP_LABEL) else: yield STOP_LABEL @classmethod def _extend_labels(cls, sequence, label_history_size): ''' >>> foo = HiddenMarkovModel() >>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 1) [('A', (), 3), ('B', (), 4), ('C', (), 5)] >>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 2) [('A', ('<START>',), 3), ('B', ('A',), 4), ('C', ('B',), 5)] ''' last_labels = [START_LABEL for _ in xrange(label_history_size)] for label, emission in sequence: last_labels.append(label) last_labels.pop(0) if label == START_LABEL: last_labels = [START_LABEL for _ in xrange(label_history_size)] all_labels = ('::'.join(last_labels[label_history_size - length - 2:-1]) for length in xrange(label_history_size - 1)) yield (label, tuple(all_labels), emission) @property def start_label(self): return '::'.join(repeat(START_LABEL, self.label_history_size)) @property def stop_label(self): return '::'.join(repeat(STOP_LABEL, self.label_history_size)) def push_label(self, history, label): return '::'.join(history.split('::')[1:] + [ label, ]) @classmethod def _linear_smooth(cls, labels, fallback_transition, label_history_size): transition = CounterMap() linear_smoothing_weights = [1.0 - 0.1 * (label_history_size - 1)] linear_smoothing_weights.extend( 0.1 for _ in xrange(label_history_size - 1)) # This is super inefficient - it should be caching smoothings involving the less-specific counters # e.g. smoothed['NN']['CD'] = cnter['NN']['CD'] * \lambda * smoothed['NN'] and so on all_label_histories = set(permutations(labels, label_history_size - 1)) for label_history in all_label_histories: histories = [ history for history in (label_history[i:] for i in xrange(label_history_size)) ] # >>> label_history = ('WDT', 'RBR') # histories = [('WDT', 'RBR'), ('RBR')] history_strings = ['::'.join(history) for history in histories] history_scores = [ fallback_transition[len(history)][history_string] for history, history_string in izip(histories, history_strings) ] transition[history_strings[0]] = Counter() for smoothing, history_score in izip(linear_smoothing_weights, history_scores): transition[history_strings[0]] += history_score * smoothing transition.normalize() return transition def train(self, labeled_sequence, fallback_model=None, fallback_training_limit=None, use_linear_smoothing=True): label_counts = [Counter() for _ in xrange(self.label_history_size)] self.fallback_transition = [ CounterMap() for _ in xrange(self.label_history_size) ] self.fallback_reverse_transition = [ CounterMap() for _ in xrange(self.label_history_size) ] labeled_sequence = self._pad_sequence(labeled_sequence, pairs=True) labeled_sequence = list( HiddenMarkovModel._extend_labels(labeled_sequence, self.label_history_size + 1)) # Load emission and transition counters from the raw data for label, label_histories, emission in labeled_sequence: full_label = self.push_label(label_histories[-1], label) self.emission[full_label][emission] += 1.0 self.label_emissions[emission][full_label] += 1.0 for history_size, label_history in enumerate(label_histories): label_counts[history_size][label_history] += 1.0 self.fallback_transition[history_size][label_history][ full_label] += 1.0 # Make the counters distributions for transition in self.fallback_transition: transition.normalize() self.label_emissions.normalize() self.emission.normalize() self.labels = self.emission.keys() # Smooth transitions using fallback data # Doesn't work with label history size 1! if use_linear_smoothing and self.label_history_size > 1: self.transition = \ HiddenMarkovModel._linear_smooth(self.labels, self.fallback_transition, self.label_history_size) else: self.transition = self.fallback_transition[-1] # Convert to log score counters self.transition.log() self.label_emissions.log() self.emission.log() self.reverse_transition = self.transition.inverted() # Train the fallback model on the label-emission pairs if fallback_model: try: start = time() pickle_file = open("fallback_model.pickle") self.fallback_emissions_model, training_pairs_length = pickle.load( pickle_file) pickle_file.close() if fallback_training_limit and fallback_training_limit != training_pairs_length: raise IOError() elif not fallback_training_limit and len( labeled_sequence) != training_pairs_length: raise IOError() print "Unpickling fallback model: %f" % (time() - start) except (IOError, EOFError), e: print "Training fallback model" self.fallback_emissions_model = fallback_model() emissions_training_pairs = [ (emission_history[-1] + '::' + label, emission) for label, emission_history, emission in labeled_sequence if label != START_LABEL and label != STOP_LABEL ] if fallback_training_limit: emissions_training_pairs = islice(emissions_training_pairs, fallback_training_limit) self.fallback_emissions_model.train(emissions_training_pairs) serialized = (self.fallback_emissions_model, len(labeled_sequence)) pickle_file = open("fallback_model.pickle", "w") pickle.dump(serialized, pickle_file, protocol=pickle.HIGHEST_PROTOCOL) pickle_file.close() self._post_training()
class HiddenMarkovModel: def __init__(self, label_history_size=2): # Distribution over next state given current state self.labels = list() self.label_history_size = label_history_size self.transition = CounterMap() self.reverse_transition = CounterMap() # same as transitions but indexed in reverse (useful for decoding) self.fallback_emissions_model = None self.fallback_transition = None self.fallback_reverse_transition = None # Multinomial distribution over emissions given label self.emission = CounterMap() # p(label | emission) self.label_emissions = CounterMap() def _pad_sequence(self, sequence, pairs=False): if pairs: yield (START_LABEL, START_LABEL) else: yield START_LABEL for item in sequence: yield item # Pad the end so we'll decode the whole thing for _ in xrange(self.label_history_size): if pairs: yield (STOP_LABEL, STOP_LABEL) else: yield STOP_LABEL @classmethod def _extend_labels(cls, sequence, label_history_size): ''' >>> foo = HiddenMarkovModel() >>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 1) [('A', (), 3), ('B', (), 4), ('C', (), 5)] >>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 2) [('A', ('<START>',), 3), ('B', ('A',), 4), ('C', ('B',), 5)] ''' last_labels = [START_LABEL for _ in xrange(label_history_size)] for label, emission in sequence: last_labels.append(label) last_labels.pop(0) if label == START_LABEL: last_labels = [START_LABEL for _ in xrange(label_history_size)] all_labels = ('::'.join(last_labels[label_history_size-length-2:-1]) for length in xrange(label_history_size-1)) yield (label, tuple(all_labels), emission) @property def start_label(self): return '::'.join(repeat(START_LABEL, self.label_history_size)) @property def stop_label(self): return '::'.join(repeat(STOP_LABEL, self.label_history_size)) def push_label(self, history, label): return '::'.join(history.split('::')[1:] + [label,]) @classmethod def _linear_smooth(cls, labels, fallback_transition, label_history_size): transition = CounterMap() linear_smoothing_weights = [1.0 - 0.1 * (label_history_size-1)] linear_smoothing_weights.extend(0.1 for _ in xrange(label_history_size-1)) # This is super inefficient - it should be caching smoothings involving the less-specific counters # e.g. smoothed['NN']['CD'] = cnter['NN']['CD'] * \lambda * smoothed['NN'] and so on all_label_histories = set(permutations(labels, label_history_size-1)) for label_history in all_label_histories: histories = [history for history in (label_history[i:] for i in xrange(label_history_size))] # >>> label_history = ('WDT', 'RBR') # histories = [('WDT', 'RBR'), ('RBR')] history_strings = ['::'.join(history) for history in histories] history_scores = [fallback_transition[len(history)][history_string] for history, history_string in izip(histories, history_strings)] transition[history_strings[0]] = Counter() for smoothing, history_score in izip(linear_smoothing_weights, history_scores): transition[history_strings[0]] += history_score * smoothing transition.normalize() return transition def train(self, labeled_sequence, fallback_model=None, fallback_training_limit=None, use_linear_smoothing=True): label_counts = [Counter() for _ in xrange(self.label_history_size)] self.fallback_transition = [CounterMap() for _ in xrange(self.label_history_size)] self.fallback_reverse_transition = [CounterMap() for _ in xrange(self.label_history_size)] labeled_sequence = self._pad_sequence(labeled_sequence, pairs=True) labeled_sequence = list(HiddenMarkovModel._extend_labels(labeled_sequence, self.label_history_size+1)) # Load emission and transition counters from the raw data for label, label_histories, emission in labeled_sequence: full_label = self.push_label(label_histories[-1], label) self.emission[full_label][emission] += 1.0 self.label_emissions[emission][full_label] += 1.0 for history_size, label_history in enumerate(label_histories): label_counts[history_size][label_history] += 1.0 self.fallback_transition[history_size][label_history][full_label] += 1.0 # Make the counters distributions for transition in self.fallback_transition: transition.normalize() self.label_emissions.normalize() self.emission.normalize() self.labels = self.emission.keys() # Smooth transitions using fallback data # Doesn't work with label history size 1! if use_linear_smoothing and self.label_history_size > 1: self.transition = \ HiddenMarkovModel._linear_smooth(self.labels, self.fallback_transition, self.label_history_size) else: self.transition = self.fallback_transition[-1] # Convert to log score counters self.transition.log() self.label_emissions.log() self.emission.log() self.reverse_transition = self.transition.inverted() # Train the fallback model on the label-emission pairs if fallback_model: try: start = time() pickle_file = open("fallback_model.pickle") self.fallback_emissions_model, training_pairs_length = pickle.load(pickle_file) pickle_file.close() if fallback_training_limit and fallback_training_limit != training_pairs_length: raise IOError() elif not fallback_training_limit and len(labeled_sequence) != training_pairs_length: raise IOError() print "Unpickling fallback model: %f" % (time() - start) except (IOError, EOFError), e: print "Training fallback model" self.fallback_emissions_model = fallback_model() emissions_training_pairs = [(emission_history[-1] + '::' + label, emission) for label, emission_history, emission in labeled_sequence if label != START_LABEL and label != STOP_LABEL] if fallback_training_limit: emissions_training_pairs = islice(emissions_training_pairs, fallback_training_limit) self.fallback_emissions_model.train(emissions_training_pairs) serialized = (self.fallback_emissions_model, len(labeled_sequence)) pickle_file = open("fallback_model.pickle", "w") pickle.dump(serialized, pickle_file, protocol=pickle.HIGHEST_PROTOCOL) pickle_file.close() self._post_training()