예제 #1
0
class NaiveBayesClassifier:
	def train(self, labeled_data):
		self.feature_distribution = CounterMap()
		labels = set()

		for label, datum in labeled_data:
			labels.add(label)
			for feature in ngrams(datum, 3)
				self.feature_distribution[feature][label] += 1

		for feature in self.feature_distribution.iterkeys():
			self.feature_distribution[feature].default = 0.01

		self.feature_distribution.normalize()
		self.feature_distribution.log()

	def label_distribution(self, datum):
		distribution = None

		for feature in ngrams(datum, 3):
			if distribution:
				distribution += self.feature_distribution[feature]
			else:
				distribution = copy(self.feature_distribution[feature])

		distribution.log_normalize()

		return distribution

	def label(self, datum):
		distribution = None

		for feature in ngrams(datum, 3):
			if distribution:
				distribution += self.feature_distribution[feature]
			else:
				distribution = copy(self.feature_distribution[feature])

		return distribution.arg_max()
예제 #2
0
class HiddenMarkovModel:
    def __init__(self, label_history_size=2):
        # Distribution over next state given current state
        self.labels = list()
        self.label_history_size = label_history_size
        self.transition = CounterMap()
        self.reverse_transition = CounterMap(
        )  # same as transitions but indexed in reverse (useful for decoding)

        self.fallback_emissions_model = None
        self.fallback_transition = None
        self.fallback_reverse_transition = None

        # Multinomial distribution over emissions given label
        self.emission = CounterMap()
        # p(label | emission)
        self.label_emissions = CounterMap()

    def _pad_sequence(self, sequence, pairs=False):
        if pairs: yield (START_LABEL, START_LABEL)
        else: yield START_LABEL

        for item in sequence:
            yield item

        # Pad the end so we'll decode the whole thing
        for _ in xrange(self.label_history_size):
            if pairs: yield (STOP_LABEL, STOP_LABEL)
            else: yield STOP_LABEL

    @classmethod
    def _extend_labels(cls, sequence, label_history_size):
        '''
		>>> foo = HiddenMarkovModel()
		>>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 1)
		[('A', (), 3), ('B', (), 4), ('C', (), 5)]
		>>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 2)
		[('A', ('<START>',), 3),
		 ('B', ('A',), 4),
		 ('C', ('B',), 5)]
		'''
        last_labels = [START_LABEL for _ in xrange(label_history_size)]

        for label, emission in sequence:
            last_labels.append(label)
            last_labels.pop(0)

            if label == START_LABEL:
                last_labels = [START_LABEL for _ in xrange(label_history_size)]

            all_labels = ('::'.join(last_labels[label_history_size - length -
                                                2:-1])
                          for length in xrange(label_history_size - 1))
            yield (label, tuple(all_labels), emission)

    @property
    def start_label(self):
        return '::'.join(repeat(START_LABEL, self.label_history_size))

    @property
    def stop_label(self):
        return '::'.join(repeat(STOP_LABEL, self.label_history_size))

    def push_label(self, history, label):
        return '::'.join(history.split('::')[1:] + [
            label,
        ])

    @classmethod
    def _linear_smooth(cls, labels, fallback_transition, label_history_size):
        transition = CounterMap()
        linear_smoothing_weights = [1.0 - 0.1 * (label_history_size - 1)]
        linear_smoothing_weights.extend(
            0.1 for _ in xrange(label_history_size - 1))

        # This is super inefficient - it should be caching smoothings involving the less-specific counters
        # e.g. smoothed['NN']['CD'] = cnter['NN']['CD'] * \lambda * smoothed['NN'] and so on
        all_label_histories = set(permutations(labels, label_history_size - 1))
        for label_history in all_label_histories:
            histories = [
                history for history in (label_history[i:]
                                        for i in xrange(label_history_size))
            ]
            # >>> label_history = ('WDT', 'RBR')
            # histories = [('WDT', 'RBR'), ('RBR')]

            history_strings = ['::'.join(history) for history in histories]
            history_scores = [
                fallback_transition[len(history)][history_string]
                for history, history_string in izip(histories, history_strings)
            ]

            transition[history_strings[0]] = Counter()
            for smoothing, history_score in izip(linear_smoothing_weights,
                                                 history_scores):
                transition[history_strings[0]] += history_score * smoothing

        transition.normalize()

        return transition

    def train(self,
              labeled_sequence,
              fallback_model=None,
              fallback_training_limit=None,
              use_linear_smoothing=True):
        label_counts = [Counter() for _ in xrange(self.label_history_size)]
        self.fallback_transition = [
            CounterMap() for _ in xrange(self.label_history_size)
        ]
        self.fallback_reverse_transition = [
            CounterMap() for _ in xrange(self.label_history_size)
        ]

        labeled_sequence = self._pad_sequence(labeled_sequence, pairs=True)
        labeled_sequence = list(
            HiddenMarkovModel._extend_labels(labeled_sequence,
                                             self.label_history_size + 1))

        # Load emission and transition counters from the raw data
        for label, label_histories, emission in labeled_sequence:
            full_label = self.push_label(label_histories[-1], label)

            self.emission[full_label][emission] += 1.0
            self.label_emissions[emission][full_label] += 1.0

            for history_size, label_history in enumerate(label_histories):
                label_counts[history_size][label_history] += 1.0
                self.fallback_transition[history_size][label_history][
                    full_label] += 1.0

        # Make the counters distributions
        for transition in self.fallback_transition:
            transition.normalize()
        self.label_emissions.normalize()
        self.emission.normalize()
        self.labels = self.emission.keys()

        # Smooth transitions using fallback data
        # Doesn't work with label history size 1!
        if use_linear_smoothing and self.label_history_size > 1:
            self.transition = \
             HiddenMarkovModel._linear_smooth(self.labels,
                      self.fallback_transition,
                      self.label_history_size)
        else:
            self.transition = self.fallback_transition[-1]

        # Convert to log score counters
        self.transition.log()
        self.label_emissions.log()
        self.emission.log()

        self.reverse_transition = self.transition.inverted()

        # Train the fallback model on the label-emission pairs
        if fallback_model:
            try:
                start = time()
                pickle_file = open("fallback_model.pickle")
                self.fallback_emissions_model, training_pairs_length = pickle.load(
                    pickle_file)
                pickle_file.close()

                if fallback_training_limit and fallback_training_limit != training_pairs_length:
                    raise IOError()
                elif not fallback_training_limit and len(
                        labeled_sequence) != training_pairs_length:
                    raise IOError()

                print "Unpickling fallback model: %f" % (time() - start)
            except (IOError, EOFError), e:
                print "Training fallback model"
                self.fallback_emissions_model = fallback_model()

                emissions_training_pairs = [
                    (emission_history[-1] + '::' + label, emission)
                    for label, emission_history, emission in labeled_sequence
                    if label != START_LABEL and label != STOP_LABEL
                ]

                if fallback_training_limit:
                    emissions_training_pairs = islice(emissions_training_pairs,
                                                      fallback_training_limit)

                self.fallback_emissions_model.train(emissions_training_pairs)

                serialized = (self.fallback_emissions_model,
                              len(labeled_sequence))
                pickle_file = open("fallback_model.pickle", "w")
                pickle.dump(serialized,
                            pickle_file,
                            protocol=pickle.HIGHEST_PROTOCOL)
                pickle_file.close()

        self._post_training()
예제 #3
0
파일: hmm.py 프로젝트: beckgom/python-nlp
class HiddenMarkovModel:
	def __init__(self, label_history_size=2):
		# Distribution over next state given current state
		self.labels = list()
		self.label_history_size = label_history_size
		self.transition = CounterMap()
		self.reverse_transition = CounterMap() # same as transitions but indexed in reverse (useful for decoding)

		self.fallback_emissions_model = None
		self.fallback_transition = None
		self.fallback_reverse_transition = None

		# Multinomial distribution over emissions given label
		self.emission = CounterMap()
		# p(label | emission)
		self.label_emissions = CounterMap()

	def _pad_sequence(self, sequence, pairs=False):
		if pairs: yield (START_LABEL, START_LABEL)
		else: yield START_LABEL

		for item in sequence: yield item

		# Pad the end so we'll decode the whole thing
		for _ in xrange(self.label_history_size):
			if pairs: yield (STOP_LABEL, STOP_LABEL)
			else: yield STOP_LABEL

	@classmethod
	def _extend_labels(cls, sequence, label_history_size):
		'''
		>>> foo = HiddenMarkovModel()
		>>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 1)
		[('A', (), 3), ('B', (), 4), ('C', (), 5)]
		>>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 2)
		[('A', ('<START>',), 3),
		 ('B', ('A',), 4),
		 ('C', ('B',), 5)]
		'''
		last_labels = [START_LABEL for _ in xrange(label_history_size)]

		for label, emission in sequence:
			last_labels.append(label)
			last_labels.pop(0)

			if label == START_LABEL:
				last_labels = [START_LABEL for _ in xrange(label_history_size)]

			all_labels = ('::'.join(last_labels[label_history_size-length-2:-1])
						  for length in xrange(label_history_size-1))
			yield (label, tuple(all_labels), emission)

	@property
	def start_label(self):
		return '::'.join(repeat(START_LABEL, self.label_history_size))

	@property
	def stop_label(self):
		return '::'.join(repeat(STOP_LABEL, self.label_history_size))

	def push_label(self, history, label):
		return '::'.join(history.split('::')[1:] + [label,])

	@classmethod
	def _linear_smooth(cls, labels, fallback_transition, label_history_size):
		transition = CounterMap()
		linear_smoothing_weights = [1.0 - 0.1 * (label_history_size-1)]
		linear_smoothing_weights.extend(0.1 for _ in xrange(label_history_size-1))

		# This is super inefficient - it should be caching smoothings involving the less-specific counters
		# e.g. smoothed['NN']['CD'] = cnter['NN']['CD'] * \lambda * smoothed['NN'] and so on
		all_label_histories = set(permutations(labels, label_history_size-1))
		for label_history in all_label_histories:
			histories = [history for history in (label_history[i:] for i in xrange(label_history_size))]
			# >>> label_history = ('WDT', 'RBR')
			# histories = [('WDT', 'RBR'), ('RBR')]

			history_strings = ['::'.join(history) for history in histories]
			history_scores = [fallback_transition[len(history)][history_string] for history, history_string in izip(histories, history_strings)]

			transition[history_strings[0]] = Counter()
			for smoothing, history_score in izip(linear_smoothing_weights, history_scores):
				transition[history_strings[0]] += history_score * smoothing

		transition.normalize()

		return transition

	def train(self, labeled_sequence, fallback_model=None, fallback_training_limit=None, use_linear_smoothing=True):
		label_counts = [Counter() for _ in xrange(self.label_history_size)]
		self.fallback_transition = [CounterMap() for _ in xrange(self.label_history_size)]
		self.fallback_reverse_transition = [CounterMap() for _ in xrange(self.label_history_size)]

		labeled_sequence = self._pad_sequence(labeled_sequence, pairs=True)
		labeled_sequence = list(HiddenMarkovModel._extend_labels(labeled_sequence, self.label_history_size+1))

		# Load emission and transition counters from the raw data
		for label, label_histories, emission in labeled_sequence:
			full_label = self.push_label(label_histories[-1], label)

			self.emission[full_label][emission] += 1.0
			self.label_emissions[emission][full_label] += 1.0

			for history_size, label_history in enumerate(label_histories):
				label_counts[history_size][label_history] += 1.0
				self.fallback_transition[history_size][label_history][full_label] += 1.0

		# Make the counters distributions
		for transition in self.fallback_transition:	transition.normalize()
		self.label_emissions.normalize()
		self.emission.normalize()
		self.labels = self.emission.keys()

		# Smooth transitions using fallback data
		# Doesn't work with label history size 1!
		if use_linear_smoothing and self.label_history_size > 1:
			self.transition = \
				HiddenMarkovModel._linear_smooth(self.labels,
												 self.fallback_transition,
												 self.label_history_size)
		else:
			self.transition = self.fallback_transition[-1]

		# Convert to log score counters
		self.transition.log()
		self.label_emissions.log()
		self.emission.log()

		self.reverse_transition = self.transition.inverted()

		# Train the fallback model on the label-emission pairs
		if fallback_model:
			try:
				start = time()
				pickle_file = open("fallback_model.pickle")
				self.fallback_emissions_model, training_pairs_length = pickle.load(pickle_file)
				pickle_file.close()

				if fallback_training_limit and fallback_training_limit != training_pairs_length:
					raise IOError()
				elif not fallback_training_limit and len(labeled_sequence) != training_pairs_length:
					raise IOError()

				print "Unpickling fallback model: %f" % (time() - start)
			except (IOError, EOFError), e:
				print "Training fallback model"
				self.fallback_emissions_model = fallback_model()

				emissions_training_pairs = [(emission_history[-1] + '::' + label, emission) for label, emission_history, emission in labeled_sequence if label != START_LABEL and label != STOP_LABEL]

				if fallback_training_limit:
					emissions_training_pairs = islice(emissions_training_pairs, fallback_training_limit)

				self.fallback_emissions_model.train(emissions_training_pairs)

				serialized = (self.fallback_emissions_model, len(labeled_sequence))
				pickle_file = open("fallback_model.pickle", "w")
				pickle.dump(serialized, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
				pickle_file.close()

		self._post_training()