Exemplo n.º 1
0
	def __init__(self, training, ngram_degree, alphabet):
		'''
		training
			[(Iterable, str), (Iterable, str)]
		ngram_degree

		'''

		self.ngrams = {}
		self.classes = {}
		self.degree = ngram_degree

		# get frequency count
		for tuple in training:
			examples = tuple[0]
			name = tuple[1]
			self.classes[name] = {}
			for example in examples:
				freq = __ling__.ngram_frequency(example, ngram_degree)
				for ngram in freq:
					if ngram not in self.classes[name]:
						self.classes[name][ngram] = 1
					else:
						self.classes[name][ngram] += 1
					if ngram not in self.ngrams:
						self.ngrams[ngram] = 1
					else:
						self.ngrams[ngram] += 1

		# laplace smooting
		for ngram in alphabet:

			# for the ngrams
			if ngram not in self.ngrams:
				self.ngrams[ngram] = 1
			else:
				self.ngrams[ngram] += 1

			# for each class
			for clazz in self.classes:
				if ngram not in self.classes[clazz]:
					self.classes[clazz][ngram] = 1
				else:
					self.classes[clazz][ngram] += 1

		# turn everything into probabilities
		self.ngrams = __normalise__(self.ngrams)
		for clazz in self.classes:
			self.classes[clazz] = __normalise__(self.classes[clazz])
Exemplo n.º 2
0
	def classify(self, string):

		# these strings cannot be classified!
		if (len(string) < self.degree):
			return None

		# get frequency count for the ngrams in the string
		freq = __ling__.ngram_frequency(string, self.degree)
		clazz_probs = { cl : [] for cl in self.classes.keys() }

		# create a vector of probabilites for each ngram for each class
		for ngram in freq:
			for clazz in self.classes:
				prob_c = self.prob_class(clazz)
				prob_xc = self.prob_ngram_class(ngram, clazz)
				p_i = prob_c * prob_xc
				for i in range(freq[ngram]):
					clazz_probs[clazz].append(p_i)

		# debugging purposes
		# outputs the vector of probabilities for each class
		#for clazz in clazz_probs:
		#	print(clazz_probs[clazz])


		# reduce the probability vectors into a probability for that one class
		# map each class's probability vector to its aggregated probability
		for clazz in clazz_probs:
			pvector = clazz_probs[clazz]
			if len(pvector) == 0:
				pvector = [0]
			p = reduce(lambda x,y : x*y, pvector)
			clazz_probs[clazz] = p

		# return the class with the highest probability
		best_category = None
		best_prob = None
		for clazz in clazz_probs:
			prob = clazz_probs[clazz]
			if best_category == None or prob > best_prob:
				best_prob = prob
				best_category = clazz
		return best_category