Python NaiveBayes.train примеры использования

Язык программирования: Python

Пространство имен/Пакет: classifier

Класс/Тип: NaiveBayes

Метод/Функция: train

Примеров на hotexamples.com: 1

Python NaiveBayes.train - 1 пример найден. Это лучшие примеры Python кода для classifier.NaiveBayes.train, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

NaiveBayes(4)

is_positive(3)

load(3)

confusion_matrix(1)

fit(1)

predict(1)

train(1)

zero_one_loss_function(1)

Пример #1

Показать файл

Файл: train.py Проект: h4ck3rk3y/SpamFilter

class Trainer(object):

	# Initializes the object
	# @param self Trainer object
	# @param directory The directory that contains the training folders
	# @param spam The sub directory for the spam class
	# @param ham The sub directory for the ham class.
	# @param limit The number of emails to be scanned
	def __init__(self, directory = 'data/corpus2', spam = 'spam', ham = 'ham', limit = 1000):
		self.spam_path = os.path.join(os.getcwd(), directory, spam)
		self.ham_path = os.path.join(os.getcwd(), directory, ham)
		self.limit = limit
		self.classifier = NaiveBayes()

	# A wrapper for the  train_classifier function.
	# @param self The trainer object
	# @param verbose Depending on verbosity information will be printed
	# @return The classifier object
	def train(self, verbose = False):
		self.train_classifier(self.spam_path,'spam', verbose)
		self.train_classifier(self.ham_path,'ham', verbose)

		return self.classifier

	# Converts a document into tokens and extracts features as mentioned in README.md
	# @param self The Trainer object
	# @param text The text to be scanned
	def extract_features(self, text):
		features  = []
		tokens  = text.split()
		porter = stem.porter.PorterStemmer()
		tokens = [token for token in tokens if token not in stopwords.words('english')]
		link = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
		for token in tokens:
			if len(token.translate(None,string.punctuation)) < 3:
				continue
			if "." + token in mimetypes.types_map.keys():
				features.append('ATTACHMENT')
			elif token.isdigit():
				features.append('NUMBER')
			elif token.upper() == token:
				features.append('ALL_CAPS')
				features.append(porter.stem(token.translate(None, string.punctuation)).lower())
			elif link.match(token):
				features.append('LINK')
			else:
				features.append(porter.stem(token.translate(None, string.punctuation)).lower())

		return features


	# The function that does the actual classfication
	# @param path The path of the data to be trained
	# @param label The label underwhich the data is classified
	# @param verbose the verbsoity of statistics printed
	def train_classifier(self, path, label, verbose):

		limit = len(os.listdir(path)) < self.limit and len(os.listdir(path)) or self.limit
		if verbose:
			print colored("Training %d emails in %s class" %(limit, label),'green')

		os.chdir(path)
		for email in os.listdir(path)[:self.limit]:
			if verbose and verbose > 1:
				print colored("Working on file %s" %(email),'green')
			email_file = open(email, 'r')
			email_text = email_file.read()
			try:
				email_text = bs4.UnicodeDammit.detwingle(email_text).decode('utf-8')
			except:
				print colored("Skipping file %s because of bad coding"%(email),'red')
				continue
			email_file.close()
			email_text = email_text.encode('ascii', 'ignore')
			features = self.extract_features(email_text)
			self.classifier.train(features, label)
		print colored(self.classifier,'green')