class Trainer(object): # Initializes the object # @param self Trainer object # @param directory The directory that contains the training folders # @param spam The sub directory for the spam class # @param ham The sub directory for the ham class. # @param limit The number of emails to be scanned def __init__(self, directory = 'data/corpus2', spam = 'spam', ham = 'ham', limit = 1000): self.spam_path = os.path.join(os.getcwd(), directory, spam) self.ham_path = os.path.join(os.getcwd(), directory, ham) self.limit = limit self.classifier = NaiveBayes() # A wrapper for the train_classifier function. # @param self The trainer object # @param verbose Depending on verbosity information will be printed # @return The classifier object def train(self, verbose = False): self.train_classifier(self.spam_path,'spam', verbose) self.train_classifier(self.ham_path,'ham', verbose) return self.classifier # Converts a document into tokens and extracts features as mentioned in README.md # @param self The Trainer object # @param text The text to be scanned def extract_features(self, text): features = [] tokens = text.split() porter = stem.porter.PorterStemmer() tokens = [token for token in tokens if token not in stopwords.words('english')] link = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') for token in tokens: if len(token.translate(None,string.punctuation)) < 3: continue if "." + token in mimetypes.types_map.keys(): features.append('ATTACHMENT') elif token.isdigit(): features.append('NUMBER') elif token.upper() == token: features.append('ALL_CAPS') features.append(porter.stem(token.translate(None, string.punctuation)).lower()) elif link.match(token): features.append('LINK') else: features.append(porter.stem(token.translate(None, string.punctuation)).lower()) return features # The function that does the actual classfication # @param path The path of the data to be trained # @param label The label underwhich the data is classified # @param verbose the verbsoity of statistics printed def train_classifier(self, path, label, verbose): limit = len(os.listdir(path)) < self.limit and len(os.listdir(path)) or self.limit if verbose: print colored("Training %d emails in %s class" %(limit, label),'green') os.chdir(path) for email in os.listdir(path)[:self.limit]: if verbose and verbose > 1: print colored("Working on file %s" %(email),'green') email_file = open(email, 'r') email_text = email_file.read() try: email_text = bs4.UnicodeDammit.detwingle(email_text).decode('utf-8') except: print colored("Skipping file %s because of bad coding"%(email),'red') continue email_file.close() email_text = email_text.encode('ascii', 'ignore') features = self.extract_features(email_text) self.classifier.train(features, label) print colored(self.classifier,'green')