def train_classifier(args): logging.debug("Training classifier") training_corpora = {} # Use the same corpora that we have used in previous demos training_set_names = [ "abc", "genesis", "gutenberg", "inaugural", "stateUnion", "webtext", "custom" ] # Open a CSV file with 3 columns. First column is the name of the corpus (which in this example is also the # name of the class). Second is a single term from the corpus. Third is the probability with which the term occurs. training = fs.open_csv_file("bayes_training.csv", ["class", "term", "probability"]) # Ignore stopwords stopwords = nltk.corpus.stopwords.words('english') # Iterate through each of the training sets for training_set_name in training_set_names: # Load the words and corpus name from the requested corpus. terms_array, corpus_name = words.load_text_corpus( {training_set_name: args[training_set_name]}) # Stem the terms if stemming is enabled if args["stemming"]: terms_array = words.stem_words_array(terms_array) # Count up the unique terms in the words array term_counts = words.collect_term_counts(terms_array) # Get the total number of words in entire corpus num_words = float(len(terms_array)) # Write the frequency of each term occurring in the given class out to the CSV for term, count in term_counts.iteritems(): # We ignore stop words and punctuation if term not in stopwords and term.isalnum(): training.writerow([corpus_name, term.lower(), count])
def train_classifier(args): logging.debug("Training classifier") training_corpora = {} # Use the same corpora that we have used in previous demos training_set_names = ["abc", "genesis", "gutenberg", "inaugural", "stateUnion", "webtext", "custom"] # Open a CSV file with 3 columns. First column is the name of the corpus (which in this example is also the # name of the class). Second is a single term from the corpus. Third is the probability with which the term occurs. training = fs.open_csv_file("bayes_training.csv", ["class", "term", "probability"]); # Ignore stopwords stopwords = nltk.corpus.stopwords.words('english') # Iterate through each of the training sets for training_set_name in training_set_names: # Load the words and corpus name from the requested corpus. terms_array, corpus_name = words.load_text_corpus({training_set_name : args[training_set_name]}) # Stem the terms if stemming is enabled if args["stemming"]: terms_array = words.stem_words_array(terms_array) # Count up the unique terms in the words array term_counts = words.collect_term_counts(terms_array) # Get the total number of words in entire corpus num_words = float(len(terms_array)) # Write the frequency of each term occurring in the given class out to the CSV for term, count in term_counts.iteritems(): # We ignore stop words and punctuation if term not in stopwords and term.isalnum(): training.writerow([corpus_name, term.lower(), count])
def classify(args): file_name = args["classify"] logging.debug("Classify " + file_name) # Load the training data and class names. training_data, class_names = load_training_data() # Read in the document to classify to_classify = codecs.open(args["classify"], "r", "utf-8").read() # Tokenize the document to classify. to_classify_terms = nltk.word_tokenize(to_classify) # If we have enabled stemming then stem these words if args["stemming"]: to_classify_terms = words.stem_words_array(to_classify_terms) # We are now ready to actually classify the document. We need to determine the # the probability that our document (D) is a member of each of our classes (C). # We calculate this probability by taking the product of the probability that # each word in the document belongs to the class C (this is the Naive aspect of # the classifier - we make the assumption that each word probability is independent # of all other word probabilities). This calculates the probability of the # words in this document given a class C -> P(w|c) class_probabilities = {} # In this example, each class is comprised of just one document. The probability # that a document falls in a class is therefore 1 / the number of classes. We # use the log probability to counteract the effect of the product of many near-0 # probabilities. In our example we are actually calling each corpus a single document, # so the probability of a given document is 1 / the number of corpora. If this weren't # the case we'd track the number of documents per category. Categories with lots of # documents would have higher probabilities of being picked by the classifier because # this term would be relatively high when compared to other categories. log_probability_of_class = math.log(1.0 / len(class_names)) stopwords = nltk.corpus.stopwords.words('english') # We need the total vocabulary size in order to do laplace smoothing vocabulary_size = calculate_vocabulary_size(training_data); logging.debug("Total vocabulary size " + str(vocabulary_size) + " terms") # Calculate the word probability product for each class P(w|c) for class_name in class_names: logging.debug("Calculating log probability for class " + class_name) # keeping everything log probabilities - math.log(1) = 0 log_probability_of_words_in_class = math.log(1) # We need the number of terms in the class (note - NOT unique terms) number_of_terms_in_class = calculate_number_of_terms_in_class(training_data[class_name]) logging.debug("Class contains " + str(number_of_terms_in_class) + " terms") # Take the product of all the probabilities of a term appearing in the class as # calculated during training for term in to_classify_terms: # Treat capitalized and lowercase as a single term term = term.lower() # We ignore stop words and punctuation if term not in stopwords and term.isalnum(): # We have to smooth the probabilities of unknown words. This means that a term we # don't recognize is treated as having a very small probability. If we left it as 1 it # doesn't impact the product. In truth, unrecognized terms should be treated as rare rather # than common. Here we use laplace smoothing (or add one smoothing) if term in training_data[class_name]: term_frequency = float(training_data[class_name][term]) else: term_frequency = 0.0 # A probability very near 0 term_probability_in_trained_class = (term_frequency + 1) / (number_of_terms_in_class + vocabulary_size) if args["printProbabilities"]: logging.warn("The word <" + term + "> occurs with frequency " + str(term_frequency) + " and probability " + str(term_probability_in_trained_class)) # Log probability used in the product to avoid approaching 0 as we multiple small numbers log_probability_of_words_in_class += math.log(term_probability_in_trained_class) # We now know P(c) and P(w|c). We are planning to use Bayes Theorem: # P(A|B) = P(B|A) * P(A) / P(B) to learn P(c|w) - the probability of # a class given the words in a document. Plugging into Bayes Theorem: # P(c|w) = P(w|c) * P(c) / P(w). P(w) is only a function of the words # in the document we are classifying, and it therefore can be considered # constant across classes. We can therefore drop it. So now we have # P(c|w) = P(w|c) * P(c). class_probabilities[class_name] = log_probability_of_words_in_class + log_probability_of_class; logging.debug("") # We now have a bunch of probabilities, one per class. We simply take the class associated # with the highest probability and label the document as belonging to that class. max_class = None max_probability = -sys.float_info.max for class_name, probability in class_probabilities.iteritems(): logging.debug("Probability of " + class_name + " is " + str(probability)) if probability > max_probability: max_probability = probability max_class = class_name return max_class, max_probability
def classify(args): file_name = args["classify"] logging.debug("Classify " + file_name) # Load the training data and class names. training_data, class_names = load_training_data() # Read in the document to classify to_classify = codecs.open(args["classify"], "r", "utf-8").read() # Tokenize the document to classify. to_classify_terms = nltk.word_tokenize(to_classify) # If we have enabled stemming then stem these words if args["stemming"]: to_classify_terms = words.stem_words_array(to_classify_terms) # We are now ready to actually classify the document. We need to determine the # the probability that our document (D) is a member of each of our classes (C). # We calculate this probability by taking the product of the probability that # each word in the document belongs to the class C (this is the Naive aspect of # the classifier - we make the assumption that each word probability is independent # of all other word probabilities). This calculates the probability of the # words in this document given a class C -> P(w|c) class_probabilities = {} # In this example, each class is comprised of just one document. The probability # that a document falls in a class is therefore 1 / the number of classes. We # use the log probability to counteract the effect of the product of many near-0 # probabilities. In our example we are actually calling each corpus a single document, # so the probability of a given document is 1 / the number of corpora. If this weren't # the case we'd track the number of documents per category. Categories with lots of # documents would have higher probabilities of being picked by the classifier because # this term would be relatively high when compared to other categories. log_probability_of_class = math.log(1.0 / len(class_names)) stopwords = nltk.corpus.stopwords.words('english') # We need the total vocabulary size in order to do laplace smoothing vocabulary_size = calculate_vocabulary_size(training_data) logging.debug("Total vocabulary size " + str(vocabulary_size) + " terms") # Calculate the word probability product for each class P(w|c) for class_name in class_names: logging.debug("Calculating log probability for class " + class_name) # keeping everything log probabilities - math.log(1) = 0 log_probability_of_words_in_class = math.log(1) # We need the number of terms in the class (note - NOT unique terms) number_of_terms_in_class = calculate_number_of_terms_in_class( training_data[class_name]) logging.debug("Class contains " + str(number_of_terms_in_class) + " terms") # Take the product of all the probabilities of a term appearing in the class as # calculated during training for term in to_classify_terms: # Treat capitalized and lowercase as a single term term = term.lower() # We ignore stop words and punctuation if term not in stopwords and term.isalnum(): # We have to smooth the probabilities of unknown words. This means that a term we # don't recognize is treated as having a very small probability. If we left it as 1 it # doesn't impact the product. In truth, unrecognized terms should be treated as rare rather # than common. Here we use laplace smoothing (or add one smoothing) if term in training_data[class_name]: term_frequency = float(training_data[class_name][term]) else: term_frequency = 0.0 # A probability very near 0 term_probability_in_trained_class = (term_frequency + 1) / ( number_of_terms_in_class + vocabulary_size) if args["printProbabilities"]: logging.warn("The word <" + term + "> occurs with frequency " + str(term_frequency) + " and probability " + str(term_probability_in_trained_class)) # Log probability used in the product to avoid approaching 0 as we multiple small numbers log_probability_of_words_in_class += math.log( term_probability_in_trained_class) # We now know P(c) and P(w|c). We are planning to use Bayes Theorem: # P(A|B) = P(B|A) * P(A) / P(B) to learn P(c|w) - the probability of # a class given the words in a document. Plugging into Bayes Theorem: # P(c|w) = P(w|c) * P(c) / P(w). P(w) is only a function of the words # in the document we are classifying, and it therefore can be considered # constant across classes. We can therefore drop it. So now we have # P(c|w) = P(w|c) * P(c). class_probabilities[ class_name] = log_probability_of_words_in_class + log_probability_of_class logging.debug("") # We now have a bunch of probabilities, one per class. We simply take the class associated # with the highest probability and label the document as belonging to that class. max_class = None max_probability = -sys.float_info.max for class_name, probability in class_probabilities.iteritems(): logging.debug("Probability of " + class_name + " is " + str(probability)) if probability > max_probability: max_probability = probability max_class = class_name return max_class, max_probability