# Build Targets print("Build target vector and data vector from documents") def party_fn(speech): if speech.speaker_party == 'D': return 1 elif speech.speaker_party == 'R': return 0 else: raise Exception("Speech must have party 'D' or 'R': " + str(speech.speech_id)) bunch = Classifier.bunch_with_targets(speeches=speeches, target_function=party_fn) data = vectorizer.fit_transform(bunch.data) #.tocsr()#.toarray() # Print Stuff learned_vocabulary = vectorizer.get_feature_names() print("Learned %d words in vocabulary" % len(learned_vocabulary)) print(learned_vocabulary) print("") print("Sparse Matrix of TfIdf Values pf each term for each document") print data target = array(bunch.target) print("") # Run Cross Validation Checks
print "%d democratic speeches" % len(dem_speeches) # ipdb.set_trace() # bayseian_prior_a_rep = len(rep_speeches) / len(speeches) # bayseian_prior_b_dem = len(dem_speeches) / len(speeches) # this frame vocabulary proba has tuples for the proba of class a and b # frame_vocabulary_proba = { word: vocabulary_proba[word] if vocabulary_proba.get(word) != None else [0, 0] for word in frame.word_string.split() } # sum_log_probability_a_rep = sum(map(lambda (word,log_probabilities): log_probabilities[0],frame_vocabulary_proba.items())) # sum_log_probability_b_dem = sum(map(lambda (word,log_probabilities): log_probabilities[1],frame_vocabulary_proba.items())) # final_prob_a = bayseian_prior_a_rep * sum_log_probability_a_rep # final_prob_b = bayseian_prior_b_dem * sum_log_probability_b_dem print "Recompute Naieve Bayes Output For Classifying Frame (%s) Within Window (%s) for phrase %s" % (frame.seed_word, speech_window_key, analysis.phrase) naive_bayes = Classifier(vocab=frame.word_string.split()) training_set = Classifier.bunch_with_targets(speeches, analysis.target_function2) naive_bayes.train_classifier(training_set.data, training_set.target) probabilities = naive_bayes.classify_document(frame.word_string) tfidf_frames_vector = naive_bayes.vectorizer.transform([frame.word_string]) print "Predicted Class: ", naive_bayes.classifier.predict(tfidf_frames_vector)[0] print "Predict Proba: ", naive_bayes.classifier.predict_proba(tfidf_frames_vector)[0] print "Probability A (Rep): ", probabilities[0] print "Probability B (Dem): ", probabilities[1] if probabilities[0] > probabilities[1]: print t.red("A (Rep) NB Proba > B (Dem) NB Proba: Classify Republican") else: print t.cyan("B (Dem) NB Proba > A (Rep) NB Proba: Classify Democratic")