#Grab the black list word data blacklist = nb.get_blacklist_data('data/blacklist_words') #get negative comment data as hash, assign val = 1 in hash bad_training_raw = nb.grab_comment_data_dir_walk('data/', 'bad', 1) #get positive comment data as hash, assign val = -1 in hash good_training_raw = nb.grab_comment_data_dir_walk('data/', 'good', -1) #get heldout data bad_heldout_raw = nb.grab_comment_data_dir_walk('heldout/', 'heldout_bad', 1) good_heldout_raw = nb.grab_comment_data_dir_walk('heldout/', 'heldout_good', -1) print "\nBeginning feature extraction" #get the features for the negative comment data bad_training_features = [(nb.get_features(comment, blacklist), val) for (comment, val) in bad_training_raw.items()] #get the features for the positive comment data good_training_features = [(nb.get_features(comment, blacklist), val) for (comment, val) in good_training_raw.items()] #get the features for the heldout data bad_heldout_features = [(nb.get_features(comment, blacklist), val) for (comment, val) in bad_heldout_raw.items()] good_heldout_features = [(nb.get_features(comment, blacklist), val) for (comment, val) in good_heldout_raw.items()] print "feature extraction complete\n" #combine the features, and then shuffle/randomize combined_feature_sets = bad_training_features + good_training_features combined_heldout_sets = bad_heldout_features + good_heldout_features random.shuffle(combined_feature_sets)
def classify_with_NB(comment): #Given a string of comment #@return 1 if comment needs to be flagged #@returns -1 if comment is fine. comment_feature = (nb.get_features(comment, blacklist)) return classifier.classify(comment_feature)
def main(): df_all_train,df_test = dutil.join_data(path_to_data) features = feature_util.get_features(df_all_train) result = getattr(model_util, model_param)(df_all_train,df_test)