Пример #1
0
#Grab the black list word data
blacklist = nb.get_blacklist_data('data/blacklist_words')

#get negative comment data as hash, assign val = 1 in hash
bad_training_raw = nb.grab_comment_data_dir_walk('data/', 'bad', 1)

#get positive comment data as hash, assign val = -1 in hash
good_training_raw = nb.grab_comment_data_dir_walk('data/', 'good', -1)

#get heldout data 
bad_heldout_raw = nb.grab_comment_data_dir_walk('heldout/', 'heldout_bad', 1)
good_heldout_raw = nb.grab_comment_data_dir_walk('heldout/', 'heldout_good', -1)

print "\nBeginning feature extraction"
#get the features for the negative comment data
bad_training_features = [(nb.get_features(comment, blacklist), val) for (comment, val) in bad_training_raw.items()]

#get the features for the positive comment data
good_training_features = [(nb.get_features(comment, blacklist), val) for (comment, val) in good_training_raw.items()]

#get the features for the heldout data
bad_heldout_features = [(nb.get_features(comment, blacklist), val) for (comment, val) in bad_heldout_raw.items()]
good_heldout_features = [(nb.get_features(comment, blacklist), val) for (comment, val) in good_heldout_raw.items()]

print "feature extraction complete\n"
#combine the features, and then shuffle/randomize
combined_feature_sets = bad_training_features + good_training_features
combined_heldout_sets = bad_heldout_features + good_heldout_features

random.shuffle(combined_feature_sets)
Пример #2
0
def classify_with_NB(comment):
    #Given a string of comment
    #@return 1 if comment needs to be flagged
    #@returns -1 if comment is fine.
    comment_feature = (nb.get_features(comment, blacklist))
    return classifier.classify(comment_feature)
def main():
  df_all_train,df_test = dutil.join_data(path_to_data)
  features = feature_util.get_features(df_all_train)
  result = getattr(model_util, model_param)(df_all_train,df_test)