def naive_bayes(cache_model): print("Running Naive-Bayes Classifier Training") idx = 0 total = len(data) start_progress("Pre-processing {} of data".format(total)) for gender, comment in data: idx += 1 word_exist = {} word_not_exist = {} if gender not in list_of_gender: list_of_gender.append(gender) for word in word_tokenize(comment): word_exist[word] = True word_not_exist[word] = False if word not in list_of_words: list_of_words.append(word) for gen in list_of_gender: if gen == gender: train_data_gender.append((word_exist, gen)) else: train_data_gender.append((word_not_exist, gen)) progress(idx / total * 100) end_progress() print("\nFinished pre-processing ({} data)".format(total)) print("Training {} gender data".format(total)) main_gender_classifier = NaiveBayesClassifier.train(train_data_gender) if cache_model: cache.cache_model(main_gender_classifier, "model/gender_classifier_{}.p".format(total)) print("Cross validation") average_accuracy = 0 size = len(train_data_gender) for i in range(1, 9): test_set = train_data_gender[round((i - 1) * size / 8):round((i) * size / 8)] training_set = train_data_gender[0:round((i - 1) * size / 8)] training_set.extend(train_data_gender[round((i) * size / 8):]) gender_classifier = NaiveBayesClassifier.train(training_set) print("Test-{0}: {1:.2%}".format( i, classify.accuracy(gender_classifier, test_set))) average_accuracy += classify.accuracy(gender_classifier, test_set) average_accuracy /= 8 print("Average accuracy: " + "{0:.2%}\n".format(average_accuracy)) return main_gender_classifier
def main(args): start_time = time.time() print("Running AdaBoost Classifier") print("Reading blacklist words file") load_blacklist_words("../data/blacklist.txt") print("Reading raw gender-comment data") with open("../data/male-comments.json", "r") as f: male_comment = json.load(f) with open("../data/female-comments.json", "r") as f: female_comment = json.load(f) # Lower case all comments male_comment = [[x[0], x[1].lower()] for x in male_comment] female_comment = [[x[0], x[1].lower()] for x in female_comment] # Filter blacklisted words in comments male_comment = [[x[0], x[1]] for x in male_comment if all(c not in BLACKLIST_WORDS for c in x[1].split(" "))] female_comment = [[x[0], x[1]] for x in female_comment if all( c not in BLACKLIST_WORDS for c in x[1].split(" "))] random.shuffle(male_comment) random.shuffle(female_comment) print("Loaded {} male and {} female comments".format( len(male_comment), len(female_comment))) female_ratio = 1.0 - args.male_female_ratio if args.limit != -1: print( "Limiting male and female comments to {} male and {} female ({} total)" .format(int(args.limit * args.male_female_ratio), int(args.limit * female_ratio), args.limit)) try: del male_comment[int(args.limit * args.male_female_ratio):] del female_comment[int(args.limit * female_ratio):] except: print("Not enough male/female comments data") sys.exit(1) gender_comment = [] for idx, data in enumerate(male_comment): data[1] = data[1].lower() gender_comment.append(data) for idx, data in enumerate(female_comment): data[1] = data[1].lower() gender_comment.append(data) random.shuffle(gender_comment) list_of_words = set() for data in gender_comment: list_of_words.update(data[1].split(" ")) list_of_words = list(list_of_words) word_count = len(list_of_words) if args.cache: cache.cache_list_of_words(list_of_words) print("Total of {} words found\n".format(word_count)) data = coo_matrix((1, 1)) label = [] total = len(gender_comment) start_progress("Processing {} raw gender-comment data".format(total)) for i, j in enumerate(gender_comment): if j[0] == "female": # Label for female = 0, and male = 1 label.append(0) else: label.append(1) wc = {} for word in j[1].split(): if word in wc: wc[word] += 1 else: wc[word] = 1 d = [] for idx in range(word_count): count = 0 if list_of_words[idx] in wc: count = wc[list_of_words[idx]] d.append(count) if i == 0: data = coo_matrix(d) else: data = vstack((data, coo_matrix(d))) progress((i + 1) / total * 100) if i == total: break end_progress() if args.cache: cache.cache_data_and_label(data, label, word_count) run_tests(data, label, total, args.split, args.algorithm, args.n_estimator) print("Elapsed time: {0:.2f}s".format(time.time() - start_time))
def main(args): print("Running Naive-Bayes Classifier\n") print("Reading blacklist words file\n") load_blacklist_words("../data/blacklist.txt") if args.model != "": print("Loading model file: {}\n".format(args.model)) classifier = cache.load_pickle(args.model) else: filenames_male = glob.glob("../data/raw_comments/male/*.json") filenames_female = glob.glob("../data/raw_comments/female/*.json") shuffle(filenames_male) shuffle(filenames_female) male_data = [] male_user = len(filenames_male) start_progress("Reading {} male user(s) data".format(male_user)) for index, filename in enumerate(filenames_male): progress((index + 1) / male_user * 100) read_file(filename, male_data) end_progress() female_data = [] female_user = len(filenames_female) start_progress("Reading {} female user(s) data".format(female_user)) for index, filename in enumerate(filenames_female): progress((index + 1) / female_user * 100) read_file(filename, female_data) end_progress() female_ratio = 1.0 - args.male_female_ratio female_count = int(len(female_data)) male_count = int(len(male_data)) total_data = male_count + female_count print( "Loaded {} male(s) and {} female(s) comment data, total of {} comment(s)" .format(male_count, female_count, total_data)) if args.limit != -1: female_count = int(args.limit * female_ratio) male_count = int(args.limit * args.male_female_ratio) if male_count < len(male_data): del male_data[male_count:] if female_count < len(female_data): del female_data[female_count:] print( "Limiting number of comments: {}, {} male(s) and {} female(s)". format(args.limit, len(male_data), len(female_data))) global data data = male_data data.extend(female_data) shuffle(data) print("\nFinished reading data") print("Total number of user: "******"Total number of comments: " + str(len(data)) + "\n") classifier = naive_bayes(args.cache_model) nb_classify(classifier)