示例#1
0
def naive_bayes(cache_model):
    print("Running Naive-Bayes Classifier Training")

    idx = 0
    total = len(data)
    start_progress("Pre-processing {} of data".format(total))
    for gender, comment in data:
        idx += 1
        word_exist = {}
        word_not_exist = {}

        if gender not in list_of_gender:
            list_of_gender.append(gender)

        for word in word_tokenize(comment):
            word_exist[word] = True
            word_not_exist[word] = False

            if word not in list_of_words:
                list_of_words.append(word)

        for gen in list_of_gender:
            if gen == gender:
                train_data_gender.append((word_exist, gen))
            else:
                train_data_gender.append((word_not_exist, gen))
        progress(idx / total * 100)
    end_progress()
    print("\nFinished pre-processing ({} data)".format(total))

    print("Training {} gender data".format(total))
    main_gender_classifier = NaiveBayesClassifier.train(train_data_gender)

    if cache_model:
        cache.cache_model(main_gender_classifier,
                          "model/gender_classifier_{}.p".format(total))

    print("Cross validation")
    average_accuracy = 0
    size = len(train_data_gender)

    for i in range(1, 9):
        test_set = train_data_gender[round((i - 1) * size / 8):round((i) *
                                                                     size / 8)]
        training_set = train_data_gender[0:round((i - 1) * size / 8)]
        training_set.extend(train_data_gender[round((i) * size / 8):])

        gender_classifier = NaiveBayesClassifier.train(training_set)

        print("Test-{0}: {1:.2%}".format(
            i, classify.accuracy(gender_classifier, test_set)))
        average_accuracy += classify.accuracy(gender_classifier, test_set)
    average_accuracy /= 8

    print("Average accuracy: " + "{0:.2%}\n".format(average_accuracy))

    return main_gender_classifier
示例#2
0
def main(args):
    start_time = time.time()
    print("Running AdaBoost Classifier")

    print("Reading blacklist words file")
    load_blacklist_words("../data/blacklist.txt")

    print("Reading raw gender-comment data")
    with open("../data/male-comments.json", "r") as f:
        male_comment = json.load(f)
    with open("../data/female-comments.json", "r") as f:
        female_comment = json.load(f)

    # Lower case all comments
    male_comment = [[x[0], x[1].lower()] for x in male_comment]
    female_comment = [[x[0], x[1].lower()] for x in female_comment]

    # Filter blacklisted words in comments
    male_comment = [[x[0], x[1]] for x in male_comment
                    if all(c not in BLACKLIST_WORDS for c in x[1].split(" "))]
    female_comment = [[x[0], x[1]] for x in female_comment if all(
        c not in BLACKLIST_WORDS for c in x[1].split(" "))]

    random.shuffle(male_comment)
    random.shuffle(female_comment)
    print("Loaded {} male and {} female comments".format(
        len(male_comment), len(female_comment)))

    female_ratio = 1.0 - args.male_female_ratio
    if args.limit != -1:
        print(
            "Limiting male and female comments to {} male and {} female ({} total)"
            .format(int(args.limit * args.male_female_ratio),
                    int(args.limit * female_ratio), args.limit))
        try:
            del male_comment[int(args.limit * args.male_female_ratio):]
            del female_comment[int(args.limit * female_ratio):]
        except:
            print("Not enough male/female comments data")
            sys.exit(1)

    gender_comment = []
    for idx, data in enumerate(male_comment):
        data[1] = data[1].lower()
        gender_comment.append(data)
    for idx, data in enumerate(female_comment):
        data[1] = data[1].lower()
        gender_comment.append(data)
    random.shuffle(gender_comment)

    list_of_words = set()
    for data in gender_comment:
        list_of_words.update(data[1].split(" "))
    list_of_words = list(list_of_words)
    word_count = len(list_of_words)

    if args.cache:
        cache.cache_list_of_words(list_of_words)

    print("Total of {} words found\n".format(word_count))

    data = coo_matrix((1, 1))
    label = []
    total = len(gender_comment)
    start_progress("Processing {} raw gender-comment data".format(total))
    for i, j in enumerate(gender_comment):
        if j[0] == "female":  # Label for female = 0, and male = 1
            label.append(0)
        else:
            label.append(1)

        wc = {}
        for word in j[1].split():
            if word in wc:
                wc[word] += 1
            else:
                wc[word] = 1

        d = []
        for idx in range(word_count):
            count = 0
            if list_of_words[idx] in wc:
                count = wc[list_of_words[idx]]
            d.append(count)

        if i == 0:
            data = coo_matrix(d)
        else:
            data = vstack((data, coo_matrix(d)))

        progress((i + 1) / total * 100)
        if i == total:
            break
    end_progress()

    if args.cache:
        cache.cache_data_and_label(data, label, word_count)

    run_tests(data, label, total, args.split, args.algorithm, args.n_estimator)

    print("Elapsed time: {0:.2f}s".format(time.time() - start_time))
示例#3
0
def main(args):
    print("Running Naive-Bayes Classifier\n")

    print("Reading blacklist words file\n")
    load_blacklist_words("../data/blacklist.txt")

    if args.model != "":
        print("Loading model file: {}\n".format(args.model))
        classifier = cache.load_pickle(args.model)
    else:
        filenames_male = glob.glob("../data/raw_comments/male/*.json")
        filenames_female = glob.glob("../data/raw_comments/female/*.json")
        shuffle(filenames_male)
        shuffle(filenames_female)

        male_data = []
        male_user = len(filenames_male)
        start_progress("Reading {} male user(s) data".format(male_user))
        for index, filename in enumerate(filenames_male):
            progress((index + 1) / male_user * 100)
            read_file(filename, male_data)
        end_progress()

        female_data = []
        female_user = len(filenames_female)
        start_progress("Reading {} female user(s) data".format(female_user))
        for index, filename in enumerate(filenames_female):
            progress((index + 1) / female_user * 100)
            read_file(filename, female_data)
        end_progress()

        female_ratio = 1.0 - args.male_female_ratio
        female_count = int(len(female_data))
        male_count = int(len(male_data))
        total_data = male_count + female_count
        print(
            "Loaded {} male(s) and {} female(s) comment data, total of {} comment(s)"
            .format(male_count, female_count, total_data))
        if args.limit != -1:
            female_count = int(args.limit * female_ratio)
            male_count = int(args.limit * args.male_female_ratio)
            if male_count < len(male_data):
                del male_data[male_count:]
            if female_count < len(female_data):
                del female_data[female_count:]
            print(
                "Limiting number of comments: {}, {} male(s) and {} female(s)".
                format(args.limit, len(male_data), len(female_data)))

        global data
        data = male_data
        data.extend(female_data)
        shuffle(data)

        print("\nFinished reading data")
        print("Total number of user: "******"Total number of comments: " + str(len(data)) + "\n")

        classifier = naive_bayes(args.cache_model)

    nb_classify(classifier)