Пример #1
0
            print("Load training first")
        else:
            path = command.split(" ")
            if (len(path) < 2):
                print("Please enter filepath")
            else:
                path = path[1]
                path = Path('.').joinpath(path)
                text = ""
                try:
                    text = path.open('r', encoding='utf-8').read()
                except OSError as e:
                    print("File doesn't exist/Invalid path")
                print(text)
                text = pp.clean_text(text)
                pos, neg = classifier.test(text)
                print("CLASS: ", end='')
                if pos == classifier.pos_prior or neg == classifier.neg_prior:
                    print("SOMETHING WENT WRONG")
                elif pos >= neg:
                    print("POSITIVE")
                else:
                    print("NEGATIVE")

    #Display stats clause
    elif command.startswith('d'):
        scores = pp.load_stats()
        pp.print_stats(scores)
    #Refresh menu clause
    elif command.startswith('m'):
        print_menu()
Пример #2
0
import pandas
from matplotlib import pyplot

from bayes import Bayes
from utils import generate_datasets

data = pandas.read_csv('./datasets/leaf.csv')
labels = data["species"]
data.drop(data.columns[-1], axis=1, inplace=True)

print(data.index)

for dataset in generate_datasets(data, labels):
    print('\n' + dataset.name)
    for training_percent in range(60, 91, 5):
        classifier = Bayes(dataset.data, labels, training_percent)
        classifier.train()
        classifier.test()
        dataset.result.append(classifier.get_accuracy())
        print('Training percent: ' + str(training_percent) + '%, accuracy: ' +
              str(classifier.get_accuracy()))
    pyplot.plot(range(60, 91, 5), dataset.result, label=dataset.name)

pyplot.xlabel('Training percent')
pyplot.ylabel('Accuracy')
pyplot.legend()
pyplot.savefig('plot', dpi=200, bbox_inches='tight')
Пример #3
0
train_data, test_data, train_labels, test_labels = train_test_split(
    data_standard,
    data.as_matrix()[:, -1],
    test_size=0.2,
    random_state=int(time.time()))
'''
贝叶斯算法识别
'''
print("---------------------------贝叶斯----------------------------------")
start = time.clock()
by = Bayes()
by.train(list(train_data), list(train_labels))
test_data_size = test_data.shape[0]
error_count = 0
for index, td in enumerate(list(test_data)):
    this_label = by.test(td)
    print("预测类别:{0},真实类别:{1}".format(this_label, test_labels[index]))
    if this_label != test_labels[index]:
        error_count += 1
end = time.clock()
error_rate = (error_count / test_data_size) * 100
time_consuming = end - start
print("错误率为:{0:.2f}%".format(error_rate))
print("耗时:{0:.4f}s".format(time_consuming))
'''
k-近邻算法识别
'''
print("---------------------------knn----------------------------------")
start = time.clock()
knn = Knn()
test_data_size = test_data.shape[0]
Пример #4
0
def main():
    #Making list of .txt-files (per sentiment)
    print("\tLOADING FILES")

    path = Path('..').joinpath('Data')
    test_ = path.joinpath('test')
    train = path.joinpath('train')

    tp_reviews = txtToList(test_.joinpath('pos'))
    tn_reviews = txtToList(test_.joinpath("neg"))
    pos_reviews = txtToList(train.joinpath("pos"))
    neg_reviews = txtToList(train.joinpath("neg"))
    print("\tFILES LOADED")

    #Cleaning reviews
    reviews = [pos_reviews, neg_reviews, tp_reviews, tn_reviews]
    print("\tCLEANING REVIEWS")
    for list_ in reviews:
        for i, review in enumerate(list_):
            list_[i] = clean_text(review)

    #Joining the reviews into one string (per sentiment)
    pos_string = "".join([string for string in pos_reviews])
    neg_string = "".join([string for string in neg_reviews])

    #Counting the frequency of words (per sentiment and total)
    posCounter = Counter(pos_string.split())
    negCounter = Counter(neg_string.split())
    vocabCounter = Counter(pos_string.split() + neg_string.split())

    for term in list(posCounter):
        if (posCounter[term] == 1):
            del posCounter[term]

    for term in list(negCounter):
        if (negCounter[term] == 1):
            del negCounter[term]

    classifier = Bayes(vocab_counts=vocabCounter)
    classifier.train(posCounter, negCounter)

    testSets = [tp_reviews, tn_reviews]
    n_pos_tp, n_neg_tp = 0, 0
    n_pos_tn, n_neg_tn = 0, 0

    for i, testSet in enumerate(testSets):
        print("_" * 15 + "RESULTS" + "_" * 15)
        n_pos, n_neg = 0, 0

        for review in testSet:
            pos, neg = classifier.test(review)
            if (pos >= neg):
                n_pos += 1
            else:
                n_neg += 1

        if (i == 0):
            print("Positive Testset: ")
            n_pos_tp, n_neg_tp = n_pos, n_neg
        else:
            print("Negative Testset: ")
            n_pos_tn, n_neg_tn = n_pos, n_neg

        print("Positive reviews: {}".format(n_pos))
        print("Negative reviews: {}".format(n_neg))

    pos_prec = n_pos_tp / (n_pos_tp + len(tn_reviews) - n_neg_tn)
    pos_rec = n_pos_tp / len(tp_reviews)
    pos_f1 = 2 * ((pos_prec * pos_rec) / (pos_prec + pos_rec))

    neg_prec = n_neg_tn / (n_neg_tn + len(tp_reviews) - n_pos_tp)
    neg_rec = n_neg_tn / len(tn_reviews)
    neg_f1 = 2 * ((neg_prec * neg_rec) / (neg_prec + neg_rec))

    scores = [pos_prec, pos_rec, pos_f1, neg_prec, neg_rec, neg_f1]

    save_stats(scores)
    print_stats(scores)

    return classifier