Exemplo n.º 1
0
def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.load_preprocessed(categories)
        dataset.split_train_true(nbr)
        print_v2_test_docs_vocabulary_labeled(categories)
        dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)
        vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy/10
    avg_f1 = avg_f1/10
    str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: "
                     + avg_accuracy.__str__()])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Exemplo n.º 2
0
def run_lp_tfidf_runtime_vocabulary(nbr, str_list, gamma):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.load_preprocessed(categories)
        dataset.split_train_true(nbr)
        print_v2_test_docs_vocabulary_labeled(categories)
        dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)
        vectorizer = TfidfVectorizer(
            vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelPropagation(kernel='rbf',
                               gamma=gamma).fit(vectors.todense(),
                                                dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'],
                                   pred,
                                   average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy / 10
    avg_f1 = avg_f1 / 10
    str_list.extend([
        "RBF TF-IDF runtime voc Avg f1: " + avg_f1.__str__(),
        "RBF TF-IDF runtime Avg acc: " + avg_accuracy.__str__()
    ])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Exemplo n.º 3
0
def run_naive_bayes_bow_runtime_vocabulary(nbr, str_list):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.load_preprocessed(categories)
        dataset.split_train_bayers(nbr)
        print_v2_test_docs_vocabulary_labeled(categories)
        dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)
        vectorizer = CountVectorizer(
            vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = MultinomialNB().fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'],
                                   pred,
                                   average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy / 10
    avg_f1 = avg_f1 / 10
    str_list.extend([
        "NB BOW runtime voc Avg f1: " + avg_f1.__str__(),
        "NB BOW runtime voc Avg acc: " + avg_accuracy.__str__()
    ])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Exemplo n.º 4
0
              'sci.med',
              'sci.space',
              'soc.religion.christian',
              'talk.politics.guns',
              'talk.politics.mideast',
              'talk.politics.misc',
              'talk.religion.misc']
"""

categories = [
    'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'
]

# initialize dataset
dataset = Dataset(categories)
dataset.load_preprocessed(categories)
dataset.split_train_true(10)
print_v2_test_docs_vocabulary_labeled(categories)
dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)

dataset_knn = Dataset(categories)
dataset_knn.load_preprocessed_vocabulary_in_use(categories)
dataset_knn.split_train_true(10)
print_v2_test_docs_vocabulary_labeled(categories)
dataset_knn.load_preprocessed_test_vocabulary_labeled_in_use(categories)

# feature extraction
vectorizer_rbf = TfidfVectorizer(
    vocabulary=voc.get_vocabulary_only_labeled(categories))
vectorizer_knn = TfidfVectorizer(
    vocabulary=voc.get_vocabulary_only_labeled(categories))