def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.load_preprocessed(categories) dataset.split_train_true(nbr) print_v2_test_docs_vocabulary_labeled(categories) dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories) vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy/10 avg_f1 = avg_f1/10 str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: " + avg_accuracy.__str__()]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def run_lp_bow_vocabulary(nbr, str_list, gamma): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.split_train_true(nbr) vectorizer = CountVectorizer( vocabulary=Vocabulary.get_vocabulary(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) clf = LabelPropagation(kernel='rbf', gamma=gamma).fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy / 10 avg_f1 = avg_f1 / 10 str_list.extend([ "RBF BOW voc Avg f1: " + avg_f1.__str__(), "RBF BOW voc Avg acc: " + avg_accuracy.__str__() ]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def run_lp_tfidf(nbr, str_list, neighbors): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.split_train_true(nbr) vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(dataset.train['data']) clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy/10 avg_f1 = avg_f1/10 str_list.extend(["KNN TF-IDF Avg f1: " + avg_f1.__str__(), "KNN TF-IDF Avg acc: " + avg_accuracy.__str__()]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] """ categories = [ 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey' ] # initialize dataset dataset = Dataset(categories) dataset.load_preprocessed(categories) dataset.split_train_true(10) print_v2_test_docs_vocabulary_labeled(categories) dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories) dataset_knn = Dataset(categories) dataset_knn.load_preprocessed_vocabulary_in_use(categories) dataset_knn.split_train_true(10) print_v2_test_docs_vocabulary_labeled(categories) dataset_knn.load_preprocessed_test_vocabulary_labeled_in_use(categories) # feature extraction vectorizer_rbf = TfidfVectorizer( vocabulary=voc.get_vocabulary_only_labeled(categories)) vectorizer_knn = TfidfVectorizer( vocabulary=voc.get_vocabulary_only_labeled(categories)) vectors = vectorizer_rbf.fit_transform(dataset.train['data'])
'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] """ categories = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'] # initialize dataset dataset_rbf = Dataset(categories) dataset_rbf.split_train_true(100) dataset_knn = Dataset(categories) dataset_knn.split_train_true(100) # feature extraction vectorizer_rbf = TfidfVectorizer() vectorizer_knn = TfidfVectorizer() vectors_rbf = vectorizer_rbf.fit_transform(dataset_rbf.train['data']) vectors_knn = vectorizer_knn.fit_transform(dataset_knn.train['data']) # classification # use max_iter=10 when 20 categories clf_rbf = LabelPropagation(kernel='rbf', gamma=5).fit(vectors_rbf.todense(), dataset_rbf.train['target']) clf_knn = LabelSpreading(kernel='knn', n_neighbors=10).fit(vectors_knn.todense(), dataset_knn.train['target']) test_vec_rbf = vectorizer_rbf.transform(dataset_rbf.test['data']) test_vec_knn = vectorizer_knn.transform(dataset_knn.test['data'])