示例#1
0
def run_tc3_0():
    global dataset, feature_extraction_method, classifiers, experiment_controller
    dataset = Dataset.arxiv_metadata
    feature_extraction_method = FeatureExtractionMethod.BOW
    classifiers = [ClassificationMethod.Gradient_Boosting_Machines]

    experiment_controller = ExperimentController('tc#3.0', '3')
    experiment_controller.set_variables(dataset,
                                        feature_extraction_method,
                                        classifiers,
                                        should_load_embedding_model=False)
    experiment_controller.run_experiment()
示例#2
0
def run_tc0_0():
    global dataset, feature_extraction_method, classifiers, experiment_controller
    dataset = Dataset.ds20newsgroups
    feature_extraction_method = FeatureExtractionMethod.BOW
    classifiers = [
        ClassificationMethod.Naive_Bayes_Classifier,
        # ClassificationMethod.Logistic_Regression,
        ClassificationMethod.Support_Vector_Machines,
        ClassificationMethod.SVM_with_SGD
    ]

    experiment_controller = ExperimentController('tc#0.0', '1')
    experiment_controller.set_variables(dataset, feature_extraction_method,
                                        classifiers)
    experiment_controller.run_experiment()
示例#3
0
def run_tc3_4():
    global dataset, feature_extraction_method, classifiers, experiment_controller
    dataset = Dataset.arxiv_metadata
    feature_extraction_method = FeatureExtractionMethod.FASTTEXT
    classifiers = [
        ClassificationMethod.Logistic_Regression,
        ClassificationMethod.Support_Vector_Machines,
        ClassificationMethod.SVM_with_SGD
    ]

    experiment_controller = ExperimentController('tc#3.4', '2')
    experiment_controller.set_variables(dataset,
                                        feature_extraction_method,
                                        classifiers,
                                        should_load_embedding_model=False)
    experiment_controller.run_experiment()
示例#4
0
def run_tc0_2():
    global dataset, feature_extraction_method, classifiers, experiment_controller
    dataset = Dataset.ds20newsgroups
    feature_extraction_method = FeatureExtractionMethod.WORD2VEC
    classifiers = [
        ClassificationMethod.Naive_Bayes_Classifier,
        ClassificationMethod.Logistic_Regression,
        ClassificationMethod.Support_Vector_Machines,
        ClassificationMethod.SVM_with_SGD
    ]

    experiment_controller = ExperimentController('tc#0.2', '1')
    experiment_controller.set_variables(dataset,
                                        feature_extraction_method,
                                        classifiers,
                                        should_load_embedding_model=True)
    experiment_controller.run_experiment()
示例#5
0
def run_tc3_1():
    global dataset, feature_extraction_method, classifiers, experiment_controller
    dataset = Dataset.arxiv_metadata
    feature_extraction_method = FeatureExtractionMethod.TF_IDF
    classifiers = [
        ClassificationMethod.Naive_Bayes_Classifier,
        ClassificationMethod.Logistic_Regression,
        ClassificationMethod.Support_Vector_Machines,
        ClassificationMethod.SVM_with_SGD,
        ClassificationMethod.Gradient_Boosting_Machines
    ]

    experiment_controller = ExperimentController('tc#3.1', '2')
    experiment_controller.set_variables(dataset,
                                        feature_extraction_method,
                                        classifiers,
                                        should_load_embedding_model=False)
    experiment_controller.run_experiment()
import json
import pandas as pd

from topic_classification.ExperimentController import ExperimentController
from topic_classification.constants import Dataset
from topic_classification.dataset_utils import \
    fetch_and_preprocess_arxiv_metadata_dataset


# # I did it on google colab, cause it was faster for some reason (though nominally
# # my CPU is and there weren't some errors that occurred locally

experiment_controller = ExperimentController()

dataset = Dataset.arxiv_metadata

data_df = fetch_and_preprocess_arxiv_metadata_dataset()

# documents = []
# for line in open(experiment_controller.TOPIC_CLASSIFICATION_DATA_PATH +
#                  dataset.name + '.json', 'r'):
#     documents.append(json.loads(line))
#
#
# def strip_category_from_subcategories(category_string: str):
#     return category_string.split(' ')[0].split('.')[0]
#
#
# data = {'Article': [document['abstract'] for document in documents],
#         'Target Name': [strip_category_from_subcategories(document['categories'])
#                         for document in documents]}
示例#7
0
import warnings

# %matplotlib inline
from topic_classification.ExperimentController import ExperimentController
from topic_classification.constants import *

warnings.filterwarnings('ignore')
# Script for different kind of experiments

dataset = Dataset.bbc_news_summary
feature_extraction_method = FeatureExtractionMethod.FASTTEXT
classifiers = [
    ClassificationMethod.Logistic_Regression,
    ClassificationMethod.Support_Vector_Machines,
    ClassificationMethod.SVM_with_SGD
]

experiment_controller = ExperimentController('tc#2.4', '1')
experiment_controller.run_experiment(dataset,
                                     feature_extraction_method,
                                     classifiers,
                                     should_load_embedding_model=True)
示例#8
0
import gensim
import nltk
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from topic_classification.ExperimentController import ExperimentController
from topic_classification.constants import Dataset, FeatureExtractionMethod
from topic_classification.dataset_utils import load_preprocessed_bbc_news_summary, \
    load_preprocessed_arxiv_metadata_dataset

os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'

# dataset = Dataset.bbc_news_summary
# feature_extraction_method = FeatureExtractionMethod.FASTTEXT
experiment_controller = ExperimentController('tm#2', '1')

TOTAL_TOPICS = 6
NUM_OF_TOP_TERMS = 20

topics = ['business', 'entertainment', 'politics', 'sport', 'tech']
topics_in_order = ['sport', 'tech', 'politics', 'business', 'entertainment']

lsi_model = None
lda_model = None
nmf_model = None
document_topics = None

# # # Load preprocessed dataset
data_df = load_preprocessed_arxiv_metadata_dataset(
    experiment_controller.TOPIC_CLASSIFICATION_DATA_PATH)