def run_tc3_0(): global dataset, feature_extraction_method, classifiers, experiment_controller dataset = Dataset.arxiv_metadata feature_extraction_method = FeatureExtractionMethod.BOW classifiers = [ClassificationMethod.Gradient_Boosting_Machines] experiment_controller = ExperimentController('tc#3.0', '3') experiment_controller.set_variables(dataset, feature_extraction_method, classifiers, should_load_embedding_model=False) experiment_controller.run_experiment()
def run_tc0_0(): global dataset, feature_extraction_method, classifiers, experiment_controller dataset = Dataset.ds20newsgroups feature_extraction_method = FeatureExtractionMethod.BOW classifiers = [ ClassificationMethod.Naive_Bayes_Classifier, # ClassificationMethod.Logistic_Regression, ClassificationMethod.Support_Vector_Machines, ClassificationMethod.SVM_with_SGD ] experiment_controller = ExperimentController('tc#0.0', '1') experiment_controller.set_variables(dataset, feature_extraction_method, classifiers) experiment_controller.run_experiment()
def run_tc3_4(): global dataset, feature_extraction_method, classifiers, experiment_controller dataset = Dataset.arxiv_metadata feature_extraction_method = FeatureExtractionMethod.FASTTEXT classifiers = [ ClassificationMethod.Logistic_Regression, ClassificationMethod.Support_Vector_Machines, ClassificationMethod.SVM_with_SGD ] experiment_controller = ExperimentController('tc#3.4', '2') experiment_controller.set_variables(dataset, feature_extraction_method, classifiers, should_load_embedding_model=False) experiment_controller.run_experiment()
def run_tc0_2(): global dataset, feature_extraction_method, classifiers, experiment_controller dataset = Dataset.ds20newsgroups feature_extraction_method = FeatureExtractionMethod.WORD2VEC classifiers = [ ClassificationMethod.Naive_Bayes_Classifier, ClassificationMethod.Logistic_Regression, ClassificationMethod.Support_Vector_Machines, ClassificationMethod.SVM_with_SGD ] experiment_controller = ExperimentController('tc#0.2', '1') experiment_controller.set_variables(dataset, feature_extraction_method, classifiers, should_load_embedding_model=True) experiment_controller.run_experiment()
def run_tc3_1(): global dataset, feature_extraction_method, classifiers, experiment_controller dataset = Dataset.arxiv_metadata feature_extraction_method = FeatureExtractionMethod.TF_IDF classifiers = [ ClassificationMethod.Naive_Bayes_Classifier, ClassificationMethod.Logistic_Regression, ClassificationMethod.Support_Vector_Machines, ClassificationMethod.SVM_with_SGD, ClassificationMethod.Gradient_Boosting_Machines ] experiment_controller = ExperimentController('tc#3.1', '2') experiment_controller.set_variables(dataset, feature_extraction_method, classifiers, should_load_embedding_model=False) experiment_controller.run_experiment()
import json import pandas as pd from topic_classification.ExperimentController import ExperimentController from topic_classification.constants import Dataset from topic_classification.dataset_utils import \ fetch_and_preprocess_arxiv_metadata_dataset # # I did it on google colab, cause it was faster for some reason (though nominally # # my CPU is and there weren't some errors that occurred locally experiment_controller = ExperimentController() dataset = Dataset.arxiv_metadata data_df = fetch_and_preprocess_arxiv_metadata_dataset() # documents = [] # for line in open(experiment_controller.TOPIC_CLASSIFICATION_DATA_PATH + # dataset.name + '.json', 'r'): # documents.append(json.loads(line)) # # # def strip_category_from_subcategories(category_string: str): # return category_string.split(' ')[0].split('.')[0] # # # data = {'Article': [document['abstract'] for document in documents], # 'Target Name': [strip_category_from_subcategories(document['categories']) # for document in documents]}
import warnings # %matplotlib inline from topic_classification.ExperimentController import ExperimentController from topic_classification.constants import * warnings.filterwarnings('ignore') # Script for different kind of experiments dataset = Dataset.bbc_news_summary feature_extraction_method = FeatureExtractionMethod.FASTTEXT classifiers = [ ClassificationMethod.Logistic_Regression, ClassificationMethod.Support_Vector_Machines, ClassificationMethod.SVM_with_SGD ] experiment_controller = ExperimentController('tc#2.4', '1') experiment_controller.run_experiment(dataset, feature_extraction_method, classifiers, should_load_embedding_model=True)
import gensim import nltk import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from topic_classification.ExperimentController import ExperimentController from topic_classification.constants import Dataset, FeatureExtractionMethod from topic_classification.dataset_utils import load_preprocessed_bbc_news_summary, \ load_preprocessed_arxiv_metadata_dataset os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' # dataset = Dataset.bbc_news_summary # feature_extraction_method = FeatureExtractionMethod.FASTTEXT experiment_controller = ExperimentController('tm#2', '1') TOTAL_TOPICS = 6 NUM_OF_TOP_TERMS = 20 topics = ['business', 'entertainment', 'politics', 'sport', 'tech'] topics_in_order = ['sport', 'tech', 'politics', 'business', 'entertainment'] lsi_model = None lda_model = None nmf_model = None document_topics = None # # # Load preprocessed dataset data_df = load_preprocessed_arxiv_metadata_dataset( experiment_controller.TOPIC_CLASSIFICATION_DATA_PATH)