def test_data_split_on_offenseval(): task = of.Offenseval() task.load(offenseval_data_dir) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=False) assert len(train_X) == 13240 assert len(test_X) == 320 assert isinstance(train_X[0], str) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=True) assert len(train_X) == 13240 * 0.9 assert len(test_X) == 13240 * 0.1
def run(task_name, data_dir, pipeline_name, print_predictions): logger.info('>> Running {} experiment'.format(task_name)) tsk = task(task_name) logger.info('>> Loading data...') tsk.load(data_dir) logger.info('>> retrieving train/data instances...') train_X, train_y, test_X, test_y = utils.get_instances( tsk, split_train_dev=False) test_X_ref = test_X if pipeline_name.startswith('cnn'): pipe = cnn(pipeline_name) train_X, train_y, test_X, test_y = pipe.encode(train_X, train_y, test_X, test_y) logger.info('>> testing...') else: pipe = pipeline(pipeline_name) logger.info('>> training pipeline ' + pipeline_name) pipe.fit(train_X, train_y) if pipeline_name == 'naive_bayes_counts_lex': logger.info(" -- Found {} tokens in lexicon".format( pipe.tokens_from_lexicon)) logger.info('>> testing...') sys_y = pipe.predict(test_X) logger.info('>> evaluation...') logger.info(utils.eval(test_y, sys_y)) if print_predictions: logger.info('>> predictions') utils.print_all_predictions(test_X_ref, test_y, sys_y, logger)
def run(task_name, data_dir, pipeline_name): logger.info('>> Running {} experiment'.format(task_name)) tsk = task(task_name) logger.info('>> Loading data...') tsk.load(data_dir) logger.info('>> retrieving train/test instances...') train_X, train_y, test_X, test_y = utils.get_instances( tsk, split_train_dev=False) if pipeline_name.startswith('cnn'): pipe = cnn(pipeline_name) train_X, train_y, test_X, test_y = pipe.encode(train_X, train_y, test_X, test_y) logger.info('>> testing...') else: pipe = pipeline(pipeline_name) logger.info('>> training pipeline ' + pipeline_name) pipe.fit(train_X, train_y) logger.info('>> testing...') sys_y = pipe.predict(test_X) logger.info(utils.print_prediction(test_X, test_y, sys_y)) logger.info('>> evaluation...') logger.info(utils.eval(test_y, sys_y))
def test_data_load(): task = vf.VuaFormat() task.load(test_data_dir) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=False) assert len(train_X) == 199 assert len(test_X) == 99
def test_grid_search(): task = of.Offenseval() task.load(offenseval_data_dir) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01) params = {'clf__C': (0.1, 1)} best_sys_y = utils.grid_search(pipelines.svm_libsvc_counts(), params, train_X, train_y, test_X) assert len(best_sys_y) == len(test_y)
def test_naive_bayes_pipeline(): task = of.Offenseval() task.load(offenseval_data_dir) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01) pipe = pipelines.naive_bayes() pipe.fit(train_X, train_y) sys_y = pipe.predict(test_X) assert len(sys_y) == len(test_y)
def test_hate_speech(): task = vf.VuaFormat() task.load(hate_speech_data_dir, ['testData.csv']) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01) pipe = pipelines.naive_bayes() pipe.fit(train_X, train_y) sys_y = pipe.predict(test_X) assert len(sys_y) == len(test_y)
def test_trac2018(): task = vf.VuaFormat() task.load(trac_data_dir, 'devData.csv') train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01) pipe = pipelines.naive_bayes_counts() pipe.fit(train_X, train_y) sys_y = pipe.predict(test_X) assert len(sys_y) == len(test_y)
def encode_data(data_dir): print('Loading data...') task = of.Offenseval() task.load(data_dir=data_dir) train_X, train_y, test_X, test_y = utils.get_instances(task, split_train_dev=False) print(len(train_X), 'train sequences') print(len(test_X), 'data sequences') train_X, train_y, test_X, test_y = encode(train_X, train_y, test_X, test_y) return train_X, train_y, test_X, test_y
def run(task_name, data_dir, pipeline_name, print_predictions, error_analysis, remove_stopwords): logger.info('>> Running {} experiment'.format(task_name)) tsk = task(task_name) logger.info('>> Loading data...') tsk.load(data_dir) logger.info('>> retrieving train/data instances...') train_X, train_y, test_X, test_y = utils.get_instances( tsk, split_train_dev=False) logger.info('>> Descriptive statistics dataset:') utils.descriptive_statistics(train_X, train_y, test_X, test_y) test_X_ref = test_X if remove_stopwords: if pipeline_name.startswith('cnn'): pipeline_name = pipeline_name.split('_')[0] pipeline_name = pipeline_name + '_stopwords' if pipeline_name.startswith('cnn'): pipe = cnn(pipeline_name) train_X, train_y, test_X, test_y = pipe.encode(train_X, train_y, test_X, test_y) logger.info('>> testing CNN...') else: pipe = pipeline(pipeline_name) logger.info('>> training pipeline ' + pipeline_name) pipe.fit(train_X, train_y) if pipeline_name == 'naive_bayes_counts_lex': logger.info(" -- Found {} tokens in lexicon".format( pipe.tokens_from_lexicon)) logger.info('>> testing...') sys_y = pipe.predict(test_X) # logger.info(utils.print_prediction(test_X, test_y, sys_y)) if print_predictions: logger.info('>> predictions1') utils.print_all_predictions(test_X_ref, test_y, sys_y, logger) if error_analysis: # Used for error evaluation logger.info(utils.print_error_analysis(test_X, test_y, sys_y)) # logger.info(utils.print_confusion_matrix(test_y, sys_y)) # Prints the confusion matrix utils.eval(test_y, sys_y, pipeline_name, data_dir) if pipeline_name.startswith('naive_bayes'): utils.important_features_per_class(pipe.named_steps.frm, pipe.named_steps.clf, n=10)
def test_representation(): task = of.Offenseval() task.load(offenseval_data_dir) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01) prep = preprocessing.Preprocessor(tokenize=True, normalize_tweet=False, lowercase=False, lemmatize=False) train_X = prep.transform(train_X) frmt = representation.count_vectorizer() train_X = frmt.fit_transform(train_X, train_y) assert not isinstance(train_X[0], str)
def test_preprocessors(): task = of.Offenseval() task.load(offenseval_data_dir) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01) prep = preprocessing.Preprocessor(tokenize=False, normalize_tweet=False, lowercase=False, lemmatize=False) train_X_prep = prep.transform(train_X) assert len(train_X_prep) == len(train_X) assert isinstance(train_X_prep[0], str) prep = preprocessing.Preprocessor(tokenize=True, normalize_tweet=True, lowercase=True, lemmatize=True) train_X_prep = prep.transform(train_X) assert len(train_X_prep) == len(train_X) assert isinstance(train_X_prep[0], str)