def task(name): if name == 'offenseval': return of.Offenseval() elif name == 'vua-format': return vf.VuaFormat() else: raise ValueError("task name is unknown. You can add a custom task in 'tasks'")
def test_grid_search(): task = of.Offenseval() task.load(offenseval_data_dir) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01) params = {'clf__C': (0.1, 1)} best_sys_y = utils.grid_search(pipelines.svm_libsvc_counts(), params, train_X, train_y, test_X) assert len(best_sys_y) == len(test_y)
def test_naive_bayes_pipeline(): task = of.Offenseval() task.load(offenseval_data_dir) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01) pipe = pipelines.naive_bayes() pipe.fit(train_X, train_y) sys_y = pipe.predict(test_X) assert len(sys_y) == len(test_y)
def encode_data(data_dir): print('Loading data...') task = of.Offenseval() task.load(data_dir=data_dir) train_X, train_y, test_X, test_y = utils.get_instances(task, split_train_dev=False) print(len(train_X), 'train sequences') print(len(test_X), 'test sequences') train_X, train_y, test_X, test_y = encode(train_X, train_y, test_X, test_y) return train_X, train_y, test_X, test_y
def test_offenseval_data_extraction_task_a(): task = of.Offenseval() task.load(offenseval_data_dir) train_X, train_y = task.train_instances() test_X, test_y = task.test_instances() assert len(train_X) == 13240 assert len(test_y) == 320 assert isinstance(train_X[0], str) labels = set(test_y) assert len(labels) == 2
def test_data_split_on_offenseval(): task = of.Offenseval() task.load(offenseval_data_dir) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=False) assert len(train_X) == 13240 assert len(test_X) == 320 assert isinstance(train_X[0], str) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=True) assert len(train_X) == 13240 * 0.9 assert len(test_X) == 13240 * 0.1
def test_representation(): task = of.Offenseval() task.load(offenseval_data_dir) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01) prep = preprocessing.Preprocessor(tokenize=True, normalize_tweet=False, lowercase=False, lemmatize=False) train_X = prep.transform(train_X) frmt = representation.count_vectorizer() train_X = frmt.fit_transform(train_X, train_y) assert not isinstance(train_X[0], str)
def test_preprocessors(): task = of.Offenseval() task.load(offenseval_data_dir) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01) prep = preprocessing.Preprocessor(tokenize=False, normalize_tweet=False, lowercase=False, lemmatize=False) train_X_prep = prep.transform(train_X) assert len(train_X_prep) == len(train_X) assert isinstance(train_X_prep[0], str) prep = preprocessing.Preprocessor(tokenize=True, normalize_tweet=True, lowercase=True, lemmatize=True) train_X_prep = prep.transform(train_X) assert len(train_X_prep) == len(train_X) assert isinstance(train_X_prep[0], str)