def test_dataset(): """Test Dataset class.""" x_train, y_train = Dataset.load_from_files_multilabel( path.join(DATASET_MULTILABEL_PATH, "train_files"), path.join(DATASET_MULTILABEL_PATH, "file_labels.tsv") ) assert x_train == ['this is the first document!!\n\n:)', 'and this is the\n\nSECOND!!'] assert y_train == [['catA', 'catB', 'catC'], ['catA']] x_train, y_train = Dataset.load_from_files_multilabel( path.join(DATASET_MULTILABEL_PATH, "train/docs.txt"), path.join(DATASET_MULTILABEL_PATH, "train/labels.txt"), sep_label=",", sep_doc="\n>>>>>\n" ) assert len(y_train) == len(y_train) and len(y_train) == 20 assert y_train[:8] == [[], ['toxic', 'severe_toxic', 'obscene', 'insult'], [], [], [], [], [], ['toxic']]
def test_util(): """Test utility module.""" x_train, y_train = Dataset.load_from_files(dataset_path, folder_label=True) rd = RecursiveDefaultDict() rd["a"]["new"]["element"] = "assigned" Print.set_verbosity(VERBOSITY.VERBOSE) Print.verbosity_region_begin(VERBOSITY.VERBOSE) print(Print.style.header("this is a header!")) Print.warn("This is a warning!") Print.info("This is an informative message!") Print.show("This is a message!") with pytest.raises(Exception): Print.warn("This is a warning!", raises=Exception) Print.set_decorator_info(">", "<") Print.set_decorator_warn("|", "|") Print.set_decorator_error("*", "*") Print.verbosity_region_end()
def test_multilabel(): """Test multilabel support.""" x_train, y_train = Dataset.load_from_files_multilabel( path.join(dataset_multilabel_path, "train/docs.txt"), path.join(dataset_multilabel_path, "train/labels.txt"), sep_label=",", sep_doc="\n>>>>>\n") clf = SS3() with pytest.raises(ValueError): membership_matrix(clf, []) clf.fit(x_train, y_train) assert sorted(clf.get_categories()) == [ 'insult', 'obscene', 'severe_toxic', 'toxic' ] assert clf.classify_multilabel("this is a unknown document!") == [] y_pred = [[], ['toxic'], ['severe_toxic'], ['obscene'], ['insult'], ['toxic', 'insult']] y_pred_memmatrix = membership_matrix(clf, y_pred).todense().tolist() assert y_pred_memmatrix == [ [0, 0, 0, 0], # [] [1, 0, 0, 0], # ['toxic'] [0, 1, 0, 0], # ['severe_toxic'] [0, 0, 1, 0], # ['obscene'] [0, 0, 0, 1], # ['insult'] [1, 0, 0, 1] ] # ['toxic', 'insult'] y_pred_memmatrix = membership_matrix(clf, y_pred + [["xxx"]]).todense().tolist() assert y_pred_memmatrix[-1] == [0, 0, 0, 0]
def test_evaluation(mocker): """Test Evaluation class.""" mocker.patch("webbrowser.open") mocker.patch("matplotlib.pyplot.show") kfold_validation = Evaluation.kfold_cross_validation Evaluation.__cache__ = None Evaluation.__cache_file__ = None Evaluation.__clf__ = None Evaluation.__last_eval_tag__ = None Evaluation.__last_eval_method__ = None Evaluation.__last_eval_def_cat__ = None ss = [0, 0.5] ll = [0, 1.5] pp = [0, 2] x_data, y_data = Dataset.load_from_files(DATASET_PATH) clf = SS3() clf.set_model_path("tests") # no classifier assigned case Evaluation.clear_cache() with pytest.raises(ValueError): Evaluation.get_best_hyperparameters() with pytest.raises(ValueError): Evaluation.remove() with pytest.raises(ValueError): Evaluation.show_best() with pytest.raises(ValueError): Evaluation.plot(TMP_FOLDER) # Not-yet-trained model case Evaluation.set_classifier(clf) Evaluation.clear_cache() Evaluation.remove() Evaluation.show_best() assert Evaluation.plot(TMP_FOLDER) is False with pytest.raises(pyss3.EmptyModelError): Evaluation.test(clf, x_data, y_data) with pytest.raises(pyss3.EmptyModelError): kfold_validation(clf, x_data, y_data) with pytest.raises(pyss3.EmptyModelError): Evaluation.grid_search(clf, x_data, y_data) with pytest.raises(LookupError): Evaluation.get_best_hyperparameters() # default argument values clf.train(x_data, y_data) assert Evaluation.test(clf, x_data, y_data, plot=PY3) == 1 assert Evaluation.test(clf, ['bla bla bla'], ['pos'], plot=PY3) == 0 assert Evaluation.test(clf, ['bla bla bla', "I love this love movie!"], ['pos', 'pos'], plot=PY3) == 0.5 assert kfold_validation(clf, x_data, y_data, plot=PY3) > 0 s, l, p, a = clf.get_hyperparameters() s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data) s1, l1, p1, a1 = Evaluation.get_best_hyperparameters() s2, l2, p2, a2 = Evaluation.get_best_hyperparameters("recall") assert s0 == s and l0 == l and p0 == p and a0 == a assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1 assert s0 == s2 and l0 == l2 and p0 == p2 and a0 == a2 assert Evaluation.plot(TMP_FOLDER) is True Evaluation.remove() Evaluation.show_best() assert Evaluation.plot(TMP_FOLDER) is False # test # OK assert Evaluation.test(clf, x_data, y_data, def_cat='unknown', plot=PY3) == 1 assert Evaluation.test(clf, x_data, y_data, def_cat='neg', plot=PY3) == 1 assert Evaluation.test(clf, x_data, y_data, metric="f1-score", plot=PY3) == 1 assert Evaluation.test(clf, x_data, y_data, plot=PY3, metric="recall", metric_target="weighted avg") == 1 assert Evaluation.test(clf, x_data, y_data, plot=PY3, metric="recall", metric_target="neg") == 1 # Not OK with pytest.raises(InvalidCategoryError): Evaluation.test(clf, x_data, y_data, def_cat='xxx', plot=PY3) with pytest.raises(KeyError): Evaluation.test(clf, x_data, y_data, metric="xxx", plot=PY3) with pytest.raises(KeyError): Evaluation.test(clf, x_data, y_data, metric="recall", metric_target="xxx", plot=PY3) # k-fold # OK assert kfold_validation(clf, x_data, y_data, n_grams=3, plot=PY3) > 0 assert kfold_validation(clf, x_data, y_data, k=10, plot=PY3) > 0 assert kfold_validation( clf, x_data, y_data, k=10, def_cat='unknown', plot=PY3) > 0 assert kfold_validation(clf, x_data, y_data, k=10, def_cat='neg', plot=PY3) > 0 assert kfold_validation(clf, x_data, y_data, metric="f1-score", plot=PY3) > 0 assert kfold_validation(clf, x_data, y_data, plot=PY3, metric="recall", metric_target="weighted avg") > 0 assert kfold_validation( clf, x_data, y_data, plot=PY3, metric="recall", metric_target="neg") > 0 # Not OK with pytest.raises(ValueError): kfold_validation(clf, x_data, y_data, n_grams=-1, plot=PY3) with pytest.raises(ValueError): kfold_validation(clf, x_data, y_data, n_grams=clf, plot=PY3) with pytest.raises(ValueError): kfold_validation(clf, x_data, y_data, k=-1, plot=PY3) with pytest.raises(ValueError): kfold_validation(clf, x_data, y_data, k=clf, plot=PY3) with pytest.raises(ValueError): kfold_validation(clf, x_data, y_data, k=None, plot=PY3) with pytest.raises(InvalidCategoryError): kfold_validation(clf, x_data, y_data, def_cat='xxx', plot=PY3) with pytest.raises(KeyError): kfold_validation(clf, x_data, y_data, metric="xxx", plot=PY3) with pytest.raises(KeyError): kfold_validation(clf, x_data, y_data, metric="recall", metric_target="xxx", plot=PY3) # grid_search # OK s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data, s=ss) s1, l1, p1, a1 = Evaluation.grid_search(clf, x_data, y_data, s=ss, l=ll, p=pp) assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1 s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data, k_fold=4) s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data, def_cat='unknown', p=pp) s1, l1, p1, a1 = Evaluation.grid_search(clf, x_data, y_data, def_cat='neg', p=pp) assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1 s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data, metric="f1-score", p=pp) s1, l1, p1, a1 = Evaluation.grid_search(clf, x_data, y_data, p=pp, metric="recall", metric_target="weighted avg") s1, l1, p1, a1 = Evaluation.grid_search(clf, x_data, y_data, p=pp, metric="recall", metric_target="neg") assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1 # Not OK with pytest.raises(TypeError): Evaluation.grid_search(clf, x_data, y_data, s='asd') with pytest.raises(TypeError): Evaluation.grid_search(clf, x_data, y_data, s=clf) with pytest.raises(TypeError): Evaluation.grid_search(clf, x_data, y_data, k_fold=clf) with pytest.raises(TypeError): Evaluation.grid_search(clf, x_data, y_data, k_fold="xxx") with pytest.raises(InvalidCategoryError): Evaluation.grid_search(clf, x_data, y_data, def_cat='xxx') with pytest.raises(KeyError): Evaluation.grid_search(clf, x_data, y_data, metric="xxx") with pytest.raises(KeyError): Evaluation.grid_search(clf, x_data, y_data, metric="recall", metric_target="xxx") # get_best_hyperparameters s1, l1, p1, a1 = Evaluation.get_best_hyperparameters() s2, l2, p2, a2 = Evaluation.get_best_hyperparameters("recall") s1, l1, p1, a1 = Evaluation.get_best_hyperparameters( "recall", "weighted avg") s1, l1, p1, a1 = Evaluation.get_best_hyperparameters("recall", "pos") s1, l1, p1, a1 = Evaluation.get_best_hyperparameters(method="10-fold") s1, l1, p1, a1 = Evaluation.get_best_hyperparameters(method="10-fold", def_cat="neg") s1, l1, p1, a1 = Evaluation.get_best_hyperparameters(method="10-fold", def_cat="unknown") assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1 assert s0 == s2 and l0 == l2 and p0 == p2 and a0 == a2 # Not OK with pytest.raises(KeyError): Evaluation.get_best_hyperparameters("xxx") with pytest.raises(KeyError): Evaluation.get_best_hyperparameters("recall", "xxx") with pytest.raises(LookupError): Evaluation.get_best_hyperparameters(method="xxx") with pytest.raises(LookupError): Evaluation.get_best_hyperparameters(def_cat="xxx") with pytest.raises(LookupError): Evaluation.get_best_hyperparameters(method="4-fold", def_cat="unknown") # plot OK assert Evaluation.plot(TMP_FOLDER) is True # remove # OK assert Evaluation.remove(s, l, p, a)[0] == 10 assert Evaluation.remove(def_cat="neg")[0] == 2 assert Evaluation.remove(method="test")[0] == 12 assert Evaluation.remove(s=-10)[0] == 0 assert Evaluation.remove(def_cat="xxx")[0] == 0 assert Evaluation.remove(method="xxx")[0] == 0 assert Evaluation.remove()[0] == 1 assert Evaluation.plot(TMP_FOLDER) is False # plot not OK (no evaluations) # not OK with pytest.raises(TypeError): Evaluation.remove("xxx") with pytest.raises(TypeError): Evaluation.remove(clf) Evaluation.show_best() Evaluation.show_best(method="test") Evaluation.show_best(def_cat="unknown") Evaluation.show_best(metric="f1-score") Evaluation.show_best(metric="f1-score", avg="weighted avg") # different tag rmtree("./tests/ss3_models", ignore_errors=True)
"""Tests for pytest.""" from os import path from pyss3 import SS3, STR_NORM_GV_XAI, STR_XAI from pyss3 import STR_UNKNOWN, STR_MOST_PROBABLE, STR_UNKNOWN_CATEGORY from pyss3.util import Dataset import pyss3 import pytest DATASET_FOLDER = "dataset" dataset_path = path.join(path.abspath(path.dirname(__file__)), DATASET_FOLDER) x_train, y_train = Dataset.load_from_files(dataset_path, folder_label=False) x_test = [ "sports nfl nba superbowl soccer football team. learns jersey air bowl hockey.\n" "baseball helmet mccutchen jordan curry poker", "travel pictures images moment glamour canvas photoshoot lens dslr portrait " "beautiful seasons lines colours snap usm af eos painter gallery museum " "flower kinkade", "hairstyles boutique handbag dress trends womens menswear luxury claudiepierlot " "rustic wedding bride collection signed patrick ista streetstyle cocksox purse " "trending status brush cosmetic stretchy gucci leather cream trendy " "bargains victoria", "finance business development fiverr hiring job social debt logos stationary " "read bad media mlm uganda entrepreneurship strategy mistake 1st employee " "financial inbound habits coupon", "cooking cook chef food drink rice kitchen cold organic yummy yum bread " "strawberry bbq pepper beverages grocery cupcakes easter gurpreet sushi " "dining meal chicken lime mushrooms restaurant whiskey", "vitamins calcium minerals workout weightloss fit skin spa motivation care "
# Ok, now we are ready to begin. Let's create a new SS3 instance clf = SS3() # What are the default hyperparameter values? let's see s, l, p, _ = clf.get_hyperparameters() print("Smoothness(s):", s) print("Significance(l):", l) print("Sanction(p):", p) # Ok, now let's load the training and the test set using the `load_from_files` function # from `pyss3.util`. Since, in this dataset, there's a single file for each category, # we will use the argument ``folder_label=False`` to tell PySS3 to use each file as a different # category and each line inside of it as a different document: x_train, y_train = Dataset.load_from_files("datasets/topic/train", folder_label=False) x_test, y_test = Dataset.load_from_files("datasets/topic/test", folder_label=False) # Let's train our model... clf.fit(x_train, y_train) # Note that we don't have to create any document-term matrix! we are using just the # plain `x_train` documents :D cool uh? (SS3 creates a language model for each category # and therefore it doesn't need to create any document-term matrices) # # Now that the model has been trained, let's test it using the documents in `x_test` y_pred = clf.predict(x_test) # Let's see how good our model performed print("Accuracy:", accuracy_score(y_pred, y_test))
# ... and unzip the "movie_review.zip" dataset inside the `datasets` folder. system('unzip -u datasets/movie_review.zip -d datasets/') # Ok, now we are ready to begin. Let's create a new SS3 instance. clf = SS3() # What are the default hyper-parameter values? let's see s, l, p, _ = clf.get_hyperparameters() print("Smoothness(s):", s) print("Significance(l):", l) print("Sanction(p):", p) # Ok, now let's load the training and the test set using the `load_from_files` # function from `pyss3.util` as follow: x_train, y_train = Dataset.load_from_files("datasets/movie_review/train") x_test, y_test = Dataset.load_from_files("datasets/movie_review/test") # Let's train our model... clf.fit(x_train, y_train) # Note that we don't have to create any document-term matrix! we are using just # the plain `x_train` documents :D cool uh? # (SS3 creates a language model for each category and therefore it doesn't need # to create any document-term matrices) # Now that the model has been trained, let's test it using the documents in `x_test` y_pred = clf.predict(x_test) # Let's see how good our model performed print("Accuracy:", accuracy_score(y_pred, y_test))