Python Dataset示例，pyss3.util.Dataset Python示例

示例#1

0

显示文件

def test_dataset():
    """Test Dataset class."""
    x_train, y_train = Dataset.load_from_files_multilabel(
        path.join(DATASET_MULTILABEL_PATH, "train_files"),
        path.join(DATASET_MULTILABEL_PATH, "file_labels.tsv")
    )

    assert x_train == ['this is the first document!!\n\n:)', 'and this is the\n\nSECOND!!']
    assert y_train == [['catA', 'catB', 'catC'], ['catA']]

    x_train, y_train = Dataset.load_from_files_multilabel(
        path.join(DATASET_MULTILABEL_PATH, "train/docs.txt"),
        path.join(DATASET_MULTILABEL_PATH, "train/labels.txt"),
        sep_label=",",
        sep_doc="\n>>>>>\n"
    )

    assert len(y_train) == len(y_train) and len(y_train) == 20
    assert y_train[:8] == [[], ['toxic', 'severe_toxic', 'obscene', 'insult'],
                           [], [], [], [], [], ['toxic']]

示例#2

0

显示文件

def test_util():
    """Test utility module."""
    x_train, y_train = Dataset.load_from_files(dataset_path, folder_label=True)

    rd = RecursiveDefaultDict()
    rd["a"]["new"]["element"] = "assigned"

    Print.set_verbosity(VERBOSITY.VERBOSE)
    Print.verbosity_region_begin(VERBOSITY.VERBOSE)

    print(Print.style.header("this is a header!"))

    Print.warn("This is a warning!")
    Print.info("This is an informative message!")
    Print.show("This is a message!")

    with pytest.raises(Exception):
        Print.warn("This is a warning!", raises=Exception)

    Print.set_decorator_info(">", "<")
    Print.set_decorator_warn("|", "|")
    Print.set_decorator_error("*", "*")

    Print.verbosity_region_end()

示例#3

0

显示文件

def test_multilabel():
    """Test multilabel support."""
    x_train, y_train = Dataset.load_from_files_multilabel(
        path.join(dataset_multilabel_path, "train/docs.txt"),
        path.join(dataset_multilabel_path, "train/labels.txt"),
        sep_label=",",
        sep_doc="\n>>>>>\n")

    clf = SS3()

    with pytest.raises(ValueError):
        membership_matrix(clf, [])

    clf.fit(x_train, y_train)

    assert sorted(clf.get_categories()) == [
        'insult', 'obscene', 'severe_toxic', 'toxic'
    ]
    assert clf.classify_multilabel("this is a unknown document!") == []

    y_pred = [[], ['toxic'], ['severe_toxic'], ['obscene'], ['insult'],
              ['toxic', 'insult']]

    y_pred_memmatrix = membership_matrix(clf, y_pred).todense().tolist()
    assert y_pred_memmatrix == [
        [0, 0, 0, 0],  # []
        [1, 0, 0, 0],  # ['toxic']
        [0, 1, 0, 0],  # ['severe_toxic']
        [0, 0, 1, 0],  # ['obscene']
        [0, 0, 0, 1],  # ['insult']
        [1, 0, 0, 1]
    ]  # ['toxic', 'insult']

    y_pred_memmatrix = membership_matrix(clf, y_pred +
                                         [["xxx"]]).todense().tolist()
    assert y_pred_memmatrix[-1] == [0, 0, 0, 0]

示例#4

0

显示文件

文件： test_util.py 项目： awoziji/pyss3

def test_evaluation(mocker):
    """Test Evaluation class."""
    mocker.patch("webbrowser.open")
    mocker.patch("matplotlib.pyplot.show")

    kfold_validation = Evaluation.kfold_cross_validation

    Evaluation.__cache__ = None
    Evaluation.__cache_file__ = None
    Evaluation.__clf__ = None
    Evaluation.__last_eval_tag__ = None
    Evaluation.__last_eval_method__ = None
    Evaluation.__last_eval_def_cat__ = None

    ss = [0, 0.5]
    ll = [0, 1.5]
    pp = [0, 2]
    x_data, y_data = Dataset.load_from_files(DATASET_PATH)

    clf = SS3()
    clf.set_model_path("tests")

    # no classifier assigned case
    Evaluation.clear_cache()
    with pytest.raises(ValueError):
        Evaluation.get_best_hyperparameters()
    with pytest.raises(ValueError):
        Evaluation.remove()
    with pytest.raises(ValueError):
        Evaluation.show_best()
    with pytest.raises(ValueError):
        Evaluation.plot(TMP_FOLDER)

    # Not-yet-trained model case
    Evaluation.set_classifier(clf)
    Evaluation.clear_cache()
    Evaluation.remove()
    Evaluation.show_best()
    assert Evaluation.plot(TMP_FOLDER) is False

    with pytest.raises(pyss3.EmptyModelError):
        Evaluation.test(clf, x_data, y_data)
    with pytest.raises(pyss3.EmptyModelError):
        kfold_validation(clf, x_data, y_data)
    with pytest.raises(pyss3.EmptyModelError):
        Evaluation.grid_search(clf, x_data, y_data)
    with pytest.raises(LookupError):
        Evaluation.get_best_hyperparameters()

    # default argument values
    clf.train(x_data, y_data)

    assert Evaluation.test(clf, x_data, y_data, plot=PY3) == 1
    assert Evaluation.test(clf, ['bla bla bla'], ['pos'], plot=PY3) == 0
    assert Evaluation.test(clf, ['bla bla bla', "I love this love movie!"],
                           ['pos', 'pos'],
                           plot=PY3) == 0.5
    assert kfold_validation(clf, x_data, y_data, plot=PY3) > 0
    s, l, p, a = clf.get_hyperparameters()
    s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data)
    s1, l1, p1, a1 = Evaluation.get_best_hyperparameters()
    s2, l2, p2, a2 = Evaluation.get_best_hyperparameters("recall")
    assert s0 == s and l0 == l and p0 == p and a0 == a
    assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1
    assert s0 == s2 and l0 == l2 and p0 == p2 and a0 == a2
    assert Evaluation.plot(TMP_FOLDER) is True
    Evaluation.remove()
    Evaluation.show_best()
    assert Evaluation.plot(TMP_FOLDER) is False

    # test
    #   OK
    assert Evaluation.test(clf, x_data, y_data, def_cat='unknown',
                           plot=PY3) == 1
    assert Evaluation.test(clf, x_data, y_data, def_cat='neg', plot=PY3) == 1
    assert Evaluation.test(clf, x_data, y_data, metric="f1-score",
                           plot=PY3) == 1
    assert Evaluation.test(clf,
                           x_data,
                           y_data,
                           plot=PY3,
                           metric="recall",
                           metric_target="weighted avg") == 1
    assert Evaluation.test(clf,
                           x_data,
                           y_data,
                           plot=PY3,
                           metric="recall",
                           metric_target="neg") == 1
    #   Not OK
    with pytest.raises(InvalidCategoryError):
        Evaluation.test(clf, x_data, y_data, def_cat='xxx', plot=PY3)
    with pytest.raises(KeyError):
        Evaluation.test(clf, x_data, y_data, metric="xxx", plot=PY3)
    with pytest.raises(KeyError):
        Evaluation.test(clf,
                        x_data,
                        y_data,
                        metric="recall",
                        metric_target="xxx",
                        plot=PY3)

    # k-fold
    #   OK
    assert kfold_validation(clf, x_data, y_data, n_grams=3, plot=PY3) > 0
    assert kfold_validation(clf, x_data, y_data, k=10, plot=PY3) > 0
    assert kfold_validation(
        clf, x_data, y_data, k=10, def_cat='unknown', plot=PY3) > 0
    assert kfold_validation(clf, x_data, y_data, k=10, def_cat='neg',
                            plot=PY3) > 0
    assert kfold_validation(clf, x_data, y_data, metric="f1-score",
                            plot=PY3) > 0
    assert kfold_validation(clf,
                            x_data,
                            y_data,
                            plot=PY3,
                            metric="recall",
                            metric_target="weighted avg") > 0
    assert kfold_validation(
        clf, x_data, y_data, plot=PY3, metric="recall",
        metric_target="neg") > 0
    #   Not OK
    with pytest.raises(ValueError):
        kfold_validation(clf, x_data, y_data, n_grams=-1, plot=PY3)
    with pytest.raises(ValueError):
        kfold_validation(clf, x_data, y_data, n_grams=clf, plot=PY3)
    with pytest.raises(ValueError):
        kfold_validation(clf, x_data, y_data, k=-1, plot=PY3)
    with pytest.raises(ValueError):
        kfold_validation(clf, x_data, y_data, k=clf, plot=PY3)
    with pytest.raises(ValueError):
        kfold_validation(clf, x_data, y_data, k=None, plot=PY3)
    with pytest.raises(InvalidCategoryError):
        kfold_validation(clf, x_data, y_data, def_cat='xxx', plot=PY3)
    with pytest.raises(KeyError):
        kfold_validation(clf, x_data, y_data, metric="xxx", plot=PY3)
    with pytest.raises(KeyError):
        kfold_validation(clf,
                         x_data,
                         y_data,
                         metric="recall",
                         metric_target="xxx",
                         plot=PY3)

    # grid_search
    #   OK
    s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data, s=ss)
    s1, l1, p1, a1 = Evaluation.grid_search(clf,
                                            x_data,
                                            y_data,
                                            s=ss,
                                            l=ll,
                                            p=pp)
    assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1
    s0, l0, p0, a0 = Evaluation.grid_search(clf, x_data, y_data, k_fold=4)
    s0, l0, p0, a0 = Evaluation.grid_search(clf,
                                            x_data,
                                            y_data,
                                            def_cat='unknown',
                                            p=pp)
    s1, l1, p1, a1 = Evaluation.grid_search(clf,
                                            x_data,
                                            y_data,
                                            def_cat='neg',
                                            p=pp)
    assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1
    s0, l0, p0, a0 = Evaluation.grid_search(clf,
                                            x_data,
                                            y_data,
                                            metric="f1-score",
                                            p=pp)
    s1, l1, p1, a1 = Evaluation.grid_search(clf,
                                            x_data,
                                            y_data,
                                            p=pp,
                                            metric="recall",
                                            metric_target="weighted avg")
    s1, l1, p1, a1 = Evaluation.grid_search(clf,
                                            x_data,
                                            y_data,
                                            p=pp,
                                            metric="recall",
                                            metric_target="neg")
    assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1
    #   Not OK
    with pytest.raises(TypeError):
        Evaluation.grid_search(clf, x_data, y_data, s='asd')
    with pytest.raises(TypeError):
        Evaluation.grid_search(clf, x_data, y_data, s=clf)
    with pytest.raises(TypeError):
        Evaluation.grid_search(clf, x_data, y_data, k_fold=clf)
    with pytest.raises(TypeError):
        Evaluation.grid_search(clf, x_data, y_data, k_fold="xxx")
    with pytest.raises(InvalidCategoryError):
        Evaluation.grid_search(clf, x_data, y_data, def_cat='xxx')
    with pytest.raises(KeyError):
        Evaluation.grid_search(clf, x_data, y_data, metric="xxx")
    with pytest.raises(KeyError):
        Evaluation.grid_search(clf,
                               x_data,
                               y_data,
                               metric="recall",
                               metric_target="xxx")

    # get_best_hyperparameters
    s1, l1, p1, a1 = Evaluation.get_best_hyperparameters()
    s2, l2, p2, a2 = Evaluation.get_best_hyperparameters("recall")
    s1, l1, p1, a1 = Evaluation.get_best_hyperparameters(
        "recall", "weighted avg")
    s1, l1, p1, a1 = Evaluation.get_best_hyperparameters("recall", "pos")
    s1, l1, p1, a1 = Evaluation.get_best_hyperparameters(method="10-fold")
    s1, l1, p1, a1 = Evaluation.get_best_hyperparameters(method="10-fold",
                                                         def_cat="neg")
    s1, l1, p1, a1 = Evaluation.get_best_hyperparameters(method="10-fold",
                                                         def_cat="unknown")
    assert s0 == s1 and l0 == l1 and p0 == p1 and a0 == a1
    assert s0 == s2 and l0 == l2 and p0 == p2 and a0 == a2

    # Not OK
    with pytest.raises(KeyError):
        Evaluation.get_best_hyperparameters("xxx")
    with pytest.raises(KeyError):
        Evaluation.get_best_hyperparameters("recall", "xxx")
    with pytest.raises(LookupError):
        Evaluation.get_best_hyperparameters(method="xxx")
    with pytest.raises(LookupError):
        Evaluation.get_best_hyperparameters(def_cat="xxx")
    with pytest.raises(LookupError):
        Evaluation.get_best_hyperparameters(method="4-fold", def_cat="unknown")

    # plot OK
    assert Evaluation.plot(TMP_FOLDER) is True

    # remove
    #   OK
    assert Evaluation.remove(s, l, p, a)[0] == 10
    assert Evaluation.remove(def_cat="neg")[0] == 2
    assert Evaluation.remove(method="test")[0] == 12
    assert Evaluation.remove(s=-10)[0] == 0
    assert Evaluation.remove(def_cat="xxx")[0] == 0
    assert Evaluation.remove(method="xxx")[0] == 0
    assert Evaluation.remove()[0] == 1
    assert Evaluation.plot(TMP_FOLDER) is False  # plot not OK (no evaluations)
    #   not OK
    with pytest.raises(TypeError):
        Evaluation.remove("xxx")
    with pytest.raises(TypeError):
        Evaluation.remove(clf)

    Evaluation.show_best()
    Evaluation.show_best(method="test")
    Evaluation.show_best(def_cat="unknown")
    Evaluation.show_best(metric="f1-score")
    Evaluation.show_best(metric="f1-score", avg="weighted avg")

    # different tag

    rmtree("./tests/ss3_models", ignore_errors=True)

示例#5

0

显示文件

"""Tests for pytest."""
from os import path
from pyss3 import SS3, STR_NORM_GV_XAI, STR_XAI
from pyss3 import STR_UNKNOWN, STR_MOST_PROBABLE, STR_UNKNOWN_CATEGORY
from pyss3.util import Dataset

import pyss3
import pytest

DATASET_FOLDER = "dataset"

dataset_path = path.join(path.abspath(path.dirname(__file__)), DATASET_FOLDER)

x_train, y_train = Dataset.load_from_files(dataset_path, folder_label=False)
x_test = [
    "sports nfl nba superbowl soccer football team. learns jersey air bowl hockey.\n"
    "baseball helmet mccutchen jordan curry poker",
    "travel pictures images moment glamour canvas photoshoot lens dslr portrait "
    "beautiful seasons lines colours snap usm af eos painter gallery museum  "
    "flower kinkade",
    "hairstyles boutique handbag dress trends womens menswear luxury claudiepierlot "
    "rustic wedding bride collection signed patrick ista streetstyle cocksox purse "
    "trending status brush cosmetic stretchy gucci leather cream trendy "
    "bargains victoria",
    "finance business development fiverr hiring job social debt logos stationary "
    "read bad media mlm uganda entrepreneurship strategy mistake 1st employee "
    "financial inbound habits coupon",
    "cooking cook chef food drink rice kitchen cold organic yummy yum bread "
    "strawberry bbq pepper beverages grocery cupcakes easter gurpreet sushi "
    "dining meal chicken lime mushrooms restaurant whiskey",
    "vitamins calcium minerals workout weightloss fit skin spa motivation care "

示例#6

0

显示文件

文件： topic_categorization.py 项目： tixqzz/pyss3

# Ok, now we are ready to begin. Let's create a new SS3 instance
clf = SS3()

# What are the default hyperparameter values? let's see
s, l, p, _ = clf.get_hyperparameters()

print("Smoothness(s):", s)
print("Significance(l):", l)
print("Sanction(p):", p)

# Ok, now let's load the training and the test set using the `load_from_files` function
# from `pyss3.util`. Since, in this dataset, there's a single file for each category,
# we will use the argument ``folder_label=False`` to tell PySS3 to use each file as a different
# category and each line inside of it as a different document:
x_train, y_train = Dataset.load_from_files("datasets/topic/train",
                                           folder_label=False)
x_test, y_test = Dataset.load_from_files("datasets/topic/test",
                                         folder_label=False)

# Let's train our model...
clf.fit(x_train, y_train)

# Note that we don't have to create any document-term matrix! we are using just the
# plain `x_train` documents :D cool uh? (SS3 creates a language model for each category
# and therefore it doesn't need to create any document-term matrices)
#
# Now that the model has been trained, let's test it using the documents in `x_test`
y_pred = clf.predict(x_test)

# Let's see how good our model performed
print("Accuracy:", accuracy_score(y_pred, y_test))

示例#7

0

显示文件

文件： movie_review.py 项目： tixqzz/pyss3

# ... and unzip the "movie_review.zip" dataset inside the `datasets` folder.
system('unzip -u datasets/movie_review.zip -d datasets/')

# Ok, now we are ready to begin. Let's create a new SS3 instance.
clf = SS3()

# What are the default hyper-parameter values? let's see
s, l, p, _ = clf.get_hyperparameters()

print("Smoothness(s):", s)
print("Significance(l):", l)
print("Sanction(p):", p)

# Ok, now let's load the training and the test set using the `load_from_files`
# function from `pyss3.util` as follow:
x_train, y_train = Dataset.load_from_files("datasets/movie_review/train")
x_test, y_test = Dataset.load_from_files("datasets/movie_review/test")

# Let's train our model...
clf.fit(x_train, y_train)

# Note that we don't have to create any document-term matrix! we are using just
# the plain `x_train` documents :D cool uh?
# (SS3 creates a language model for each category and therefore it doesn't need
# to create any document-term matrices)

# Now that the model has been trained, let's test it using the documents in `x_test`
y_pred = clf.predict(x_test)

# Let's see how good our model performed
print("Accuracy:", accuracy_score(y_pred, y_test))