示例#1
0
def test_wos():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    get_wos("wos")
示例#2
0
def test_directories():

    assert adso.common.ADSODIR == Path(".adso_test")
    adso.set_adso_dir(".test")
    assert adso.common.ADSODIR == Path(".test")
    adso.set_project_name("test")
    assert adso.common.PROJDIR == Path(".test/test")
示例#3
0
def test_HDPVB():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups("HDPVB_20news",
                               categories=["sci.space", "rec.autos"])

    hdp = HDPVB()

    topic_model, (n, ) = hdp.fit_transform(dataset, "test_HDPVB")
示例#4
0
def test_hSBM():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups("hSBM_20news",
                               categories=["sci.space", "rec.autos"])

    hsbm = hSBM()

    topic_model, (n_layer, ) = hsbm.fit_transform(dataset, "test_hSBM")

    assert round(NMI(dataset, topic_model), 5) == 0.1668
示例#5
0
def test_LDAVB():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups("LDAVB_20news",
                               categories=["sci.space", "rec.autos"])

    lda = LDAVB(2)

    topic_model = lda.fit_transform(dataset, "test_LDAVB")

    return dataset, topic_model
示例#6
0
def test_simple_HDPVB():
    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    docs = ["A A B C D", "B B B A C", "E F E"]
    labels = ["1", "1", "2"]

    dataset = data.LabeledDataset.from_iterator("HDPVB_simple_data",
                                                zip(labels, docs))

    dataset.set_vectorizer_params(tokenizer=(lambda s: s.split(" ")), )

    hdp = HDPVB()

    topic_model, (n, ) = hdp.fit_transform(dataset, "test_simple_HDPVB")
示例#7
0
def test_simple_LDAVB():
    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    docs = ["A A B C D", "B B B A C", "E F E"]
    labels = ["1", "1", "2"]

    dataset = data.LabeledDataset.from_iterator("LDAVB_simple_data",
                                                zip(labels, docs))

    dataset.set_vectorizer_params(tokenizer=(lambda s: s.split(" ")), )

    lda = LDAVB(2)

    return lda.fit_transform(dataset, "test_simple_LDAVB")
示例#8
0
def test_vectorizer():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")

    docs = ["A A B C D", "B B B A C", "E F E"]

    dataset = data.Dataset.from_iterator("test_vectorizer", docs)

    dataset.set_vectorizer_params(tokenizer=(lambda s: s.split(" ")), )

    count_matrix = dataset.get_count_matrix()
    vocab = dataset.get_vocab()

    assert (count_matrix[...] == np.array([[2, 1, 1, 1, 0, 0],
                                           [1, 3, 1, 0, 0, 0],
                                           [0, 0, 0, 0, 2, 1]])).all()
    assert set(vocab) == {"a", "b", "c", "d", "e", "f"}
示例#9
0
def test_NMF():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups("NMF_20news",
                               categories=["sci.space", "rec.autos"])

    nmf = NMF(2)

    topic_model, (n_iter, error) = nmf.fit_transform(dataset, "test_NMF")

    assert round(NMI(dataset, topic_model), 5) == 0.00119
    assert (confusion_matrix(dataset,
                             topic_model).todense() == np.array([[617, 373],
                                                                 [653, 334]
                                                                 ])).all()
示例#10
0
def test_TM():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups("TM_20news",
                               categories=["sci.space", "rec.autos"])

    tm = TopicMapping()

    topic_model, (n, ) = tm.fit_transform(dataset, "test_TM")

    assert round(NMI(dataset, topic_model), 5) == 0.16266
    assert (confusion_matrix(dataset, topic_model).todense() == np.array([
        [18, 12, 45, 7, 294, 21, 71, 26, 6, 28, 92, 115, 86, 81, 4, 84],
        [162, 105, 58, 73, 2, 39, 79, 52, 44, 81, 36, 53, 54, 6, 130, 13],
    ])).all()
示例#11
0
def test_LDAGS():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups(
        "LDAGS_20news",
        categories=[
            "sci.space",
            "rec.autos",
        ],
    )

    lda = LDAGS(
        2,
        mallet_args={"optimize-interval": 20},
    )

    lda.fit_transform(dataset, "test_LDAGS")
示例#12
0
def test_PLSA():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups("PLSA_20news",
                               categories=["sci.space", "rec.autos"])

    plsa = PLSA(2, max_iter=5)

    topic_model = plsa.fit_transform(dataset, "test_PLSA")

    assert round(NMI(dataset, topic_model), 5) == 0.00032
    assert (confusion_matrix(dataset,
                             topic_model).todense() == np.array([[521, 469],
                                                                 [540, 447]
                                                                 ])).all()

    return dataset, topic_model
示例#13
0
def test_simple_NMF():
    adso.set_adso_dir(".test")
    adso.set_project_name("test")

    docs = ["A A B C D", "B B B A C", "E F E"]
    labels = ["1", "1", "2"]

    dataset = data.LabeledDataset.from_iterator("NMF_simple_data",
                                                zip(labels, docs))

    dataset.set_vectorizer_params(tokenizer=(lambda s: s.split(" ")), )

    nmf = NMF(2)

    topic_model, (n_iter, error) = nmf.fit_transform(dataset,
                                                     "test_simple_NMF")

    assert round(NMI(dataset, topic_model), 5) == 1
    assert (confusion_matrix(dataset,
                             topic_model).todense() == np.array([[2, 0],
                                                                 [0,
                                                                  1]])).all()
示例#14
0
def test_simple_TM():
    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    docs = ["A A B C D", "B B B A C", "E F E"]
    labels = ["1", "1", "2"]

    dataset = data.LabeledDataset.from_iterator("TM_simple_data",
                                                zip(labels, docs))

    dataset.set_vectorizer_params(tokenizer=(lambda s: s.split(" ")), )

    tm = TopicMapping(p=1)

    topic_model, (n, ) = tm.fit_transform(dataset, "test_simple_TM")

    assert round(NMI(dataset, topic_model), 5) == 1.0
    assert (confusion_matrix(dataset,
                             topic_model).todense() == np.array([[2, 0],
                                                                 [0,
                                                                  1]])).all()
示例#15
0
def test_UMAP_HDBSCAN():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups("UH_20news",
                               categories=["sci.space", "rec.autos"])

    u_args = {
        "n_components": 2,
        "n_neighbors": 15,
        "min_dist": 0.1,
        "metric": "hellinger",
    }
    model = UMAP_HDBSCAN(u_args=u_args)

    topic_model = model.fit_transform(dataset, "test_simple_UMAP_HDBSCAN")

    assert round(NMI(dataset, topic_model), 5) == 0.24392

    return dataset, topic_model
示例#16
0
def test_simple_LDAGS():
    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    docs = ["A A B C D", "B B B A C", "E F E"]
    labels = ["1", "1", "2"]

    dataset = data.LabeledDataset.from_iterator("LDAGS_simple_data",
                                                zip(labels, docs))

    dataset.set_vectorizer_params(tokenizer=(lambda s: s.split(" ")), )

    lda = LDAGS(
        2,
        memory="512M",
        mallet_args={
            "num-iterations": 1000,
            "optimize-interval": 20,
        },
    )

    lda.fit_transform(dataset, "test_simple_LDAGS")
示例#17
0
def test_from_iterator():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")

    labels = ["Animals", "Animals", "Maths", "Maths"]

    docs = [
        "Dinosaurs are reptiles. Birds descend from dinosaurs. Even if most of the dinosaurs don't fly, probably the majority of them was covered by feathers.",
        "Birds lay eggs, like reptiles and fishes. Most of the birds fly, even if penguins, ostriches and some others are unable to fly. Birds have two wings, like some dinosaurs and others ancient reptiles.",
        "Geometry studies shapes and entities in the space. A geometrician proves theorem about the relation among two or more geometrical entities. Continuity is a geometric concept widely used in calculus.",
        "Linear algebra studies matrices, vectors and vectorial spaces. Sometimes linear algebra is considered a subfield of geometry. Many theorem regarding matrices exists but one of the most important is the spectral one.",
    ]

    data.Dataset.from_iterator("test_from_iterator", docs)
    assert (
        data.Dataset.load(".test/test/test_from_iterator").path == data.Dataset
        .load(".test/test/test_from_iterator/test_from_iterator.json").path)

    assert [
        x.item().decode("utf-8") for x in list(
            data.Dataset.load(".test/test/test_from_iterator").get_corpus())
    ] == docs

    data.LabeledDataset.from_iterator("labeled_test_from_iterator",
                                      zip(labels, docs))
    assert (data.LabeledDataset.load(
        ".test/test/labeled_test_from_iterator"
    ).path == data.LabeledDataset.load(
        ".test/test/labeled_test_from_iterator/labeled_test_from_iterator.json"
    ).path)

    assert [
        x.item() for x in list(
            data.LabeledDataset.load(
                ".test/test/labeled_test_from_iterator").get_labels())
    ] == labels
示例#18
0
import gc

import dask
from dask.distributed import Client

import adso
from adso.corpora import get_20newsgroups

if __name__ == "__main__":

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dask.config.set({"temporary_directory": str(adso.common.ADSODIR / "dask")})
    client = Client()

    gc.set_threshold(50, 10, 10)

    adso.data.common.nltk_download("punkt")

    def my_tokenizer(doc):
        return list(
            filter(
                lambda s: s.isalpha() and len(s) >= 3,
                adso.data.common.tokenize_and_stem(doc),
            )
        )

    try:
        dataset = adso.data.LabeledDataset.load(".test/test/20news")