示例#1
0
def lang():
    vector_data = {k: np.random.normal(0, 1, (2,)) for k in ["red", "blue", "cat", "dog", "green", "purple"]}
    vector_data['cat'] += 10
    vector_data['dog'] += 10
    vocab = Vocab(strings=vector_data.keys())
    for word, vector in vector_data.items():
        vocab.set_vector(word, vector)
    nlp = Language(vocab=vocab)
    return SpacyLanguage(nlp)
示例#2
0
def color_lang():
    vector_data = {
        "red": np.array([1.0, 0.0]),
        "green": np.array([0.5, 0.5]),
        "blue": np.array([0.0, 1.0]),
        "purple": np.array([0.0, 1.0]),
    }

    vocab = Vocab(strings=vector_data.keys())
    for word, vector in vector_data.items():
        vocab.set_vector(word, vector)
    nlp = Language(vocab=vocab)
    return SpacyLanguage(nlp)
def embset():
    lang = SpacyLanguage("en_core_web_md")
    names = [
        "red",
        "blue",
        "green",
        "yellow",
        "cat",
        "dog",
        "mouse",
        "rat",
        "bike",
        "car",
    ]
    return lang[names]
示例#4
0
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

from whatlies.language import (
    FasttextLanguage,
    SpacyLanguage,
    GensimLanguage,
    BytePairLanguage,
    TFHubLanguage,
    HFTransformersLanguage,
)

backends = [
    SpacyLanguage("en_core_web_sm"),
    FasttextLanguage("tests/custom_fasttext_model.bin"),
    BytePairLanguage("en", vs=1000, dim=25, cache_dir="tests/cache"),
    GensimLanguage("tests/cache/custom_gensim_vectors.kv"),
    HFTransformersLanguage("sshleifer/tiny-gpt2", framework="tf"),
    TFHubLanguage("https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"),
]


@pytest.mark.parametrize("lang", backends)
def test_sklearn_pipeline_works(lang):
    pipe = Pipeline([("embed", lang), ("model", LogisticRegression())])

    X = [
        "i really like this post",
        "thanks for that comment",
示例#5
0
from sklearn.feature_extraction.text import CountVectorizer

from whatlies.language import (
    FasttextLanguage,
    CountVectorLanguage,
    SpacyLanguage,
    GensimLanguage,
    BytePairLanguage,
    TFHubLanguage,
    ConveRTLanguage,
    HFTransformersLanguage,
)


backends = [
    SpacyLanguage("tests/custom_test_lang/"),
    FasttextLanguage("tests/custom_fasttext_model.bin"),
    CountVectorLanguage(n_components=10),
    BytePairLanguage("en"),
    GensimLanguage("tests/cache/custom_gensim_vectors.kv"),
    ConveRTLanguage(),
    HFTransformersLanguage("sshleifer/tiny-gpt2", framework="tf"),
    TFHubLanguage("https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"),
]


@pytest.mark.parametrize("lang", backends)
def test_sklearn_pipeline_works(lang):
    pipe = Pipeline([("embed", lang), ("model", LogisticRegression())])

    X = [
示例#6
0
def test_estimator_checks(test_fn):
    test_fn("spacy_lang", SpacyLanguage("tests/custom_test_lang/"))
示例#7
0
def test_get_params():
    assert "nlp" in SpacyLanguage(
        "tests/custom_test_lang/").get_params().keys()
import pytest
from spacy.vocab import Vocab
from spacy.language import Language
from whatlies.language import SpacyLanguage
from whatlies.transformers import Umap, Pca, Noise, AddRandom, Tsne, OpenTsne


vocab = Vocab().from_disk("tests/custom_test_vocab/")
words = list(vocab.strings)
lang = SpacyLanguage(nlp=Language(vocab=vocab, meta={"lang": "en"}))
emb = lang[words]

transformers = [
    Umap(2),
    Umap(3),
    Pca(2),
    Pca(3),
    Noise(0.1),
    Noise(0.01),
    AddRandom(n=4),
    AddRandom(n=1),
    lambda d: d | (d["man"] - d["woman"]),
    Tsne(2, n_iter=250),
    Tsne(3, n_iter=250),
    OpenTsne(2, n_iter=100),
]
extra_sizes = [2, 3, 2, 3, 0, 0, 4, 1, 0, 2, 3, 2]
tfm_ids = [_.__class__.__name__ for _ in transformers]


@pytest.mark.parametrize(
示例#9
0
Synopsis : This tutorial pursues the following learning goals:
   
           1. charting the semantic relations involving sets of lexical items
           2. familiarizing with 'whatlies,' a dedicated libraries for the 
              visualization of word embeddings

To do    : None

"""

# %% load libraries
from whatlies import EmbeddingSet
from whatlies.language import SpacyLanguage

# %% load a model of the language 'via' whatlies
lang = SpacyLanguage("en_core_web_lg")

# %% create a list of lexical items
"""
let's see how animals (actors) map onto qualities (attributes)
"""
# sample animals
animals = ["cat", "dog", "mouse"]
# sample qualities
qualities = ["responsive", "loyal"]
# set of lexical items
items = animals + qualities

# %% browse the loaded model of the language, retrieve the vectors
#    and create and initialize an embedding sets (a class specific
#    to the library whatlies)
示例#10
0
import pytest
from spacy.vocab import Vocab
from spacy.language import Language
from whatlies.language import SpacyLanguage
from whatlies.transformers import Umap, Pca, Noise, AddRandom

vocab = Vocab().from_disk("tests/custom_test_vocab/")
words = [v.text for v in vocab]
lang = SpacyLanguage(model=Language(vocab=vocab, meta={"lang": "en"}))
emb = lang[words]


@pytest.mark.parametrize(
    "transformer,extra_size",
    zip(
        [
            Umap(2),
            Umap(3),
            Pca(2),
            Pca(3),
            Noise(0.1),
            Noise(0.01),
            AddRandom(n=4),
            AddRandom(n=1),
            lambda d: d | (d["man"] - d["woman"]),
        ],
        [2, 3, 2, 3, 0, 0, 4, 1, 0],
    ),
)
def test_transformations_new_size(transformer, extra_size):
    emb_new = emb.transform(transformer)
示例#11
0
    "dog",
    "cat",
    "mouse",
    "red",
    "bluee",
    "green",
    "yellow",
    "water",
    "person",
    "family",
    "brother",
    "sister",
]

# I'm loading in the spaCy model globally because it is much faster this way.
lang = SpacyLanguage("en_core_web_md")


@pytest.fixture
def embset():
    return lang[words]


def test_set_title_works(embset):
    ax = embset.plot_3d(annot=True, title="foobar")
    assert ax.title._text == "foobar"


def test_correct_points_plotted(embset):
    embset_plt = embset.transform(Pca(3))
    ax = embset_plt.plot_3d(annot=True)