def lang(): vector_data = {k: np.random.normal(0, 1, (2,)) for k in ["red", "blue", "cat", "dog", "green", "purple"]} vector_data['cat'] += 10 vector_data['dog'] += 10 vocab = Vocab(strings=vector_data.keys()) for word, vector in vector_data.items(): vocab.set_vector(word, vector) nlp = Language(vocab=vocab) return SpacyLanguage(nlp)
def color_lang(): vector_data = { "red": np.array([1.0, 0.0]), "green": np.array([0.5, 0.5]), "blue": np.array([0.0, 1.0]), "purple": np.array([0.0, 1.0]), } vocab = Vocab(strings=vector_data.keys()) for word, vector in vector_data.items(): vocab.set_vector(word, vector) nlp = Language(vocab=vocab) return SpacyLanguage(nlp)
def embset(): lang = SpacyLanguage("en_core_web_md") names = [ "red", "blue", "green", "yellow", "cat", "dog", "mouse", "rat", "bike", "car", ] return lang[names]
from sklearn.pipeline import Pipeline from sklearn.pipeline import FeatureUnion from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import CountVectorizer from whatlies.language import ( FasttextLanguage, SpacyLanguage, GensimLanguage, BytePairLanguage, TFHubLanguage, HFTransformersLanguage, ) backends = [ SpacyLanguage("en_core_web_sm"), FasttextLanguage("tests/custom_fasttext_model.bin"), BytePairLanguage("en", vs=1000, dim=25, cache_dir="tests/cache"), GensimLanguage("tests/cache/custom_gensim_vectors.kv"), HFTransformersLanguage("sshleifer/tiny-gpt2", framework="tf"), TFHubLanguage("https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"), ] @pytest.mark.parametrize("lang", backends) def test_sklearn_pipeline_works(lang): pipe = Pipeline([("embed", lang), ("model", LogisticRegression())]) X = [ "i really like this post", "thanks for that comment",
from sklearn.feature_extraction.text import CountVectorizer from whatlies.language import ( FasttextLanguage, CountVectorLanguage, SpacyLanguage, GensimLanguage, BytePairLanguage, TFHubLanguage, ConveRTLanguage, HFTransformersLanguage, ) backends = [ SpacyLanguage("tests/custom_test_lang/"), FasttextLanguage("tests/custom_fasttext_model.bin"), CountVectorLanguage(n_components=10), BytePairLanguage("en"), GensimLanguage("tests/cache/custom_gensim_vectors.kv"), ConveRTLanguage(), HFTransformersLanguage("sshleifer/tiny-gpt2", framework="tf"), TFHubLanguage("https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"), ] @pytest.mark.parametrize("lang", backends) def test_sklearn_pipeline_works(lang): pipe = Pipeline([("embed", lang), ("model", LogisticRegression())]) X = [
def test_estimator_checks(test_fn): test_fn("spacy_lang", SpacyLanguage("tests/custom_test_lang/"))
def test_get_params(): assert "nlp" in SpacyLanguage( "tests/custom_test_lang/").get_params().keys()
import pytest from spacy.vocab import Vocab from spacy.language import Language from whatlies.language import SpacyLanguage from whatlies.transformers import Umap, Pca, Noise, AddRandom, Tsne, OpenTsne vocab = Vocab().from_disk("tests/custom_test_vocab/") words = list(vocab.strings) lang = SpacyLanguage(nlp=Language(vocab=vocab, meta={"lang": "en"})) emb = lang[words] transformers = [ Umap(2), Umap(3), Pca(2), Pca(3), Noise(0.1), Noise(0.01), AddRandom(n=4), AddRandom(n=1), lambda d: d | (d["man"] - d["woman"]), Tsne(2, n_iter=250), Tsne(3, n_iter=250), OpenTsne(2, n_iter=100), ] extra_sizes = [2, 3, 2, 3, 0, 0, 4, 1, 0, 2, 3, 2] tfm_ids = [_.__class__.__name__ for _ in transformers] @pytest.mark.parametrize(
Synopsis : This tutorial pursues the following learning goals: 1. charting the semantic relations involving sets of lexical items 2. familiarizing with 'whatlies,' a dedicated libraries for the visualization of word embeddings To do : None """ # %% load libraries from whatlies import EmbeddingSet from whatlies.language import SpacyLanguage # %% load a model of the language 'via' whatlies lang = SpacyLanguage("en_core_web_lg") # %% create a list of lexical items """ let's see how animals (actors) map onto qualities (attributes) """ # sample animals animals = ["cat", "dog", "mouse"] # sample qualities qualities = ["responsive", "loyal"] # set of lexical items items = animals + qualities # %% browse the loaded model of the language, retrieve the vectors # and create and initialize an embedding sets (a class specific # to the library whatlies)
import pytest from spacy.vocab import Vocab from spacy.language import Language from whatlies.language import SpacyLanguage from whatlies.transformers import Umap, Pca, Noise, AddRandom vocab = Vocab().from_disk("tests/custom_test_vocab/") words = [v.text for v in vocab] lang = SpacyLanguage(model=Language(vocab=vocab, meta={"lang": "en"})) emb = lang[words] @pytest.mark.parametrize( "transformer,extra_size", zip( [ Umap(2), Umap(3), Pca(2), Pca(3), Noise(0.1), Noise(0.01), AddRandom(n=4), AddRandom(n=1), lambda d: d | (d["man"] - d["woman"]), ], [2, 3, 2, 3, 0, 0, 4, 1, 0], ), ) def test_transformations_new_size(transformer, extra_size): emb_new = emb.transform(transformer)
"dog", "cat", "mouse", "red", "bluee", "green", "yellow", "water", "person", "family", "brother", "sister", ] # I'm loading in the spaCy model globally because it is much faster this way. lang = SpacyLanguage("en_core_web_md") @pytest.fixture def embset(): return lang[words] def test_set_title_works(embset): ax = embset.plot_3d(annot=True, title="foobar") assert ax.title._text == "foobar" def test_correct_points_plotted(embset): embset_plt = embset.transform(Pca(3)) ax = embset_plt.plot_3d(annot=True)