def test_estimator_checks(test_fn): test_fn("spacy_lang", FasttextLanguage("tests/custom_fasttext_model.bin"))
from sklearn.pipeline import FeatureUnion from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import CountVectorizer from whatlies.language import ( FasttextLanguage, SpacyLanguage, GensimLanguage, BytePairLanguage, TFHubLanguage, HFTransformersLanguage, ) backends = [ SpacyLanguage("en_core_web_sm"), FasttextLanguage("tests/custom_fasttext_model.bin"), BytePairLanguage("en", vs=1000, dim=25, cache_dir="tests/cache"), GensimLanguage("tests/cache/custom_gensim_vectors.kv"), HFTransformersLanguage("sshleifer/tiny-gpt2", framework="tf"), TFHubLanguage("https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"), ] @pytest.mark.parametrize("lang", backends) def test_sklearn_pipeline_works(lang): pipe = Pipeline([("embed", lang), ("model", LogisticRegression())]) X = [ "i really like this post", "thanks for that comment", "i enjoy this friendly forum",
def lang(): return FasttextLanguage("tests/custom_fasttext_model.bin")
def test_raise_warning(): with pytest.warns(UserWarning): FasttextLanguage(model1).score_similar("cat", 1000)
def test_retreive_similar_len(): assert len(FasttextLanguage(model1).score_similar("cat", 20)) == 20 assert len(FasttextLanguage(model2).score_similar("cat", 10)) == 10 assert len(FasttextLanguage(model1).score_similar("cat", 1000)) == 91 assert len(FasttextLanguage(model2).score_similar("cat", 1000)) == 91
def test_load_in_model2(): lang = FasttextLanguage(model2) assert lang["dog"].vector.shape[0] == 10
def test_load_in_model1(): lang = FasttextLanguage(model1) assert lang['dog'].vector.shape[0] == 20