def test_issue3412(): data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f") vectors = Vectors(data=data, keys=["A", "B", "C"]) keys, best_rows, scores = vectors.most_similar( numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f") ) assert best_rows[0] == 2
def __init__( self, shape: tuple = (1000, 128), strings: StringStore = None, senses: List[str] = [], vectors_name: str = "sense2vec", overrides: Dict[str, str] = SimpleFrozenDict(), ): """Initialize the Sense2Vec object. shape (tuple): The vector shape. strings (StringStore): Optional string store. Will be created if it doesn't exist. senses (list): Optional list of all available senses. Used in methods that generate the best sense or other senses. vectors_name (unicode): Optional name to assign to the Vectors object. overrides (dict): Optional custom functions to use, mapped to names registered via the registry, e.g. {"make_key": "custom_make_key"}. RETURNS (Sense2Vec): The newly constructed object. """ self.vectors = Vectors(shape=shape, name=vectors_name) self._row2key = None self.strings = StringStore() if strings is None else strings self.freqs: Dict[int, int] = {} self.cache = None self.cfg: Dict[str, Any] = { "senses": senses, "make_key": "default", "split_key": "default", } self.cfg.update(overrides)
def test_vectors_most_similar(most_similar_vectors_data, most_similar_vectors_keys): v = Vectors(data=most_similar_vectors_data, keys=most_similar_vectors_keys) _, best_rows, _ = v.most_similar(v.data, batch_size=2, n=2, sort=True) assert all(row[0] == i for i, row in enumerate(best_rows)) with pytest.raises(ValueError): v.most_similar(v.data, batch_size=2, n=10, sort=True)
def add_vectors(nlp, vectors_loc, name=None): vectors_loc = Path(vectors_loc) if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) for lex in nlp.vocab: if lex.rank: nlp.vocab.vectors.add(lex.orth, row=lex.rank) else: if vectors_loc: vectors_data, vector_keys = read_vectors(vectors_loc) else: vectors_data, vector_keys = (None, None) if vector_keys is not None: for word in vector_keys: if word not in nlp.vocab: lexeme = nlp.vocab[word] lexeme.is_oov = False if vectors_data is not None: nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) if name is None: nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"] else: nlp.vocab.vectors.name = name nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name return nlp
def get_fasttext( bin_file, txt_file, fasttext_file, size, ): print("Reading data...") fst = MyVec(fasttext_file) fst_short_k = fst.keys(size) fst_short_v = fst.vectors(size) # save bin vectors print("Saving bin version...") fst_spacy = Vectors(data=fst_short_v, keys=fst_short_k) fst_spacy.to_disk(bin_file) s = get_file_size(os.path.join(bin_file, 'vectors')) print("Chosen fasttexts in binary format weight {} MB".format(round(s))) # save txt vectors print("Saving txt version...") chosen_lines = fst.get_first_n(size) with open(txt_file, 'wb') as f: f.write(bytes("{} {}\n".format(size, fst.nr_dim), 'utf-8')) f.writelines(chosen_lines) s = get_file_size(txt_file) print("Chosen fasttexts in txt format weight {} MB".format(round(s)))
def test_get_vector(strings, data): v = Vectors(data=data) strings = [hash_string(s) for s in strings] for i, string in enumerate(strings): v.add(string, row=i) assert list(v[strings[0]]) == list(data[0]) assert list(v[strings[0]]) != list(data[1]) assert list(v[strings[1]]) != list(data[0])
def __init__(self, rootDir='.cache', vectorPath='vectors', tokenizerPath='tokenizer'): self.vectorPath = Path.cwd() / rootDir / vectorPath self.tokenizerPath = Path.cwd() / rootDir / tokenizerPath self.tokenizer = Tokenizer(Vocab()) self.vectors = Vectors(shape=(41299, 300))
def test_get_vector(strings, data): v = Vectors(data=data) strings = [hash_string(s) for s in strings] for i, string in enumerate(strings): v.add(string, row=i) assert list(v[strings[0]]) == list(data[0]) assert list(v[strings[0]]) != list(data[1]) assert list(v[strings[1]]) != list(data[0])
def test_vectors_get_batch(): data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") v = Vectors(data=data, keys=["A", "B", "C"]) # check with mixed int/str keys words = ["C", "B", "A", v.strings["B"]] rows = v.find(keys=words) vecs = OPS.as_contig(v.data[rows]) assert_equal(OPS.to_numpy(vecs), OPS.to_numpy(v.get_batch(words)))
def test_init_vectors_unset(): v = Vectors(shape=(10, 10)) assert v.is_full is False assert v.shape == (10, 10) with pytest.raises(ValueError): v = Vectors(shape=(10, 10), mode="floret") v = Vectors(data=OPS.xp.zeros((10, 10)), mode="floret", hash_count=1) assert v.is_full is True
def test_set_vector(strings, data): orig = data.copy() v = Vectors(data=data) strings = [hash_string(s) for s in strings] for i, string in enumerate(strings): v.add(string, row=i) assert list(v[strings[0]]) == list(orig[0]) assert list(v[strings[0]]) != list(orig[1]) v[strings[0]] = data[1] assert list(v[strings[0]]) == list(orig[1]) assert list(v[strings[0]]) != list(orig[0])
def test_set_vector(strings, data): orig = data.copy() v = Vectors(data=data) strings = [hash_string(s) for s in strings] for i, string in enumerate(strings): v.add(string, row=i) assert list(v[strings[0]]) == list(orig[0]) assert list(v[strings[0]]) != list(orig[1]) v[strings[0]] = data[1] assert list(v[strings[0]]) == list(orig[1]) assert list(v[strings[0]]) != list(orig[0])
def __init__(self, wordlist): tokens = nlp(" ".join(wordlist)) self.reference = {} self.tokens = tokens self.vectors = Vectors(shape=(len(wordlist), 300)) for token in tokens: if token.has_vector: idx = nlp.vocab.strings[token.text] self.reference[idx] = token.text self.vectors.add(idx, vector=token.vector)
def test_vectors_clear(): data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") v = Vectors(data=data, keys=["A", "B", "C"]) assert v.is_full is True assert hash_string("A") in v v.clear() # no keys assert v.key2row == {} assert list(v) == [] assert v.is_full is False assert "A" not in v with pytest.raises(KeyError): v["A"]
def create_model(lang, probs, oov_prob, vectors_data, vector_keys, expand_vectors, prune_vectors, vectors_name): print("Creating model...") lang_class = get_lang_class(lang) nlp = lang_class() for lexeme in nlp.vocab: lexeme.rank = 0 lex_added = 0 for i, (word, prob) in enumerate( tqdm(sorted(probs.items(), key=lambda item: item[1], reverse=True))): lexeme = nlp.vocab[word] lexeme.rank = i lexeme.prob = prob lexeme.is_oov = False # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx lexeme.cluster = 0 lex_added += 1 nlp.vocab.cfg.update({'oov_prob': oov_prob}) if vector_keys is not None: new_keys = [] new_indices = [] for i, word in enumerate(vector_keys): if word not in nlp.vocab and expand_vectors: lexeme = nlp.vocab[word] lexeme.is_oov = False lex_added += 1 elif word in nlp.vocab and not expand_vectors: new_keys.append(word) new_indices.append(i) if len(vectors_data): if expand_vectors: nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys, name=vectors_name) else: nlp.vocab.vectors = Vectors(data=vectors_data[new_indices], keys=new_keys, name=vectors_name) nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name if prune_vectors >= 1: nlp.vocab.prune_vectors(prune_vectors) vec_added = len(nlp.vocab.vectors) msg.good( "Sucessfully compiled vocab", "{} entries, {} vectors".format(lex_added, vec_added), ) return nlp
def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): """Load a Sense2Vec object from a bytestring. bytes_data (bytes): The data to load. exclude (list): Names of serialization fields to exclude. RETURNS (Sense2Vec): The loaded object. """ data = srsly.msgpack_loads(bytes_data) self.vectors = Vectors().from_bytes(data["vectors"]) self.freqs = dict(data.get("freqs", [])) self.cfg.update(data.get("cfg", {})) if "strings" not in exclude and "strings" in data: self.strings = StringStore().from_bytes(data["strings"]) return self
def test_vectors_deduplicate(): data = OPS.asarray([[1, 1], [2, 2], [3, 4], [1, 1], [3, 4]], dtype="f") v = Vectors(data=data, keys=["a1", "b1", "c1", "a2", "c2"]) vocab = Vocab() vocab.vectors = v # duplicate vectors do not use the same keys assert (vocab.vectors.key2row[v.strings["a1"]] != vocab.vectors.key2row[v.strings["a2"]]) assert (vocab.vectors.key2row[v.strings["c1"]] != vocab.vectors.key2row[v.strings["c2"]]) vocab.deduplicate_vectors() # there are three unique vectors assert vocab.vectors.shape[0] == 3 # the uniqued data is the same as the deduplicated data assert_equal( numpy.unique(OPS.to_numpy(vocab.vectors.data), axis=0), OPS.to_numpy(vocab.vectors.data), ) # duplicate vectors use the same keys now assert (vocab.vectors.key2row[v.strings["a1"]] == vocab.vectors.key2row[ v.strings["a2"]]) assert (vocab.vectors.key2row[v.strings["c1"]] == vocab.vectors.key2row[ v.strings["c2"]]) # deduplicating again makes no changes vocab_b = vocab.to_bytes() vocab.deduplicate_vectors() assert vocab_b == vocab.to_bytes()
class VectorDictionary: tokens = None def __init__(self, wordlist): tokens = nlp(" ".join(wordlist)) self.reference = {} self.tokens = tokens self.vectors = Vectors(shape=(len(wordlist), 300)) for token in tokens: if token.has_vector: idx = nlp.vocab.strings[token.text] self.reference[idx] = token.text self.vectors.add(idx, vector=token.vector) def print(self): for key in self.vectors.keys(): print(self.reference[key], self.vectors[key])
def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): """Load a Sense2Vec object from a directory. path (unicode / Path): The path to load from. exclude (list): Names of serialization fields to exclude. RETURNS (Sense2Vec): The loaded object. """ path = Path(path) strings_path = path / "strings.json" freqs_path = path / "freqs.json" self.vectors = Vectors().from_disk(path) self.cfg.update(srsly.read_json(path / "cfg")) if freqs_path.exists(): self.freqs = dict(srsly.read_json(freqs_path)) if "strings" not in exclude and strings_path.exists(): self.strings = StringStore().from_disk(strings_path) return self
def test_pickle_vocab(strings, lex_attr): vocab = Vocab(strings=strings) ops = get_current_ops() vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1) vocab.vectors = vectors vocab[strings[0]].norm_ = lex_attr vocab_pickled = pickle.dumps(vocab) vocab_unpickled = pickle.loads(vocab_pickled) assert vocab.to_bytes() == vocab_unpickled.to_bytes() assert vocab_unpickled.vectors.mode == "floret"
def get_nlp(model="en", embeddings_path=None): import spacy if embeddings_path not in nlp_objects: if embeddings_path is None: nlp_ = spacy.load(model) else: if embeddings_path.endswith(".bin"): nlp_ = spacy.load(model, vectors=False) nlp_.vocab.load_vectors_from_bin_loc(embeddings_path) elif os.path.isdir(embeddings_path): from spacy.vectors import Vectors vectors = Vectors() vectors = vectors.from_disk(embeddings_path) nlp_ = spacy.load(model, vectors=False) nlp_.vocab.vectors = vectors else: nlp_ = spacy.load(model, vectors=embeddings_path) nlp_objects[embeddings_path] = nlp_ return nlp_objects[embeddings_path]
class VocabBuilder(object): def __init__(self, rootDir='.cache', vectorPath='vectors', tokenizerPath='tokenizer'): self.vectorPath = Path.cwd() / rootDir / vectorPath self.tokenizerPath = Path.cwd() / rootDir / tokenizerPath self.tokenizer = Tokenizer(Vocab()) self.vectors = Vectors(shape=(41299, 300)) def _countWords(self, sequences, tokenizer): self.tokenCounts = Counter() for seq in sequences: tokens = tokenizer(seq) for t in tokens: self.tokenCounts[t.text] += 1 def fromDisk(self): self.tokenizer.from_disk(self.tokenizerPath) self.vectors.from_disk(self.vectorPath) def learnVocab(self, sequences, tokenizer, vectors, padToken='<pad>'): nlp = English() self._countWords(sequences, tokenizer=tokenizer) nlp.vocab = Vocab() nlp.vocab.set_vector(padToken, np.zeros(vectors.data.shape[1])) for word in self.tokenCounts: idx = tokenizer(word)[0].lex_id nlp.vocab.set_vector(word, vectors.data[idx]) self.tokenizer = Tokenizer(nlp.vocab, rules={padToken: [{ ORTH: padToken }]}, prefix_search=nlp.tokenizer.prefix_search, suffix_search=nlp.tokenizer.suffix_search, token_match=nlp.tokenizer.token_match, infix_finditer=nlp.tokenizer.infix_finditer) self.vectors = nlp.vocab.vectors def toDisk(self, tokenizerPath=None, vectorPath=None): self.tokenizer.to_disk(tokenizerPath or self.tokenizerPath) self.vectors.to_disk(vectorPath or self.vectorPath)
def test_issue1727(): """Test that models with no pretrained vectors can be deserialized correctly after vectors are added.""" data = numpy.ones((3, 300), dtype="f") vectors = Vectors(data=data, keys=["I", "am", "Matt"]) tagger = Tagger(Vocab()) tagger.add_label("PRP") tagger.begin_training() assert tagger.cfg.get("pretrained_dims", 0) == 0 tagger.vocab.vectors = vectors with make_tempdir() as path: tagger.to_disk(path) tagger = Tagger(Vocab()).from_disk(path) assert tagger.cfg.get("pretrained_dims", 0) == 0
def test_vectors_most_similar_identical(): """Test that most similar identical vectors are assigned a score of 1.0.""" data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") v = Vectors(data=data, keys=["A", "B", "C"]) keys, _, scores = v.most_similar(numpy.asarray([[4, 2, 2, 2]], dtype="f")) assert scores[0][0] == 1.0 # not 1.0000002 data = numpy.asarray([[1, 2, 3], [1, 2, 3], [1, 1, 1]], dtype="f") v = Vectors(data=data, keys=["A", "B", "C"]) keys, _, scores = v.most_similar(numpy.asarray([[1, 2, 3]], dtype="f")) assert scores[0][0] == 1.0 # not 0.9999999
def test_get_vector_resize(strings, data, resize_data): v = Vectors(data=data) v.resize(shape=resize_data.shape) strings = [hash_string(s) for s in strings] for i, string in enumerate(strings): v.add(string, row=i) assert list(v[strings[0]]) == list(resize_data[0]) assert list(v[strings[0]]) != list(resize_data[1]) assert list(v[strings[1]]) != list(resize_data[0]) assert list(v[strings[1]]) == list(resize_data[1])
def add_vectors(nlp, vectors_loc): with open(vectors_loc, "r") as f: shape = tuple(int(size) for size in next(f).split()) vectors_data = numpy.zeros(shape=shape, dtype="f") vectors_keys = [] for i, line in enumerate(tqdm(f)): line = line.rstrip() pieces = line.rsplit(" ", vectors_data.shape[1]) word = pieces.pop(0) if len(pieces) != vectors_data.shape[1]: raise ValueError("invalid vectors format") vectors_data[i] = numpy.asarray(pieces, dtype="f") vectors_keys.append(word) for word in vectors_keys: if word not in nlp.vocab: lexeme = nlp.vocab[word] lexeme.is_oov = False nlp.vocab.vectors = Vectors(data=vectors_data, keys=vectors_keys)
def test_issue1539(): """Ensure vectors.resize() doesn't try to modify dictionary during iteration.""" v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100]) v.resize((100, 100))
def test_issue1518(): """Test vectors.resize() works.""" vectors = Vectors(shape=(10, 10)) vectors.add("hello", row=2) vectors.resize((5, 9))
def test_issue1518(): """Test vectors.resize() works.""" vectors = Vectors(shape=(10, 10)) vectors.add("hello", row=2) vectors.resize((5, 9))
def test_issue1539(): """Ensure vectors.resize() doesn't try to modify dictionary during iteration.""" v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100]) v.resize((100, 100))
def test_init_vectors_with_resize_shape(strings, resize_data): v = Vectors(shape=(len(strings), 3)) v.resize(shape=resize_data.shape) assert v.shape == resize_data.shape assert v.shape != (len(strings), 3)
def test_vectors_serialize(): data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") v = Vectors(data=data, keys=["A", "B", "C"]) b = v.to_bytes() v_r = Vectors() v_r.from_bytes(b) assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data)) assert v.key2row == v_r.key2row v.resize((5, 4)) v_r.resize((5, 4)) row = v.add("D", vector=OPS.asarray([1, 2, 3, 4], dtype="f")) row_r = v_r.add("D", vector=OPS.asarray([1, 2, 3, 4], dtype="f")) assert row == row_r assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data)) assert v.is_full == v_r.is_full with make_tempdir() as d: v.to_disk(d) v_r.from_disk(d) assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data)) assert v.key2row == v_r.key2row v.resize((5, 4)) v_r.resize((5, 4)) row = v.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f")) row_r = v_r.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f")) assert row == row_r assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
def test_init_vectors_with_resize_data(data, resize_data): v = Vectors(data=data) v.resize(shape=resize_data.shape) assert v.shape == resize_data.shape assert v.shape != data.shape
def test_init_vectors_with_shape(strings): v = Vectors(shape=(len(strings), 3)) assert v.shape == (len(strings), 3) assert v.is_full is False
def test_init_vectors_with_data(strings, data): v = Vectors(data=data) assert v.shape == data.shape
def test_get_vector_resize(strings, data): strings = [hash_string(s) for s in strings] # decrease vector dimension (truncate) v = Vectors(data=data) resized_dim = v.shape[1] - 1 v.resize(shape=(v.shape[0], resized_dim)) for i, string in enumerate(strings): v.add(string, row=i) assert list(v[strings[0]]) == list(data[0, :resized_dim]) assert list(v[strings[1]]) == list(data[1, :resized_dim]) # increase vector dimension (pad with zeros) v = Vectors(data=data) resized_dim = v.shape[1] + 1 v.resize(shape=(v.shape[0], resized_dim)) for i, string in enumerate(strings): v.add(string, row=i) assert list(v[strings[0]]) == list(data[0]) + [0] assert list(v[strings[1]]) == list(data[1]) + [0]