Пример #1
0
def test_issue3412():
    data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
    vectors = Vectors(data=data, keys=["A", "B", "C"])
    keys, best_rows, scores = vectors.most_similar(
        numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
    )
    assert best_rows[0] == 2
Пример #2
0
    def __init__(
            self,
            shape: tuple = (1000, 128),
            strings: StringStore = None,
            senses: List[str] = [],
            vectors_name: str = "sense2vec",
            overrides: Dict[str, str] = SimpleFrozenDict(),
    ):
        """Initialize the Sense2Vec object.

        shape (tuple): The vector shape.
        strings (StringStore): Optional string store. Will be created if it
            doesn't exist.
        senses (list): Optional list of all available senses. Used in methods
            that generate the best sense or other senses.
        vectors_name (unicode): Optional name to assign to the Vectors object.
        overrides (dict): Optional custom functions to use, mapped to names
            registered via the registry, e.g. {"make_key": "custom_make_key"}.
        RETURNS (Sense2Vec): The newly constructed object.
        """
        self.vectors = Vectors(shape=shape, name=vectors_name)
        self._row2key = None
        self.strings = StringStore() if strings is None else strings
        self.freqs: Dict[int, int] = {}
        self.cache = None
        self.cfg: Dict[str, Any] = {
            "senses": senses,
            "make_key": "default",
            "split_key": "default",
        }
        self.cfg.update(overrides)
Пример #3
0
def test_vectors_most_similar(most_similar_vectors_data, most_similar_vectors_keys):
    v = Vectors(data=most_similar_vectors_data, keys=most_similar_vectors_keys)
    _, best_rows, _ = v.most_similar(v.data, batch_size=2, n=2, sort=True)
    assert all(row[0] == i for i, row in enumerate(best_rows))

    with pytest.raises(ValueError):
        v.most_similar(v.data, batch_size=2, n=10, sort=True)
Пример #4
0
def add_vectors(nlp, vectors_loc, name=None):
    vectors_loc = Path(vectors_loc)
    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
        for lex in nlp.vocab:
            if lex.rank:
                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
    else:
        if vectors_loc:
            vectors_data, vector_keys = read_vectors(vectors_loc)
        else:
            vectors_data, vector_keys = (None, None)
        if vector_keys is not None:
            for word in vector_keys:
                if word not in nlp.vocab:
                    lexeme = nlp.vocab[word]
                    lexeme.is_oov = False
        if vectors_data is not None:
            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
    if name is None:
        nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
    else:
        nlp.vocab.vectors.name = name
    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
    return nlp
Пример #5
0
def get_fasttext(
    bin_file,
    txt_file,
    fasttext_file,
    size,
):
    print("Reading data...")
    fst = MyVec(fasttext_file)
    fst_short_k = fst.keys(size)
    fst_short_v = fst.vectors(size)

    # save bin vectors
    print("Saving bin version...")
    fst_spacy = Vectors(data=fst_short_v, keys=fst_short_k)
    fst_spacy.to_disk(bin_file)
    s = get_file_size(os.path.join(bin_file, 'vectors'))
    print("Chosen fasttexts in binary format weight {} MB".format(round(s)))

    # save txt vectors
    print("Saving txt version...")
    chosen_lines = fst.get_first_n(size)
    with open(txt_file, 'wb') as f:
        f.write(bytes("{} {}\n".format(size, fst.nr_dim), 'utf-8'))
        f.writelines(chosen_lines)
    s = get_file_size(txt_file)
    print("Chosen fasttexts in txt format weight {} MB".format(round(s)))
Пример #6
0
def test_get_vector(strings, data):
    v = Vectors(data=data)
    strings = [hash_string(s) for s in strings]
    for i, string in enumerate(strings):
        v.add(string, row=i)
    assert list(v[strings[0]]) == list(data[0])
    assert list(v[strings[0]]) != list(data[1])
    assert list(v[strings[1]]) != list(data[0])
Пример #7
0
 def __init__(self,
              rootDir='.cache',
              vectorPath='vectors',
              tokenizerPath='tokenizer'):
     self.vectorPath = Path.cwd() / rootDir / vectorPath
     self.tokenizerPath = Path.cwd() / rootDir / tokenizerPath
     self.tokenizer = Tokenizer(Vocab())
     self.vectors = Vectors(shape=(41299, 300))
Пример #8
0
def test_get_vector(strings, data):
    v = Vectors(data=data)
    strings = [hash_string(s) for s in strings]
    for i, string in enumerate(strings):
        v.add(string, row=i)
    assert list(v[strings[0]]) == list(data[0])
    assert list(v[strings[0]]) != list(data[1])
    assert list(v[strings[1]]) != list(data[0])
Пример #9
0
def test_vectors_get_batch():
    data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
    v = Vectors(data=data, keys=["A", "B", "C"])
    # check with mixed int/str keys
    words = ["C", "B", "A", v.strings["B"]]
    rows = v.find(keys=words)
    vecs = OPS.as_contig(v.data[rows])
    assert_equal(OPS.to_numpy(vecs), OPS.to_numpy(v.get_batch(words)))
Пример #10
0
def test_init_vectors_unset():
    v = Vectors(shape=(10, 10))
    assert v.is_full is False
    assert v.shape == (10, 10)

    with pytest.raises(ValueError):
        v = Vectors(shape=(10, 10), mode="floret")

    v = Vectors(data=OPS.xp.zeros((10, 10)), mode="floret", hash_count=1)
    assert v.is_full is True
Пример #11
0
def test_set_vector(strings, data):
    orig = data.copy()
    v = Vectors(data=data)
    strings = [hash_string(s) for s in strings]
    for i, string in enumerate(strings):
        v.add(string, row=i)
    assert list(v[strings[0]]) == list(orig[0])
    assert list(v[strings[0]]) != list(orig[1])
    v[strings[0]] = data[1]
    assert list(v[strings[0]]) == list(orig[1])
    assert list(v[strings[0]]) != list(orig[0])
Пример #12
0
def test_set_vector(strings, data):
    orig = data.copy()
    v = Vectors(data=data)
    strings = [hash_string(s) for s in strings]
    for i, string in enumerate(strings):
        v.add(string, row=i)
    assert list(v[strings[0]]) == list(orig[0])
    assert list(v[strings[0]]) != list(orig[1])
    v[strings[0]] = data[1]
    assert list(v[strings[0]]) == list(orig[1])
    assert list(v[strings[0]]) != list(orig[0])
Пример #13
0
	def __init__(self, wordlist):
		
		tokens = nlp(" ".join(wordlist))
		self.reference = {}
		self.tokens = tokens
		self.vectors = Vectors(shape=(len(wordlist), 300))
		for token in tokens:
			if token.has_vector:
				idx = nlp.vocab.strings[token.text]
				self.reference[idx] = token.text
				self.vectors.add(idx, vector=token.vector)
Пример #14
0
def test_vectors_clear():
    data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
    v = Vectors(data=data, keys=["A", "B", "C"])
    assert v.is_full is True
    assert hash_string("A") in v
    v.clear()
    # no keys
    assert v.key2row == {}
    assert list(v) == []
    assert v.is_full is False
    assert "A" not in v
    with pytest.raises(KeyError):
        v["A"]
Пример #15
0
def create_model(lang, probs, oov_prob, vectors_data, vector_keys,
                 expand_vectors, prune_vectors, vectors_name):
    print("Creating model...")
    lang_class = get_lang_class(lang)
    nlp = lang_class()
    for lexeme in nlp.vocab:
        lexeme.rank = 0
    lex_added = 0
    for i, (word, prob) in enumerate(
            tqdm(sorted(probs.items(), key=lambda item: item[1],
                        reverse=True))):
        lexeme = nlp.vocab[word]
        lexeme.rank = i
        lexeme.prob = prob
        lexeme.is_oov = False
        # Decode as a little-endian string, so that we can do & 15 to get
        # the first 4 bits. See _parse_features.pyx
        lexeme.cluster = 0
        lex_added += 1
    nlp.vocab.cfg.update({'oov_prob': oov_prob})
    if vector_keys is not None:
        new_keys = []
        new_indices = []
        for i, word in enumerate(vector_keys):
            if word not in nlp.vocab and expand_vectors:
                lexeme = nlp.vocab[word]
                lexeme.is_oov = False
                lex_added += 1
            elif word in nlp.vocab and not expand_vectors:
                new_keys.append(word)
                new_indices.append(i)

        if len(vectors_data):
            if expand_vectors:
                nlp.vocab.vectors = Vectors(data=vectors_data,
                                            keys=vector_keys,
                                            name=vectors_name)
            else:
                nlp.vocab.vectors = Vectors(data=vectors_data[new_indices],
                                            keys=new_keys,
                                            name=vectors_name)
            nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name

        if prune_vectors >= 1:
            nlp.vocab.prune_vectors(prune_vectors)
    vec_added = len(nlp.vocab.vectors)
    msg.good(
        "Sucessfully compiled vocab",
        "{} entries, {} vectors".format(lex_added, vec_added),
    )
    return nlp
Пример #16
0
    def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()):
        """Load a Sense2Vec object from a bytestring.

        bytes_data (bytes): The data to load.
        exclude (list): Names of serialization fields to exclude.
        RETURNS (Sense2Vec): The loaded object.
        """
        data = srsly.msgpack_loads(bytes_data)
        self.vectors = Vectors().from_bytes(data["vectors"])
        self.freqs = dict(data.get("freqs", []))
        self.cfg.update(data.get("cfg", {}))
        if "strings" not in exclude and "strings" in data:
            self.strings = StringStore().from_bytes(data["strings"])
        return self
Пример #17
0
def test_vectors_deduplicate():
    data = OPS.asarray([[1, 1], [2, 2], [3, 4], [1, 1], [3, 4]], dtype="f")
    v = Vectors(data=data, keys=["a1", "b1", "c1", "a2", "c2"])
    vocab = Vocab()
    vocab.vectors = v
    # duplicate vectors do not use the same keys
    assert (vocab.vectors.key2row[v.strings["a1"]] !=
            vocab.vectors.key2row[v.strings["a2"]])
    assert (vocab.vectors.key2row[v.strings["c1"]] !=
            vocab.vectors.key2row[v.strings["c2"]])
    vocab.deduplicate_vectors()
    # there are three unique vectors
    assert vocab.vectors.shape[0] == 3
    # the uniqued data is the same as the deduplicated data
    assert_equal(
        numpy.unique(OPS.to_numpy(vocab.vectors.data), axis=0),
        OPS.to_numpy(vocab.vectors.data),
    )
    # duplicate vectors use the same keys now
    assert (vocab.vectors.key2row[v.strings["a1"]] == vocab.vectors.key2row[
        v.strings["a2"]])
    assert (vocab.vectors.key2row[v.strings["c1"]] == vocab.vectors.key2row[
        v.strings["c2"]])
    # deduplicating again makes no changes
    vocab_b = vocab.to_bytes()
    vocab.deduplicate_vectors()
    assert vocab_b == vocab.to_bytes()
Пример #18
0
class VectorDictionary:
	tokens = None

	def __init__(self, wordlist):
		
		tokens = nlp(" ".join(wordlist))
		self.reference = {}
		self.tokens = tokens
		self.vectors = Vectors(shape=(len(wordlist), 300))
		for token in tokens:
			if token.has_vector:
				idx = nlp.vocab.strings[token.text]
				self.reference[idx] = token.text
				self.vectors.add(idx, vector=token.vector)
	def print(self):
		for key in self.vectors.keys():
		    print(self.reference[key], self.vectors[key])
Пример #19
0
    def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
        """Load a Sense2Vec object from a directory.

        path (unicode / Path): The path to load from.
        exclude (list): Names of serialization fields to exclude.
        RETURNS (Sense2Vec): The loaded object.
        """
        path = Path(path)
        strings_path = path / "strings.json"
        freqs_path = path / "freqs.json"
        self.vectors = Vectors().from_disk(path)
        self.cfg.update(srsly.read_json(path / "cfg"))
        if freqs_path.exists():
            self.freqs = dict(srsly.read_json(freqs_path))
        if "strings" not in exclude and strings_path.exists():
            self.strings = StringStore().from_disk(strings_path)
        return self
Пример #20
0
def test_pickle_vocab(strings, lex_attr):
    vocab = Vocab(strings=strings)
    ops = get_current_ops()
    vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1)
    vocab.vectors = vectors
    vocab[strings[0]].norm_ = lex_attr
    vocab_pickled = pickle.dumps(vocab)
    vocab_unpickled = pickle.loads(vocab_pickled)
    assert vocab.to_bytes() == vocab_unpickled.to_bytes()
    assert vocab_unpickled.vectors.mode == "floret"
Пример #21
0
def get_nlp(model="en", embeddings_path=None):
    import spacy
    if embeddings_path not in nlp_objects:
        if embeddings_path is None:
            nlp_ = spacy.load(model)
        else:
            if embeddings_path.endswith(".bin"):
                nlp_ = spacy.load(model, vectors=False)
                nlp_.vocab.load_vectors_from_bin_loc(embeddings_path)
            elif os.path.isdir(embeddings_path):
                from spacy.vectors import Vectors
                vectors = Vectors()
                vectors = vectors.from_disk(embeddings_path)
                nlp_ = spacy.load(model, vectors=False)
                nlp_.vocab.vectors = vectors
            else:
                nlp_ = spacy.load(model, vectors=embeddings_path)
        nlp_objects[embeddings_path] = nlp_
    return nlp_objects[embeddings_path]
Пример #22
0
class VocabBuilder(object):
    def __init__(self,
                 rootDir='.cache',
                 vectorPath='vectors',
                 tokenizerPath='tokenizer'):
        self.vectorPath = Path.cwd() / rootDir / vectorPath
        self.tokenizerPath = Path.cwd() / rootDir / tokenizerPath
        self.tokenizer = Tokenizer(Vocab())
        self.vectors = Vectors(shape=(41299, 300))

    def _countWords(self, sequences, tokenizer):
        self.tokenCounts = Counter()
        for seq in sequences:
            tokens = tokenizer(seq)
            for t in tokens:
                self.tokenCounts[t.text] += 1

    def fromDisk(self):
        self.tokenizer.from_disk(self.tokenizerPath)
        self.vectors.from_disk(self.vectorPath)

    def learnVocab(self, sequences, tokenizer, vectors, padToken='<pad>'):
        nlp = English()
        self._countWords(sequences, tokenizer=tokenizer)
        nlp.vocab = Vocab()
        nlp.vocab.set_vector(padToken, np.zeros(vectors.data.shape[1]))
        for word in self.tokenCounts:
            idx = tokenizer(word)[0].lex_id
            nlp.vocab.set_vector(word, vectors.data[idx])

        self.tokenizer = Tokenizer(nlp.vocab,
                                   rules={padToken: [{
                                       ORTH: padToken
                                   }]},
                                   prefix_search=nlp.tokenizer.prefix_search,
                                   suffix_search=nlp.tokenizer.suffix_search,
                                   token_match=nlp.tokenizer.token_match,
                                   infix_finditer=nlp.tokenizer.infix_finditer)
        self.vectors = nlp.vocab.vectors

    def toDisk(self, tokenizerPath=None, vectorPath=None):
        self.tokenizer.to_disk(tokenizerPath or self.tokenizerPath)
        self.vectors.to_disk(vectorPath or self.vectorPath)
Пример #23
0
def test_issue1727():
    """Test that models with no pretrained vectors can be deserialized
    correctly after vectors are added."""
    data = numpy.ones((3, 300), dtype="f")
    vectors = Vectors(data=data, keys=["I", "am", "Matt"])
    tagger = Tagger(Vocab())
    tagger.add_label("PRP")
    tagger.begin_training()
    assert tagger.cfg.get("pretrained_dims", 0) == 0
    tagger.vocab.vectors = vectors
    with make_tempdir() as path:
        tagger.to_disk(path)
        tagger = Tagger(Vocab()).from_disk(path)
        assert tagger.cfg.get("pretrained_dims", 0) == 0
Пример #24
0
def test_vectors_most_similar_identical():
    """Test that most similar identical vectors are assigned a score of 1.0."""
    data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
    v = Vectors(data=data, keys=["A", "B", "C"])
    keys, _, scores = v.most_similar(numpy.asarray([[4, 2, 2, 2]], dtype="f"))
    assert scores[0][0] == 1.0  # not 1.0000002
    data = numpy.asarray([[1, 2, 3], [1, 2, 3], [1, 1, 1]], dtype="f")
    v = Vectors(data=data, keys=["A", "B", "C"])
    keys, _, scores = v.most_similar(numpy.asarray([[1, 2, 3]], dtype="f"))
    assert scores[0][0] == 1.0  # not 0.9999999
Пример #25
0
def test_get_vector_resize(strings, data, resize_data):
    v = Vectors(data=data)
    v.resize(shape=resize_data.shape)
    strings = [hash_string(s) for s in strings]
    for i, string in enumerate(strings):
        v.add(string, row=i)

    assert list(v[strings[0]]) == list(resize_data[0])
    assert list(v[strings[0]]) != list(resize_data[1])
    assert list(v[strings[1]]) != list(resize_data[0])
    assert list(v[strings[1]]) == list(resize_data[1])
Пример #26
0
def add_vectors(nlp, vectors_loc):
    with open(vectors_loc, "r") as f:
        shape = tuple(int(size) for size in next(f).split())

        vectors_data = numpy.zeros(shape=shape, dtype="f")
        vectors_keys = []

        for i, line in enumerate(tqdm(f)):
            line = line.rstrip()
            pieces = line.rsplit(" ", vectors_data.shape[1])
            word = pieces.pop(0)
            if len(pieces) != vectors_data.shape[1]:
                raise ValueError("invalid vectors format")
            vectors_data[i] = numpy.asarray(pieces, dtype="f")
            vectors_keys.append(word)

        for word in vectors_keys:
            if word not in nlp.vocab:
                lexeme = nlp.vocab[word]
                lexeme.is_oov = False

        nlp.vocab.vectors = Vectors(data=vectors_data, keys=vectors_keys)
Пример #27
0
def test_issue1539():
    """Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
    v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100])
    v.resize((100, 100))
Пример #28
0
def test_issue1518():
    """Test vectors.resize() works."""
    vectors = Vectors(shape=(10, 10))
    vectors.add("hello", row=2)
    vectors.resize((5, 9))
Пример #29
0
def test_issue1518():
    """Test vectors.resize() works."""
    vectors = Vectors(shape=(10, 10))
    vectors.add("hello", row=2)
    vectors.resize((5, 9))
Пример #30
0
def test_issue1539():
    """Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
    v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100])
    v.resize((100, 100))
Пример #31
0
def test_init_vectors_with_resize_shape(strings, resize_data):
    v = Vectors(shape=(len(strings), 3))
    v.resize(shape=resize_data.shape)
    assert v.shape == resize_data.shape
    assert v.shape != (len(strings), 3)
Пример #32
0
def test_vectors_serialize():
    data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
    v = Vectors(data=data, keys=["A", "B", "C"])
    b = v.to_bytes()
    v_r = Vectors()
    v_r.from_bytes(b)
    assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
    assert v.key2row == v_r.key2row
    v.resize((5, 4))
    v_r.resize((5, 4))
    row = v.add("D", vector=OPS.asarray([1, 2, 3, 4], dtype="f"))
    row_r = v_r.add("D", vector=OPS.asarray([1, 2, 3, 4], dtype="f"))
    assert row == row_r
    assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
    assert v.is_full == v_r.is_full
    with make_tempdir() as d:
        v.to_disk(d)
        v_r.from_disk(d)
        assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
        assert v.key2row == v_r.key2row
        v.resize((5, 4))
        v_r.resize((5, 4))
        row = v.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f"))
        row_r = v_r.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f"))
        assert row == row_r
        assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
Пример #33
0
def test_init_vectors_with_resize_data(data, resize_data):
    v = Vectors(data=data)
    v.resize(shape=resize_data.shape)
    assert v.shape == resize_data.shape
    assert v.shape != data.shape
Пример #34
0
def test_init_vectors_with_shape(strings):
    v = Vectors(shape=(len(strings), 3))
    assert v.shape == (len(strings), 3)
    assert v.is_full is False
Пример #35
0
def test_init_vectors_with_data(strings, data):
    v = Vectors(data=data)
    assert v.shape == data.shape
Пример #36
0
def test_get_vector_resize(strings, data):
    strings = [hash_string(s) for s in strings]

    # decrease vector dimension (truncate)
    v = Vectors(data=data)
    resized_dim = v.shape[1] - 1
    v.resize(shape=(v.shape[0], resized_dim))
    for i, string in enumerate(strings):
        v.add(string, row=i)

    assert list(v[strings[0]]) == list(data[0, :resized_dim])
    assert list(v[strings[1]]) == list(data[1, :resized_dim])

    # increase vector dimension (pad with zeros)
    v = Vectors(data=data)
    resized_dim = v.shape[1] + 1
    v.resize(shape=(v.shape[0], resized_dim))
    for i, string in enumerate(strings):
        v.add(string, row=i)

    assert list(v[strings[0]]) == list(data[0]) + [0]
    assert list(v[strings[1]]) == list(data[1]) + [0]