예제 #1
0
def test_vectors_serialize():
    data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
    v = Vectors(data=data, keys=["A", "B", "C"])
    b = v.to_bytes()
    v_r = Vectors()
    v_r.from_bytes(b)
    assert_equal(v.data, v_r.data)
    assert v.key2row == v_r.key2row
    v.resize((5, 4))
    v_r.resize((5, 4))
    row = v.add("D", vector=numpy.asarray([1, 2, 3, 4], dtype="f"))
    row_r = v_r.add("D", vector=numpy.asarray([1, 2, 3, 4], dtype="f"))
    assert row == row_r
    assert_equal(v.data, v_r.data)
    assert v.is_full == v_r.is_full
    with make_tempdir() as d:
        v.to_disk(d)
        v_r.from_disk(d)
        assert_equal(v.data, v_r.data)
        assert v.key2row == v_r.key2row
        v.resize((5, 4))
        v_r.resize((5, 4))
        row = v.add("D", vector=numpy.asarray([10, 20, 30, 40], dtype="f"))
        row_r = v_r.add("D", vector=numpy.asarray([10, 20, 30, 40], dtype="f"))
        assert row == row_r
        assert_equal(v.data, v_r.data)
예제 #2
0
class VocabBuilder(object):
    def __init__(self,
                 rootDir='.cache',
                 vectorPath='vectors',
                 tokenizerPath='tokenizer'):
        self.vectorPath = Path.cwd() / rootDir / vectorPath
        self.tokenizerPath = Path.cwd() / rootDir / tokenizerPath
        self.tokenizer = Tokenizer(Vocab())
        self.vectors = Vectors(shape=(41299, 300))

    def _countWords(self, sequences, tokenizer):
        self.tokenCounts = Counter()
        for seq in sequences:
            tokens = tokenizer(seq)
            for t in tokens:
                self.tokenCounts[t.text] += 1

    def fromDisk(self):
        self.tokenizer.from_disk(self.tokenizerPath)
        self.vectors.from_disk(self.vectorPath)

    def learnVocab(self, sequences, tokenizer, vectors, padToken='<pad>'):
        nlp = English()
        self._countWords(sequences, tokenizer=tokenizer)
        nlp.vocab = Vocab()
        nlp.vocab.set_vector(padToken, np.zeros(vectors.data.shape[1]))
        for word in self.tokenCounts:
            idx = tokenizer(word)[0].lex_id
            nlp.vocab.set_vector(word, vectors.data[idx])

        self.tokenizer = Tokenizer(nlp.vocab,
                                   rules={padToken: [{
                                       ORTH: padToken
                                   }]},
                                   prefix_search=nlp.tokenizer.prefix_search,
                                   suffix_search=nlp.tokenizer.suffix_search,
                                   token_match=nlp.tokenizer.token_match,
                                   infix_finditer=nlp.tokenizer.infix_finditer)
        self.vectors = nlp.vocab.vectors

    def toDisk(self, tokenizerPath=None, vectorPath=None):
        self.tokenizer.to_disk(tokenizerPath or self.tokenizerPath)
        self.vectors.to_disk(vectorPath or self.vectorPath)
예제 #3
0
def get_nlp(model="en", embeddings_path=None):
    import spacy
    if embeddings_path not in nlp_objects:
        if embeddings_path is None:
            nlp_ = spacy.load(model)
        else:
            if embeddings_path.endswith(".bin"):
                nlp_ = spacy.load(model, vectors=False)
                nlp_.vocab.load_vectors_from_bin_loc(embeddings_path)
            elif os.path.isdir(embeddings_path):
                from spacy.vectors import Vectors
                vectors = Vectors()
                vectors = vectors.from_disk(embeddings_path)
                nlp_ = spacy.load(model, vectors=False)
                nlp_.vocab.vectors = vectors
            else:
                nlp_ = spacy.load(model, vectors=embeddings_path)
        nlp_objects[embeddings_path] = nlp_
    return nlp_objects[embeddings_path]
예제 #4
0
파일: hn.py 프로젝트: icorecool/CHN
import config
import utils
from train import spacy_tokenizer

# for bundled
# set_data_path('./')

logger = utils.get_logger()

print('**************** Loading model... ****************')
# too slow
# _nlp = spacy.load('en_core_web_md')
_vectors = Vectors()
# _vectors.from_disk('/home/han/.local/lib/python3.6/site-packages/en_core_web_md/en_core_web_md-2.1.0/vocab/')
_vectors.from_disk('%s/%s-%s/vocab/' %
                   (en_core_web_md.__path__, en_core_web_md.__name__,
                    en_core_web_md.__version__))
_vector_size = _vectors.shape[1]


def get_sent_vector(sent):
    # use regexp tokenizer to speed up
    vs = np.array([get_word_vector(w) for w in spacy_tokenizer(sent)])
    if len(vs) > 0:
        return vs.sum(axis=0) / vs.shape[0]
    return np.zeros(_vector_size)


def get_word_vector(w):
    h = hash_string(w.lower())
    i = _vectors.key2row.get(h, 0)