def load_vectors_from_csv(self, fname, vocab_size=973265, vector_size=100): print("Loading vectors from file: {0}".format(fname)) result = keyedvectors.KeyedVectors(fname) result.syn0 = zeros((vocab_size, vector_size), dtype=REAL) result.vecor_size = vector_size counts = None def add_word(word, weights): word_id = len(result.vocab) if word in result.vocab: print("duplicate word '%s' in %s, ignoring all but first", word, fname) return if counts is None: # most common scenario: no vocab file given. just make up some bogus counts, in descending order result.vocab[word] = keyedvectors.Vocab(index=word_id, count=vocab_size - word_id) elif word in counts: # use count from the vocab file result.vocab[word] = keyedvectors.Vocab(index=word_id, count=counts[word]) else: # vocab file given, but word is missing -- set count to None (TODO: or raise?) print("vocabulary file is incomplete: '%s' is missing", word) result.vocab[word] = keyedvectors.Vocab(index=word_id, count=None) result.syn0[word_id] = weights result.index2word.append(word) file = codecs.open(fname, "r", "utf-8") i = 0 for line in file: i += 1 if i == 1: #ommit header continue parts = line.strip().split(",") word, weights = parts[1], [REAL(x) for x in parts[2:]] add_word(word, weights) if i % 100000 == 0: print(i, "word vectors loaded so far ...") file.close() print(i - 1, "word vectors loaded!") return result
def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=REAL): """Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. Parameters ---------- fname : str The file path to the saved word2vec-format file. fvocab : str Optional file path to the vocabulary.Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). binary : bool If True, indicates whether the data is in binary word2vec format. encoding : str If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. unicode_errors : str default 'strict', is a string suitable to be passed as the `errors` argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source file may include word tokens truncated in the middle of a multibyte unicode character (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. limit : int Sets a maximum number of word-vectors to read from the file. The default, None, means read all. datatype : :class: `numpy.float*` (Experimental) Can coerce dimensions to a non-default float type (such as np.float16) to save memory. (Such types may result in much slower bulk operations or incompatibility with optimized routines.) Returns ------- :obj: `cls` Returns the loaded model as an instance of :class: `cls`. """ from gensim.models.keyedvectors import Vocab counts = None if fvocab is not None: logger.info("loading word counts from %s", fvocab) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s", fname) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) vocab_size, vector_size = (int(x) for x in header.split() ) # throws for invalid file format if limit: vocab_size = min(vocab_size, limit) result = cls(vector_size) result.vector_size = vector_size result.vectors = zeros((vocab_size, vector_size), dtype=datatype) def add_word(word, weights): word_id = len(result.vocab) if word in result.vocab: logger.warning( "duplicate word '%s' in %s, ignoring all but first", word, fname) return if counts is None: # most common scenario: no vocab file given. just make up some bogus counts, in descending order result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) elif word in counts: # use count from the vocab file result.vocab[word] = Vocab(index=word_id, count=counts[word]) else: # vocab file given, but word is missing -- set count to None (TODO: or raise?) logger.warning( "vocabulary file is incomplete: '%s' is missing", word) result.vocab[word] = Vocab(index=word_id, count=None) result.vectors[word_id] = weights result.index2word.append(word) if binary: binary_len = dtype(REAL).itemsize * vector_size for _ in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) weights = fromstring(fin.read(binary_len), dtype=REAL) add_word(word, weights) else: for line_no in xrange(vocab_size): line = fin.readline() if line == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % line_no) word, weights = parts[0], [REAL(x) for x in parts[1:]] add_word(word, weights) if result.vectors.shape[0] != len(result.vocab): logger.info( "duplicate words detected, shrinking matrix size from %i to %i", result.vectors.shape[0], len(result.vocab)) result.vectors = ascontiguousarray(result.vectors[:len(result.vocab)]) assert (len(result.vocab), vector_size) == result.vectors.shape logger.info("loaded %s matrix from %s", result.vectors.shape, fname) return result
def load_word2vec_format( cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=REAL): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. `binary` is a boolean indicating whether the data is in binary word2vec format. `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors` argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source file may include word tokens truncated in the middle of a multibyte unicode character (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. `limit` sets a maximum number of word-vectors to read from the file. The default, None, means read all. `datatype` (experimental) can coerce dimensions to a non-default float type (such as np.float16) to save memory. (Such types may result in much slower bulk operations or incompatibility with optimized routines.) """ counts = None if fvocab is not None: print("loading word counts from %s" % fvocab) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) print("loading projection weights from %s" % fname) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) # throws for invalid file format vocab_size, vector_size = (int(x) for x in header.split()) if limit: vocab_size = min(vocab_size, limit) result = cls() result.vector_size = vector_size result.syn0 = zeros((vocab_size, vector_size), dtype=datatype) def add_word(word, weights): word_id = len(result.vocab) # print("word id: %d, word: %s, weights: %s" % (word_id, word, weights)) if word in result.vocab: print( "duplicate word '%s' in %s, ignoring all but first" % (word, fname)) return if counts is None: # most common scenario: no vocab file given. just make up # some bogus counts, in descending order result.vocab[word] = Vocab( index=word_id, count=vocab_size - word_id) elif word in counts: # use count from the vocab file result.vocab[word] = Vocab( index=word_id, count=counts[word]) else: # vocab file given, but word is missing -- set count to # None (TODO: or raise?) print( "vocabulary file is incomplete: '%s' is missing" % word) result.vocab[word] = Vocab(index=word_id, count=None) result.syn0[word_id] = weights result.index2word.append(word) if binary: binary_len = dtype(REAL).itemsize * vector_size for _ in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?") # ignore newlines in front of words (some binary files # have) if ch != b'\n': word.append(ch) word = utils.to_unicode( b''.join(word), encoding=encoding, errors=unicode_errors) weights = fromstring(fin.read(binary_len), dtype=REAL) add_word(word, weights) else: for line_no in xrange(vocab_size): line = fin.readline() if line == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?") parts = utils.to_unicode( line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % line_no) word, weights = parts[0], [REAL(x) for x in parts[1:]] add_word(word, weights) if result.syn0.shape[0] != len(result.vocab): print( "duplicate words detected, shrinking matrix size from %i to %i" % (result.syn0.shape[0], len(result.vocab))) result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)]) assert (len(result.vocab), vector_size) == result.syn0.shape ''' KDTree Build KDTree with vectors. http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html#sklearn.neighbors.KDTree ''' result.kdt = KDTree(result.syn0, leaf_size=10, metric = "euclidean") print("loaded %s matrix from %s" % (result.syn0.shape, fname)) return result
def load_word2emb(self, file_name, batch_size=5000, limit=np.inf, reset=False): self.seen = set() if reset: self.clear() batch = [] start = time() # Loop over file. with utils.open(file_name, "rb") as fin: # Determine size file. header = utils.to_unicode(fin.readline(), encoding="utf-8") vocab_size, vector_size = (int(x) for x in header.split() ) # throws for invalid file format if limit < vocab_size: vocab_size = limit for line_no in range(vocab_size): line = fin.readline() if line == b"": raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) parts = utils.to_unicode(line.rstrip(), encoding="utf-8", errors="strict").split(" ") if len(parts) != vector_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % line_no) word, vec = parts[0], np.array([REAL(x) for x in parts[1:]]) if word in self.seen: continue self.seen.add(word) batch.append((word, vec)) if "ENTITY/" in word: self.avg_cnt["entity"]["cnt"] += 1 self.avg_cnt["entity"]["sum"] += vec else: self.avg_cnt["word"]["cnt"] += 1 self.avg_cnt["word"]["sum"] += vec if len(batch) == batch_size: print("Another {}".format(batch_size), line_no, time() - start) start = time() self.insert_batch_emb(batch) batch.clear() for x in ['entity', 'word']: if self.avg_cnt[x]["cnt"] > 0: batch.append(( "#{}/UNK#".format(x.upper()), self.avg_cnt[x]["sum"] / self.avg_cnt[x]["cnt"], )) print("Added #{}/UNK#".format(x.upper())) if batch: self.insert_batch_emb(batch)
def load_vector_bin(fname, binary=True): encoding = 'utf-8' unicode_errors = 'strict' with open(fname, 'rb') as fin: header = str(fin.readline(), encoding=encoding) vocab_size, vector_size = (int(x) for x in header.split() ) # throws for invalid file format vector_size = vector_size vectors = np.zeros((vocab_size, vector_size)) vocab = {} def add_word(word, weights): word_id = len(vocab) if word in vocab: print("duplicate word '%s' in %s, ignoring all but first", word, fname) return vocab[word] = word_id vectors[word_id] = weights if binary: binary_len = dtype(REAL).itemsize * vector_size for _ in range(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) word = str(b''.join(word), encoding=encoding, errors=unicode_errors) weights = fromstring(fin.read(binary_len), dtype=REAL) add_word(word, weights) else: for line_no in range(vocab_size): line = fin.readline() if line == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) parts = str(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % line_no) word, weights = parts[0], [REAL(x) for x in parts[1:]] add_word(word, weights) if vectors.shape[0] != len(vocab): print("duplicate words detected, shrinking matrix size from %i to %i", vectors.shape[0], len(vocab)) vectors = ascontiguousarray(vectors[:len(vocab)]) assert (len(vocab), vector_size) == vectors.shape print("loaded %s matrix from %s", vectors.shape, fname) return vectors