def analogy(self, pos, neg, n=10): """ Analogy similarity. Parameters ---------- pos : list neg : list Returns ------- List of tuples, each tuple is (word, similarity) Example ------- `king - man + woman = queen` will be: `pos=['king', 'woman'], neg=['man']` """ words = pos + neg pos = [(word, 1.0) for word in pos] neg = [(word, -1.0) for word in neg] mean = [] for word, direction in pos + neg: mean.append(direction * unitvec(self.get_vector(word))) mean = np.array(mean).mean(axis=0) similarities = np.dot(self.l2norm, mean) best = similarities.argsort()[::-1][1:n + len(words) - 1] return self.generate_response(best, similarities)
def from_text(cls, fname, vocabUnicodeSize=78, desired_vocab=None): """ Create a WordVectors class based on a word2vec text file Parameters ---------- fname : path to file vocabUnicodeSize: the maximum string length (78, by default) desired_vocab: if set, this will ignore any word and vector that doesn't fall inside desired_vocab. Returns ------- WordVectors instance """ with open(fname, 'rb') as fin: header = fin.readline() vocab_size, vector_size = list(map(int, header.split())) vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize) vectors = np.empty((vocab_size, vector_size), dtype=np.float) for i, line in enumerate(fin): line = line.decode('ISO-8859-1').strip() parts = line.split(' ') word = parts[0] include = desired_vocab is None or word in desired_vocab if include: vector = np.array(parts[1:], dtype=np.float) vocab[i] = word vectors[i] = unitvec(vector) if desired_vocab is not None: vectors = vectors[vocab != '', :] vocab = vocab[vocab != ''] return cls(vocab=vocab, vectors=vectors)
def __init__(self, vocab, vectors=None, l2norm=None, save_memory=True): """ Initialize a WordVectors class based on vocabulary and vectors This initializer precomputes the l2norm of the vectors Parameters ---------- vocab : np.array 1d array with the vocabulary vectors : np.array 2d array with the vectors calculated by word2vec l2norm : np.array 2d array with the calulated l2norm of the vectors save_memory : boolean wheter or not save the original vectors in `self.vectors` """ if vectors is None and l2norm is None: raise Exception('Need vectors OR l2norm arguments') self.vocab = vocab if l2norm is None: if not save_memory: self.vectors = vectors self.l2norm = np.vstack(unitvec(vec) for vec in vectors) else: self.l2norm = l2norm
def analogy(self, pos, neg, n=10): ''' Analogy similarity. Parameters ---------- pos : list neg : list Example ------- king - man + woman = queen | will be: pos=['king', 'woman'], neg=['man'] ''' words = pos + neg pos = [(word, 1.0) for word in pos] neg = [(word, -1.0) for word in neg] mean = [] for word, direction in pos + neg: mean.append(direction * unitvec(self.get_vector(word))) mean = np.array(mean).mean(axis=0) similarities = np.dot(self.l2norm, mean) best = np.argsort(similarities)[::-1][:n + len(words) - 1] return [(_word, sim) for _word, sim in zip(self.vocab[best], similarities[best]) if _word not in words]
def from_binary( cls, fname, vocab_unicode_size=78, desired_vocab=None, encoding="utf-8", new_lines=True, ): """ Create a WordVectors class based on a word2vec binary file Parameters ---------- fname : path to file vocabUnicodeSize: the maximum string length (78, by default) desired_vocab: if set any words that don't fall into this vocab will be droped Returns ------- WordVectors instance """ with open(fname, "rb") as fin: # The first line has the vocab_size and the vector_size as text header = fin.readline() vocab_size, vector_size = list(map(int, header.split())) vocab = np.empty(vocab_size, dtype="<U%s" % vocab_unicode_size) vectors = np.empty((vocab_size, vector_size), dtype=np.float) binary_len = np.dtype(np.float32).itemsize * vector_size for i in range(vocab_size): # read word word = b"" while True: ch = fin.read(1) if ch == b" ": break word += ch include = desired_vocab is None or word in desired_vocab if include: vocab[i] = word.decode(encoding) # read vector vector = np.fromstring(fin.read(binary_len), dtype=np.float32) if include: vectors[i] = unitvec(vector) if new_lines: fin.read(1) # newline char if desired_vocab is not None: vectors = vectors[vocab != "", :] vocab = vocab[vocab != ""] return cls(vocab=vocab, vectors=vectors)
def from_binary(cls, fname, vocabUnicodeSize=78, desired_vocab=None, encoding="utf-8"): """ Create a WordVectors class based on a word2vec binary file Parameters ---------- fname : path to file vocabUnicodeSize: the maximum string length (78, by default) desired_vocab: if set, this will ignore any word and vector that doesn't fall inside desired_vocab. Returns ------- WordVectors instance """ with open(fname, 'rb') as fin: header = fin.readline() vocab_size, vector_size = list(map(int, header.split())) vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize) vectors = np.empty((vocab_size, vector_size), dtype=np.float) binary_len = np.dtype(np.float32).itemsize * vector_size for i in range(vocab_size): # read word word = b'' while True: ch = fin.read(1) if ch == b' ': break word += ch include = desired_vocab is None or word in desired_vocab if include: vocab[i] = word.decode(encoding) # read vector vector = np.fromstring(fin.read(binary_len), dtype=np.float32) if include: vectors[i] = unitvec(vector) fin.read(1) # newline if desired_vocab is not None: vectors = vectors[vocab != '', :] vocab = vocab[vocab != ''] return cls(vocab=vocab, vectors=vectors)
def from_binary(cls, fname, vocabUnicodeSize=78, desired_vocab=None): """ Create a WordVectors class based on a word2vec binary file a version that can fit for utf8 text Parameters ---------- fname : path to file vocabUnicodeSize: the maximum string length (78, by default) desired_vocab: if set, this will ignore any word and vector that doesn't fall inside desired_vocab. Returns ------- WordVectors instance """ with open(fname) as fin: header = fin.readline() vocab_size, vector_size = map(int, header.split()) vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize) vectors = np.empty((vocab_size, vector_size), dtype=np.float) binary_len = np.dtype(np.float32).itemsize * vector_size for i in xrange(vocab_size): # read word word = '' while True: ch = fin.read( 1 ) ##for utf-8 style strings, as one character contains more than one bytes, we should decode the whole word instead of each byte if ch == ' ': break word += ch include = desired_vocab is None or word in desired_vocab if include: vocab[i] = word.decode('utf-8') #decode here # read vector vector = np.fromstring(fin.read(binary_len), dtype=np.float32) if include: vectors[i] = unitvec(vector) fin.read(1) # newline if desired_vocab is not None: vectors = vectors[vocab != u'', :] vocab = vocab[vocab != u''] return cls(vocab=vocab, vectors=vectors)
def from_binary(cls, fname, vocabUnicodeSize=78, desired_vocab=None): """ Create a WordVectors class based on a word2vec binary file a version that can fit for utf8 text Parameters ---------- fname : path to file vocabUnicodeSize: the maximum string length (78, by default) desired_vocab: if set, this will ignore any word and vector that doesn't fall inside desired_vocab. Returns ------- WordVectors instance """ with open(fname) as fin: header = fin.readline() vocab_size, vector_size = map(int, header.split()) vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize) vectors = np.empty((vocab_size, vector_size), dtype=np.float) binary_len = np.dtype(np.float32).itemsize * vector_size for i in xrange(vocab_size): # read word word = '' while True: ch = fin.read(1) ##for utf-8 style strings, as one character contains more than one bytes, we should decode the whole word instead of each byte if ch == ' ': break word += ch include = desired_vocab is None or word in desired_vocab if include: vocab[i] = word.decode('utf-8') #decode here # read vector vector = np.fromstring(fin.read(binary_len), dtype=np.float32) if include: vectors[i] = unitvec(vector) fin.read(1) # newline if desired_vocab is not None: vectors = vectors[vocab != u'', :] vocab = vocab[vocab != u''] return cls(vocab=vocab, vectors=vectors)
def from_binary(cls, fname, vocabUnicodeSize=78, desired_vocab=None, encoding="utf-8", newLines=True): """ Create a WordVectors class based on a word2vec binary file Parameters ---------- fname : path to file vocabUnicodeSize: the maximum string length (78, by default) desired_vocab: if set, this will ignore any word and vector that doesn't fall inside desired_vocab. Returns ------- WordVectors instance """ with open(fname, 'rb') as fin: header = fin.readline() vocab_size, vector_size = list(map(int, header.split())) vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize) vectors = np.empty((vocab_size, vector_size), dtype=np.float) binary_len = np.dtype(np.float32).itemsize * vector_size for i in range(vocab_size): # read word word = b'' while True: ch = fin.read(1) if ch == b' ': break word += ch include = desired_vocab is None or word in desired_vocab if include: vocab[i] = word.decode(encoding) # read vector vector = np.fromstring(fin.read(binary_len), dtype=np.float32) if include: vectors[i] = unitvec(vector) if newLines: fin.read(1) # newline if desired_vocab is not None: vectors = vectors[vocab != '', :] vocab = vocab[vocab != ''] return cls(vocab=vocab, vectors=vectors)
def from_text(cls, fname, vocabUnicodeSize=78, desired_vocab=None, encoding="utf-8"): """ Create a WordVectors class based on a word2vec text file Parameters ---------- fname : path to file vocabUnicodeSize: the maximum string length (78, by default) desired_vocab: if set, this will ignore any word and vector that doesn't fall inside desired_vocab. Returns ------- WordVectors instance """ with open(fname, 'rb') as fin: header = fin.readline() vocab_size, vector_size = list(map(int, header.split())) vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize) vectors = np.empty((vocab_size, vector_size), dtype=np.float) for i, line in enumerate(fin): try: line = line.decode(encoding).strip() parts = line.split(' ') word = parts[0] include = desired_vocab is None or word in desired_vocab if include: vector = np.array(parts[1:], dtype=np.float) vocab[i] = word vectors[i] = unitvec(vector) except: pass if desired_vocab is not None: vectors = vectors[vocab != '', :] vocab = vocab[vocab != ''] return cls(vocab=vocab, vectors=vectors)
def __init__(self, vocab=None, vectors=None, saveMemory=True): self.vocab = vocab if not saveMemory: self.vectors = vectors self.l2norm = np.vstack(unitvec(vec) for vec in vectors)