def load_embeddings(self, words, embedding_file): """Load pretrained embeddings for a given list of words, if they exist. Args: words: iterable of tokens. Only those that are indexed in the dictionary are kept. embedding_file: path to text file of embeddings, space separated. """ words = {w for w in words if w in vocab} logger.info('Loading pre-trained embeddings for %d words from %s' % (len(words), embedding_file)) embedding = self.network.embedding.weight.data # When normalized, some words are duplicated. (Average the embeddings). vec_counts = {} with open(embedding_file) as f: for line in f: parsed = line.rstrip().split(' ') assert(len(parsed) == embedding.size(1) + 1) w = vocab.normalize(parsed[0]) if w in words: vec = torch.Tensor([float(i) for i in parsed[1:]]) if w not in vec_counts: vec_counts[w] = 1 embedding[vocab[w]].copy_(vec) else: logging.warning('WARN: Duplicate embedding found for %s' % w) vec_counts[w] = vec_counts[w] + 1 embedding[vocab[w]].add_(vec) for w, c in vec_counts.items(): embedding[vocab[w]].div_(c) logger.info('Loaded %d embeddings (%.2f%%)' % (len(vec_counts), 100 * len(vec_counts) / len(words)))
def load_embeddings(self, words, embedding_file): """Load pretrained embeddings for a given list of words, if they exist. Args: words: iterable of tokens. Only those that are indexed in the dictionary are kept. embedding_file: path to text file of embeddings, space separated. """ words = {w for w in words if w in vocab} logger.info('Loading pre-trained embeddings for %d words from %s' % (len(words), embedding_file)) #embedding = self.network.embedding.weight.data embedding = np.zeros((len(vocab), 300)) # When normalized, some words are duplicated. (Average the embeddings). vec_counts = {} with open(embedding_file) as f: for line in f: parsed = line.rstrip().split(' ') #assert(len(parsed) == embedding.size(1) + 1) w = vocab.normalize(parsed[0]) if w in words: vec = list(map(float, parsed[1:])) if w not in vec_counts: vec_counts[w] = 1 embedding[vocab[w]] = np.array(vec, dtype='float32') else: logging.warning( 'WARN: Duplicate embedding found for %s' % w) vec_counts[w] = vec_counts[w] + 1 embedding[vocab[w]] += np.array(vec, dtype='float32') for w, c in vec_counts.items(): embedding[vocab[w]] /= c logger.info('Loaded %d embeddings (%.2f%%)' % (len(vec_counts), 100 * len(vec_counts) / len(words))) return embedding