def load_embedding(opts, paddingSym): if opts["lexicon"]: emb = np.load(opts["word_embedding"]) lexicon = Lexicon(unknownSymbol=None) with codecs.open(opts["lexicon"]) as f: for l in f: lexicon.put(l.strip()) lexicon.setUnknown("UUUKNNN") paddingId = lexicon.getLexiconIndex(paddingSym) embedding = Embedding(lexicon, emb, paddingIdx=paddingId) elif opts["word_embedding"]: # todo: Allow use embeddings and other representation lexicon, embedding = Embedding.fromFile(opts['word_embedding'], 'UUUKNNN', hasHeader=False, paddingSym=paddingSym) return lexicon, embedding
def fromFile(file, unknownSymbol, lexiconName=None, hasHeader=True, paddingSym=None): """ Creates a lexicon and a embedding from word2vec file. :param file: path of file :param unknownSymbol: the string that represents the unknown words. :return: (data.Lexicon.Lexicon, Embedding) """ log = logging.getLogger(__name__) fVec = codecs.open(file, 'r', 'utf-8') # Read the number of words in the dictionary and the embedding size if hasHeader: nmWords, embeddingSizeStr = fVec.readline().strip().split(" ") embeddingSize = int(embeddingSizeStr) else: embeddingSize = None lexicon = Lexicon(unknownSymbol, lexiconName) # The empty array represents the array of unknown # At end, this array will be replaced by one array that exist in the w2vFile or a random array. vectors = [[]] nmEmptyWords = 0 for line in fVec: splitLine = line.rstrip().split(u' ') word = splitLine[0] if len(word) == 0: log.warning( "Insert in the embedding a empty string. This embeddings will be thrown out." ) nmEmptyWords += 1 continue vec = [float(num) for num in splitLine[1:]] if word == unknownSymbol: if len(vectors[0]) != 0: raise Exception("A unknown symbol was already inserted.") vectors[0] = vec else: lexicon.put(word) vectors.append(vec) expected_size = lexicon.getLen() - 1 + nmEmptyWords if len(vectors[0]) == 0: if embeddingSize is None: embeddingSize = len(vectors[-1]) vectors[0] = generateVector(embeddingSize) expected_size += 1 if hasHeader: if int(nmWords) != expected_size: raise Exception( "The size of lexicon is different of number of vectors") if paddingSym is None: paddingIdx = None else: if not lexicon.exist(paddingSym): paddingIdx = lexicon.put(paddingSym) vectors.append([0.0] * embeddingSize) else: paddingIdx = lexicon.getLexiconIndex(paddingSym) fVec.close() return lexicon, Embedding(lexicon, vectors, paddingIdx=paddingIdx)