def make_ELMo(self): # Location of pretrained BiLM for the specified language # TBD check if ELMo language resources are present description = self.get_description('elmo-' + self.lang) if description is not None: self.lang = description["lang"] vocab_file = description["path-vocab"] options_file = description["path-config"] weight_file = description["path_weights"] print('init ELMo') # Create a Batcher to map text to character ids self.batcher = Batcher(vocab_file, 50) # Build the biLM graph. self.bilm = BidirectionalLanguageModel(self.lang, options_file, weight_file) # Input placeholders to the biLM. self.character_ids = tf.placeholder('int32', shape=(None, None, 50)) with tf.variable_scope(self.lang, reuse=tf.AUTO_REUSE): # the reuse=True scope reuses weights from the whole context self.embeddings_op = self.bilm(self.character_ids) self.elmo_input = weight_layers('input', self.embeddings_op, l2_coef=0.0)
class Embeddings(object): def __init__(self, name, path='./embedding-registry.json', lang='en', extension='vec', use_ELMo=False, use_BERT=False, use_cache=True, load=True): self.name = name self.embed_size = 0 self.static_embed_size = 0 self.vocab_size = 0 self.model = {} self.registry = self._load_embedding_registry(path) self.lang = lang self.extension = extension self.embedding_lmdb_path = None if self.registry is not None: self.embedding_lmdb_path = self.registry["embedding-lmdb-path"] self.env = None if load: self.make_embeddings_simple(name) self.static_embed_size = self.embed_size self.bilm = None self.use_cache = use_cache # below init for using ELMo embeddings self.use_ELMo = use_ELMo if use_ELMo: self.make_ELMo() self.embed_size = ELMo_embed_size + self.embed_size description = self.get_description('elmo-' + self.lang) self.env_ELMo = None if description and description["cache-training"] and self.use_cache: self.embedding_ELMo_cache = os.path.join( description["path-cache"], "cache") # clean possible remaining cache self.clean_ELMo_cache() # create and load a cache in write mode, it will be used only for training self.env_ELMo = lmdb.open(self.embedding_ELMo_cache, map_size=map_size) # below init for using BERT embeddings (extracted features only, not fine tuning), # similar to ELMo for this usage self.use_BERT = use_BERT if use_BERT: # to avoid issue with tf graph and thread, we maintain in the class its own graph and session #self.session = tf.Session() self.graph = tf.get_default_graph() #self.session.run(tf.global_variables_initializer()) self.make_BERT() self.embed_size = BERT_embed_size + self.embed_size description = self.get_description('bert-base-' + self.lang) self.env_BERT = None if description and description["cache-training"] and self.use_cache: self.embedding_BERT_cache = os.path.join( description["path-cache"], "cache") # clean possible remaining cache self.clean_BERT_cache() # create and load a cache in write mode, it will be used only for training self.env_BERT = lmdb.open(self.embedding_BERT_cache, map_size=map_size) def __getattr__(self, name): return getattr(self.model, name) def _load_embedding_registry(self, path='./embedding-registry.json'): """ Load the description of available embeddings. Each description provides a name, a file path (used only if necessary) and a embeddings type (to take into account small variation of format) """ registry_json = open(path).read() return json.loads(registry_json) def make_embeddings_simple_in_memory(self, name="fasttext-crawl"): nbWords = 0 print('loading embeddings...') begin = True description = self.get_description(name) if description is not None: embeddings_path = description["path"] self.lang = description["lang"] print("path:", embeddings_path) if self.extension == 'bin': self.model = fastText.load_model(embeddings_path) nbWords = len(self.model.get_words()) self.embed_size = self.model.get_dimension() else: with open(embeddings_path, encoding='utf8') as f: for line in f: line = line.strip() line = line.split(' ') if begin: begin = False nb_words, embed_size = _fetch_header_if_available( line) # we parse the header if nb_words > 0 and embed_size > 0: nbWords = nb_words self.embed_size = embed_size continue word = line[0] vector = np.array( [float(val) for val in line[1:len(line)]], dtype='float32') #else: # vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32') if self.embed_size == 0: self.embed_size = len(vector) self.model[word] = vector if nbWords == 0: nbWords = len(self.model) print('embeddings loaded for', nbWords, "words and", self.embed_size, "dimensions") def make_embeddings_lmdb(self, name="fasttext-crawl"): print( '\nCompiling embeddings... (this is done only one time per embeddings at first usage)' ) description = self.get_description(name) if description is None: print( '\nNo description found in embeddings registry for embeddings', name) return if description is not None: # the following method will possibly download the mebedding file if not available locally embeddings_path = self.get_embedding_path(description) if embeddings_path is None: print('\nCould not locate a usable resource for embeddings', name) return self.load_embeddings_from_file(embeddings_path) # cleaning possible downloaded embeddings self.clean_downloads() def load_embeddings_from_file(self, embeddings_path): begin = True nbWords = 0 txn = self.env.begin(write=True) # batch_size = 1024 i = 0 nb_lines = 0 # read number of lines first embedding_file = open_embedding_file(embeddings_path) if embedding_file is None: print("Error: could not open embeddings file", embeddings_path) return for line in embedding_file: nb_lines += 1 embedding_file.close() embedding_file = open_embedding_file(embeddings_path) #with open(embeddings_path, encoding='utf8') as f: for line in tqdm(embedding_file, total=nb_lines): line = line.decode() line = line.split(' ') if begin: begin = False nb_words, embed_size = _fetch_header_if_available(line) if nb_words > 0 and embed_size > 0: nbWords = nb_words self.embed_size = embed_size continue word = line[0] try: if line[len(line) - 1] == '\n': vector = np.array( [float(val) for val in line[1:len(line) - 1]], dtype='float32') else: vector = np.array( [float(val) for val in line[1:len(line)]], dtype='float32') #vector = np.array([float(val) for val in line[1:len(line)]], dtype='float32') except: print(len(line)) print(line[1:len(line)]) #else: # vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32') if self.embed_size == 0: self.embed_size = len(vector) if len(word.encode(encoding='UTF-8')) < self.env.max_key_size(): txn.put(word.encode(encoding='UTF-8'), _serialize_pickle(vector)) #txn.put(word.encode(encoding='UTF-8'), _serialize_byteio(vector)) i += 1 # commit batch # if i % batch_size == 0: # txn.commit() # txn = self.env.begin(write=True) embedding_file.close() #if i % batch_size != 0: txn.commit() if nbWords == 0: nbWords = i self.vocab_size = nbWords print('embeddings loaded for', nbWords, "words and", self.embed_size, "dimensions") def clean_downloads(self): # cleaning possible downloaded embeddings for filename in os.listdir(self.registry['embedding-download-path']): file_path = os.path.join(self.registry['embedding-download-path'], filename) try: if os.path.isfile(file_path) or os.path.islink(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: print('Failed to delete %s. Reason: %s' % (file_path, e)) def make_embeddings_simple(self, name="fasttext-crawl"): description = self.get_description(name) if description is not None: self.extension = description["format"] if self.extension == "bin": if fasttext_support == True: print( "embeddings are of .bin format, so they will be loaded in memory..." ) self.make_embeddings_simple_in_memory(name) else: if not (sys.platform == 'linux' or sys.platform == 'darwin'): raise ValueError( 'FastText .bin format not supported for your platform') else: raise ValueError( 'Go to the documentation to get more information on how to install FastText .bin support' ) elif self.embedding_lmdb_path is None or self.embedding_lmdb_path == "None": print( "embedding_lmdb_path is not specified in the embeddings registry, so the embeddings will be loaded in memory..." ) self.make_embeddings_simple_in_memory(name) else: # if the path to the lmdb database files does not exist, we create it if not os.path.isdir(self.embedding_lmdb_path): # conservative check (likely very useless) if not os.path.exists(self.embedding_lmdb_path): os.makedirs(self.embedding_lmdb_path) # check if the lmdb database exists envFilePath = os.path.join(self.embedding_lmdb_path, name) load_db = True if os.path.isdir(envFilePath): description = self.get_description(name) if description is not None: self.lang = description["lang"] # open the database in read mode self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=4) if self.env: # we need to set self.embed_size and self.vocab_size with self.env.begin() as txn: stats = txn.stat() size = stats['entries'] self.vocab_size = size with self.env.begin() as txn: cursor = txn.cursor() for key, value in cursor: vector = _deserialize_pickle(value) self.embed_size = vector.shape[0] break cursor.close() if self.vocab_size > 100 and self.embed_size > 10: # lmdb database exists and looks valid load_db = False # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env.close() self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=2) if load_db: # create and load the database in write mode self.env = lmdb.open(envFilePath, map_size=map_size) self.make_embeddings_lmdb(name) def make_ELMo(self): # Location of pretrained BiLM for the specified language # TBD check if ELMo language resources are present description = self.get_description('elmo-' + self.lang) if description is not None: self.lang = description["lang"] vocab_file = description["path-vocab"] options_file = description["path-config"] weight_file = description["path_weights"] print('init ELMo') # Create a Batcher to map text to character ids self.batcher = Batcher(vocab_file, 50) # Build the biLM graph. self.bilm = BidirectionalLanguageModel(self.lang, options_file, weight_file) # Input placeholders to the biLM. self.character_ids = tf.placeholder('int32', shape=(None, None, 50)) with tf.variable_scope(self.lang, reuse=tf.AUTO_REUSE): # the reuse=True scope reuses weights from the whole context self.embeddings_op = self.bilm(self.character_ids) self.elmo_input = weight_layers('input', self.embeddings_op, l2_coef=0.0) def make_BERT(self): # Location of BERT model description = self.get_description('bert-base-' + self.lang) if description is not None: self.lang = description["lang"] config_file = description["path-config"] weight_file = description["path-weights"] vocab_file = description["path-vocab"] print('init BERT') # load the pretrained model with self.graph.as_default(): # there are different typical pooling strategies for getting BERT features: # - concatenation of 4 last layers (the one from the original BERT paper, BERT_embed_size is then 3072) # - last layer (BERT_embed_size is 768) # - average of 4 last layers (BERT_embed_size is 768) # - sum of the 4 last layers (BERT_embed_size is 768) self.bert_model = load_trained_model_from_checkpoint( config_file, weight_file, output_layer_num=4) self.bert_model.summary(line_length=120) self.bert_model._make_predict_function() # init the tokenizer token_dict = {} with codecs.open(vocab_file, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) print('token_dict size:', len(token_dict)) self.bert_tokenizer = Tokenizer(token_dict, cased=True) def get_sentence_vector_only_ELMo(self, token_list): """ Return the ELMo embeddings only for a full sentence """ if not self.use_ELMo: print( "Warning: ELMo embeddings requested but embeddings object wrongly initialised" ) return # Create batches of data local_token_ids = self.batcher.batch_sentences(token_list) max_size_sentence = local_token_ids[0].shape[0] # check lmdb cache elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence) if elmo_result is not None: return elmo_result with tf.Session() as sess: # weird, for this cpu is faster than gpu (1080Ti !) with tf.device("/cpu:0"): # It is necessary to initialize variables once before running inference sess.run(tf.global_variables_initializer()) # Compute ELMo representations (2 times as a heavy warm-up) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) #cache computation self.cache_ELMo_lmdb_vector(token_list, elmo_result) return elmo_result def get_sentence_vector_with_ELMo(self, token_list): """ Return a concatenation of standard embeddings (e.g. Glove) and ELMo embeddings for a full sentence """ if not self.use_ELMo: print( "Warning: ELMo embeddings requested but embeddings object wrongly initialised" ) return #print("\ntoken_list:", token_list) local_token_ids = self.batcher.batch_sentences(token_list) #print("local_token_ids:", local_token_ids) max_size_sentence = local_token_ids[0].shape[0] elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence) if elmo_result is None: with tf.Session() as sess: # weird, for this cpu is faster than gpu (1080Ti !) with tf.device("/cpu:0"): # It is necessary to initialize variables once before running inference sess.run(tf.global_variables_initializer()) # Compute ELMo representations (2 times as a heavy warm-up) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) #cache computation self.cache_ELMo_lmdb_vector(token_list, elmo_result) concatenated_result = np.zeros( (len(token_list), max_size_sentence - 2, self.embed_size), dtype=np.float32) #concatenated_result = np.random.rand(elmo_result.shape[0], max_size_sentence-2, self.embed_size) for i in range(0, len(token_list)): for j in range(0, len(token_list[i])): #if is_int(token_list[i][j]) or is_float(token_list[i][j]): #dummy_result = np.zeros((elmo_result.shape[2]), dtype=np.float32) #concatenated_result[i][j] = np.concatenate((dummy_result, self.get_word_vector(token_list[i][j])), ) #else: concatenated_result[i][j] = np.concatenate( (elmo_result[i][j], self.get_word_vector( token_list[i][j]).astype('float32')), ) #concatenated_result[i][j] = np.concatenate((self.get_word_vector(token_list[i][j]), elmo_result[i][j]), ) return concatenated_result def get_sentence_vector_only_BERT(self, token_list): """ Return the BERT extracted embeddings only for a full sentence """ if not self.use_BERT: print( "Warning: BERT embeddings requested but embeddings object wrongly initialised" ) return #print("local_token_ids:", local_token_ids) max_size_token_list = 0 for i, sentence in enumerate(token_list): if len(sentence) > max_size_token_list: max_size_token_list = len(sentence) # retokenize with BERT tokenizer max_size = BERT_sentence_size max_size_sentence = 0 new_token_list = [] bert_results = np.zeros((len(token_list), max_size, BERT_embed_size), dtype=np.float32) for i, sentence in enumerate(token_list): local_text = " ".join(sentence) local_tokens = self.bert_tokenizer.tokenize(local_text) bert_result = self.get_BERT_lmdb_vector(sentence) if bert_result is None: indices, segments = self.bert_tokenizer.encode( local_text, max_len=max_size) with self.graph.as_default(): bert_result = self.bert_model.predict( [np.array([indices]), np.array([segments])])[0] #cache computation if bert_result is not None: self.cache_BERT_lmdb_vector(sentence, bert_result) # Realign BERT tokenization with the provided tokenization. Normally BERT segmenter always # over-segment as compared to DeLFT segmenter. # There are two obvious possibilities to combine subtoken embeddings into token embeddings, # either take the embeddings of the last subtoken, of use the average vector of the subtokens. new_bert_result = np.zeros((max_size, BERT_embed_size), dtype=np.float32) token_tensor = [] tid = 0 buffer = '' #print(sentence) #print(local_tokens) for j, t in enumerate(local_tokens): if j >= max_size: break if t == '[CLS]' or t == '[SEP]': continue else: if t.startswith('##'): t = t[2:] buffer += t #print(buffer) token_tensor.append(bert_result[j]) if buffer == sentence[tid]: # average vector of the subtokens new_bert_result[tid] = np.stack(token_tensor).mean( axis=0) # or last subtoken vector #new_bert_result[tid] = token_tensor[-1] token_tensor = [] buffer = '' tid += 1 bert_result = new_bert_result if bert_result is not None: bert_results[i] = bert_result # we need to squeze the vector to max_size_token_list squeezed_bert_results = np.zeros( (len(token_list), max_size_token_list, BERT_embed_size), dtype=np.float32) for i, sentence in enumerate(token_list): squeezed_bert_results[i] = bert_results[i][:max_size_token_list] return squeezed_bert_results def get_sentence_vector_with_BERT(self, token_list): """ Return a concatenation of standard embeddings (e.g. Glove) and BERT extracted embeddings for a full sentence """ if not self.use_BERT: print( "Warning: BERT embeddings requested but embeddings object wrongly initialised" ) return max_size_token_list = 0 for i, sentence in enumerate(token_list): if len(sentence) > max_size_token_list: max_size_token_list = len(sentence) squeezed_bert_results = self.get_sentence_vector_only_BERT(token_list) concatenated_squeezed_result = np.zeros( (len(token_list), max_size_token_list, self.embed_size), dtype=np.float32) for i, sentence in enumerate(token_list): for j in range(0, len(token_list[i])): concatenated_squeezed_result[i][j] = np.concatenate( (squeezed_bert_results[i][j], self.get_word_vector( token_list[i][j]).astype('float32')), ) return concatenated_squeezed_result def get_description(self, name): for emb in self.registry["embeddings"]: if emb["name"] == name: return emb for emb in self.registry["embeddings-contextualized"]: if emb["name"] == name: return emb for emb in self.registry["transformers"]: if emb["name"] == name: return emb return None def get_word_vector(self, word): """ Get static embeddings (e.g. glove) for a given token """ if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'): # the pre-trained embeddings are not cased word = word.lower() if self.env is None or self.extension == 'bin': # db not available or embeddings in bin format, the embeddings should be available in memory (normally!) return self.get_word_vector_in_memory(word) try: with self.env.begin() as txn: vector = txn.get(word.encode(encoding='UTF-8')) if vector: word_vector = _deserialize_pickle(vector) vector = None else: word_vector = np.zeros((self.static_embed_size, ), dtype=np.float32) # alternatively, initialize with random negative values #word_vector = np.random.uniform(low=-0.5, high=0.0, size=(self.embed_size,)) # alternatively use fasttext OOV ngram possibilities (if ngram available) except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env.close() envFilePath = os.path.join(self.embedding_lmdb_path, self.name) self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_word_vector(word) return word_vector def get_ELMo_lmdb_vector(self, token_list, max_size_sentence): """ Try to get the ELMo embeddings for a sequence cached in LMDB """ if self.env_ELMo is None: # db cache not available, we don't cache ELMo stuff return None try: ELMo_vector = np.zeros( (len(token_list), max_size_sentence - 2, ELMo_embed_size), dtype='float32') with self.env_ELMo.begin() as txn: for i in range(0, len(token_list)): txn = self.env_ELMo.begin() # get a hash for the token_list the_hash = list_digest(token_list[i]) vector = txn.get(the_hash.encode(encoding='UTF-8')) if vector: # adapt expected shape/padding local_embeddings = _deserialize_pickle(vector) if local_embeddings.shape[0] > max_size_sentence - 2: # squeeze the extra padding space ELMo_vector[ i] = local_embeddings[:max_size_sentence - 2, ] elif local_embeddings.shape[ 0] == max_size_sentence - 2: # bingo~! ELMo_vector[i] = local_embeddings else: # fill the missing space with padding filler = np.zeros((max_size_sentence - (local_embeddings.shape[0] + 2), ELMo_embed_size), dtype='float32') ELMo_vector[i] = np.concatenate( (local_embeddings, filler)) vector = None else: return None except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env_ELMo.close() self.env_ELMo = lmdb.open(self.embedding_ELMo_cache, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_ELMo_lmdb_vector(token_list) return ELMo_vector def get_BERT_lmdb_vector(self, sentence): """ Try to get the BERT extracted embeddings for a sequence cached in LMDB """ if self.env_BERT is None: # db cache not available, we don't cache ELMo stuff return None try: BERT_vector = np.zeros((BERT_sentence_size, BERT_embed_size), dtype='float32') with self.env_BERT.begin() as txn: txn = self.env_BERT.begin() # get a hash for the token_list the_hash = list_digest(sentence) vector = txn.get(the_hash.encode(encoding='UTF-8')) if vector: # adapt expected shape/padding BERT_vector = _deserialize_pickle(vector) ''' if local_embeddings.shape[0] > max_size_sentence: # squeeze the extra padding space BERT_vector = local_embeddings[:max_size_sentence,] elif local_embeddings.shape[0] == max_size_sentence: # bingo~! BERT_vector = local_embeddings else: # fill the missing space with padding filler = np.zeros((max_size_sentence-(local_embeddings.shape[0]), BERT_embed_size), dtype='float32') BERT_vector = np.concatenate((local_embeddings, filler)) ''' vector = None else: return None except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env_BERT.close() self.env_BERT = lmdb.open(self.embedding_BERT_cache, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_BERT_lmdb_vector(sentence) return BERT_vector def cache_ELMo_lmdb_vector(self, token_list, ELMo_vector): """ Cache in LMDB the ELMo embeddings for a given sequence """ if self.env_ELMo is None: # db cache not available, we don't cache ELMo stuff return None txn = self.env_ELMo.begin(write=True) for i in range(0, len(token_list)): # get a hash for the token_list the_hash = list_digest(token_list[i]) txn.put(the_hash.encode(encoding='UTF-8'), _serialize_pickle(ELMo_vector[i])) txn.commit() def cache_BERT_lmdb_vector(self, sentence, BERT_vector): """ Cache in LMDB the BERT embeddings for a given sequence """ if self.env_BERT is None: # db cache not available, we don't cache BERT stuff return None txn = self.env_BERT.begin(write=True) #for i in range(0, len(sentence)): # get a hash for the token_list the_hash = list_digest(sentence) txn.put(the_hash.encode(encoding='UTF-8'), _serialize_pickle(BERT_vector)) txn.commit() def clean_ELMo_cache(self): """ Delete ELMo embeddings cache, this takes place normally after the completion of a training """ if self.env_ELMo is None: # db cache not available, nothing to clean return else: self.env_ELMo.close() self.env_ELMo = None for file in os.listdir(self.embedding_ELMo_cache): file_path = os.path.join(self.embedding_ELMo_cache, file) if os.path.isfile(file_path): os.remove(file_path) os.rmdir(self.embedding_ELMo_cache) def clean_BERT_cache(self): """ Delete BERT embeddings cache, this takes place normally after the completion of a training """ # if cache subdirectory does not exist, we create it if not os.path.exists(self.embedding_BERT_cache): os.makedirs(self.embedding_BERT_cache) return if self.env_BERT is None: # db cache not available, nothing to clean return else: self.env_BERT.close() self.env_BERT = None for file in os.listdir(self.embedding_BERT_cache): file_path = os.path.join(self.embedding_BERT_cache, file) if os.path.isfile(file_path): os.remove(file_path) os.rmdir(self.embedding_BERT_cache) def get_word_vector_in_memory(self, word): if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'): # the pre-trained embeddings are not cased word = word.lower() if self.extension == 'bin': return self.model.get_word_vector(word) if word in self.model: return self.model[word] else: # for unknown word, we use a vector filled with 0.0 return np.zeros((self.static_embed_size, ), dtype=np.float32) # alternatively, initialize with random negative values #return np.random.uniform(low=-0.5, high=0.0, size=(self.embed_size,)) # alternatively use fasttext OOV ngram possibilities (if ngram available) def get_embedding_path(self, description): embeddings_path = None if "path" in description: embeddings_path = description["path"] self.lang = description["lang"] if embeddings_path is None or not os.path.isfile(embeddings_path): print("error: embedding path for", description['name'], "is not valid", embeddings_path) if "url" in description and len(description["url"]) > 0: url = description["url"] download_path = self.registry['embedding-download-path'] # if the download path does not exist, we create it if not os.path.isdir(download_path): try: os.mkdir(download_path) except OSError: print("Creation of the download directory", download_path, "failed") print("Downloading resource file for", description['name'], "...") embeddings_path = download_file(url, download_path) if embeddings_path != None and os.path.isfile(embeddings_path): print("Download sucessful:", embeddings_path) else: print( "no download url available for this embeddings resource, please review the embedding registry for", description['name']) return embeddings_path
class Embeddings(object): def __init__(self, name, path='./embedding-registry.json', lang='en', extension='vec', use_ELMo=False): self.name = name self.embed_size = 0 self.static_embed_size = 0 self.vocab_size = 0 self.model = {} self.registry = self._load_embedding_registry(path) self.lang = lang self.extension = extension self.embedding_lmdb_path = None if self.registry is not None: self.embedding_lmdb_path = self.registry["embedding-lmdb-path"] self.env = None self.make_embeddings_simple(name) self.static_embed_size = self.embed_size self.bilm = None # below init for using ELMo embeddings self.use_ELMo = use_ELMo if use_ELMo: self.make_ELMo() self.embed_size = ELMo_embed_size + self.embed_size description = self._get_description('elmo-en') self.env_ELMo = None if description: self.embedding_ELMo_cache = os.path.join(description["path-dump"], "cache") # clean possible remaining cache self.clean_ELMo_cache() # create and load a cache in write mode, it will be used only for training self.env_ELMo = lmdb.open(self.embedding_ELMo_cache, map_size=map_size) def __getattr__(self, name): return getattr(self.model, name) def _load_embedding_registry(self, path='./embedding-registry.json'): """ Load the description of available embeddings. Each description provides a name, a file path (used only if necessary) and a embeddings type (to take into account small variation of format) """ registry_json = open(path).read() return json.loads(registry_json) def make_embeddings_simple_in_memory(self, name="fasttext-crawl", hasHeader=True): nbWords = 0 print('loading embeddings...') begin = True description = self._get_description(name) if description is not None: embeddings_path = description["path"] embeddings_type = description["type"] self.lang = description["lang"] print("path:", embeddings_path) if self.extension == 'bin': self.model = fastText.load_model(embeddings_path) nbWords = len(self.model.get_words()) self.embed_size = self.model.get_dimension() else: if embeddings_type == "glove": hasHeader = False with open(embeddings_path, encoding='utf8') as f: for line in f: line = line.strip() line = line.split(' ') if begin: if hasHeader: # first line gives the nb of words and the embedding size nbWords = int(line[0]) self.embed_size = int(line[1].replace("\n", "")) begin = False continue else: begin = False word = line[0] #if embeddings_type == 'glove': vector = np.array([float(val) for val in line[1:len(line)]], dtype='float32') #else: # vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32') if self.embed_size == 0: self.embed_size = len(vector) self.model[word] = vector if nbWords == 0: nbWords = len(self.model) print('embeddings loaded for', nbWords, "words and", self.embed_size, "dimensions") ''' def make_embeddings_fasttext_bin(self, name="wiki.en.bin"): nbWords = 0 print('loading embeddings...') description = self._get_description(name) if description is not None: embeddings_path = description["path"] print("path:", embeddings_path) self.model = load_fasttext_format(embeddings_path) ''' def make_embeddings_lmdb(self, name="fasttext-crawl", hasHeader=True): nbWords = 0 print('\nCompiling embeddings... (this is done only one time per embeddings at first launch)') begin = True description = self._get_description(name) if description is not None: embeddings_path = description["path"] embeddings_type = description["type"] self.lang = description["lang"] print("path:", embeddings_path) if embeddings_type == "glove": hasHeader = False txn = self.env.begin(write=True) batch_size = 1024 i = 0 nb_lines = 0 with open(embeddings_path, encoding='utf8') as f: for line in f: nb_lines += 1 with open(embeddings_path, encoding='utf8') as f: for line in tqdm(f, total=nb_lines): line = line.split(' ') if begin: if hasHeader: # first line gives the nb of words and the embedding size nbWords = int(line[0]) self.embed_size = int(line[1].replace("\n", "")) begin = False continue else: begin = False word = line[0] #if embeddings_type == 'glove': try: if line[len(line)-1] == '\n': vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32') else: vector = np.array([float(val) for val in line[1:len(line)]], dtype='float32') #vector = np.array([float(val) for val in line[1:len(line)]], dtype='float32') except: print(len(line)) print(line[1:len(line)]) #else: # vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32') if self.embed_size == 0: self.embed_size = len(vector) if len(word.encode(encoding='UTF-8')) < self.env.max_key_size(): txn.put(word.encode(encoding='UTF-8'), _serialize_pickle(vector)) #txn.put(word.encode(encoding='UTF-8'), _serialize_byteio(vector)) i += 1 # commit batch if i % batch_size == 0: txn.commit() txn = self.env.begin(write=True) #if i % batch_size != 0: txn.commit() if nbWords == 0: nbWords = i self.vocab_size = nbWords print('embeddings loaded for', nbWords, "words and", self.embed_size, "dimensions") def make_embeddings_simple(self, name="fasttext-crawl", hasHeader=True): description = self._get_description(name) if description is not None: self.extension = description["format"] if self.extension == "bin": if fasttext_support == True: print("embeddings are of .bin format, so they will be loaded in memory...") self.make_embeddings_simple_in_memory(name, hasHeader) else: if not (sys.platform == 'linux' or sys.platform == 'darwin'): raise ValueError('FastText .bin format not supported for your platform') else: raise ValueError('Go to the documentation to get more information on how to install FastText .bin support') elif self.embedding_lmdb_path is None or self.embedding_lmdb_path == "None": print("embedding_lmdb_path is not specified in the embeddings registry, so the embeddings will be loaded in memory...") self.make_embeddings_simple_in_memory(name, hasHeader) else: # check if the lmdb database exists envFilePath = os.path.join(self.embedding_lmdb_path, name) if os.path.isdir(envFilePath): description = self._get_description(name) if description is not None: self.lang = description["lang"] # open the database in read mode self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=4) # we need to set self.embed_size and self.vocab_size with self.env.begin() as txn: stats = txn.stat() size = stats['entries'] self.vocab_size = size with self.env.begin() as txn: cursor = txn.cursor() for key, value in cursor: vector = _deserialize_pickle(value) self.embed_size = vector.shape[0] break cursor.close() # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env.close() self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=2) else: # create and load the database in write mode self.env = lmdb.open(envFilePath, map_size=map_size) self.make_embeddings_lmdb(name, hasHeader) def make_ELMo(self): # Location of pretrained BiLM for the specified language # TBD check if ELMo language resources are present description = self._get_description('elmo-en') if description is not None: self.lang = description["lang"] vocab_file = description["path-vocab"] options_file = description["path-config"] weight_file = description["path_weights"] print('init ELMo') # Create a Batcher to map text to character ids self.batcher = Batcher(vocab_file, 50) # Build the biLM graph. self.bilm = BidirectionalLanguageModel(options_file, weight_file) # Input placeholders to the biLM. self.character_ids = tf.placeholder('int32', shape=(None, None, 50)) self.embeddings_op = self.bilm(self.character_ids) with tf.variable_scope('', reuse=tf.AUTO_REUSE): # the reuse=True scope reuses weights from the whole context self.elmo_input = weight_layers('input', self.embeddings_op, l2_coef=0.0) def dump_ELMo_token_embeddings(self, x_train): if not self.use_ELMo: print("Warning: ELMo embeddings dump requested but embeddings object wrongly initialised") return description = self._get_description('elmo-en') if description is not None: print("Building ELMo token dump") self.lang = description["lang"] options_file = description["path-config"] weight_file = description["path_weights"] working_path = description["path-dump"] all_tokens = set(['<S>', '</S>']) for i in range(0, len(x_train)): # as it is training, it is already tokenized tokens = x_train[i] for token in tokens: if token not in all_tokens: all_tokens.add(token) vocab_file = os.path.join(working_path, 'vocab_small.txt') with open(vocab_file, 'w') as fout: fout.write('\n'.join(all_tokens)) tf.reset_default_graph() token_embedding_file = os.path.join(working_path, 'elmo_token_embeddings.hdf5') dump_token_embeddings( vocab_file, options_file, weight_file, token_embedding_file ) tf.reset_default_graph() self.batcher_token_dump = TokenBatcher(vocab_file) self.bilm_token_dump = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file ) self.token_ids = tf.placeholder('int32', shape=(None, None)) self.embeddings_op_token_dump = self.bilm_token_dump(self.token_ids) """ with tf.variable_scope('', reuse=tf.AUTO_REUSE): # the reuse=True scope reuses weights from the whole context self.elmo_input_token_dump = weight_layers('input', self.embeddings_op_token_dump, l2_coef=0.0) """ print("ELMo token dump completed") def get_sentence_vector_only_ELMo(self, token_list): """ Return the ELMo embeddings only for a full sentence """ if not self.use_ELMo: print("Warning: ELMo embeddings requested but embeddings object wrongly initialised") return # Create batches of data local_token_ids = self.batcher.batch_sentences(token_list) max_size_sentence = local_token_ids[0].shape[0] # check lmdb cache elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence) if elmo_result is not None: return elmo_result with tf.Session() as sess: # weird, for this cpu is faster than gpu (1080Ti !) with tf.device("/cpu:0"): # It is necessary to initialize variables once before running inference sess.run(tf.global_variables_initializer()) # Compute ELMo representations (2 times as a heavy warm-up) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids} ) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids} ) #cache computation self.cache_ELMo_lmdb_vector(token_list, elmo_result) return elmo_result def get_sentence_vector_with_ELMo(self, token_list): """ Return a concatenation of standard embeddings (e.g. Glove) and ELMo embeddings for a full sentence """ if not self.use_ELMo: print("Warning: ELMo embeddings requested but embeddings object wrongly initialised") return """ # trick to extend the context for short sentences token_list_extended = token_list.copy() #print("token_list_extended before: ", token_list_extended) for i in range(0, len(token_list_extended)): local_list = token_list_extended[i] j = i while len(local_list) <= 5: #print(j, local_list) if j < len(token_list_extended)-1: local_list = local_list + token_list_extended[j+1] else: break j = j + 1 token_list_extended[i] = local_list #print("token_list_extended after: ", token_list_extended) max_size_sentence = 0 for i in range(0, len(token_list)): local_length = len(token_list[i]) if local_length > max_size_sentence: max_size_sentence = local_length """ # Create batches of data #print("\ntoken_list:", token_list) local_token_ids = self.batcher.batch_sentences(token_list) #print("local_token_ids:", local_token_ids) max_size_sentence = local_token_ids[0].shape[0] ''' i = 0 j = 1 # <s> k = 1 # start of word for sentence in token_list: print('\nsentence:', sentence) #print('local_token_ids[i]:', local_token_ids[i]) for token in sentence: print('\ntoken:', token) print('local_token_ids[i,j]:', local_token_ids[i,j]) for character in token: print(character, ":", local_token_ids[i][j][k]) k += 1 k = 1 j += 1 j = 1 i += 1 ''' #elmo_result = np.zeros((len(token_list), max_size_sentence-2, ELMo_embed_size), dtype='float32') #elmo_result = np.random.rand(len(token_list), max_size_sentence-2, ELMo_embed_size) # check lmdb cache elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence) if elmo_result is None: with tf.Session() as sess: # weird, for this cpu is faster than gpu (1080Ti !) with tf.device("/cpu:0"): # It is necessary to initialize variables once before running inference sess.run(tf.global_variables_initializer()) # Compute ELMo representations (2 times as a heavy warm-up) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids} ) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids} ) #cache computation self.cache_ELMo_lmdb_vector(token_list, elmo_result) concatenated_result = np.zeros((len(token_list), max_size_sentence-2, self.embed_size), dtype=np.float32) #concatenated_result = np.random.rand(elmo_result.shape[0], max_size_sentence-2, self.embed_size) for i in range(0, len(token_list)): for j in range(0, len(token_list[i])): #if is_int(token_list[i][j]) or is_float(token_list[i][j]): #dummy_result = np.zeros((elmo_result.shape[2]), dtype=np.float32) #concatenated_result[i][j] = np.concatenate((dummy_result, self.get_word_vector(token_list[i][j])), ) #else: concatenated_result[i][j] = np.concatenate((elmo_result[i][j], self.get_word_vector(token_list[i][j]).astype('float32')), ) #concatenated_result[i][j] = np.concatenate((self.get_word_vector(token_list[i][j]), elmo_result[i][j]), ) return concatenated_result def get_sentence_vector_ELMo_with_token_dump(self, token_list): if not self.use_ELMo: print("Warning: ELMo embeddings requested but embeddings object wrongly initialised") return with tf.variable_scope('', reuse=tf.AUTO_REUSE): # the reuse=True scope reuses weights from the whole context self.elmo_input_token_dump = weight_layers('input', self.embeddings_op_token_dump, l2_coef=0.0) # Create batches of data local_token_ids = self.batcher_token_dump.batch_sentences(token_list) with tf.Session() as sess: # weird, for this cpu is faster than gpu (1080Ti !) with tf.device("/cpu:0"): # It is necessary to initialize variables once before running inference sess.run(tf.global_variables_initializer()) # Compute ELMo representations elmo_result = sess.run( self.elmo_input_token_dump['weighted_op'], feed_dict={self.token_ids: local_token_ids} ) return elmo_result def _get_description(self, name): for emb in self.registry["embeddings"]: if emb["name"] == name: return emb for emb in self.registry["embeddings-contextualized"]: if emb["name"] == name: return emb return None def get_word_vector(self, word): """ Get static embeddings (e.g. glove) for a given token """ if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'): # the pre-trained embeddings are not cased word = word.lower() if self.env is None or self.extension == 'bin': # db not available or embeddings in bin format, the embeddings should be available in memory (normally!) return self.get_word_vector_in_memory(word) try: with self.env.begin() as txn: txn = self.env.begin() vector = txn.get(word.encode(encoding='UTF-8')) if vector: word_vector = _deserialize_pickle(vector) vector = None else: word_vector = np.zeros((self.static_embed_size,), dtype=np.float32) # alternatively, initialize with random negative values #word_vector = np.random.uniform(low=-0.5, high=0.0, size=(self.embed_size,)) # alternatively use fasttext OOV ngram possibilities (if ngram available) except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env.close() envFilePath = os.path.join(self.embedding_lmdb_path, self.name) self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_word_vector(word) return word_vector def get_ELMo_lmdb_vector(self, token_list, max_size_sentence): """ Try to get the ELMo embeddings for a sequence cached in LMDB """ if self.env_ELMo is None: # db cache not available, we don't cache ELMo stuff return None try: ELMo_vector = np.zeros((len(token_list), max_size_sentence-2, ELMo_embed_size), dtype='float32') with self.env_ELMo.begin() as txn: for i in range(0, len(token_list)): txn = self.env_ELMo.begin() # get a hash for the token_list the_hash = list_digest(token_list[i]) vector = txn.get(the_hash.encode(encoding='UTF-8')) if vector: # adapt expected shape/padding local_embeddings = _deserialize_pickle(vector) if local_embeddings.shape[0] > max_size_sentence-2: # squeeze the extra padding space ELMo_vector[i] = local_embeddings[:max_size_sentence-2,] elif local_embeddings.shape[0] == max_size_sentence-2: # bingo~! ELMo_vector[i] = local_embeddings else: # fill the missing space with padding filler = np.zeros((max_size_sentence-(local_embeddings.shape[0]+2), ELMo_embed_size), dtype='float32') ELMo_vector[i] = np.concatenate((local_embeddings, filler)) vector = None else: return None except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env_ELMo.close() self.env_ELMo = lmdb.open(self.embedding_ELMo_cache, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_ELMo_lmdb_vector(token_list) return ELMo_vector def cache_ELMo_lmdb_vector(self, token_list, ELMo_vector): """ Cache in LMDB the ELMo embeddings for a given sequence """ if self.env_ELMo is None: # db cache not available, we don't cache ELMo stuff return None txn = self.env_ELMo.begin(write=True) for i in range(0, len(token_list)): # get a hash for the token_list the_hash = list_digest(token_list[i]) txn.put(the_hash.encode(encoding='UTF-8'), _serialize_pickle(ELMo_vector[i])) txn.commit() def clean_ELMo_cache(self): """ Delete ELMo embeddings cache, this takes place normally after the completion of a training """ if self.env_ELMo is None: # db cache not available, nothing to clean return else: self.env.close() for file in os.listdir(self.embedding_ELMo_cache): file_path = os.path.join(self.embedding_ELMo_cache, file) if os.path.isfile(file_path): os.remove(file_path) os.rmdir(self.embedding_ELMo_cache) def get_word_vector_in_memory(self, word): if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'): # the pre-trained embeddings are not cased word = word.lower() if self.extension == 'bin': return self.model.get_word_vector(word) if word in self.model: return self.model[word] else: # for unknown word, we use a vector filled with 0.0 return np.zeros((self.static_embed_size,), dtype=np.float32)