def dump_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file, outfile): with open(options_file, 'r') as fin: options = json.load(fin) max_word_length = options['char_cnn']['max_characters_per_token'] vocab = UnicodeCharsVocabulary(vocab_file, max_word_length) batcher = Batcher(vocab_file, max_word_length) ids_placeholder = tf.placeholder('int32', shape=(None, None, max_word_length)) model = BidirectionalLanguageModel(options_file, weight_file) ops = model(ids_placeholder) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) sentence_id = 0 with open(dataset_file, 'r') as fin, h5py.File(outfile, 'w') as fout: for line in fin: sentence = line.strip().split() char_ids = batcher.batch_sentences([sentence]) embeddings = sess.run(ops['lm_embeddings'], feed_dict={ids_placeholder: char_ids}) ds = fout.create_dataset('{}'.format(sentence_id), embeddings.shape[1:], dtype='float32', data=embeddings[0, :, :, :]) sentence_id += 1
def dump_embeddings_from_dynamic_bilm(option_file, weight_file, word_file, char_file, data_file, output_file, sent_vec=False, sent_vec_type='last', cell_reset=False): """ Get elmo embeddings """ with open(option_file, 'r') as fin: options = json.load(fin) # add one so that 0 is the mask value options['char_cnn']['n_characters'] += 1 max_word_length = options['char_cnn']['max_characters_per_token'] batcher = Batcher(word_file, char_file, max_word_length) # 1D: batch_size, 2D: time_steps, 3D: max_characters_per_token ids_placeholder = tf.placeholder('int32', shape=(None, None, max_word_length)) model = DynamicLanguageModel(options, weight_file, cell_reset=cell_reset) ops = model(ids_placeholder) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: sess.run(tf.global_variables_initializer()) print('Computing ELMo...') sentence_id = 0 with open(data_file, 'r') as fin, h5py.File(output_file, 'w') as fout: for line in fin: if (sentence_id + 1) % 100 == 0: print("%d" % (sentence_id + 1), flush=True, end=" ") sentence = line.rstrip().split() char_ids = batcher.batch_sentences([sentence]) embeddings = sess.run(ops['lm_embeddings'], feed_dict={ids_placeholder: char_ids}) # 1D: 3(ELMo layers), 2D: n_words, 3D: vector dim embeddings = embeddings[0, :, :, :] if sent_vec: embeddings = np.mean(embeddings, axis=1) if sent_vec_type == 'last': embeddings = embeddings[-1] else: embeddings = np.mean(embeddings, axis=0) else: # 1D: n_words, 2D: 3(ELMo layers), 3D: vector dim embeddings = np.transpose(embeddings, (1, 0, 2)) fout.create_dataset(name=str(sentence_id), data=embeddings) sentence_id += 1 print('Finished')
def dump_token_embeddings(vocab_file, options_file, weight_file, outfile): ''' Given an input vocabulary file, dump all the token embeddings to the outfile. The result can be used as the embedding_weight_file when constructing a BidirectionalLanguageModel. ''' with open(options_file, 'r') as fin: options = json.load(fin) max_word_length = options['char_cnn']['max_characters_per_token'] vocab = UnicodeCharsVocabulary(vocab_file, max_word_length) batcher = Batcher(vocab_file, max_word_length) ids_placeholder = tf.placeholder('int32', shape=(None, None, max_word_length)) model = BidirectionalLanguageModel(options_file, weight_file) embedding_op = model(ids_placeholder)['token_embeddings'] n_tokens = vocab.size embed_dim = int(embedding_op.shape[2]) embeddings = np.zeros((n_tokens, embed_dim), dtype=DTYPE) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) for k in range(n_tokens): token = vocab.id_to_word(k) #todo 获取具体的单词 char_ids = batcher.batch_sentences([[token] ])[0, 1, :].reshape(1, 1, -1) embeddings[k, :] = sess.run(embedding_op, feed_dict={ids_placeholder: char_ids}) with h5py.File(outfile, 'w') as fout: ds = fout.create_dataset('embedding', embeddings.shape, dtype='float32', data=embeddings)
def test_batch_sentences(self): batcher = Batcher(os.path.join(DATA_FIXTURES, 'vocab_test.txt'), 50) sentences = [['The', 'first', 'sentence'], ['Second', '.']] x_char_ids = batcher.batch_sentences(sentences) self.assertTrue((x_char_ids == self._expected_char_ids).all())
def _check_weighted_layer(self, l2_coef, do_layer_norm, use_top_only): # create the Batcher vocab_file = os.path.join(FIXTURES, 'vocab_test.txt') batcher = Batcher(vocab_file, 50) # load the model options_file = os.path.join(FIXTURES, 'options.json') weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5') character_ids = tf.placeholder('int32', (None, None, 50)) model = BidirectionalLanguageModel( options_file, weight_file, max_batch_size=4) bilm_ops = model(character_ids) weighted_ops = [] for k in range(2): ops = weight_layers(str(k), bilm_ops, l2_coef=l2_coef, do_layer_norm=do_layer_norm, use_top_only=use_top_only) weighted_ops.append(ops) # initialize self.sess.run(tf.global_variables_initializer()) n_expected_trainable_weights = 2 * (1 + int(not use_top_only)) self.assertEqual(len(tf.trainable_variables()), n_expected_trainable_weights) # and one regularizer per weighted layer n_expected_reg_losses = 2 * int(not use_top_only) self.assertEqual( len(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)), n_expected_reg_losses, ) # Set the variables. weights = [[np.array([0.1, 0.3, 0.5]), np.array([1.1])], [np.array([0.2, 0.4, 0.6]), np.array([0.88])]] for k in range(2): with tf.variable_scope('', reuse=True): if not use_top_only: W = tf.get_variable('{}_ELMo_W'.format(k)) _ = self.sess.run([W.assign(weights[k][0])]) gamma = tf.get_variable('{}_ELMo_gamma'.format(k)) _ = self.sess.run([gamma.assign(weights[k][1])]) # make some data sentences = [ ['The', 'first', 'sentence', '.'], ['The', 'second'], ['Third'] ] X_chars = batcher.batch_sentences(sentences) ops = model(character_ids) lm_embeddings, mask, weighted0, weighted1 = self.sess.run( [ops['lm_embeddings'], ops['mask'], weighted_ops[0]['weighted_op'], weighted_ops[1]['weighted_op']], feed_dict={character_ids: X_chars} ) actual_elmo = [weighted0, weighted1] # check the mask first expected_mask = [[True, True, True, True], [True, True, False, False], [True, False, False, False]] self.assertTrue((expected_mask == mask).all()) # Now compute the actual weighted layers for k in range(2): normed_weights = np.exp(weights[k][0] + 1.0 / 3) / np.sum( np.exp(weights[k][0] + 1.0 / 3)) # masked layer normalization expected_elmo = np.zeros((3, 4, lm_embeddings.shape[-1])) if not use_top_only: for j in range(3): # number of LM layers if do_layer_norm: mean = np.mean(lm_embeddings[:, j, :, :][mask]) std = np.std(lm_embeddings[:, j, :, :][mask]) normed_lm_embed = (lm_embeddings[:, j, :, :] - mean) / ( std + 1E-12) expected_elmo += normed_weights[j] * normed_lm_embed else: expected_elmo += normed_weights[j] * lm_embeddings[ :, j, :, :] else: expected_elmo += lm_embeddings[:, -1, :, :] # the scale parameter expected_elmo *= weights[k][1] self.assertTrue( np.allclose(expected_elmo, actual_elmo[k], atol=1e-6) )
def test_bilm(self): sentences, expected_lm_embeddings = _load_sentences_embeddings() # create the Batcher vocab_file = os.path.join(FIXTURES, 'vocab_test.txt') batcher = Batcher(vocab_file, 50) # load the model options_file = os.path.join(FIXTURES, 'options.json') weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5') character_ids = tf.placeholder('int32', (None, None, 50)) model = BidirectionalLanguageModel(options_file, weight_file, max_batch_size=4) # get the ops to compute embeddings ops = model(character_ids) # initialize self.sess.run(tf.global_variables_initializer()) # We shouldn't have any trainable variables self.assertEqual(len(tf.trainable_variables()), 0) # will run 10 batches of 3 sentences for i in range(10): # make a batch of sentences batch_sentences = [] for k in range(3): sentence = sentences[k][i].strip().split() batch_sentences.append(sentence) X = batcher.batch_sentences(batch_sentences) lm_embeddings, lengths = self.sess.run( [ops['lm_embeddings'], ops['lengths']], feed_dict={character_ids: X}) #todo 句子的真实的长度 actual_lengths = [len(sent) for sent in batch_sentences] self.assertEqual(actual_lengths, list(lengths)) # get the expected embeddings and compare! expected_y = [expected_lm_embeddings[k][i] for k in range(3)] for k in range(3): self.assertTrue( np.allclose(lm_embeddings[k, 2, :lengths[k], :], expected_y[k], atol=1.0e-6)) # Finally, check that the states are being updated properly. # All batches were size=3, so last element of states should always # be zero. third_states = [] for direction in ['forward', 'backward']: states = self.sess.run( model._graphs[character_ids].lstm_init_states[direction]) for i in range(2): for state in states[i]: self.assertTrue(np.sum(np.abs(state[-1, :])) < 1e-7) third_states.append(state[2, :]) # Run a batch with size=2, the third state should not have been updated _ = self.sess.run( ops['lm_embeddings'], feed_dict={character_ids: np.ones((2, 5, 50), dtype=np.int32)}) k = 0 for direction in ['forward', 'backward']: states = self.sess.run( model._graphs[character_ids].lstm_init_states[direction]) for i in range(2): for state in states[i]: self.assertTrue( np.allclose(third_states[k], state[2, :], atol=1e-6)) k += 1