예제 #1
0
파일: model.py 프로젝트: sjyttkl/ELMO
def dump_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file,
                         outfile):
    with open(options_file, 'r') as fin:
        options = json.load(fin)
    max_word_length = options['char_cnn']['max_characters_per_token']

    vocab = UnicodeCharsVocabulary(vocab_file, max_word_length)
    batcher = Batcher(vocab_file, max_word_length)

    ids_placeholder = tf.placeholder('int32',
                                     shape=(None, None, max_word_length))
    model = BidirectionalLanguageModel(options_file, weight_file)
    ops = model(ids_placeholder)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        sentence_id = 0
        with open(dataset_file, 'r') as fin, h5py.File(outfile, 'w') as fout:
            for line in fin:
                sentence = line.strip().split()
                char_ids = batcher.batch_sentences([sentence])
                embeddings = sess.run(ops['lm_embeddings'],
                                      feed_dict={ids_placeholder: char_ids})
                ds = fout.create_dataset('{}'.format(sentence_id),
                                         embeddings.shape[1:],
                                         dtype='float32',
                                         data=embeddings[0, :, :, :])

                sentence_id += 1
예제 #2
0
def dump_embeddings_from_dynamic_bilm(option_file,
                                      weight_file,
                                      word_file,
                                      char_file,
                                      data_file,
                                      output_file,
                                      sent_vec=False,
                                      sent_vec_type='last',
                                      cell_reset=False):
    """
    Get elmo embeddings
    """

    with open(option_file, 'r') as fin:
        options = json.load(fin)

    # add one so that 0 is the mask value
    options['char_cnn']['n_characters'] += 1

    max_word_length = options['char_cnn']['max_characters_per_token']
    batcher = Batcher(word_file, char_file, max_word_length)

    # 1D: batch_size, 2D: time_steps, 3D: max_characters_per_token
    ids_placeholder = tf.placeholder('int32',
                                     shape=(None, None, max_word_length))
    model = DynamicLanguageModel(options, weight_file, cell_reset=cell_reset)
    ops = model(ids_placeholder)

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
        sess.run(tf.global_variables_initializer())

        print('Computing ELMo...')
        sentence_id = 0
        with open(data_file, 'r') as fin, h5py.File(output_file, 'w') as fout:
            for line in fin:
                if (sentence_id + 1) % 100 == 0:
                    print("%d" % (sentence_id + 1), flush=True, end=" ")

                sentence = line.rstrip().split()
                char_ids = batcher.batch_sentences([sentence])

                embeddings = sess.run(ops['lm_embeddings'],
                                      feed_dict={ids_placeholder: char_ids})

                # 1D: 3(ELMo layers), 2D: n_words, 3D: vector dim
                embeddings = embeddings[0, :, :, :]
                if sent_vec:
                    embeddings = np.mean(embeddings, axis=1)
                    if sent_vec_type == 'last':
                        embeddings = embeddings[-1]
                    else:
                        embeddings = np.mean(embeddings, axis=0)
                else:
                    # 1D: n_words, 2D: 3(ELMo layers), 3D: vector dim
                    embeddings = np.transpose(embeddings, (1, 0, 2))

                fout.create_dataset(name=str(sentence_id), data=embeddings)
                sentence_id += 1
        print('Finished')
예제 #3
0
파일: model.py 프로젝트: sjyttkl/ELMO
def dump_token_embeddings(vocab_file, options_file, weight_file, outfile):
    '''
    Given an input vocabulary file, dump all the token embeddings to the
    outfile.  The result can be used as the embedding_weight_file when
    constructing a BidirectionalLanguageModel.
    '''
    with open(options_file, 'r') as fin:
        options = json.load(fin)
    max_word_length = options['char_cnn']['max_characters_per_token']

    vocab = UnicodeCharsVocabulary(vocab_file, max_word_length)
    batcher = Batcher(vocab_file, max_word_length)

    ids_placeholder = tf.placeholder('int32',
                                     shape=(None, None, max_word_length))
    model = BidirectionalLanguageModel(options_file, weight_file)
    embedding_op = model(ids_placeholder)['token_embeddings']

    n_tokens = vocab.size
    embed_dim = int(embedding_op.shape[2])

    embeddings = np.zeros((n_tokens, embed_dim), dtype=DTYPE)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        for k in range(n_tokens):
            token = vocab.id_to_word(k)  #todo 获取具体的单词
            char_ids = batcher.batch_sentences([[token]
                                                ])[0, 1, :].reshape(1, 1, -1)
            embeddings[k, :] = sess.run(embedding_op,
                                        feed_dict={ids_placeholder: char_ids})

    with h5py.File(outfile, 'w') as fout:
        ds = fout.create_dataset('embedding',
                                 embeddings.shape,
                                 dtype='float32',
                                 data=embeddings)
예제 #4
0
    def test_batch_sentences(self):
        batcher = Batcher(os.path.join(DATA_FIXTURES, 'vocab_test.txt'), 50)
        sentences = [['The', 'first', 'sentence'], ['Second', '.']]
        x_char_ids = batcher.batch_sentences(sentences)

        self.assertTrue((x_char_ids == self._expected_char_ids).all())
예제 #5
0
    def _check_weighted_layer(self, l2_coef, do_layer_norm, use_top_only):
        # create the Batcher
        vocab_file = os.path.join(FIXTURES, 'vocab_test.txt')
        batcher = Batcher(vocab_file, 50)

        # load the model
        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
        character_ids = tf.placeholder('int32', (None, None, 50))
        model = BidirectionalLanguageModel(
            options_file, weight_file, max_batch_size=4)
        bilm_ops = model(character_ids)

        weighted_ops = []
        for k in range(2):
            ops = weight_layers(str(k), bilm_ops, l2_coef=l2_coef, 
                                     do_layer_norm=do_layer_norm,
                                     use_top_only=use_top_only)
            weighted_ops.append(ops)

        # initialize
        self.sess.run(tf.global_variables_initializer())

        n_expected_trainable_weights = 2 * (1 + int(not use_top_only))
        self.assertEqual(len(tf.trainable_variables()),
                         n_expected_trainable_weights)
        # and one regularizer per weighted layer
        n_expected_reg_losses = 2 * int(not use_top_only)
        self.assertEqual(
            len(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)),
            n_expected_reg_losses,
        )

        # Set the variables.
        weights = [[np.array([0.1, 0.3, 0.5]), np.array([1.1])],
                   [np.array([0.2, 0.4, 0.6]), np.array([0.88])]]
        for k in range(2):
            with tf.variable_scope('', reuse=True):
                if not use_top_only:
                    W = tf.get_variable('{}_ELMo_W'.format(k))
                    _ = self.sess.run([W.assign(weights[k][0])])
                gamma = tf.get_variable('{}_ELMo_gamma'.format(k))
                _ = self.sess.run([gamma.assign(weights[k][1])])

        # make some data
        sentences = [
            ['The', 'first', 'sentence', '.'],
            ['The', 'second'],
            ['Third']
        ]
        X_chars = batcher.batch_sentences(sentences)

        ops = model(character_ids)
        lm_embeddings, mask, weighted0, weighted1 = self.sess.run(
            [ops['lm_embeddings'], ops['mask'],
             weighted_ops[0]['weighted_op'], weighted_ops[1]['weighted_op']],
            feed_dict={character_ids: X_chars}
        )
        actual_elmo = [weighted0, weighted1]

        # check the mask first
        expected_mask = [[True, True, True, True],
                         [True, True, False, False],
                         [True, False, False, False]]
        self.assertTrue((expected_mask == mask).all())

        # Now compute the actual weighted layers
        for k in range(2):
            normed_weights = np.exp(weights[k][0] + 1.0 / 3) / np.sum(
                                  np.exp(weights[k][0] + 1.0 / 3))
            # masked layer normalization
            expected_elmo = np.zeros((3, 4, lm_embeddings.shape[-1]))
            if not use_top_only:
                for j in range(3):  # number of LM layers
                    if do_layer_norm:
                        mean = np.mean(lm_embeddings[:, j, :, :][mask])
                        std = np.std(lm_embeddings[:, j, :, :][mask])
                        normed_lm_embed = (lm_embeddings[:, j, :, :] - mean) / (
                            std + 1E-12)
                        expected_elmo += normed_weights[j] * normed_lm_embed
                    else:
                        expected_elmo += normed_weights[j] * lm_embeddings[
                                                                    :, j, :, :]
            else:
                expected_elmo += lm_embeddings[:, -1, :, :]

            # the scale parameter
            expected_elmo *= weights[k][1]
            self.assertTrue(
                np.allclose(expected_elmo, actual_elmo[k], atol=1e-6)
            )
예제 #6
0
    def test_bilm(self):
        sentences, expected_lm_embeddings = _load_sentences_embeddings()

        # create the Batcher
        vocab_file = os.path.join(FIXTURES, 'vocab_test.txt')
        batcher = Batcher(vocab_file, 50)
        # load the model
        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
        character_ids = tf.placeholder('int32', (None, None, 50))
        model = BidirectionalLanguageModel(options_file,
                                           weight_file,
                                           max_batch_size=4)

        # get the ops to compute embeddings
        ops = model(character_ids)

        # initialize
        self.sess.run(tf.global_variables_initializer())

        # We shouldn't have any trainable variables
        self.assertEqual(len(tf.trainable_variables()), 0)

        # will run 10 batches of 3 sentences
        for i in range(10):
            # make a batch of sentences
            batch_sentences = []
            for k in range(3):
                sentence = sentences[k][i].strip().split()
                batch_sentences.append(sentence)

            X = batcher.batch_sentences(batch_sentences)
            lm_embeddings, lengths = self.sess.run(
                [ops['lm_embeddings'], ops['lengths']],
                feed_dict={character_ids: X})
            #todo 句子的真实的长度
            actual_lengths = [len(sent) for sent in batch_sentences]

            self.assertEqual(actual_lengths, list(lengths))

            # get the expected embeddings and compare!
            expected_y = [expected_lm_embeddings[k][i] for k in range(3)]
            for k in range(3):
                self.assertTrue(
                    np.allclose(lm_embeddings[k, 2, :lengths[k], :],
                                expected_y[k],
                                atol=1.0e-6))

        # Finally, check that the states are being updated properly.
        # All batches were size=3, so last element of states should always
        # be zero.
        third_states = []
        for direction in ['forward', 'backward']:
            states = self.sess.run(
                model._graphs[character_ids].lstm_init_states[direction])
            for i in range(2):
                for state in states[i]:
                    self.assertTrue(np.sum(np.abs(state[-1, :])) < 1e-7)
                    third_states.append(state[2, :])

        # Run a batch with size=2, the third state should not have been updated
        _ = self.sess.run(
            ops['lm_embeddings'],
            feed_dict={character_ids: np.ones((2, 5, 50), dtype=np.int32)})
        k = 0
        for direction in ['forward', 'backward']:
            states = self.sess.run(
                model._graphs[character_ids].lstm_init_states[direction])
            for i in range(2):
                for state in states[i]:
                    self.assertTrue(
                        np.allclose(third_states[k], state[2, :], atol=1e-6))
                    k += 1