예제 #1
0
파일: model.py 프로젝트: sjyttkl/ELMO
def dump_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file,
                         outfile):
    with open(options_file, 'r') as fin:
        options = json.load(fin)
    max_word_length = options['char_cnn']['max_characters_per_token']

    vocab = UnicodeCharsVocabulary(vocab_file, max_word_length)
    batcher = Batcher(vocab_file, max_word_length)

    ids_placeholder = tf.placeholder('int32',
                                     shape=(None, None, max_word_length))
    model = BidirectionalLanguageModel(options_file, weight_file)
    ops = model(ids_placeholder)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        sentence_id = 0
        with open(dataset_file, 'r') as fin, h5py.File(outfile, 'w') as fout:
            for line in fin:
                sentence = line.strip().split()
                char_ids = batcher.batch_sentences([sentence])
                embeddings = sess.run(ops['lm_embeddings'],
                                      feed_dict={ids_placeholder: char_ids})
                ds = fout.create_dataset('{}'.format(sentence_id),
                                         embeddings.shape[1:],
                                         dtype='float32',
                                         data=embeddings[0, :, :, :])

                sentence_id += 1
예제 #2
0
def load_batcher(data_params, cuda):
    languages, Lang_name = [], []
    # Load the data into languages
    data_dir = data_params['data_dir']
    for w in data_params['languages']:
        lang = Language(
            name=w['name'],
            cuda=cuda,
            mode=data_params['mode'],
            mean_center=data_params['mean_center'],
            unit_norm=data_params['unit_norm']
        )
        Lang_name.append(w['name'])
        lang.load(w['filename'], data_dir, max_freq=data_params['max_freq'])
        languages.append(lang)
    batcher = Batcher(languages)
    if 'supervised' in data_params:
        filename = data_params['supervised']['fname']
        random = data_params['supervised']['random']
        max_count = data_params['supervised']['max_count']
        if data_params["data_dir"] == "./muse_data/":
            sup_dir_name = os.path.join(data_dir, "crosslingual", "dictionaries")
        elif data_params["data_dir"] == "./vecmap_data/":
            sup_dir_name = os.path.join(data_dir, "dictionaries")
        batcher.load_from_supervised(
            filename, Lang_name[0], Lang_name[1],
            sup_dir_name, random = random, max_count=max_count)
    return batcher
예제 #3
0
def dump_embeddings_from_dynamic_bilm(option_file,
                                      weight_file,
                                      word_file,
                                      char_file,
                                      data_file,
                                      output_file,
                                      sent_vec=False,
                                      sent_vec_type='last',
                                      cell_reset=False):
    """
    Get elmo embeddings
    """

    with open(option_file, 'r') as fin:
        options = json.load(fin)

    # add one so that 0 is the mask value
    options['char_cnn']['n_characters'] += 1

    max_word_length = options['char_cnn']['max_characters_per_token']
    batcher = Batcher(word_file, char_file, max_word_length)

    # 1D: batch_size, 2D: time_steps, 3D: max_characters_per_token
    ids_placeholder = tf.placeholder('int32',
                                     shape=(None, None, max_word_length))
    model = DynamicLanguageModel(options, weight_file, cell_reset=cell_reset)
    ops = model(ids_placeholder)

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
        sess.run(tf.global_variables_initializer())

        print('Computing ELMo...')
        sentence_id = 0
        with open(data_file, 'r') as fin, h5py.File(output_file, 'w') as fout:
            for line in fin:
                if (sentence_id + 1) % 100 == 0:
                    print("%d" % (sentence_id + 1), flush=True, end=" ")

                sentence = line.rstrip().split()
                char_ids = batcher.batch_sentences([sentence])

                embeddings = sess.run(ops['lm_embeddings'],
                                      feed_dict={ids_placeholder: char_ids})

                # 1D: 3(ELMo layers), 2D: n_words, 3D: vector dim
                embeddings = embeddings[0, :, :, :]
                if sent_vec:
                    embeddings = np.mean(embeddings, axis=1)
                    if sent_vec_type == 'last':
                        embeddings = embeddings[-1]
                    else:
                        embeddings = np.mean(embeddings, axis=0)
                else:
                    # 1D: n_words, 2D: 3(ELMo layers), 3D: vector dim
                    embeddings = np.transpose(embeddings, (1, 0, 2))

                fout.create_dataset(name=str(sentence_id), data=embeddings)
                sentence_id += 1
        print('Finished')
def main(unused_argv):
  if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
    raise Exception("Problem with flags: %s" % unused_argv)

  tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want
  tf.logging.info('Starting seq2seq_attention in %s mode...', (PARAMS.mode))

  # Change log_root to PARAMS.log_root/PARAMS.exp_name and create the dir if necessary
  PARAMS.log_root = os.path.join(PARAMS.log_root, PARAMS.exp_name)
  if not os.path.exists(PARAMS.log_root):
    if PARAMS.mode== "train":
      os.makedirs(PARAMS.log_root)
    else:
      raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (PARAMS.log_root))

  vocab = Vocab(PARAMS.vocab_path, PARAMS.vocab_size) # create a vocabulary

  # If in decode mode, set batch_size = beam_size
  # Reason: in decode mode, we decode one example at a time.
  # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses.
  if PARAMS.mode == 'decode':
    PARAMS.batch_size = PARAMS.beam_size

  # If single_pass=True, check we're in decode mode
  if PARAMS.single_pass and PARAMS.mode!= 'decode':
    raise Exception("The single_pass flag should only be True in decode mode")

  # Make a namedtuple hps, containing the values of the hyperparameters that the model needs
  hparam_list = ['mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen']
  hps_dict = {}
  for key,val in PARAMS.__flags.items(): # for each flag
    if key in hparam_list: # if it's in the list
      hps_dict[key] = val # add it to the dict
  hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)

  # Create a batcher object that will create minibatches of data
  batcher = Batcher(PARAMS.data_path, vocab, hps, single_pass=PARAMS.single_pass)

  tf.set_random_seed(111) # a seed value for randomness

  if hps.mode == 'train':
    print("creating model...")
    model = AttHistCopyModel(hps, vocab)
    setup_training(model, batcher)
  elif hps.mode == 'eval':
    model = AttHistCopyModel(hps, vocab)
    run_eval(model, batcher, vocab)
  elif hps.mode == 'decode':
    decode_model_hps = hps  # This will be the hyperparameters for the decoder model
    decode_model_hps = hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries
    model = AttHistCopyModel(decode_model_hps, vocab)
    decoder = BeamSearchDecoder(model, batcher, vocab)
    decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once)
  else:
    raise ValueError("The 'mode' flag must be one of train/eval/decode")
예제 #5
0
    def __init__(self, model_file_path):
        model_name = re.findall(r'train_\d+', model_file_path)[0] + '_' + \
                     re.findall(r'model_\d+_\d+\.\d+', model_file_path)[0]
        self._decode_dir = os.path.join(config.log_root,
                                        'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=config.beam_size,
                               single_pass=True)
        self.model = Model(model_file_path, is_eval=True)
예제 #6
0
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))

        if not os.path.exists(config.log_root):
            os.mkdir(config.log_root)

        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
예제 #7
0
파일: model.py 프로젝트: sjyttkl/ELMO
def dump_token_embeddings(vocab_file, options_file, weight_file, outfile):
    '''
    Given an input vocabulary file, dump all the token embeddings to the
    outfile.  The result can be used as the embedding_weight_file when
    constructing a BidirectionalLanguageModel.
    '''
    with open(options_file, 'r') as fin:
        options = json.load(fin)
    max_word_length = options['char_cnn']['max_characters_per_token']

    vocab = UnicodeCharsVocabulary(vocab_file, max_word_length)
    batcher = Batcher(vocab_file, max_word_length)

    ids_placeholder = tf.placeholder('int32',
                                     shape=(None, None, max_word_length))
    model = BidirectionalLanguageModel(options_file, weight_file)
    embedding_op = model(ids_placeholder)['token_embeddings']

    n_tokens = vocab.size
    embed_dim = int(embedding_op.shape[2])

    embeddings = np.zeros((n_tokens, embed_dim), dtype=DTYPE)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        for k in range(n_tokens):
            token = vocab.id_to_word(k)  #todo 获取具体的单词
            char_ids = batcher.batch_sentences([[token]
                                                ])[0, 1, :].reshape(1, 1, -1)
            embeddings[k, :] = sess.run(embedding_op,
                                        feed_dict={ids_placeholder: char_ids})

    with h5py.File(outfile, 'w') as fout:
        ds = fout.create_dataset('embedding',
                                 embeddings.shape,
                                 dtype='float32',
                                 data=embeddings)
예제 #8
0
    def test_batch_sentences(self):
        batcher = Batcher(os.path.join(DATA_FIXTURES, 'vocab_test.txt'), 50)
        sentences = [['The', 'first', 'sentence'], ['Second', '.']]
        x_char_ids = batcher.batch_sentences(sentences)

        self.assertTrue((x_char_ids == self._expected_char_ids).all())
예제 #9
0
    print('Success rate: %d / %d' % (success_rate, len(train_sentences)))


if __name__ == "__main__":

    config = json.load(open('config.json', 'r'))
    data_path = '/dev/shm/coco/'
    #data_path = 'coco/'
    train_dir = 'summaries/Caption_training' + datetime.datetime.strftime(
        datetime.datetime.today(), '%d%m%Y%H%M%S')

    vocab = Vocab('vocab')
    model = CaptioningNetwork(config, vocab)

    batcher = Batcher(data_path, config, vocab)

    tf.set_random_seed(111)

    # Setup training
    tf.logging.info('Building graph...')
    model.build_graph()

    # print(tf.GraphKeys.GLOBAL_VARIABLES)
    # print(tf.GraphKeys.TRAINABLE_VARIABLES)

    # Feed forward test
    # with sess:
    #     sess.run(...)
    #     output_shape = ...
    #     print('Feed forward OK! Output shape: %s' % str(output_shape))
예제 #10
0
    def _check_weighted_layer(self, l2_coef, do_layer_norm, use_top_only):
        # create the Batcher
        vocab_file = os.path.join(FIXTURES, 'vocab_test.txt')
        batcher = Batcher(vocab_file, 50)

        # load the model
        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
        character_ids = tf.placeholder('int32', (None, None, 50))
        model = BidirectionalLanguageModel(
            options_file, weight_file, max_batch_size=4)
        bilm_ops = model(character_ids)

        weighted_ops = []
        for k in range(2):
            ops = weight_layers(str(k), bilm_ops, l2_coef=l2_coef, 
                                     do_layer_norm=do_layer_norm,
                                     use_top_only=use_top_only)
            weighted_ops.append(ops)

        # initialize
        self.sess.run(tf.global_variables_initializer())

        n_expected_trainable_weights = 2 * (1 + int(not use_top_only))
        self.assertEqual(len(tf.trainable_variables()),
                         n_expected_trainable_weights)
        # and one regularizer per weighted layer
        n_expected_reg_losses = 2 * int(not use_top_only)
        self.assertEqual(
            len(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)),
            n_expected_reg_losses,
        )

        # Set the variables.
        weights = [[np.array([0.1, 0.3, 0.5]), np.array([1.1])],
                   [np.array([0.2, 0.4, 0.6]), np.array([0.88])]]
        for k in range(2):
            with tf.variable_scope('', reuse=True):
                if not use_top_only:
                    W = tf.get_variable('{}_ELMo_W'.format(k))
                    _ = self.sess.run([W.assign(weights[k][0])])
                gamma = tf.get_variable('{}_ELMo_gamma'.format(k))
                _ = self.sess.run([gamma.assign(weights[k][1])])

        # make some data
        sentences = [
            ['The', 'first', 'sentence', '.'],
            ['The', 'second'],
            ['Third']
        ]
        X_chars = batcher.batch_sentences(sentences)

        ops = model(character_ids)
        lm_embeddings, mask, weighted0, weighted1 = self.sess.run(
            [ops['lm_embeddings'], ops['mask'],
             weighted_ops[0]['weighted_op'], weighted_ops[1]['weighted_op']],
            feed_dict={character_ids: X_chars}
        )
        actual_elmo = [weighted0, weighted1]

        # check the mask first
        expected_mask = [[True, True, True, True],
                         [True, True, False, False],
                         [True, False, False, False]]
        self.assertTrue((expected_mask == mask).all())

        # Now compute the actual weighted layers
        for k in range(2):
            normed_weights = np.exp(weights[k][0] + 1.0 / 3) / np.sum(
                                  np.exp(weights[k][0] + 1.0 / 3))
            # masked layer normalization
            expected_elmo = np.zeros((3, 4, lm_embeddings.shape[-1]))
            if not use_top_only:
                for j in range(3):  # number of LM layers
                    if do_layer_norm:
                        mean = np.mean(lm_embeddings[:, j, :, :][mask])
                        std = np.std(lm_embeddings[:, j, :, :][mask])
                        normed_lm_embed = (lm_embeddings[:, j, :, :] - mean) / (
                            std + 1E-12)
                        expected_elmo += normed_weights[j] * normed_lm_embed
                    else:
                        expected_elmo += normed_weights[j] * lm_embeddings[
                                                                    :, j, :, :]
            else:
                expected_elmo += lm_embeddings[:, -1, :, :]

            # the scale parameter
            expected_elmo *= weights[k][1]
            self.assertTrue(
                np.allclose(expected_elmo, actual_elmo[k], atol=1e-6)
            )
예제 #11
0
    def test_bilm(self):
        sentences, expected_lm_embeddings = _load_sentences_embeddings()

        # create the Batcher
        vocab_file = os.path.join(FIXTURES, 'vocab_test.txt')
        batcher = Batcher(vocab_file, 50)
        # load the model
        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
        character_ids = tf.placeholder('int32', (None, None, 50))
        model = BidirectionalLanguageModel(options_file,
                                           weight_file,
                                           max_batch_size=4)

        # get the ops to compute embeddings
        ops = model(character_ids)

        # initialize
        self.sess.run(tf.global_variables_initializer())

        # We shouldn't have any trainable variables
        self.assertEqual(len(tf.trainable_variables()), 0)

        # will run 10 batches of 3 sentences
        for i in range(10):
            # make a batch of sentences
            batch_sentences = []
            for k in range(3):
                sentence = sentences[k][i].strip().split()
                batch_sentences.append(sentence)

            X = batcher.batch_sentences(batch_sentences)
            lm_embeddings, lengths = self.sess.run(
                [ops['lm_embeddings'], ops['lengths']],
                feed_dict={character_ids: X})
            #todo 句子的真实的长度
            actual_lengths = [len(sent) for sent in batch_sentences]

            self.assertEqual(actual_lengths, list(lengths))

            # get the expected embeddings and compare!
            expected_y = [expected_lm_embeddings[k][i] for k in range(3)]
            for k in range(3):
                self.assertTrue(
                    np.allclose(lm_embeddings[k, 2, :lengths[k], :],
                                expected_y[k],
                                atol=1.0e-6))

        # Finally, check that the states are being updated properly.
        # All batches were size=3, so last element of states should always
        # be zero.
        third_states = []
        for direction in ['forward', 'backward']:
            states = self.sess.run(
                model._graphs[character_ids].lstm_init_states[direction])
            for i in range(2):
                for state in states[i]:
                    self.assertTrue(np.sum(np.abs(state[-1, :])) < 1e-7)
                    third_states.append(state[2, :])

        # Run a batch with size=2, the third state should not have been updated
        _ = self.sess.run(
            ops['lm_embeddings'],
            feed_dict={character_ids: np.ones((2, 5, 50), dtype=np.int32)})
        k = 0
        for direction in ['forward', 'backward']:
            states = self.sess.run(
                model._graphs[character_ids].lstm_init_states[direction])
            for i in range(2):
                for state in states[i]:
                    self.assertTrue(
                        np.allclose(third_states[k], state[2, :], atol=1e-6))
                    k += 1