Пример #1
0
def eval(args):
    paddle.set_device(args.device)

    if not args.init_from_ckpt:
        raise ValueError('init_from_ckpt should be set when eval.')
    vocab = load_vocab(args.vocab_file, args.max_characters_per_token)

    elmo = ELMo(args.batch_size,
                args.char_embed_dim,
                args.projection_dim,
                vocab.size,
                dropout=args.dropout,
                num_layers=args.num_layers,
                num_highways=args.num_highways,
                char_vocab_size=vocab.char_size)
    elmo.eval()

    elmo_loss = ELMoLoss()

    # Loads pre-trained parameters.
    weight_state_dict = paddle.load(args.init_from_ckpt + '.pdparams')
    elmo.set_state_dict(weight_state_dict)
    print("Loaded checkpoint from %s" % args.init_from_ckpt)

    dev_dataset = OneBillionWordDataset(args.dev_data_path,
                                        vocab,
                                        args.batch_size,
                                        args.unroll_steps,
                                        mode='test',
                                        shuffle=False,
                                        seed=args.seed)

    dev_dataloader = DataLoader(dev_dataset, return_list=True, batch_size=None)

    total_step = total_loss = 0
    total_time = 0.0
    batch_start_time = time.time()
    for step, inputs in enumerate(dev_dataloader, start=1):
        ids, next_ids, ids_reverse, next_ids_reverse = inputs
        outputs = elmo([ids, ids_reverse])
        loss = elmo_loss(outputs, [next_ids, next_ids_reverse])
        ppl = paddle.exp(loss)

        total_loss += loss.numpy()[0]
        total_step += 1

        total_time += (time.time() - batch_start_time)
        if step % args.log_freq == 0:
            print("Eval step %d - loss: %.4f - Perplexity: %.4f - %.3fs/step" %
                  (step, loss.numpy()[0] * args.unroll_steps, ppl.numpy()[0],
                   total_time / args.log_freq))
            total_time = 0.0
        batch_start_time = time.time()

    avg_loss = total_loss / total_step
    avg_ppl = math.exp(avg_loss)
    print("Eval - average loss: %.4f - average Perplexity: %.4f" %
          (avg_loss * args.unroll_steps, avg_ppl))
Пример #2
0
class FeatureExtractor(object):
    def __init__(self, model, options, vocab, nnvecs=1):

        self.word_counts, words, chars, pos, cpos, rels, treebanks, langs = vocab

        self.model = model
        self.nnvecs = nnvecs

        # Load ELMo if the option is set
        if options.elmo is not None:
            from elmo import ELMo
            self.elmo = ELMo(options.elmo, options.elmo_gamma,
                             options.elmo_learn_gamma)
            self.elmo.init_weights(model)
        else:
            self.elmo = None

        extra_words = 2  # MLP padding vector and OOV vector
        self.words = {word: ind for ind, word in enumerate(words, extra_words)}
        self.word_lookup = self.model.add_lookup_parameters(
            (len(self.words) + extra_words, options.word_emb_size))

        extra_pos = 2  # MLP padding vector and OOV vector
        self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)}
        self.pos_lookup = self.model.add_lookup_parameters(
            (len(cpos) + extra_pos, options.pos_emb_size))

        self.irels = rels
        self.rels = {rel: ind for ind, rel in enumerate(rels)}

        extra_chars = 1  # OOV vector
        self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)}
        self.char_lookup = self.model.add_lookup_parameters(
            (len(chars) + extra_chars, options.char_emb_size))

        extra_treebanks = 1  # Padding vector
        self.treebanks = {
            treebank: ind
            for ind, treebank in enumerate(treebanks, extra_treebanks)
        }
        self.treebank_lookup = self.model.add_lookup_parameters(
            (len(treebanks) + extra_treebanks, options.tbank_emb_size))

        # initialise word vectors with external embeddings where they exist
        # This part got ugly - TODO: refactor
        if not options.predict:
            self.external_embedding = defaultdict(lambda: {})

            if options.ext_word_emb_file and options.word_emb_size > 0:
                # Load pre-trained word embeddings
                for lang in langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_word_emb_file,
                        lang=lang,
                        words=self.words.viewkeys())
                    self.external_embedding["words"].update(embeddings)

            if options.ext_char_emb_file and options.char_emb_size > 0:
                # Load pre-trained character embeddings
                for lang in langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_char_emb_file,
                        lang=lang,
                        words=self.chars,
                        chars=True)
                    self.external_embedding["chars"].update(embeddings)

            if options.ext_emb_dir:
                # For every language, load the data for the word and character
                # embeddings from a directory.
                for lang in langs:
                    if options.word_emb_size > 0:
                        embeddings = utils.get_external_embeddings(
                            options,
                            emb_dir=options.ext_emb_dir,
                            lang=lang,
                            words=self.words.viewkeys())
                        self.external_embedding["words"].update(embeddings)

                    if options.char_emb_size > 0:
                        embeddings = utils.get_external_embeddings(
                            options,
                            emb_dir=options.ext_emb_dir,
                            lang=lang,
                            words=self.chars,
                            chars=True)
                        self.external_embedding["chars"].update(embeddings)

            self.init_lookups(options)

        elmo_emb_size = self.elmo.emb_dim if self.elmo else 0
        self.lstm_input_size = (
            options.word_emb_size + elmo_emb_size + options.pos_emb_size +
            options.tbank_emb_size + 2 *
            (options.char_lstm_output_size if options.char_emb_size > 0 else 0)
        )
        print "Word-level LSTM input size: " + str(self.lstm_input_size)

        self.bilstms = []
        if options.no_bilstms > 0:
            self.bilstms.append(
                BiLSTM(self.lstm_input_size,
                       options.lstm_output_size,
                       self.model,
                       dropout_rate=0.33))
            for i in range(1, options.no_bilstms):
                self.bilstms.append(
                    BiLSTM(2 * options.lstm_output_size,
                           options.lstm_output_size,
                           self.model,
                           dropout_rate=0.33))
            #used in the PaddingVec
            self.word2lstm = self.model.add_parameters(
                (options.lstm_output_size * 2, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (options.lstm_output_size * 2))
        else:
            self.word2lstm = self.model.add_parameters(
                (self.lstm_input_size, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (self.lstm_input_size))

        self.char_bilstm = BiLSTM(options.char_emb_size,
                                  options.char_lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)

        self.charPadding = self.model.add_parameters(
            (options.char_lstm_output_size * 2))

    def Init(self, options):
        paddingWordVec = self.word_lookup[
            1] if options.word_emb_size > 0 else None
        paddingElmoVec = dy.zeros(self.elmo.emb_dim) if self.elmo else None
        paddingPosVec = self.pos_lookup[1] if options.pos_emb_size > 0 else None
        paddingCharVec = self.charPadding.expr(
        ) if options.char_emb_size > 0 else None
        paddingTbankVec = self.treebank_lookup[
            0] if options.tbank_emb_size > 0 else None

        self.paddingVec = dy.tanh(self.word2lstm.expr() *\
            dy.concatenate(filter(None,[paddingWordVec,
                                        paddingElmoVec,
                                        paddingPosVec,
                                        paddingCharVec,
                                        paddingTbankVec])) + self.word2lstmbias.expr())

        self.empty = self.paddingVec if self.nnvecs == 1 else\
            dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)])

    def getWordEmbeddings(self,
                          sentence,
                          train,
                          options,
                          test_embeddings=defaultdict(lambda: {})):

        if self.elmo:
            # Get full text of sentence - excluding root, which is loaded differently
            # for transition and graph-based parsers.
            if options.graph_based:
                sentence_text = " ".join(
                    [entry.form for entry in sentence[1:]])
            else:
                sentence_text = " ".join(
                    [entry.form for entry in sentence[:-1]])

            elmo_sentence_representation = \
                self.elmo.get_sentence_representation(sentence_text)

        for i, root in enumerate(sentence):
            root.vecs = defaultdict(
                lambda: None
            )  # all vecs are None by default (possibly a little risky?)
            if options.word_emb_size > 0:
                if train:
                    word_count = float(self.word_counts.get(root.norm, 0))
                    dropFlag = random.random() > word_count / (0.25 +
                                                               word_count)
                    root.vecs["word"] = self.word_lookup[
                        self.words.get(root.norm, 0) if not dropFlag else 0]
                else:  # need to check in test_embeddings at prediction time
                    if root.norm in self.words:
                        root.vecs["word"] = self.word_lookup[self.words[
                            root.norm]]
                    elif root.norm in test_embeddings["words"]:
                        root.vecs["word"] = dy.inputVector(
                            test_embeddings["words"][root.norm])
                    else:
                        root.vecs["word"] = self.word_lookup[0]
            if options.pos_emb_size > 0:
                root.vecs["pos"] = self.pos_lookup[self.pos.get(root.cpos, 0)]
            if options.char_emb_size > 0:
                root.vecs["char"] = self.get_char_vector(
                    root, train, test_embeddings["chars"])
            if options.tbank_emb_size > 0:
                if options.forced_tbank_emb:
                    treebank_id = options.forced_tbank_emb
                elif root.proxy_tbank:
                    treebank_id = root.proxy_tbank
                else:
                    treebank_id = root.treebank_id
                # this is a bit of a hack for models trained on an old version of the code
                # that used treebank name rather than id as the lookup
                if not treebank_id in self.treebanks and treebank_id in utils.reverse_iso_dict and \
                    utils.reverse_iso_dict[treebank_id] in self.treebanks:
                    treebank_id = utils.reverse_iso_dict[treebank_id]
                root.vecs["treebank"] = self.treebank_lookup[
                    self.treebanks[treebank_id]]
            if self.elmo:
                if i < len(sentence) - 1:
                    # Don't look up the 'root' word
                    root.vecs["elmo"] = elmo_sentence_representation[i]
                else:
                    # TODO
                    root.vecs["elmo"] = dy.zeros(self.elmo.emb_dim)

            root.vec = dy.concatenate(
                filter(None, [
                    root.vecs["word"], root.vecs["elmo"], root.vecs["pos"],
                    root.vecs["char"], root.vecs["treebank"]
                ]))

        for bilstm in self.bilstms:
            bilstm.set_token_vecs(sentence, train)

    def get_char_vector(self, root, train, test_embeddings_chars={}):

        if root.char_rep == "*root*":  # no point running a character analysis over this placeholder token
            return self.charPadding.expr(
            )  # use the padding vector if it's the root token
        else:
            char_vecs = []
            for char in root.char_rep:
                if char in self.chars:
                    char_vecs.append(self.char_lookup[self.chars[char]])
                elif char in test_embeddings_chars:
                    char_vecs.append(
                        dy.inputVector(test_embeddings_chars[char]))
                else:
                    char_vecs.append(self.char_lookup[0])
            return self.char_bilstm.get_sequence_vector(char_vecs, train)

    def init_lookups(self, options):

        if self.external_embedding["words"]:
            print 'Initialising %i word vectors with external embeddings' % len(
                self.external_embedding["words"])
            for word in self.external_embedding["words"]:
                if len(self.external_embedding["words"]
                       [word]) != options.word_emb_size:
                    raise Exception(
                        "Size of external embedding does not match specified word embedding size of %s"
                        % (options.word_emb_size))
                self.word_lookup.init_row(
                    self.words[word], self.external_embedding["words"][word])
        elif options.word_emb_size > 0:
            print 'No word external embeddings found: all vectors initialised randomly'

        if self.external_embedding["chars"]:
            print 'Initialising %i char vectors with external embeddings' % len(
                self.external_embedding["chars"])
            for char in self.external_embedding["chars"]:
                if len(self.external_embedding["chars"]
                       [char]) != options.char_emb_size:
                    raise Exception(
                        "Size of external embedding does not match specified char embedding size of %s"
                        % (options.char_emb_size))
                self.char_lookup.init_row(
                    self.chars[char], self.external_embedding["chars"][char])
        elif options.char_emb_size > 0:
            print 'No character external embeddings found: all vectors initialised randomly'
Пример #3
0
    def __init__(self, model, options, vocab, nnvecs=1):

        self.word_counts, words, chars, pos, cpos, rels, treebanks, langs = vocab

        self.model = model
        self.nnvecs = nnvecs

        # Load ELMo if the option is set
        if options.elmo is not None:
            from elmo import ELMo
            self.elmo = ELMo(options.elmo, options.elmo_gamma,
                             options.elmo_learn_gamma)
            self.elmo.init_weights(model)
        else:
            self.elmo = None

        extra_words = 2  # MLP padding vector and OOV vector
        self.words = {word: ind for ind, word in enumerate(words, extra_words)}
        self.word_lookup = self.model.add_lookup_parameters(
            (len(self.words) + extra_words, options.word_emb_size))

        extra_pos = 2  # MLP padding vector and OOV vector
        self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)}
        self.pos_lookup = self.model.add_lookup_parameters(
            (len(cpos) + extra_pos, options.pos_emb_size))

        self.irels = rels
        self.rels = {rel: ind for ind, rel in enumerate(rels)}

        extra_chars = 1  # OOV vector
        self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)}
        self.char_lookup = self.model.add_lookup_parameters(
            (len(chars) + extra_chars, options.char_emb_size))

        extra_treebanks = 1  # Padding vector
        self.treebanks = {
            treebank: ind
            for ind, treebank in enumerate(treebanks, extra_treebanks)
        }
        self.treebank_lookup = self.model.add_lookup_parameters(
            (len(treebanks) + extra_treebanks, options.tbank_emb_size))

        # initialise word vectors with external embeddings where they exist
        # This part got ugly - TODO: refactor
        if not options.predict:
            self.external_embedding = defaultdict(lambda: {})

            if options.ext_word_emb_file and options.word_emb_size > 0:
                # Load pre-trained word embeddings
                for lang in langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_word_emb_file,
                        lang=lang,
                        words=self.words.viewkeys())
                    self.external_embedding["words"].update(embeddings)

            if options.ext_char_emb_file and options.char_emb_size > 0:
                # Load pre-trained character embeddings
                for lang in langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_char_emb_file,
                        lang=lang,
                        words=self.chars,
                        chars=True)
                    self.external_embedding["chars"].update(embeddings)

            if options.ext_emb_dir:
                # For every language, load the data for the word and character
                # embeddings from a directory.
                for lang in langs:
                    if options.word_emb_size > 0:
                        embeddings = utils.get_external_embeddings(
                            options,
                            emb_dir=options.ext_emb_dir,
                            lang=lang,
                            words=self.words.viewkeys())
                        self.external_embedding["words"].update(embeddings)

                    if options.char_emb_size > 0:
                        embeddings = utils.get_external_embeddings(
                            options,
                            emb_dir=options.ext_emb_dir,
                            lang=lang,
                            words=self.chars,
                            chars=True)
                        self.external_embedding["chars"].update(embeddings)

            self.init_lookups(options)

        elmo_emb_size = self.elmo.emb_dim if self.elmo else 0
        self.lstm_input_size = (
            options.word_emb_size + elmo_emb_size + options.pos_emb_size +
            options.tbank_emb_size + 2 *
            (options.char_lstm_output_size if options.char_emb_size > 0 else 0)
        )
        print "Word-level LSTM input size: " + str(self.lstm_input_size)

        self.bilstms = []
        if options.no_bilstms > 0:
            self.bilstms.append(
                BiLSTM(self.lstm_input_size,
                       options.lstm_output_size,
                       self.model,
                       dropout_rate=0.33))
            for i in range(1, options.no_bilstms):
                self.bilstms.append(
                    BiLSTM(2 * options.lstm_output_size,
                           options.lstm_output_size,
                           self.model,
                           dropout_rate=0.33))
            #used in the PaddingVec
            self.word2lstm = self.model.add_parameters(
                (options.lstm_output_size * 2, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (options.lstm_output_size * 2))
        else:
            self.word2lstm = self.model.add_parameters(
                (self.lstm_input_size, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (self.lstm_input_size))

        self.char_bilstm = BiLSTM(options.char_emb_size,
                                  options.char_lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)

        self.charPadding = self.model.add_parameters(
            (options.char_lstm_output_size * 2))
Пример #4
0
def eval():
    paddle.disable_static()
    n_gpus = dist.get_world_size()
    rank = dist.get_rank()

    if n_gpus > 1:
        dist.init_parallel_env()

    args = parse_args()
    if not args.init_from_ckpt:
        raise ValueError('init_from_ckpt should be set when eval.')
    vocab = load_vocab(args.vocab_file, args.max_characters_per_token)

    elmo = ELMo(args.batch_size,
                args.char_embed_dim,
                args.projection_dim,
                vocab.size,
                dropout=args.dropout,
                num_layers=args.num_layers,
                num_highways=args.num_highways,
                char_vocab_size=vocab.char_size)
    if n_gpus > 1:
        elmo = paddle.DataParallel(elmo)
    elmo.eval()

    elmo_loss = ELMoLoss()

    # Loads pre-trained parameters.
    weight_state_dict = paddle.load(args.init_from_ckpt + '.pdparams')
    elmo.set_state_dict(weight_state_dict)
    print("Loaded checkpoint from %s" % args.init_from_ckpt)

    dev_dataset = OneBillionWordDataset(args.dev_data_path,
                                        vocab,
                                        args.batch_size,
                                        args.unroll_steps,
                                        n_gpus,
                                        rank,
                                        mode='test',
                                        shuffle=False,
                                        seed=args.random_seed)

    # FIXME(xiemoyuan): When DataLoader support setting batch_size to None,
    #                   setting batch_size to None.
    dev_dataloader = DataLoader(dev_dataset, return_list=True, batch_size=1)

    total_step = total_loss = 0
    total_time = 0.0
    batch_start_time = time.time()
    for step, inputs in enumerate(dev_dataloader, start=1):
        # FIXME(xiemoyuan): When DataLoader support setting batch_size to None,
        #                   deleting the operation of squeeze.
        for j in range(len(inputs)):
            inputs[j] = paddle.squeeze(inputs[j], axis=0)

        ids, next_ids, ids_reverse, next_ids_reverse = inputs
        outputs = elmo([ids, ids_reverse])
        loss = elmo_loss(outputs, [next_ids, next_ids_reverse])
        ppl = paddle.exp(loss)

        total_loss += loss.numpy()[0]
        total_step += 1

        total_time += (time.time() - batch_start_time)
        if rank == 0:
            if step % args.log_freq == 0:
                print(
                    "Eval step %d - loss: %.4f - Perplexity: %.4f - %.3fs/step"
                    % (step, loss.numpy()[0] * args.unroll_steps,
                       ppl.numpy()[0], total_time / args.log_freq))
                total_time = 0.0
        batch_start_time = time.time()

    avg_loss = total_loss / total_step
    avg_ppl = math.exp(avg_loss)
    if rank == 0:
        print("Eval - average loss: %.4f - average Perplexity: %.4f" %
              (avg_loss * args.unroll_steps, avg_ppl))
Пример #5
0
from tensorflow.contrib import seq2seq
from elmo import ELMo
from data import NERData
import os

total_epoch = 5000
hidden_size = 200
vocab_size = 5000
max_length = 128
entity_class = 8

lr = 1e-4
batch_size = 256

ner = NERData(batch_size, max_length)
elmo = ELMo(batch_size, hidden_size, vocab_size)


def network(X):
    w = tf.get_variable("fcn_w", [1, hidden_size, entity_class])
    b = tf.get_variable("fcn_b", [entity_class])
    w_tile = tf.tile(w, [batch_size, 1, 1])

    logists = tf.nn.softmax(tf.nn.xw_plus_b(X, w_tile, b), name="logists")
    return logists


def train():
    X = tf.placeholder(shape=[batch_size, max_length],
                       dtype=tf.int32,
                       name="X")
Пример #6
0
        for _ in range(len(SENT2VEC))
    ]

    for i in range(len(SENT2VEC)):
        s2vsingle[i].load_state(SENT2VEC[i])
        s2vsingle[i].set_w2v_path(PATH_TO_W2V)
        s2vsingle[i] = s2vsingle[i].cuda()

    sent2vec = Sent2Vec(s2vsingle, 'concat')

    params_model = {'bsize': 64, 'pool_type': 'mean',
                    'which_layer': 'all',
                    'optfile': ELMO_OPTIONS,
                    'wgtfile': ELMO_WEIGHT}

    elmo = ELMo(params_model)
    elmo = elmo.cuda()

    gensen_1 = GenSenSingle(
        model_folder=FOLDER_PATH,
        filename_prefix=PREFIX1,
        pretrained_emb=PRETRAIN_EMB,
        cuda=True
    )
    gensen_2 = GenSenSingle(
        model_folder=FOLDER_PATH,
        filename_prefix=PREFIX2,
        pretrained_emb=PRETRAIN_EMB,
        cuda=True
    )
    gensen = GenSen(gensen_1, gensen_2)
Пример #7
0
# The list contains train, valid and test dataset. Each tuple in the list is the dataset of word tokens and the dataset of character tokens
datasets, field_word, field_char = gen_language_model_corpus(WikiText2)
train_data, valid_data, test_data = datasets

VOCAB_DIM = len(field_char.vocab)
OUTPUT_DIM = len(field_word.vocab)

# OTHER HYPER-PARAMETERS
BATCH_SIZE = 32
N_EPOCHS = 100
CLIP = 1
best_valid_loss = float('inf')
# PAD_IDX = field_word.vocab.stoi["<pad>"] # PAD token for word, NOT CHAR

model = ELMo(VOCAB_DIM, OUTPUT_DIM, CHAR_EMB_DIM, HID_DIM, PRJ_DIM, FILTERS,
             CHAR_LEN, N_LAYERS).to(DEVICE)

# Initialize
model.init_weights()

print(f'The model has {count_parameters(model):,} trainable parameters')

import time

# criterion = cal_loss
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
train_losses = []
test_losses = []

for epoch in range(1, N_EPOCHS + 1):
Пример #8
0
def train(args):
    paddle.set_device(args.device)
    n_procs = dist.get_world_size()
    rank = dist.get_rank()

    if n_procs > 1:
        dist.init_parallel_env()

    vocab = load_vocab(args.vocab_file, args.max_characters_per_token)

    elmo = ELMo(args.batch_size,
                args.char_embed_dim,
                args.projection_dim,
                vocab.size,
                dropout=args.dropout,
                num_layers=args.num_layers,
                num_highways=args.num_highways,
                char_vocab_size=vocab.char_size)
    if n_procs > 1:
        elmo = paddle.DataParallel(elmo)
    elmo.train()

    gloabl_norm_clip = nn.ClipGradByGlobalNorm(args.max_grad_norm)
    optimizer = paddle.optimizer.Adagrad(learning_rate=args.lr,
                                         parameters=elmo.parameters(),
                                         initial_accumulator_value=1.0,
                                         grad_clip=gloabl_norm_clip)
    elmo_loss = ELMoLoss()

    # Loads pre-trained parameters.
    if args.init_from_ckpt:
        weight_state_dict = paddle.load(args.init_from_ckpt + '.pdparams')
        opt_state_dict = paddle.load(args.init_from_ckpt + '.pdopt')
        elmo.set_state_dict(weight_state_dict)
        optimizer.set_state_dict(opt_state_dict)
        print("Loaded checkpoint from %s" % args.init_from_ckpt)

    train_dataset = OneBillionWordDataset(args.train_data_path,
                                          vocab,
                                          args.batch_size,
                                          args.unroll_steps,
                                          n_procs=n_procs,
                                          rank=rank,
                                          mode='train',
                                          shuffle=True,
                                          seed=args.seed)

    train_dataloader = DataLoader(train_dataset,
                                  return_list=True,
                                  batch_size=None)

    n_tokens_per_batch = args.batch_size * args.unroll_steps * n_procs
    n_steps_per_epoch = int(train_dataset.number_of_tokens /
                            n_tokens_per_batch)
    n_steps_total = args.epochs * n_steps_per_epoch
    print("Training for %s epochs and %s steps" % (args.epochs, n_steps_total))

    total_time = 0.0
    batch_start_time = time.time()
    for step, inputs in enumerate(train_dataloader, start=1):
        ids, next_ids, ids_reverse, next_ids_reverse = inputs
        outputs = elmo([ids, ids_reverse])
        loss = elmo_loss(outputs, [next_ids, next_ids_reverse])
        ppl = paddle.exp(loss)
        loss *= args.unroll_steps
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()

        total_time += (time.time() - batch_start_time)
        if step % args.log_freq == 0:
            print("step %d/%d - loss: %.4f - Perplexity: %.4f - %.3fs/step" %
                  (step, n_steps_total, loss.numpy()[0], ppl.numpy()[0],
                   total_time / args.log_freq))
            total_time = 0.0
        if rank == 0 and step % args.save_freq == 0:
            save_params(elmo, optimizer, args.save_dir, step)
        if step == n_steps_total:
            # training done
            if rank == 0:
                save_params(elmo, optimizer, args.save_dir, 'final')
            break
        batch_start_time = time.time()
    'epoch_size': 4
}
# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    # Load InferSent model
    params_model = {
        'bsize': 64,
        'pool_type': 'mean',
        'which_layer': 'all',
        'optfile': OPT_PATH,
        'wgtfile': MODEL_PATH
    }

    model = ELMo(params_model)
    params_senteval['elmo'] = model.cuda()

    se = senteval.engine.SE(params_senteval, batcher, prepare)
    transfer_tasks = [
        'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA',
        'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment',
        'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth',
        'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
        'OddManOut', 'CoordinationInversion'
    ]
    results_transfer = se.eval(transfer_tasks)

    print('--------------------------------------------')
    print('MR                [Dev:%.1f/Test:%.1f]' %
          (results_transfer['MR']['devacc'], results_transfer['MR']['acc']))