예제 #1
0
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask,
                                     combined_mask, dec_padding_mask)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)
예제 #2
0
def train_model(model,
                optimizer,
                train_itr,
                EN_TEXT,
                FR_TEXT,
                epochs=10000,
                print_every=100):

    # tell the model, we are training the model
    # b/c dropout, batch norm behave differently on the train and test procedures
    model.train()
    start = time.time()
    prev = start
    total_loss = 0

    for ep in range(epochs):
        for i, batch in enumerate(train_itr):
            src = batch.English.transpose(0, 1)
            tar = batch.French.transpose(0, 1)
            # the French sentence we input has all words except
            # the last, as it is using each word to predict the next

            tar_input = tar[:, :-1]
            targets = tar[:, 1:].contiguous().view(-1)

            src_mask, tar_mask = create_masks(batch, EN_TEXT, FR_TEXT)
            preds = model(src, tar_input, src_mask, tar_mask)

            optimizer.zero_grad()
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)),
                                   targets,
                                   ignore_index=t)
            loss.backward()
            optimizer.step()

            total_loss += loss.data[0]

            if (i + 1) % print_every == 0:
                loss_avg = total_loss / print_every
                print(
                    "time = %dm, epoch %d, iter = %d, loss = %.3f,% ds per % d iters"
                    % ((time.time() - start) // 60, ep + 1, i + 1, loss_avg,
                       time.time() - prev, print_every))
                total_loss = 0
                prev = time.time()
예제 #3
0
def evaluate(inp_sentence):
    start_token = [tokeinzer_pt.vocab_size]
    end_token = [tokenizer_pt.vocab_size + 1]

    # inp sentence is portuguese, hense adding the start and end token
    inp_sentence = start_token + tokenizer_pt.encode(inp_sentence) + end_token
    encoder_input = tf.expand_dims(inp_sentence, 0)

    # as the target is english, the first word to the transformer should be the
    # english start token.
    decoder_input = [tokenizer_en.vocab_size]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(cst.MAC_LENGTH):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(encoder_input,
                                                     output,
                                                     False,
                                                     enc_padding_mask,
                                                     combined_mask,
                                                     dec_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token
        if predicted_id == tokenizer_en.vocab_size + 1:
            return tf.squeeze(output, axis=0), attention_weights

        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights
예제 #4
0
def train_model(transformer, eventer, dataset, test_dataset, epochs, criterion,
                optimizer, SRC, TRG):

    print("training model...")
    # eval_loss1 = eval(transformer, eventer, test_dataset)
    # eval_loss2 = shuffle_eval(transformer, eventer, test_dataset)
    for epoch in range(epochs):
        transformer.train()
        eventer.train()
        cur_lr = get_lr(optimizer)
        print("Current lr ", cur_lr)
        total_loss = []
        for index, (srcs, srcs_len, trgs, trgs_len, mask_tok,
                    root) in enumerate(tqdm(dataset)):
            sent_num = srcs.size(1)
            srcs = srcs.cuda()
            mask_tok = mask_tok.cuda()
            root = root.cuda()
            # B * S
            trgs = trgs.cuda()
            trg_input = trgs[:, :-1]
            trg_input = trg_input.cuda()
            src_masks = [None] * sent_num
            trg_mask = None
            for i in range(sent_num):
                src_masks[i], trg_mask = create_masks(srcs[:, i], trg_input)
            for i in range(sent_num):
                src_masks[i] = src_masks[i].squeeze().cuda()
            src_masks = torch.stack([m for m in src_masks], dim=1)
            src_word_masks = src_masks.view(src_masks.size(0), 1, -1)
            # print(src_word_masks.size())
            src_word_tok_masks = src_word_masks.repeat(1,
                                                       src_word_masks.size(2),
                                                       1)
            # print("word_mask", src_word_tok_masks[0][0])
            # print("mask_tok", mask_tok[0][0])
            mask_tok = mask_tok * src_word_tok_masks.long()
            # print("mask_tok", mask_tok[0][0])
            trg_mask = trg_mask.cuda()

            events = [None] * sent_num
            for i in range(sent_num):
                events[i] = transformer.encoder(srcs[:, i],
                                                src_masks[:, i].unsqueeze(1))
                # print(events[i].size())
                # print(src_masks[0,i])
                # events[i] = pool(events[i], src_masks[:, i])

            eventers = torch.cat([e for e in events], dim=-2)
            eventers = eventer(eventers, mask_tok)
            eventers = eventers.view(eventers.size(0), sent_num, -1,
                                     eventers.size(2))

            feat_root = root.view(-1, 1, 1, 1).expand(-1, 1, eventers.size(2),
                                                      eventers.size(3))
            root_feat = torch.gather(eventers, 1, feat_root).squeeze(1)
            mask_root = root.view(-1, 1, 1).expand(-1, 1, src_masks.size(2))
            root_masks = torch.gather(src_masks, 1, mask_root)
            # print(root_feat.size(), root_masks.size())
            # pred = transformer.out(transformer.decoder(trg_input, sent_feats, mask_sent.unsqueeze(1), trg_mask)[0])
            pred = transformer.out(
                transformer.decoder(trg_input, root_feat, root_masks,
                                    trg_mask)[0])
            ys = trgs[:, 1:].contiguous().view(-1).cuda()

            optimizer.zero_grad()
            loss = criterion(pred.view(-1, pred.size(-1)), ys)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(transformer.parameters(), 0.1)
            torch.nn.utils.clip_grad_norm_(eventer.parameters(), 0.1)
            optimizer.step()
            total_loss.append(loss.item())
        print(f"Epoch {epoch} training loss : ",
              sum(total_loss) / len(total_loss))
        eval_loss1 = eval(transformer, eventer, test_dataset)
        eval_loss2 = shuffle_eval(transformer, eventer, test_dataset)
        print(f"Epoch {epoch} evaluation loss : ", eval_loss1, eval_loss2)
        torch.save(transformer.state_dict(), f'models/transformer{epoch}.pth')
        torch.save(eventer.state_dict(), f'models/eventer{epoch}.pth')
예제 #5
0
def init_vars(src, root, mask_tok, transformer, eventer, SRC, TRG, beam_size,
              max_len):

    init_tok = TRG('<sos>')
    outputs = torch.LongTensor([[init_tok]])
    outputs = outputs.cuda()

    trg_mask = nopeak_mask(1)
    trg_mask = trg_mask.cuda()

    ##################################################
    src_lens_flat = [len(s) for s in src]
    src_toks = torch.zeros(1, len(src), max(src_lens_flat)).long()
    for i, s in enumerate(src):
        end = src_lens_flat[i]
        src_toks[0, i, :end] = torch.LongTensor(s[:end])
    src = src_toks
    src = src.cuda()
    sent_num = src.size(1)

    mask_tok = torch.LongTensor(mask_tok).unsqueeze(0)
    # print(mask_tok.size())
    mask_tok = tile(mask_tok, 1, max(src_lens_flat))
    mask_tok = tile(mask_tok, 2, max(src_lens_flat))
    mask_tok = mask_tok.cuda()
    # B * S
    src_masks = [None] * sent_num
    for i in range(sent_num):
        src_masks[i], _ = create_masks(src[:, i], None)
    for i in range(sent_num):
        src_masks[i] = src_masks[i].squeeze(1).cuda()
    src_masks = torch.stack([m for m in src_masks], dim=1)
    src_word_masks = src_masks.view(src_masks.size(0), 1, -1)
    #  print("src_word_masks", src_word_masks.size())

    events = [None] * sent_num
    for i in range(sent_num):
        events[i] = transformer.encoder(src[:, i], src_masks[:,
                                                             i].unsqueeze(1))

    eventers = torch.cat([e for e in events], dim=-2)
    eventers = eventer(eventers, mask_tok)
    eventers = eventers.view(eventers.size(0), sent_num, -1, eventers.size(2))
    # print(eventers.size())
    root = torch.LongTensor([root]).cuda()
    feat_root = root.view(-1, 1, 1, 1).expand(-1, 1, eventers.size(2),
                                              eventers.size(3))
    root_feat = torch.gather(eventers, 1, feat_root).squeeze(1)
    mask_root = root.view(-1, 1, 1).expand(-1, 1, src_masks.size(2))
    root_masks = torch.gather(src_masks, 1, mask_root)
    # print(root_feat.size(), root_masks.size())
    # pred = transformer.out(transformer.decoder(trg_input, sent_feats, mask_sent.unsqueeze(1), trg_mask)[0])
    # print(root_masks.data.cpu().numpy())
    pred = transformer.out(
        transformer.decoder(outputs, root_feat, root_masks, trg_mask)[0])
    #
    out = F.softmax(pred, dim=-1)

    probs, ix = out[:, -1].data.topk(beam_size)
    log_scores = torch.Tensor([math.log(prob)
                               for prob in probs.data[0]]).unsqueeze(0)

    outputs = torch.zeros(beam_size, max_len).long()
    outputs = outputs.cuda()
    outputs[:, 0] = init_tok
    outputs[:, 1] = ix[0]

    e_outputs = torch.zeros(beam_size, eventers.size(-2), eventers.size(-1))
    e_outputs = e_outputs.cuda()
    e_outputs[:, :] = root_feat

    return outputs, e_outputs, log_scores, root_masks