Exemplo n.º 1
0
 def download(self):
     utils.create_dirs(self.dirs)
     dataset = fetch_dataset(*osp.split(self.root))
     utils.create_masks(data=dataset.data)
     data = dataset.data
     edge_attr = torch.ones(data.edge_index.shape[1]) if data.edge_attr is None else data.edge_attr
     data.edge_attr = edge_attr
     torch.save((dataset.data, dataset.slices), self.processed_paths[1])
Exemplo n.º 2
0
def evaluate(inp_sentence):
    # 文本转ID
    input_id_sentence = [pt_tokenizer.vocab_size] \
                        + pt_tokenizer.encode(inp_sentence) + [pt_tokenizer.vocab_size + 1]
    # encoder_input.shape: (1, input_sentence_length)
    # 维度变换
    encoder_input = tf.expand_dims(input_id_sentence, 0)

    # decoder_input.shape: (1, 1)
    decoder_input = tf.expand_dims([en_tokenizer.vocab_size], 0)

    for i in range(max_length):
        encoder_padding_mask, decoder_mask, encoder_decoder_padding_mask \
            = create_masks(encoder_input, decoder_input)
        # predictions.shape: (batch_size, output_target_len, target_vocab_size)
        predictions, attention_weights = transformer(
            encoder_input, decoder_input, False, encoder_padding_mask,
            decoder_mask, encoder_decoder_padding_mask)
        # predictions.shape: (batch_size, target_vocab_size) 中间位置只取一个值, 中间维度消失
        predictions = predictions[:, -1, :]

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        if tf.equal(predicted_id, en_tokenizer.vocab_size + 1):
            return tf.squeeze(decoder_input, axis=0), attention_weights

        decoder_input = tf.concat(
            [decoder_input, [predicted_id]],  # predicted_id的中括号注意
            axis=-1)
    return tf.squeeze(decoder_input, axis=0), attention_weights  # 维度缩减
Exemplo n.º 3
0
    def train_step(inp, tar):
        """
        The target (tar) is divided into tar_inp and tar_real
        tar_inp is passed to the decoder as input. tar_real is the same input shifted by 1:
        at each position in tar_inp, tar_real contains the next token that should be predicted.
        """

        # sentence = "SOS A lion in the jungle is sleeping EOS"
        # tar_inp = "SOS A lion in the jungle is sleeping"
        # tar_real = "A lion in the jungle is sleeping EOS"
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]

        data = create_look_ahead_mask(tf.shape(tar_inp)[1])

        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

        with tf.GradientTape() as tape:
            predictions, _ = transformer(inp, tar_inp,
                                         True,
                                         enc_padding_mask,
                                         combined_mask,
                                         dec_padding_mask)
            loss = loss_function(tar_real, predictions)
        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

        train_loss(loss)
        train_accuracy(tar_real, predictions)
Exemplo n.º 4
0
    def evaluate(self, inp_sentence):
        # normalize input sentence
        inp_sentence = self.encode_zh(inp_sentence)
        encoder_input = tf.expand_dims(inp_sentence, 0)

        # as the target is english, the first word to the transformer should be the
        # english start token.
        decoder_input = [self.tokenizer_en.vocab_size]
        output = tf.expand_dims(decoder_input, 0)

        for i in range(self.MAX_SEQ_LENGTH):
            enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
                encoder_input, output)

            # predictions.shape == (batch_size, seq_len, vocab_size)
            predictions, attention_weights = self.model(
                encoder_input, output, False, enc_padding_mask, combined_mask,
                dec_padding_mask)

            # select the last word from the seq_len dimension
            predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

            predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

            # return the result if the predicted_id is equal to the end token
            if tf.equal(predicted_id, self.tokenizer_en.vocab_size + 1):
                return tf.squeeze(output, axis=0), attention_weights

            # concatentate the predicted_id to the output which is given to the decoder
            # as its input.
            output = tf.concat([output, predicted_id], axis=-1)

        return tf.squeeze(output, axis=0), attention_weights
Exemplo n.º 5
0
def predict(features, params, model):

    output = tf.tile([[2]], [params["batch_size"], 1])  # 2 = start_decoding

    for i in range(params["max_dec_len"]):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            features["enc_input"], output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = model(
            features["enc_input"],
            features["extended_enc_input"],
            features["max_oov_len"],
            output,
            training=params["training"],
            enc_padding_mask=enc_padding_mask,
            look_ahead_mask=combined_mask,
            dec_padding_mask=dec_padding_mask,
            stats=features["stats"])

        # select the last word from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return output, attention_weights
Exemplo n.º 6
0
def evaluate(input_document, summary_tokenizer, document_tokenizer):
    input_document = document_tokenizer.texts_to_sequences([input_document])
    input_document = tf.keras.preprocessing.sequence.pad_sequences(
        input_document,
        maxlen=encoder_maxlen,
        padding='post',
        truncating='post')

    encoder_input = tf.expand_dims(input_document[0], 0)

    decoder_input = [summary_tokenizer.word_index["<start>"]]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(decoder_maxlen):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)

        predictions, attention_weights = transformer(encoder_input, output,
                                                     False, enc_padding_mask,
                                                     combined_mask,
                                                     dec_padding_mask)

        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        if predicted_id == summary_tokenizer.word_index["<end>"]:
            return tf.squeeze(output, axis=0), attention_weights

        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights
Exemplo n.º 7
0
def download_pyg_data(config):
    """
    Downloads a dataset from the PyTorch Geometric library

    :param config: A dict containing info on the dataset to be downloaded
    :return: A tuple containing (root directory, dataset name, data directory)
    """
    leaf_dir = config["kwargs"]["root"].split("/")[-1].strip()
    data_dir = osp.join(config["kwargs"]["root"],
                        "" if config["name"] == leaf_dir else config["name"])
    dst_path = osp.join(data_dir, "raw", "data.pt")
    if not osp.exists(dst_path):
        DatasetClass = config["class"]
        dataset = DatasetClass(**config["kwargs"])
        utils.create_masks(data=dataset.data)
        torch.save((dataset.data, dataset.slices), dst_path)
    return config["kwargs"]["root"], config["name"], data_dir
Exemplo n.º 8
0
def train_step(inp, tar_inp, tar_real):
    enc_padding_mask, combined_mask, dec_padding_mask = utils.create_masks(
        inp, tar_inp)
    # shape(inp) = (batch_size, pad_size)
    # shape(predictions) = (batch_size, pad_size, tar_vocab_size)
    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask,
                                     combined_mask, dec_padding_mask)
        loss = metrics.loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    train_loss(loss)
Exemplo n.º 9
0
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    encoder_padding_mask, decoder_mask, encoder_decoder_padding_mask = create_masks(
        inp, tar_inp)
    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp, tar_inp, True, encoder_padding_mask,
                                     decoder_mask,
                                     encoder_decoder_padding_mask)
        loss = loss_function(tar_real, predictions)
    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    train_loss(loss)
    train_accuracy(tar_real, predictions)
def train_step(features, labels, params, model, optimizer, loss_object, train_loss_metric):
  
	enc_padding_mask, combined_mask, dec_padding_mask = create_masks(features["enc_input"], labels["dec_input"])

	with tf.GradientTape() as tape:
		output, attn_weights = model(features["enc_input"],features["extended_enc_input"], features["max_oov_len"], labels["dec_input"], training=params["training"], 
									enc_padding_mask=enc_padding_mask, 
									look_ahead_mask=combined_mask,
									dec_padding_mask=dec_padding_mask)
		loss = loss_function(loss_object, labels["dec_target"], output)

	gradients = tape.gradient(loss, model.trainable_variables)    
	optimizer.apply_gradients(zip(gradients, model.trainable_variables))
	train_loss_metric(loss)
Exemplo n.º 11
0
 def encode(self, X, W, training=False):
     X_imputed = self.imputation_layer(X, W)
     enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(
         X_imputed)
     enc_output = self.encoder(X_imputed,
                               training=training,
                               mask=enc_padding_mask)
     # use (self.D + 1) because of the SOS added at the beginning
     hidden = tf.reshape(enc_output, [-1, (self.D + 1) * self.d_model])
     if len(self.n_hidden) > 1:
         for layer in self.ae_encode_layers:
             hidden = layer(hidden)
     mu_tilde = self.mu_layer(hidden)
     log_sigma2_tilde = self.log_sigma2_layer(hidden)
     z = self.z_layer((mu_tilde, log_sigma2_tilde))
     return z, mu_tilde, log_sigma2_tilde
Exemplo n.º 12
0
def evaluate(transformer, inp_sequence, max_length=160):
    """
    Given input sequence for encoder and predict target for decoder

    Parameters:
        inp_sequence (string): input sequence for encoder

    Returns:
        predict logits: shape = (seq_len, vocab_size)

    """
    start_token = token_encode_dic["^"]
    end_token = token_encode_dic["$"]
    # inp_sentence is product, shape = (len(inp_sentence)+2, )
    inp_sequence = tokenizer.lookup(tf.strings.bytes_split("^"+inp_sequence+"$"))
    encoder_input = tf.expand_dims(inp_sequence, 0) # (1, len(inp_sentence)+2)

    # target is reactant,first char is "^"
    decoder_input = [start_token]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(max_length):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(encoder_input,
                                                     output,
                                                     False,
                                                     enc_padding_mask,
                                                     combined_mask,
                                                     dec_padding_mask)

        # each time choose last element from seq_len dimension, because sequence is shifted
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # if predicted_id == ,就返回结果
        if predicted_id == end_token:
            return tf.squeeze(output, axis=0), attention_weights

        # each time append predicted_id to output, finaly will get whole target sequence
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights
Exemplo n.º 13
0
 def decode(self, z, X, training=False):
     hidden = z
     for layer in self.ae_decode_layers:
         hidden = layer(hidden)
     hidden = self.connect_layer(hidden)
     # use (self.D + 1) because of the SOS added at the beginning
     hidden = tf.reshape(hidden, [-1, (self.D + 1), self.d_model])
     enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(
         hidden)
     dec_output, attention_weights = self.decoder(
         X[:, :-1, :],
         hidden,
         training=training,
         look_ahead_mask=look_ahead_mask,
         padding_mask=dec_padding_mask)
     x_raw = self.final_layer(dec_output)
     x = self.output_activation(x_raw, name="x_output")
     return x, x_raw, attention_weights
Exemplo n.º 14
0
def train_step(inp, tar):
    tar = tar.get('output_ids')
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        inp['input_ids'], tar_inp)

    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask,
                                     combined_mask, dec_padding_mask)

        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    opt.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)
Exemplo n.º 15
0
    def _translate_sentence(self, src_seq, max_seq_len):
        # Only accept batch size equals to 1 in this function.
        # TODO: expand to batch operation.
        #assert src_seq.size(0) == 1

        src_pad_idx, trg_eos_idx = self.src_pad_idx, self.trg_eos_idx
        #max_seq_len, beam_size, alpha = self.max_seq_len, self.beam_size, self.alpha
        beam_size, alpha = self.beam_size, self.alpha

        with torch.no_grad():
            #src_mask = get_pad_mask(src_seq, src_pad_idx)
            src_mask, _ = create_masks(src_seq, None, src_seq.device,
                                       src_pad_idx)
            enc_output, gen_seq, scores = self._get_init_state(
                src_seq, src_mask)

            ans_idx = 0  # default
            for step in range(2, max_seq_len):  # decode up to max length
                dec_output = self._model_decode(gen_seq[:, :step], enc_output,
                                                src_mask)
                gen_seq, scores = self._get_the_best_score_and_idx(
                    gen_seq, dec_output, scores, step)

                # Check if all path finished
                # -- locate the eos in the generated sequences
                eos_locs = gen_seq == trg_eos_idx
                # -- replace the eos with its position for the length penalty use
                seq_lens, _ = self.len_map.masked_fill(~eos_locs,
                                                       max_seq_len).min(1)
                # -- check if all beams contain eos
                if (eos_locs.sum(1) > 0).sum(0).item() == beam_size:
                    # TODO: Try different terminate conditions.
                    _, ans_idx = scores.div(
                        seq_lens.to(enc_output.device)**alpha).max(0)
                    ans_idx = ans_idx.item()
                    break
        #return gen_seq[ans_idx][:seq_lens[ans_idx]].tolist()
        return gen_seq[ans_idx][:seq_lens[ans_idx]]
Exemplo n.º 16
0
def main(args):

    #create a writer
    writer = SummaryWriter('loss_plot_' + args.mode, comment='test')
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing, normalization for the pretrained resnet
    transform = T.Compose([
        T.Resize((224, 224)),
        T.ToTensor(),
        T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    val_length = len(os.listdir(args.image_dir_val))

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    data_loader_val = get_loader(args.image_dir_val,
                                 args.caption_path_val,
                                 vocab,
                                 transform,
                                 args.batch_size,
                                 shuffle=True,
                                 num_workers=args.num_workers)

    # Build the model
    # if no-attention model is chosen:
    if args.model_type == 'no_attention':
        encoder = Encoder(args.embed_size).to(device)
        decoder = Decoder(args.embed_size, args.hidden_size, len(vocab),
                          args.num_layers).to(device)
        criterion = nn.CrossEntropyLoss()

    # if attention model is chosen:
    elif args.model_type == 'attention':
        encoder = EncoderAtt(encoded_image_size=9).to(device)
        decoder = DecoderAtt(vocab, args.encoder_dim, args.hidden_size,
                             args.attention_dim, args.embed_size,
                             args.dropout_ratio, args.alpha_c).to(device)

    # if transformer model is chosen:
    elif args.model_type == 'transformer':
        model = Transformer(len(vocab), args.embed_size,
                            args.transformer_layers, 8,
                            args.dropout_ratio).to(device)

        encoder_optimizer = torch.optim.Adam(params=filter(
            lambda p: p.requires_grad, model.encoder.parameters()),
                                             lr=args.learning_rate_enc)
        decoder_optimizer = torch.optim.Adam(params=filter(
            lambda p: p.requires_grad, model.decoder.parameters()),
                                             lr=args.learning_rate_dec)
        criterion = nn.CrossEntropyLoss(ignore_index=vocab.word2idx['<pad>'])

    else:
        print('Select model_type attention or no_attention')

    # if model is not transformer: additional step in encoder is needed: freeze lower layers of resnet if args.fine_tune == True
    if args.model_type != 'transformer':
        decoder_optimizer = torch.optim.Adam(params=filter(
            lambda p: p.requires_grad, decoder.parameters()),
                                             lr=args.learning_rate_dec)
        encoder.fine_tune(args.fine_tune)
        encoder_optimizer = torch.optim.Adam(params=filter(
            lambda p: p.requires_grad, encoder.parameters()),
                                             lr=args.learning_rate_enc)

    # initialize lists to store results:
    loss_train = []
    loss_val = []
    loss_val_epoch = []
    loss_train_epoch = []

    bleu_res_list = []
    cider_res_list = []
    rouge_res_list = []

    results = {}

    # calculate total steps fot train and validation
    total_step = len(data_loader)
    total_step_val = len(data_loader_val)

    #For each epoch
    for epoch in tqdm(range(args.num_epochs)):

        loss_val_iter = []
        loss_train_iter = []

        # set model to train mode
        if args.model_type != 'transformer':
            encoder.train()
            decoder.train()
        else:
            model.train()

        # for each entry in data_loader
        for i, (images, captions, lengths) in tqdm(enumerate(data_loader)):
            # load images and captions to device
            images = images.to(device)
            captions = captions.to(device)
            # Forward, backward and optimize

            # forward and backward path is different dependent of model type:
            if args.model_type == 'no_attention':
                # get features from encoder
                features = encoder(images)
                # pad targergets to a length
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]
                # get output from decoder
                outputs = decoder(features, captions, lengths)
                # calculate loss
                loss = criterion(outputs, targets)

                # optimizer and backward step
                decoder_optimizer.zero_grad()
                decoder_optimizer.zero_grad()
                loss.backward()
                decoder_optimizer.step()
                encoder_optimizer.step()

            elif args.model_type == 'attention':

                # get features from encoder
                features = encoder(images)

                # get targets - starting from 2 word in captions
                #(the model not sequantial, so targets are predicted in parallel- no need to predict first word in captions)

                targets = captions[:, 1:]
                # decode length = length-1 for each caption
                decode_lengths = [length - 1 for length in lengths]
                #flatten targets
                targets = targets.reshape(targets.shape[0] * targets.shape[1])

                sampled_caption = []

                # get scores and alphas from decoder
                scores, alphas = decoder(features, captions, decode_lengths)

                scores = scores.view(-1, scores.shape[-1])

                #predicted = prediction with maximum score
                _, predicted = torch.max(scores, dim=1)

                # calculate loss
                loss = decoder.loss(scores, targets, alphas)

                # optimizer and backward step
                decoder_optimizer.zero_grad()
                decoder_optimizer.zero_grad()
                loss.backward()
                decoder_optimizer.step()
                encoder_optimizer.step()

            elif args.model_type == 'transformer':

                # input is captions without last word
                trg_input = captions[:, :-1]
                # create mask
                trg_mask = create_masks(trg_input)

                # get scores from model
                scores = model(images, trg_input, trg_mask)
                scores = scores.view(-1, scores.shape[-1])

                # get targets - starting from 2 word in captions
                targets = captions[:, 1:]

                #predicted = prediction with maximum score
                _, predicted = torch.max(scores, dim=1)

                # calculate loss
                loss = criterion(
                    scores,
                    targets.reshape(targets.shape[0] * targets.shape[1]))

                #forward and backward path
                decoder_optimizer.zero_grad()
                decoder_optimizer.zero_grad()
                loss.backward()
                decoder_optimizer.step()
                encoder_optimizer.step()

            else:
                print('Select model_type attention or no_attention')

            # append results to loss lists and writer
            loss_train_iter.append(loss.item())
            loss_train.append(loss.item())
            writer.add_scalar('Loss/train/iterations', loss.item(), i + 1)

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                    .format(epoch, args.num_epochs, i, total_step, loss.item(),
                            np.exp(loss.item())))

        print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'.
              format(epoch, args.num_epochs, i, total_step, loss.item(),
                     np.exp(loss.item())))

        #append mean of last 10 batches as approximate epoch loss
        loss_train_epoch.append(np.mean(loss_train_iter[-10:]))

        writer.add_scalar('Loss/train/epoch', np.mean(loss_train_iter[-10:]),
                          epoch + 1)

        #save model
        if args.model_type != 'transformer':
            torch.save(
                decoder.state_dict(),
                os.path.join(
                    args.model_path,
                    'decoder_' + args.mode + '_{}.ckpt'.format(epoch + 1)))
            torch.save(
                encoder.state_dict(),
                os.path.join(
                    args.model_path,
                    'decoder_' + args.mode + '_{}.ckpt'.format(epoch + 1)))

        else:
            torch.save(
                model.state_dict(),
                os.path.join(
                    args.model_path,
                    'model_' + args.mode + '_{}.ckpt'.format(epoch + 1)))
        np.save(
            os.path.join(args.predict_json,
                         'loss_train_temp_' + args.mode + '.npy'), loss_train)

        #validate model:
        # set model to eval mode:
        if args.model_type != 'transformer':
            encoder.eval()
            decoder.eval()
        else:
            model.eval()
        total_step = len(data_loader_val)

        # set no_grad mode:
        with torch.no_grad():
            # for each entry in data_loader
            for i, (images, captions,
                    lengths) in tqdm(enumerate(data_loader_val)):
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]
                images = images.to(device)
                captions = captions.to(device)

                # forward and backward path is different dependent of model type:
                if args.model_type == 'no_attention':
                    features = encoder(images)
                    outputs = decoder(features, captions, lengths)
                    loss = criterion(outputs, targets)

                elif args.model_type == 'attention':

                    features = encoder(images)
                    sampled_caption = []
                    targets = captions[:, 1:]
                    decode_lengths = [length - 1 for length in lengths]
                    targets = targets.reshape(targets.shape[0] *
                                              targets.shape[1])

                    scores, alphas = decoder(features, captions,
                                             decode_lengths)

                    _, predicted = torch.max(scores, dim=1)

                    scores = scores.view(-1, scores.shape[-1])

                    sampled_caption = []

                    loss = decoder.loss(scores, targets, alphas)

                elif args.model_type == 'transformer':

                    trg_input = captions[:, :-1]
                    trg_mask = create_masks(trg_input)
                    scores = model(images, trg_input, trg_mask)
                    scores = scores.view(-1, scores.shape[-1])
                    targets = captions[:, 1:]

                    _, predicted = torch.max(scores, dim=1)

                    loss = criterion(
                        scores,
                        targets.reshape(targets.shape[0] * targets.shape[1]))

                #display results
                if i % args.log_step == 0:
                    print(
                        'Epoch [{}/{}], Step [{}/{}], Validation Loss: {:.4f}, Validation Perplexity: {:5.4f}'
                        .format(epoch, args.num_epochs, i, total_step_val,
                                loss.item(), np.exp(loss.item())))

                # append results to loss lists and writer
                loss_val.append(loss.item())
                loss_val_iter.append(loss.item())

                writer.add_scalar('Loss/validation/iterations', loss.item(),
                                  i + 1)

        np.save(
            os.path.join(args.predict_json, 'loss_val_' + args.mode + '.npy'),
            loss_val)

        print(
            'Epoch [{}/{}], Step [{}/{}], Validation Loss: {:.4f}, Validation Perplexity: {:5.4f}'
            .format(epoch, args.num_epochs, i, total_step_val, loss.item(),
                    np.exp(loss.item())))

        # results: epoch validation loss

        loss_val_epoch.append(np.mean(loss_val_iter))
        writer.add_scalar('Loss/validation/epoch', np.mean(loss_val_epoch),
                          epoch + 1)

        #predict captions:
        filenames = os.listdir(args.image_dir_val)

        predicted = {}

        for file in tqdm(filenames):
            if file == '.DS_Store':
                continue
            # Prepare an image
            image = load_image(os.path.join(args.image_dir_val, file),
                               transform)
            image_tensor = image.to(device)

            # Generate caption starting with <start> word

            # procedure is different for each model type
            if args.model_type == 'attention':

                features = encoder(image_tensor)
                sampled_ids, _ = decoder.sample(features)
                sampled_ids = sampled_ids[0].cpu().numpy()
                #start sampled_caption with <start>
                sampled_caption = ['<start>']

            elif args.model_type == 'no_attention':
                features = encoder(image_tensor)
                sampled_ids = decoder.sample(features)
                sampled_ids = sampled_ids[0].cpu().numpy()
                sampled_caption = ['<start>']

            elif args.model_type == 'transformer':

                e_outputs = model.encoder(image_tensor)
                max_seq_length = 20
                sampled_ids = torch.zeros(max_seq_length, dtype=torch.long)
                sampled_ids[0] = torch.LongTensor([[vocab.word2idx['<start>']]
                                                   ]).to(device)

                for i in range(1, max_seq_length):

                    trg_mask = np.triu(np.ones((1, i, i)), k=1).astype('uint8')
                    trg_mask = Variable(
                        torch.from_numpy(trg_mask) == 0).to(device)

                    out = model.decoder(sampled_ids[:i].unsqueeze(0),
                                        e_outputs, trg_mask)

                    out = model.out(out)
                    out = F.softmax(out, dim=-1)
                    val, ix = out[:, -1].data.topk(1)
                    sampled_ids[i] = ix[0][0]

                sampled_ids = sampled_ids.cpu().numpy()
                sampled_caption = []

            # Convert word_ids to words
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
                sampled_caption.append(word)
                # break at <end> of the sentence
                if word == '<end>':
                    break
            sentence = ' '.join(sampled_caption)

            predicted[file] = sentence

        # save predictions to json file:
        json.dump(
            predicted,
            open(
                os.path.join(
                    args.predict_json,
                    'predicted_' + args.mode + '_' + str(epoch) + '.json'),
                'w'))

        #validate model
        with open(args.caption_path_val, 'r') as file:
            captions = json.load(file)

        res = {}
        for r in predicted:
            res[r] = [predicted[r].strip('<start> ').strip(' <end>')]

        images = captions['images']
        caps = captions['annotations']
        gts = {}
        for image in images:
            image_id = image['id']
            file_name = image['file_name']
            list_cap = []
            for cap in caps:
                if cap['image_id'] == image_id:
                    list_cap.append(cap['caption'])
            gts[file_name] = list_cap

        #calculate BLUE, CIDER and ROUGE metrics from real and resulting captions
        bleu_res = bleu(gts, res)
        cider_res = cider(gts, res)
        rouge_res = rouge(gts, res)

        # append resuls to result lists
        bleu_res_list.append(bleu_res)
        cider_res_list.append(cider_res)
        rouge_res_list.append(rouge_res)

        # write results to writer
        writer.add_scalar('BLEU1/validation/epoch', bleu_res[0], epoch + 1)
        writer.add_scalar('BLEU2/validation/epoch', bleu_res[1], epoch + 1)
        writer.add_scalar('BLEU3/validation/epoch', bleu_res[2], epoch + 1)
        writer.add_scalar('BLEU4/validation/epoch', bleu_res[3], epoch + 1)
        writer.add_scalar('CIDEr/validation/epoch', cider_res, epoch + 1)
        writer.add_scalar('ROUGE/validation/epoch', rouge_res, epoch + 1)

    results['bleu'] = bleu_res_list
    results['cider'] = cider_res_list
    results['rouge'] = rouge_res_list

    json.dump(
        results,
        open(os.path.join(args.predict_json, 'results_' + args.mode + '.json'),
             'w'))
    np.save(
        os.path.join(args.predict_json, 'loss_train_' + args.mode + '.npy'),
        loss_train)
    np.save(os.path.join(args.predict_json, 'loss_val_' + args.mode + '.npy'),
            loss_val)
Exemplo n.º 17
0
def evaluate(sentence,
             tokenizer,
             model,
             max_length=MAX_SEQ_LENGTH,
             pad_on_left=False,
             pad_token=0,
             pad_token_segment_id=0,
             mask_padding_with_zero=True):

    sentence = preprocess_sentence(sentence)

    inputs = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=max_length,
    )

    input_ids, input_token_type_ids = inputs["input_ids"], inputs[
        "token_type_ids"]
    input_attention_mask = [1 if mask_padding_with_zero else 0
                            ] * len(input_ids)
    input_padding_length = max_length - len(input_ids)

    if pad_on_left:
        input_ids = ([pad_token] * input_padding_length) + input_ids
        input_attention_mask = ([0 if mask_padding_with_zero else 1] *
                                input_padding_length) + input_attention_mask
        input_token_type_ids = ([pad_token_segment_id] *
                                input_padding_length) + input_token_type_ids

    else:
        input_ids = input_ids + ([pad_token] * input_padding_length)
        input_attention_mask = input_attention_mask + (
            [0 if mask_padding_with_zero else 1] * input_padding_length)
        input_token_type_ids = input_token_type_ids + ([pad_token_segment_id] *
                                                       input_padding_length)

    assert len(
        input_ids) == max_length, "Error with input length {} vs {}".format(
            len(input_ids), max_length)
    assert len(input_attention_mask
               ) == max_length, "Error with input length {} vs {}".format(
                   len(input_attention_mask), max_length)
    assert len(input_token_type_ids
               ) == max_length, "Error with input length {} vs {}".format(
                   len(input_token_type_ids), max_length)

    input_ids = tf.expand_dims(input_ids, 0)
    input_attention_mask = tf.expand_dims(input_attention_mask, 0)
    input_token_type_ids = tf.expand_dims(input_token_type_ids, 0)

    bert_input = {
        'input_ids': input_ids,
        'attention_mask': input_attention_mask,
        'token_type_ids': input_token_type_ids
    }

    decoder_input = [START_TOKEN]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(max_length):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            bert_input['input_ids'], output)

        predictions, attention_weights = model(bert_input, output, False,
                                               enc_padding_mask, combined_mask,
                                               dec_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
        output = tf.concat([output, predicted_id], axis=-1)

        # return the result if the predicted_id is equal to the end token
        if tf.equal(predicted_id, STOP_TOKEN):
            return tf.squeeze(output, axis=0), attention_weights

        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.

    return tf.squeeze(output, axis=0), attention_weights