Exemplo n.º 1
0
def main(args):
    train_losses = []
    train_acc = []

    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        losses = []
        accuracy = 0.0

        for i, (images, captions, lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)

            # record accuracy and loss
            losses.append(loss.item())
            topv, topi = outputs.topk(1, dim=1)
            targets = targets.unsqueeze(-1)
            accuracy += float((topi == targets).sum()) / targets.shape[0]
            # update params
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}, Accuracy: {:.4f}'
                    .format(epoch + 1, args.num_epochs, i, total_step,
                            loss.item(), np.exp(loss.item()),
                            accuracy / float(i + 1)))
                with open('my_train_loss_t4_resnext.txt', 'a') as fi:
                    fi.write('\n' +
                             'epoch = {}, i = {}, tr_loss = {}, acc = {}'.
                             format(epoch + 1, i + 1, loss.item(), accuracy /
                                    float(i + 1)))

            # Save the model checkpoints
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(
                        args.model_path,
                        'my-decoder-{}-{}-t4-resnext.ckpt'.format(
                            epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(
                        args.model_path,
                        'my-encoder-{}-{}-t4-resnext.ckpt'.format(
                            epoch + 1, i + 1)))

        train_losses.append(sum(losses) / total_step)
        train_acc.append(accuracy / total_step)

        # save losses over epoch
        f = open("train_loss.txt", "a")
        f.write(str(train_losses))
        f.close()

        # save accuracies over epoch
        f = open("train_acc.txt", "a")
        f.write(str(train_acc))
        f.close()
Exemplo n.º 2
0
def main(args):

    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        # transforms.RandomCrop(args.crop_size),
        # transforms.RandomHorizontalFlip(),
        transforms.Scale(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, args.coco_detection_result,
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers)

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    # the layout encoder hidden state size must be the same with decoder input size
    layout_encoder = LayoutEncoder(args.layout_embed_size, args.embed_size, 100, args.num_layers)
    decoder = DecoderRNN(args.embed_size, args.hidden_size,
                         len(vocab), args.num_layers)

    if torch.cuda.is_available():
        encoder.cuda()
        layout_encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(layout_encoder.parameters()) + list(decoder.parameters()) + \
	     list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths, label_seqs, location_seqs, layout_lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            images = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

            # Forward, Backward and Optimize
            decoder.zero_grad()
            layout_encoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            layout_encoding = layout_encoder(label_seqs, location_seqs, layout_lengths)
            comb_features = features + layout_encoding
            outputs = decoder(comb_features, captions, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                      % (epoch, args.num_epochs, i, total_step,
                         loss.data[0], np.exp(loss.data[0])))

                # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(decoder.state_dict(),
                           os.path.join(args.model_path,
                                        'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(encoder.state_dict(),
                           os.path.join(args.model_path,
                                        'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    # transform = transforms.Compose([
    #     transforms.RandomCrop(args.crop_size),
    #     transforms.RandomHorizontalFlip(),
    #     transforms.ToTensor(),
    #     transforms.Normalize((0.485, 0.456, 0.406),
    #                          (0.229, 0.224, 0.225))])

    transform = transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # data_loader = get_loader(args.image_dir, args.caption_path, vocab,
    #                          transform, args.batch_size,
    #                          shuffle=True, num_workers=args.num_workers)
    sasr_data_loader = SASR_Data_Loader(vocab, transform)
    sasr_data_loader.load_data(args.data_file, args.init_flag)
    frogger_data_loader = sasr_data_loader.data_loader(
        args.batch_size, transform, shuffle=True, num_workers=args.num_workers)
    # Build the models
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    total_step = len(frogger_data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(frogger_data_loader):
            images = to_var(images, volatile=True)
            if (list(images.size())[0] != 1):
                captions = to_var(captions)
                # print(list(images.size())[0])
                # print(captions)
                # exit(0)

                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]
                decoder.zero_grad()
                encoder.zero_grad()
                features = encoder(images)
                outputs = decoder(features, captions, lengths)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

                # Print log info
                if i % args.log_step == 0:
                    print(
                        'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                        % (epoch, args.num_epochs, i, total_step, loss.data[0],
                           np.exp(loss.data[0])))

                # Save the models
                if (i + 1) % args.save_step == 0:
                    torch.save(
                        decoder.state_dict(),
                        os.path.join(args.model_path,
                                     'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                    torch.save(
                        encoder.state_dict(),
                        os.path.join(args.model_path,
                                     'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
Exemplo n.º 4
0
def main(args):
    # Create model directory
    threshold = 20
    captions_dict = load_captions(train_dir)
    vocab = Vocabulary(captions_dict, threshold)
    vocab_size = vocab.index

    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper
    #with open(args.vocab_path, 'rb') as f:
    #vocab = pickle.load(f)
    dataloader = DataLoader(train_dir, vocab, transform)
    imagenumbers, captiontotal, imagetotal = dataloader.gen_data()

    # Build data loader
    data_loader = get_loader(imagenumbers,
                             captiontotal,
                             imagetotal,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, vocab_size,
                         args.num_layers).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                    .format(epoch, args.num_epochs, i, total_step, loss.item(),
                            np.exp(loss.item())))

            # Save the model checkpoints
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
Exemplo n.º 5
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            images = Variable(images)
            captions = Variable(captions)
            if torch.cuda.is_available():
                images = images.cuda()
                captions = captions.cuda()
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       np.exp(loss.data[0])))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
Exemplo n.º 6
0
class ImageDescriptor():
    def __init__(self, args, encoder):
        assert(args.mode == 'train' or 'val' or 'test')
        self.__args = args
        self.__mode = args.mode
        self.__attention_mechanism = args.attention
        self.__stats_manager = ImageDescriptorStatsManager()
        self.__validate_when_training = args.validate_when_training
        self.__history = []

        if not os.path.exists(args.model_dir):
            os.makedirs(args.model_dir)

        self.__config_path = os.path.join(
            args.model_dir, f'config-{args.encoder}{args.encoder_ver}.txt')

        # Device configuration
        self.__device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        # training set vocab
        with open(args.vocab_path, 'rb') as f:
            self.__vocab = pickle.load(f)

        # validation set vocab
        with open(args.vocab_path.replace('train', 'val'), 'rb') as f:
            self.__vocab_val = pickle.load(f)

        # coco dataset
        self.__coco_train = CocoDataset(
            args.image_dir, args.caption_path, self.__vocab, args.crop_size)
        self.__coco_val = CocoDataset(
            args.image_dir, args.caption_path.replace('train', 'val'), self.__vocab_val, args.crop_size)

        # data loader
        self.__train_loader = torch.utils.data.DataLoader(dataset=self.__coco_train,
                                                          batch_size=args.batch_size,
                                                          shuffle=True,
                                                          num_workers=args.num_workers,
                                                          collate_fn=collate_fn)
        self.__val_loader = torch.utils.data.DataLoader(dataset=self.__coco_val,
                                                        batch_size=args.batch_size,
                                                        shuffle=False,
                                                        num_workers=args.num_workers,
                                                        collate_fn=collate_fn)
        # Build the models
        self.__encoder = encoder.to(self.__device)
        self.__decoder = DecoderRNN(args.embed_size, args.hidden_size,
                                    len(self.__vocab), args.num_layers, attention_mechanism=self.__attention_mechanism).to(self.__device)

        # Loss and optimizer
        self.__criterion = nn.CrossEntropyLoss()
        self.__params = list(self.__decoder.parameters(
        )) + list(self.__encoder.linear.parameters()) + list(self.__encoder.bn.parameters())
        self.__optimizer = torch.optim.Adam(
            self.__params, lr=args.learning_rate)

        # Load checkpoint and check compatibility
        if os.path.isfile(self.__config_path):
            with open(self.__config_path, 'r') as f:
                content = f.read()[:-1]
            if content != repr(self):
                # save the error info
                with open('config.err', 'w') as f:
                    print(f'f.read():\n{content}', file=f)
                    print(f'repr(self):\n{repr(self)}', file=f)
                raise ValueError(
                    "Cannot create this experiment: "
                    "I found a checkpoint conflicting with the current setting.")
            self.load(file_name=args.checkpoint)
        else:
            self.save()

    def setting(self):
        '''
        Return the setting of the experiment.
        '''
        return {'Net': (self.__encoder, self.__decoder),
                'Optimizer': self.__optimizer,
                'BatchSize': self.__args.batch_size}

    @property
    def epoch(self):
        return len(self.__history)

    @property
    def history(self):
        return self.__history

    # @property
    # def mode(self):
    #     return self.__args.mode

    # @mode.setter
    # def mode(self, m):
    #     self.__args.mode = m

    def __repr__(self):
        '''
        Pretty printer showing the setting of the experiment. This is what
        is displayed when doing `print(experiment). This is also what is
        saved in the `config.txt file.
        '''
        string = ''
        for key, val in self.setting().items():
            string += '{}({})\n'.format(key, val)
        return string

    def state_dict(self):
        '''
        Returns the current state of the model.
        '''
        return {'Net': (self.__encoder.state_dict(), self.__decoder.state_dict()),
                'Optimizer': self.__optimizer.state_dict(),
                'History': self.__history}

    def save(self):
        '''
        Saves the model on disk, i.e, create/update the last checkpoint.
        '''
        file_name = os.path.join(
            self.__args.model_dir, '{}{}-epoch-{}.ckpt'.format(self.__args.encoder, self.__args.encoder_ver, self.epoch))
        torch.save(self.state_dict(), file_name)
        with open(self.__config_path, 'w') as f:
            print(self, file=f)

        print(f'Save to {file_name}.')

    def load(self, file_name=None):
        '''
        Loads the model from the last checkpoint saved on disk.

        Args:
            file_name (str): path to the checkpoint file
        '''
        if not file_name:
            # find the latest .ckpt file
            try:
                file_name = max(
                    glob.iglob(os.path.join(self.__args.model_dir, '*.ckpt')), key=os.path.getctime)
                print(f'Load from {file_name}.')
            except:
                raise FileNotFoundError(
                    'No checkpoint file in the model directory.')
        else:
            file_name = os.path.join(self.__args.model_dir, file_name)
            print(f'Load from {file_name}.')

        try:
            checkpoint = torch.load(file_name, map_location=self.__device)
        except:
            raise FileNotFoundError(
                'Please check --checkpoint, the name of the file')

        self.load_state_dict(checkpoint)
        del checkpoint

    def load_state_dict(self, checkpoint):
        '''
        Loads the model from the input checkpoint.

        Args:
            checkpoint: an object saved with torch.save() from a file.
        '''
        self.__encoder.load_state_dict(checkpoint['Net'][0])
        self.__decoder.load_state_dict(checkpoint['Net'][1])
        self.__optimizer.load_state_dict(checkpoint['Optimizer'])
        self.__history = checkpoint['History']

        # The following loops are used to fix a bug that was
        # discussed here: https://github.com/pytorch/pytorch/issues/2830
        # (it is supposed to be fixed in recent PyTorch version)
        for state in self.__optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(self.__device)

    def train(self, plot_loss=None):
        '''
        Train the network using backpropagation based
        on the optimizer and the training set.

        Args:
            plot_loss (func, optional): if not None, should be a function taking a
                single argument being an experiment (meant to be `self`).
                Similar to a visitor pattern, this function is meant to inspect
                the current state of the experiment and display/plot/save
                statistics. For example, if the experiment is run from a
                Jupyter notebook, `plot` can be used to display the evolution
                of the loss with `matplotlib`. If the experiment is run on a
                server without display, `plot` can be used to show statistics
                on `stdout` or save statistics in a log file. (default: None)
        '''
        self.__encoder.train()
        self.__decoder.train()
        self.__stats_manager.init()
        total_step = len(self.__train_loader)
        start_epoch = self.epoch
        print("Start/Continue training from epoch {}".format(start_epoch))

        if plot_loss is not None:
            plot_loss(self)

        for epoch in range(start_epoch, self.__args.num_epochs):
            t_start = time.time()
            self.__stats_manager.init()
            for i, (images, captions, lengths) in enumerate(self.__train_loader):
                # Set mini-batch dataset
                if not self.__attention_mechanism:
                    images = images.to(self.__device)
                    captions = captions.to(self.__device)
                else:
                    with torch.no_grad():
                        images = images.to(self.__device)
                    captions = captions.to(self.__device)

                targets = pack_padded_sequence(
                    captions, lengths, batch_first=True)[0]

                # Forward, backward and optimize
                if not self.__attention_mechanism:
                    features = self.__encoder(images)
                    outputs = self.__decoder(features, captions, lengths)
                    self.__decoder.zero_grad()
                    self.__encoder.zero_grad()
                else:
                    self.__encoder.zero_grad()
                    self.__decoder.zero_grad()
                    features, cnn_features = self.__encoder(images)
                    outputs = self.__decoder(
                        features, captions, lengths, cnn_features=cnn_features)
                loss = self.__criterion(outputs, targets)

                loss.backward()
                self.__optimizer.step()
                with torch.no_grad():
                    self.__stats_manager.accumulate(
                        loss=loss.item(), perplexity=np.exp(loss.item()))

                # Print log info each iteration
                if i % self.__args.log_step == 0:
                    print('[Training] Epoch: {}/{} | Step: {}/{} | Loss: {:.4f} | Perplexity: {:5.4f}'
                          .format(epoch+1, self.__args.num_epochs, i, total_step, loss.item(), np.exp(loss.item())))

            if not self.__validate_when_training:
                self.__history.append(self.__stats_manager.summarize())
                print("Epoch {} | Time: {:.2f}s\nTraining Loss: {:.6f} | Training Perplexity: {:.6f}".format(
                    self.epoch, time.time() - t_start, self.__history[-1]['loss'], self.__history[-1]['perplexity']))
            else:
                self.__history.append(
                    (self.__stats_manager.summarize(), self.evaluate()))
                print("Epoch {} | Time: {:.2f}s\nTraining Loss: {:.6f} | Training Perplexity: {:.6f}\nEvaluation Loss: {:.6f} | Evaluation Perplexity: {:.6f}".format(
                    self.epoch, time.time() - t_start,
                    self.__history[-1][0]['loss'], self.__history[-1][0]['perplexity'],
                    self.__history[-1][1]['loss'], self.__history[-1][1]['perplexity']))

            # Save the model checkpoints
            self.save()

            if plot_loss is not None:
                plot_loss(self)

        print("Finish training for {} epochs".format(self.__args.num_epochs))

    def evaluate(self, print_info=False):
        '''
        Evaluates the experiment, i.e., forward propagates the validation set
        through the network and returns the statistics computed by the stats
        manager.

        Args:
            print_info (bool): print the results of loss and perplexity
        '''
        self.__stats_manager.init()
        self.__encoder.eval()
        self.__decoder.eval()
        total_step = len(self.__val_loader)
        with torch.no_grad():
            for i, (images, captions, lengths) in enumerate(self.__val_loader):
                images = images.to(self.__device)
                captions = captions.to(self.__device)
                targets = pack_padded_sequence(
                    captions, lengths, batch_first=True)[0]

                # Forward
                if not self.__attention_mechanism:
                    features = self.__encoder(images)
                    outputs = self.__decoder(features, captions, lengths)
                else:
                    features, cnn_features = self.__encoder(images)
                    outputs = self.__decoder(
                        features, captions, lengths, cnn_features=cnn_features)
                loss = self.__criterion(outputs, targets)
                self.__stats_manager.accumulate(
                    loss=loss.item(), perplexity=np.exp(loss.item()))
                if i % self.__args.log_step == 0:
                    print('[Validation] Step: {}/{} | Loss: {:.4f} | Perplexity: {:5.4f}'
                          .format(i, total_step, loss.item(), np.exp(loss.item())))

        summarize = self.__stats_manager.summarize()
        if print_info:
            print(
                f'[Validation] Average loss for this epoch is {summarize["loss"]:.6f}')
            print(
                f'[Validation] Average perplexity for this epoch is {summarize["perplexity"]:.6f}\n')
        self.__encoder.train()
        self.__decoder.train()
        return summarize

    def mode(self, mode=None):
        '''
        Get the current mode or change mode.

        Args:
            mode (str): 'train' or 'eval' mode
        '''
        if not mode:
            return self.__mode
        self.__mode = mode

    def __load_image(self, image):
        '''
        Load image at `image_path` for evaluation.

        Args:
            image (PIL Image): image
        '''
        image = image.resize([224, 224], Image.LANCZOS)

        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406),
                                 (0.229, 0.224, 0.225))])
        image = transform(image).unsqueeze(0)

        return image

    def test(self, image_path=None, plot=False):
        '''
        Evaluate the model by generating the caption for the
        corresponding image at `image_path`.

        Note: This function will not provide BLEU socre.

        Args:
            image_path (str): file path of the evaluation image
            plot (bool): plot or not
        '''
        self.__encoder.eval()
        self.__decoder.eval()

        with torch.no_grad():
            if not image_path:
                image_path = self.__args.image_path

            image = Image.open(image_path)

            # only process with RGB image
            if np.array(image).ndim == 3:
                img = self.__load_image(image).to(self.__device)

                # generate an caption
                if not self.__attention_mechanism:
                    feature = self.__encoder(img)
                    sampled_ids = self.__decoder.sample(feature)
                    sampled_ids = sampled_ids[0].cpu().numpy()
                else:
                    feature, cnn_features = self.__encoder(img)
                    sampled_ids = self.__decoder.sample(feature, cnn_features)
                    sampled_ids = sampled_ids.cpu().data.numpy()

                # Convert word_ids to words
                sampled_caption = []
                for word_id in sampled_ids:
                    word = self.__vocab.idx2word[word_id]
                    sampled_caption.append(word)
                    if word == '<end>':
                        break
                sentence = ' '.join(sampled_caption[1:-1])

                # Print out the image and the generated caption
                print(sentence)

                if plot:
                    image = Image.open(image_path)
                    plt.imshow(np.asarray(image))
            else:
                print('Not support for non-RGB image.')
        self.__encoder.train()
        self.__decoder.train()

    def coco_image(self, idx, ds='val'):
        '''
        Access iamge_id (which is part of the file name) 
        and corresponding image caption of index `idx` in COCO dataset.

        Note: For jupyter notebook

        Args:
            idx (int): index of COCO dataset

        Returns:
            (dict)
        '''
        assert(ds == 'train' or 'val')

        if ds == 'train':
            ann_id = self.__coco_train.ids[idx]
            return self.__coco_train.coco.anns[ann_id]
        else:
            ann_id = self.__coco_val.ids[idx]
            return self.__coco_val.coco.anns[ann_id]

    @property
    def len_of_train_set(self):
        '''
        Number of training 
        '''
        return len(self.__coco_train)

    @property
    def len_of_val_set(self):
        return len(self.__coco_val)

    def bleu_score(self, idx, ds='val', plot=False, show_caption=False):
        '''
        Evaluate the BLEU score for index `idx` in COCO dataset.

        Note: For jupyter notebook

        Args:
            idx (int): index
            ds (str): training or validation dataset
            plot (bool): plot the image or not

        Returns:
            score (float): bleu score
        '''
        assert(ds == 'train' or 'val')
        self.__encoder.eval()
        self.__decoder.eval()

        with torch.no_grad():
            try:
                if ds == 'train':
                    ann_id = self.__coco_train.ids[idx]
                    coco_ann = self.__coco_train.coco.anns[ann_id]
                else:
                    ann_id = self.__coco_val.ids[idx]
                    coco_ann = self.__coco_val.coco.anns[ann_id]
            except:
                raise IndexError('Invalid index')

            image_id = coco_ann['image_id']

            image_id = str(image_id)
            if len(image_id) != 6:
                for _ in range(6 - len(image_id)):
                    image_id = '0' + image_id

            image_path = f'{self.__args.image_dir}/COCO_train2014_000000{image_id}.jpg'
            if ds == 'val':
                image_path = image_path.replace('train', 'val')

            coco_list = coco_ann['caption'].split()

            image = Image.open(image_path)

            if np.array(image).ndim == 3:
                img = self.__load_image(image).to(self.__device)

                # generate an caption
                if not self.__attention_mechanism:
                    feature = self.__encoder(img)
                    sampled_ids = self.__decoder.sample(feature)
                    sampled_ids = sampled_ids[0].cpu().numpy()
                else:
                    feature, cnn_features = self.__encoder(img)
                    sampled_ids = self.__decoder.sample(feature, cnn_features)
                    sampled_ids = sampled_ids.cpu().data.numpy()

                # Convert word_ids to words
                sampled_caption = []
                for word_id in sampled_ids:
                    word = self.__vocab.idx2word[word_id]
                    sampled_caption.append(word)
                    if word == '<end>':
                        break

                # strip punctuations and spacing
                sampled_list = [c for c in sampled_caption[1:-1]
                                if c not in punctuation]

                score = sentence_bleu(coco_list, sampled_list,
                                      smoothing_function=SmoothingFunction().method4)

                if plot:
                    plt.figure()
                    image = Image.open(image_path)
                    plt.imshow(np.asarray(image))
                    plt.title(f'score: {score}')
                    plt.xlabel(f'file: {image_path}')

                # Print out the generated caption
                if show_caption:
                    print(f'Sampled caption:\n{sampled_list}')
                    print(f'COCO caption:\n{coco_list}')

            else:
                print('Not support for non-RGB image.')
                return

        return score
Exemplo n.º 7
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    
    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([ 
        transforms.Resize((224,224)), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    #Load vocab_list for uniskip
    vocab_list = pd.read_csv("./data/vocab_list.csv", header=None)
    vocab_list = vocab_list.values.tolist()[0]
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.img_embeddings_dir, args.data_path, vocab, 
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers) 

    # Build the models
    #im_encoder = preprocess_get_model.model()
    attention = T_Att()
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers, args.dropout)
   
    uniskip = UniSkip('./data/skip-thoughts', vocab_list)
    if torch.cuda.is_available():
        #im_encoder.cuda()
        attention.cuda()
        decoder.cuda()
        uniskip.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(attention.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    
    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, cap_lengths, qa, qa_lengths, vocab_words) in enumerate(tqdm(data_loader)):
            
            #Re-initialize decoder hidden state
            decoder.hidden = decoder.init_hidden()

            # Set mini-batch dataset
            img_embeddings = to_var(images.data, volatile=True)
            captions = to_var(captions)
            qa = to_var(qa)
            targets = pack_padded_sequence(qa, qa_lengths, batch_first=True)[0]

            # Forward, Backward and Optimize
            decoder.zero_grad()
            attention.zero_grad()
            #features = encoder(images)
            #img_embeddings = im_encoder(images)   
            cap_embeddings = uniskip(captions, cap_lengths)
            cap_embeddings = cap_embeddings.data
            img_embeddings = img_embeddings.data
           # print(img_embeddings.size())
           # print(type(img_embeddings))
           # print(cap_embeddings.size())
            #print(type(cap_embeddings)) 
            ctx_vec = attention(img_embeddings,cap_embeddings)
            outputs = decoder(ctx_vec, qa, qa_lengths)
            predicted = outputs.max(1)[1]
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            #pred_ids = []
            #print(predicted.size())
            # pred_ids.append(predicted)
            # Print log info
            if i % args.log_step == 0:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                      %(epoch, args.num_epochs, i, total_step, 
                        loss.data[0], np.exp(loss.data[0]))) 
                #output_ids = predicted.cpu().data.numpy()
                #sample = []                
                #for word_id in output_ids:
                #    word = vocab.idx2word[word_id]
                #    sample.append(word)
                #sample = ' '.join(sample)
                #print("predicted qa : " + sample)

            # Save the models
            if (i+1)%args.save_step == 0:
                torch.save(decoder.state_dict(), 
                           os.path.join(args.model_path, 
                                        'decoder-%d-%d.pkl' %(epoch+1, i+1)))
                torch.save(attention.state_dict(), 
                           os.path.join(args.model_path, 
                                        'attention-%d-%d.pkl' %(epoch+1, i+1)))
Exemplo n.º 8
0
def main(args):

    if not os.path.exists(
            args.model_path
    ):  # # create model folder to keep model setting pickle files
        os.makedirs(args.model_path)

    # image preprocessing and normailzation
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)  # load vocabulary wrapper file

    # get data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    encoder = EncoderCNN(args.embed_size)  # build encoder
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)  # build decoder

    if torch.cuda.is_available():  # load GPU
        encoder.cuda()
        decoder.cuda()

    criterion = nn.CrossEntropyLoss()  # get loss
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params,
                                 lr=args.learning_rate)  # get optimization

    # train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):

            # set mini batch dataset
            images = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # forward and backward
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            loss.backward()

            optimizer.step()  # optimization

            # Print loss and perplexity
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       np.exp(loss.data[0])))

            # save the models pickle file settings
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
Exemplo n.º 9
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models (Gen)
    # TODO: put these in generator
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Build the models (Disc)
    discriminator = Discriminator(args.embed_size, args.hidden_size,
                                  len(vocab), args.num_layers)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        discriminator.cuda()

    # Loss and Optimizer (Gen)
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Loss and Optimizer (Disc)
    params_disc = list(discriminator.parameters())
    optimizer_disc = torch.optim.Adam(params_disc)

    # Train the Models
    total_step = len(data_loader)
    disc_losses = []
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths, wrong_captions,
                wrong_lengths) in enumerate(data_loader):

            # pdb.set_trace()
            # TODO: train disc before gen

            # Set mini-batch dataset
            images = to_var(images, volatile=True)
            captions = to_var(captions)
            wrong_captions = to_var(wrong_captions)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            outputs = decoder(features, captions, lengths)

            sampled_captions = decoder.sample(features)
            # sampled_captions = torch.zeros_like(sampled_ids)
            sampled_lengths = []

            for row in range(sampled_captions.size(0)):
                for index, word_id in enumerate(sampled_captions[row, :]):
                    # pdb.set_trace()
                    word = vocab.idx2word[word_id.cpu().data.numpy()[0]]
                    # sampled_captions[row, index].data = word
                    if word == '<end>':
                        sampled_lengths.append(index + 1)
                        break
                    elif index == sampled_captions.size(1) - 1:
                        sampled_lengths.append(sampled_captions.size(1))
                        break
            sampled_lengths = np.array(sampled_lengths)
            sampled_lengths[::-1].sort()
            sampled_lengths = sampled_lengths.tolist()
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Train discriminator
            discriminator.zero_grad()
            rewards_real = discriminator(images, captions, lengths)
            rewards_fake = discriminator(images, sampled_captions,
                                         sampled_lengths)
            rewards_wrong = discriminator(images, wrong_captions,
                                          wrong_lengths)
            real_loss = -torch.mean(torch.log(rewards_real))
            fake_loss = -torch.mean(
                torch.clamp(torch.log(1 - rewards_fake), min=-1000))
            wrong_loss = -torch.mean(
                torch.clamp(torch.log(1 - rewards_wrong), min=-1000))
            loss_disc = real_loss + fake_loss + wrong_loss

            disc_losses.append(loss_disc.cpu().data.numpy()[0])
            loss_disc.backward()
            optimizer_disc.step()

            # print('iteration %i' % i)
            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       np.exp(loss.data[0])))

            # Save the models
            # if (i+1) % args.save_step == 0:
            if (
                    i + 1
            ) % total_step == 0:  # jm: saving at the last iteration instead
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    discriminator.state_dict(),
                    os.path.join(
                        args.model_path,
                        'discriminator-%d-%d.pkl' % (epoch + 1, i + 1)))

                # plot at the end of every epoch
                plt.plot(disc_losses, label='disc loss')
                plt.savefig('disc_losses.png')
                plt.clf()
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    transform = transforms.Compose([
        # transforms.ColorJitter(contrast = 0.3,saturation = 0.3),
        # transforms.RandomChoice([transforms.RandomHorizontalFlip(),transforms.RandomVerticalFlip()]),
        transforms.RandomAffine(0,translate = (0.1,0.1)),
        transforms.ToTensor(), 
        transforms.Normalize((0.8, 0.7, 0.8), 
                            (1, 1, 1))
        ])
    
    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
    #                          transform, args.batch_size,
    #                          shuffle=True, num_workers=args.num_workers) 
    sasr_data_loader = SASR_Data_Loader(vocab,transform)
    sasr_data_loader.load_data(args.data_file,args.init_flag)
    frogger_data_loader = sasr_data_loader.data_loader(args.batch_size, 
                             transform,
                             shuffle=True, num_workers=args.num_workers) 
    # Build the models
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) + list(encoder.resnet.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    stransform = transforms.ToPILImage()

    img2vec = Img2Vec()
    total_step = len(frogger_data_loader)
    for epoch in range(args.num_epochs):
        for i,(images,captions,lengths) in enumerate(frogger_data_loader):
            # image1 = images[0].squeeze()
            # # print(image1.size())
            # # c = stransform(image1)
            # # vec = img2vec.get_vec(c,True)
            # # # print(vec)
            # # c.save('save_image1.png')
            # # image2 = images[1].squeeze()
            # # print(image2.size())
            # # c = stransform(image2)
            # # # vec = img2vec.get_vec(c)
            # # # print(vec)
            # # c.save('save_image2.png')
            images = to_var(images, volatile=True)
            # images = images.to(device)
            if (list(images.size())[0]!=1):
                captions = to_var(captions)
                
                # print(images[0])
                targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
                decoder.zero_grad()
                encoder.zero_grad()
                # print(images) 
                features = encoder(images)
                outputs = decoder(features, captions, lengths)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

                # Print log info
                if i % args.log_step == 0:
                    print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                          %(epoch, args.num_epochs, i, total_step, 
                            loss.data[0], np.exp(loss.data[0]))) 
                    
                # Save the models
                if (i+1) % args.save_step == 0:
                    torch.save(decoder.state_dict(), 
                               os.path.join(args.model_path, 
                                            'decoder-%d-%d.pkl' %(epoch+1, i+1)))
                    torch.save(encoder.state_dict(), 
                               os.path.join(args.model_path, 
                                        'encoder-%d-%d.pkl' %(epoch+1, i+1)))
Exemplo n.º 11
0
def train(
        num_epochs: int,
        lr: float,
        batch_size: int,
        vocab_threshold: int,
        vocab_from_file: bool,
        embed_size: int,
        hidden_size: int,
        save_every: int,
        print_every: int,
        log_file: str
)-> None:
    """
    Train the captioning network with the required parameters.
    The training logs are saved in log_file.

    num_epochs:         Number of epochs to train the model.
    batch_size:         Mini-batch size for training.
    vocab_threshold:    Minimum word count threshold for vocabulary initialisation. A word that appears in
                        the dataset a fewer number of times than vocab_threshold will be discarded and
                        will not appear in the vocabulary dictionnary. Indeed, the smaller the threshold,
                        the bigger the vocabulary.
    vocab_from_file:    Whether to load the vocabulary from a pre-initialized file.
    embed_size:         Dimensionality of image and word embeddings.
    hidden_size:        Number of features in hidden state of the RNN decoder.
    save_every:         Number of epochs between each checkpoint saving.
    print_every:        Number of batches for printing average loss.
    log_file:           Name of the training log file. Saves loss and perplexity.

    """

    transform_train = transforms.Compose([
        transforms.Resize(256),                          # smaller edge of image resized to 256
        transforms.RandomCrop(224),                      # get 224x224 crop from random location
        transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
        transforms.ToTensor(),                           # convert the PIL Image to a tensor
        transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                             (0.229, 0.224, 0.225))])

    # Build data loader.
    data_loader = get_loader(transform=transform_train,
                             mode='train',
                             batch_size=batch_size,
                             vocab_threshold=vocab_threshold,
                             vocab_from_file=vocab_from_file)

    # The size of the vocabulary.
    vocab_size = len(data_loader.dataset.vocab)

    # Initialize the encoder and decoder.
    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

    # Move models to GPU if CUDA is available.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder.to(device)
    decoder.to(device)

    # Define the loss function.
    criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

    # Parameters to update. We do not re-train de CNN here
    params = list(encoder.embed.parameters()) + list(decoder.parameters())

    # TODO: add learning rate scheduler
    # Optimizer for minimum search.
    optimizer = optim.Adam(params, lr=lr)

    # Set the total number of training steps per epoch.
    total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

    # Open the training log file.
    f = open(log_file, 'w')

    for epoch in range(1, num_epochs + 1):
        for i_step in range(1, total_step + 1):

            # Randomly sample a caption length, and sample indices with that length.
            indices = data_loader.dataset.get_train_indices()
            # Create and assign a batch sampler to retrieve a batch with the sampled indices.
            new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
            data_loader.batch_sampler.sampler = new_sampler

            # Obtain the batch.
            images, captions = next(iter(data_loader))

            # Move batch of images and captions to GPU if CUDA is available.
            images = images.to(device)
            captions = captions.to(device)

            # Zero the gradients.
            decoder.zero_grad()
            encoder.zero_grad()

            # Pass the inputs through the CNN-RNN model.
            features = encoder(images)
            outputs = decoder(features, captions)

            # for i in range(10):
            #     print(torch.argmax(outputs[0,i, :]).item())

            # Calculate the batch loss.
            loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))

            # Backward pass.
            loss.backward()

            # Update the parameters in the optimizer.
            optimizer.step()

            # Get training statistics.
            stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (
            epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))

            # Print training statistics (on same line).
            print('\r' + stats, end="")
            sys.stdout.flush()

            # Print training statistics to file.
            f.write(stats + '\n')
            f.flush()

            # Print training statistics (on different line).
            if i_step % print_every == 0:
                print('\r' + stats)

        # Save the weights.
        if epoch % save_every == 0:
            torch.save(decoder.state_dict(), os.path.join('./models', f"{device}_{hidden_size}_decoder-{epoch}.pkl"))
            torch.save(encoder.state_dict(), os.path.join('./models', f"{device}_{hidden_size}_encoder-{epoch}.pkl"))

    # Close the training log file.
    f.close()
Exemplo n.º 12
0
def main(cfg):
    # modelのディレクトリ作成
    if not os.path.exists(hydra.utils.to_absolute_path(cfg.train.model_path)):
        os.makedirs(hydra.utils.to_absolute_path(cfg.train.model_path))

    # 画像の前処理と正規化を行う
    transform = transforms.Compose([
        transforms.RandomCrop(cfg.image.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(cfg.image.mean, cfg.image.std)
    ])

    with open(hydra.utils.to_absolute_path(cfg.train.vocab_path), 'rb') as f:
        vocab = pickle.load(f)

    # data_loaderの読み込み
    data_loader = get_loader(hydra.utils.to_absolute_path(cfg.train.image_dir),
                             hydra.utils.to_absolute_path(cfg.train.caption_path), vocab, transform,
                             cfg.train.batch_size, shuffle=True,
                             num_workers=cfg.train.num_workers)

    # modelの構築
    encoder = EncoderCNN(cfg.train.embed_size).to(device)
    decoder = DecoderRNN(cfg.train.embed_size, cfg.train.hidden_size, len(vocab), cfg.train.num_layers).to(device)

    # lossとoptimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=cfg.train.learning_rate)

    # train
    total_step = len(data_loader)

    for epoch in range(cfg.train.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            # ミニバッジデータセットのセット
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            if i % cfg.train.log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                      .format(epoch, cfg.train.num_epochs, i, total_step, loss.item(), np.exp(loss.item())))

            # modelをcheckpointごとにSaveする
            if (i + 1) % cfg.train.save_step == 0:
                torch.save(decoder.state_dict(), os.path.join(
                    hydra.utils.to_absolute_path(cfg.train.model_path), 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))

                torch.save(encoder.state_dict(), os.path.join(
                    hydra.utils.to_absolute_path(cfg.train.model_path), 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
Exemplo n.º 13
0
def main(args):
    tensor_board_writer = Logger()
    # Create model directory
    # if not os.path.exists(args.model_path):
    #     os.makedirs(args.model_path)

    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    with open(args.test_vocab_path, 'rb') as f:
        test_vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    test_data_loader = get_loader(args.test_image_dir,
                                  args.test_caption_path,
                                  test_vocab,
                                  transform,
                                  args.batch_size,
                                  shuffle=True,
                                  num_workers=args.num_workers)

    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the models
    total_step = len(data_loader)
    for epoch in trange(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(tqdm(data_loader)):

            # Set mini-batch dataset
            images = images.to(device)
            # print(images.shape)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()
Exemplo n.º 14
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             args.dictionary,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models
    #encoder = EncoderCNN(args.embed_size).to(device)
    dictionary = pd.read_csv(args.dictionary,
                             header=0,
                             encoding='unicode_escape',
                             error_bad_lines=False)
    dictionary = list(dictionary['keys'])

    decoder = DecoderRNN(len(dictionary), args.hidden_size, len(vocab),
                         args.num_layers).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters(
    ))  # + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (array, captions, lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            array = array.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, backward and optimize
            #features = encoder(images)
            outputs = decoder(array, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            #encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                    .format(epoch, args.num_epochs, i, total_step, loss.item(),
                            np.exp(loss.item())))

            # Save the model checkpoints
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
Exemplo n.º 15
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path + "_" + args.model_type):
        os.makedirs(args.model_path + "_" + args.model_type)
    
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([ 
        #transforms.RandomCrop(args.crop_size),
        #transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers, mode=args.mode) 
    
    # Build the models
    encoder = EncoderCNN(args.embed_size, args.model_type, args.mode).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    #params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    #params = list(encoder.parameters()) + list(decoder.parameters())

    # fine tune
    if(args.mode == "side_by_side"): params = list(decoder.parameters()) + list(encoder.linear_to_embed_size.parameters())
    elif(args.mode == "depthwise"): params = list(decoder.parameters()) + list(encoder.linear_to_embed_size_concat.parameters())

    # train from scratch
    #if(args.mode == "side_by_side"): params = list(decoder.parameters()) + list(encoder.linear_to_embed_size.parameters()) + list(encoder.bn.parameters()) + list(encoder.feature_extractor.parameters())
    #elif(args.mode == "depthwise"): params = list(decoder.parameters()) + list(encoder.linear_to_embed_size_concat.parameters()) + list(encoder.bn.parameters()) + list(encoder.feature_extractor.parameters())

    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    
    # Train the models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            
            # Set mini-batch dataset
            images = images.to(device)
            #Image.fromarray((((images[0].permute(1, 2, 0).detach().cpu().numpy() + 1) / 2) * 255).astype(np.uint8)).show()
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            #print(targets.shape)
            #print(targets)
            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)

            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                      .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item())))
                
    # Save the model checkpoints
    torch.save(decoder.state_dict(), os.path.join(
        args.model_path + "_" + args.model_type + "/", 'decoder.pt'))
    torch.save(encoder.state_dict(), os.path.join(
        args.model_path + "_" + args.model_type + "/", 'encoder.pt'))
Exemplo n.º 16
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    #read rationalization data
    rationalizations = []
    max_length = 0
    lengths = []
    bad_worker_ids = [
        'A2CNSIECB9UP05', 'A23782O23HSPLA', 'A2F9ZBSR6AXXND', 'A3GI86L18Z71XY',
        'AIXTI8PKSX1D2', 'A2QWHXMFQI18GQ', 'A3SB7QYI84HYJT', 'A2Q2A7AB6MMFLI',
        'A2P1KI42CJVNIA', 'A1IJXPKZTJV809', 'A2WZ0RZMKQ2WGJ', 'A3EKETMVGU2PM9',
        'A1OCEC1TBE3CWA', 'AE1RYK54MH11G', 'A2ADEPVGNNXNPA', 'A15QGLWS8CNJFU',
        'A18O3DEA5Z4MJD', 'AAAL4RENVAPML', 'A3TZBZ92CQKQLG', 'ABO9F0JD9NN54',
        'A8F6JFG0WSELT', 'ARN9ET3E608LJ', 'A2TCYNRAZWK8CC', 'A32BK0E1IPDUAF',
        'ANNV3E6CIVCW4'
    ]
    with open('./Log/Rationalizations.txt') as f:
        for line in f:
            line = line.lower()
            line = re.sub('[^a-z\ \']+', " ", line)
            words = line.split()
            length = len(words)
            lengths.append(length)
            if length > max_length:
                max_length = length
            for index, word in enumerate(words):
                words[index] = vocab.word2idx[word]
            rationalizations.append(words)
    # max_length = max(rationalizations,key=len
    rationalizations = [np.array(xi) for xi in rationalizations]
    # for index,r in enumerate(rationalizations):
    #     # print(max_length)
    #     r = np.lib.pad(r,(0,max_length - len(r)),'constant')
    #     rationalizations[index] = r

    # rationalizations = np.vstack(rationalizations)
    # print(rationalizations)
    # print(rationalizations.shape)
    # print(torch.from_numpy(rationalizations))
    # rationalizations = torch.from_numpy(rationalizations)
    # print(np.asarray(rationalizations).reshape(rationalizations.shape,rationalizations.shape))

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    frogger_data_loader = get_images('./data/FroggerDataset/', args.batch_size,
                                     transform)
    # exit(0)

    # Train the Models
    # data = iter(frogger_data_loader)
    # imgs = data.next()[0]
    # print(imgs)
    # print(frogger_data_loader[0])
    # exit(0)

    # for i,(images)  in enumerate(frogger_data_loader):
    #     print(images)
    total_step = len(frogger_data_loader)
    for epoch in range(args.num_epochs):
        for i, x in enumerate(frogger_data_loader):
            # print(x)
            # print(x[0])
            # exit(0)
            # print(x[0])
            # exit(0)
            images = to_var(x[0], volatile=True)
            print(images[0][1])
            exit(0)
            captions = []
            max_length = max(lengths[i:i + 2])
            rats = rationalizations[i:i + 2]
            rats.sort(key=lambda s: len(s))
            rats.reverse()
            # print(rats)
            # exit(0)
            for index, r in enumerate(rats):
                # print(max_length)
                r = np.lib.pad(r, (0, max_length - len(r)), 'constant')
                captions.append(r)
            # rationalizations = np.vstack(rationalizations)
            # captions.sort(key = lambda s: len(s))
            captions = to_var(torch.from_numpy(np.asarray(captions)))

            # lengths.append(len(rationalizations[i]))
            new_lengths = []
            # new_lengths.append(lengths[i])
            new_lengths = lengths[i:i + 2]
            new_lengths.sort()
            new_lengths.reverse()
            captions = captions
            # print(captions)
            # print(new_lengths)
            targets = pack_padded_sequence(captions,
                                           new_lengths,
                                           batch_first=True)[0]
            decoder.zero_grad()
            encoder.zero_grad()
            # print(images)
            features = encoder(images)
            # print(features)
            # print(rats)
            # print(len(lengths))
            outputs = decoder(features, captions, new_lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       np.exp(loss.data[0])))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
Exemplo n.º 17
0
def main(args):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    with open('data/multim_poem.json') as f, open('data/unim_poem.json') as unif:
        multim = json.load(f)
        unim = json.load(unif)

    multim = util.filter_multim(multim)
    # multim = multim[:128]
    with open('data/img_features.pkl', 'rb') as fi, open('data/poem_features.pkl', 'rb') as fp:
        img_features = pickle.load(fi)
        poem_features = pickle.load(fp)


    # make sure vocab exists
    word2idx, idx2word = util.read_vocab_pickle(args.vocab_path)

    # will be used in embedder

    if args.source == 'unim':
        data = unim
        features = poem_features
    elif args.source == 'multim':
        data = multim
        features = img_features
    else:
        print('Error: source must be unim or multim!')
        exit()

    # create data loader. the data will be in decreasing order of length
    data_loader = get_poem_poem_dataset(args.batch_size, shuffle=True,
                                        num_workers=args.num_workers, json_obj=data, features=features,
                                        max_seq_len=128, word2idx=word2idx, tokenizer=None)

    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(word2idx), device)
    decoder = DataParallel(decoder)
    if args.restore:
        decoder.load_state_dict(torch.load(args.ckpt))
    if args.load:
        decoder.load_state_dict(torch.load(args.load))
    decoder.to(device)

    discriminator = Discriminator(args.embed_size, args.hidden_size, len(word2idx), num_labels=2)
    discriminator.embed.weight = decoder.module.embed.weight
    discriminator = DataParallel(discriminator)
    if args.restore:
        discriminator.load_state_dict(torch.load(args.disc))
    discriminator.to(device)

    # optimization config
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(decoder.parameters(), lr=args.learning_rate)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[3, 10], gamma=0.33)
    optimizerD = torch.optim.Adam(discriminator.parameters(), lr=args.learning_rate)

    sys.stderr.write('Start training...\n')
    total_step = len(data_loader)
    decoder.train()
    global_step = 0
    running_ls = 0
    for epoch in range(args.num_epochs):
        scheduler.step()
        acc_ls = 0
        start = time.time()

        for i, (batch) in enumerate(data_loader):
            poem_embed, ids, lengths = [t.to(device) for t in batch]
            targets = pack_padded_sequence(ids[:, 1:], lengths, batch_first=True)[0]
            # train discriminator

            # train with real
            discriminator.zero_grad()
            pred_real = discriminator(ids[:, 1:], lengths)
            real_label = torch.ones(ids.size(0), dtype=torch.long).to(device)
            loss_d_real = criterion(pred_real, real_label)
            loss_d_real.backward(torch.ones_like(loss_d_real), retain_graph=True)

            # train with fake

            logits = decoder(poem_embed, ids, lengths)
            weights = F.softmax(logits, dim=-1)
            m = Categorical(probs=weights)
            generated_ids = m.sample()

            # generated_ids = torch.argmax(logits, dim=-1)
            pred_fake = discriminator(generated_ids.detach(), lengths)
            fake_label = torch.zeros(ids.size(0)).long().to(device)
            loss_d_fake = criterion(pred_fake, fake_label)
            loss_d_fake.backward(torch.ones_like(loss_d_fake), retain_graph=True)

            loss_d = loss_d_real.mean().item() + loss_d_fake.mean().item()

            optimizerD.step()

            # train generator
            decoder.zero_grad()
            reward = F.softmax(pred_fake, dim=-1)[:, 1].unsqueeze(-1)
            loss_r = -m.log_prob(generated_ids) * reward
            loss_r.backward(torch.ones_like(loss_r), retain_graph=True)
            loss_r = loss_r.mean().item()

            loss = criterion(pack_padded_sequence(logits, lengths, batch_first=True)[0], targets)
            loss.backward(torch.ones_like(loss))
            loss = loss.mean().item()
            # loss = loss_r
            running_ls += loss
            acc_ls += loss

            for param in decoder.parameters():
                torch.nn.utils.clip_grad_norm_(param, 0.25)

            optimizer.step()
            global_step += 1

            if global_step % args.log_step == 0:
                elapsed_time = time.time() - start
                iters_per_sec = (i + 1) / elapsed_time
                remaining = (total_step - i - 1) / iters_per_sec
                remaining_fmt = time.strftime("%H:%M:%S", time.gmtime(remaining))
                elapsed_fmt = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

                print('[{}/{}, {}/{}], ls_d:{:.2f}, ls_r:{:.2f} ls: {:.2f}, Acc: {:.2f} Perp: {:5.2f} {:.3}it/s {}<{}'
                      .format(epoch+1, args.num_epochs, i+1, total_step, loss_d, loss_r,
                              running_ls / args.log_step, acc_ls / (i+1), np.exp(acc_ls / (i+1)),
                              iters_per_sec, elapsed_fmt, remaining_fmt ) )
                running_ls = 0

            if global_step % args.save_step == 0:
                torch.save(decoder.state_dict(), args.ckpt)
                torch.save(discriminator.state_dict(), args.disc)
    torch.save(decoder.state_dict(), args.save)
    torch.save(discriminator.state_dict(), args.disc)
Exemplo n.º 18
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    obj = data_loader.MsvdDataset()
    datas = obj.getAll()
    #print(len(datas))
    os.chdir(r'E:/jupyterNotebook/our_project/')
    # Train the models
    total_step = len(datas)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(datas):

            #print(epoch,i,images.shape)
            # Set mini-batch dataset
            images = images.to(device)

            # Forward, backward and optimize
            features = encoder(images)
            features = features.cpu().detach().numpy()
            features = features.mean(axis=0)
            features = torch.from_numpy(features).view(1, -1).to(device)
            #print(features.shape)

            for j in range(1):
                #for j in range(len(captions)):
                captions[j] = captions[j].long()
                captions[j] = captions[j].view(1, -1).to(device)
                targets = pack_padded_sequence(captions[j],
                                               lengths[j],
                                               batch_first=True)[0]

                outputs = decoder(features, captions[j], lengths[j])
                #print(targets.shape)
                #print(outputs.shape)
                loss = criterion(outputs, targets)
                decoder.zero_grad()
                #encoder.zero_grad()
                loss.backward()
                optimizer.step()

            print(
                'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                .format(epoch + 1, args.num_epochs, i, total_step, loss.item(),
                        np.exp(loss.item())))

            #print(os.path)
            if (i + 1) % 25 == 0:  #args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join('E:\jupyterNotebook\our_project\models',
                                 'decoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join('E:\jupyterNotebook\our_project\models',
                                 'encoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
Exemplo n.º 19
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.CenterCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    transform_val = transforms.Compose([
        transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)
    val_loader = get_loader(args.val_dir,
                            args.val_caption_path,
                            vocab,
                            transform_val,
                            args.batch_size,
                            shuffle=False,
                            num_workers=args.num_workers)
    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    encoder.freeze_bottom()
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers).to(device)
    #     decoder = BahdanauAttnDecoderRNN(args.hidden_size, args.embed_size, len(vocab)).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the models
    total_step = len(data_loader)
    accs, b1s, b2s, b3s, b4s = [], [], [], [], []
    for epoch in range(args.num_epochs):
        decoder.train()
        encoder.train()
        losses = []
        for i, (images, captions, lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]
            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            losses.append(loss.item())
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                    .format(epoch + 1, args.num_epochs, i, total_step,
                            loss.item(), np.exp(loss.item())))

            # Save the model checkpoints
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))


#         acc, b1, b2, b3, b4 = evaluate(val_loader, encoder, decoder, vocab)
#         accs.append(acc)
#         b1s.append(b1)
#         b2s.append(b2)
#         b3s.append(b3)
#         b4s.append(b4)
        avg_loss = sum(losses) / total_step

        print('Epoch {} Average Training Loss: {:.4f}'.format(
            epoch + 1, avg_loss))

        with open('stem_freeze_freq1000.txt', 'a') as file:
            file.write("Epoch {} \n".format(epoch + 1))
            file.write('Average Accuracy: {} \n'.format(acc))
            file.write('Average Loss: {} \n'.format(avg_loss))
            file.write('Average BLEU gram1: {} \n'.format(b1))
            file.write('Average BLEU gram2: {} \n'.format(b2))
            file.write('Average BLEU gram3: {} \n'.format(b3))
            file.write('Average BLEU gram4: {} \n'.format(b4))
            file.write('\n')

    plt.title("Accuracy vs BLEU score")
    plt.plot(np.arange(1, args.num_epochs + 1), accs, label='accuracy')
    plt.plot(np.arange(1, args.num_epochs + 1), b1s, label='BLEU 1')
    plt.plot(np.arange(1, args.num_epochs + 1), b2s, label='BLEU 2')
    plt.plot(np.arange(1, args.num_epochs + 1), b3s, label='BLEU 3')
    plt.plot(np.arange(1, args.num_epochs + 1), b4s, label='BLEU 4')
    plt.xlabel("epochs")
    plt.xticks(np.arange(1, args.num_epochs + 1))
    plt.legend(loc='upper left')
    plt.savefig('accuracy_BLEU.png')
    plt.clf()
Exemplo n.º 20
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    
    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([ 
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers) 

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    
    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            
            # Set mini-batch dataset
            images = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            
            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                      %(epoch, args.num_epochs, i, total_step, 
                        loss.data[0], np.exp(loss.data[0]))) 
                
            # Save the models
            if (i+1) % args.save_step == 0:
                torch.save(decoder.state_dict(), 
                           os.path.join(args.model_path, 
                                        'decoder-%d-%d.pkl' %(epoch+1, i+1)))
                torch.save(encoder.state_dict(), 
                           os.path.join(args.model_path, 
                                        'encoder-%d-%d.pkl' %(epoch+1, i+1)))
Exemplo n.º 21
0
def main(args):
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers).to(device)

    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):

            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                    .format(epoch, args.num_epochs, i, total_step, loss.item(),
                            np.exp(loss.item())))

            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
Exemplo n.º 22
0
def main(args):

    configure(os.path.join(args['exp_dir'], 'log_dir'))

    transform = transforms.Compose([
        transforms.RandomCrop(args['crop_size']),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    data_loader = get_loader({
        'data_dir': args['data_dir'],
        'exp_dir': args['exp_dir'],
        'raw_data_dir': args['raw_data_dir'],
        'batch_size': args['batch_size'],
        'transform': transform,
        'num_workers': args['num_workers'],
        'shuffle': args['shuffle'],
        'mode': 'train'
    })

    #    valid_data_loader=get_loader({'data_dir' : args['data_dir'],
    #                             'raw_data_dir' : args['raw_data_dir'],
    #                             'batch_size' : int(args['batch_size']/4),
    #                             'transform' : transform,
    #                             'num_workers' : args['num_workers'],
    #                             'shuffle' : args['shuffle'],
    #                             'mode':'validate'})

    args['vocab_size'] = len(Vocabulary.load_vocab(args['exp_dir']))

    encoder = EncoderCNN(args).train()
    decoder = DecoderRNN(args).train()

    if args['pretrained']:
        checkpoint_path = Checkpoint.get_latest_checkpoint(args['exp_dir'])
        checkpoint = Checkpoint.load(checkpoint_path)
        encoder.load_state_dict(checkpoint.encoder)
        decoder.load_state_dict(checkpoint.decoder)
        step = checkpoint.step
        epoch = checkpoint.epoch
        omit = True

    else:
        step = 0
        epoch = 0
        omit = False

    encoder.to(device)
    decoder.to(device)

    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    #    params=list(decoder.parameters()) + list(encoder.parameters())
    optimizer = torch.optim.Adam(params, lr=args['lr'])
    scheduler = StepLR(optimizer, step_size=40, gamma=0.1)
    #    optimizer=YFOptimizer(params)

    total_step = len(data_loader)
    min_valid_loss = float('inf')

    for epoch in range(epoch, args['num_epochs']):
        scheduler.step()
        for idx, (images, captions, leng) in enumerate(data_loader):

            if omit:
                if idx < (step - total_step * epoch):
                    logger.info(
                        'idx:{},step:{}, epoch:{}, total_step:{}, diss:{}'.
                        format(idx, step, epoch, total_step,
                               step - total_step * epoch))
                    continue
                else:
                    omit = False

            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, leng, batch_first=True)[0]

            features = encoder(images)
            outputs = decoder(features, captions, leng)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(decoder.parameters(), 5)
            optimizer.step()

            log_value('loss', loss.item(), step)
            step += 1

            if step % args['log_step'] == 0:
                logger.info(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                    .format(epoch, args['num_epochs'], idx, total_step,
                            loss.item(), np.exp(loss.item())))

            if step % args['valid_step'] == 0:
                #                valid_loss=validate(encoder.eval(),decoder,criterion,valid_data_loader)
                #                if valid_loss<min_valid_loss:
                #                    min_valid_loss=valid_loss
                Checkpoint(encoder, decoder, optimizer, epoch,
                           step).save(args['exp_dir'])
Exemplo n.º 23
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    val_loader = get_loader(args.val_image_dir,
                            args.val_caption_path,
                            vocab,
                            transform,
                            args.batch_size,
                            shuffle=False,
                            num_workers=args.num_workers)

    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the models
    total_step = len(data_loader)
    train_loss_arr = []
    val_loss_arr = []
    train_bleu_arr = []
    val_bleu_arr = []
    for epoch in range(1, args.num_epochs + 1, 1):
        iteration_loss = []
        iteration_bleu = []
        for i, (images, captions, lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            #print(outputs.shape, targets.shape)
            loss = criterion(outputs, targets)
            iteration_loss.append(loss.item())
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            #get BLEU score for corresponding batch
            sampled_ids = decoder.sample(features)
            sampled_ids = sampled_ids.cpu().numpy()
            bleu_score_batch = get_bleu(captions, sampled_ids, vocab)
            iteration_bleu.append(bleu_score_batch)

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Bleu: '.format(
                        epoch, args.num_epochs, i, total_step, loss.item()) +
                    str(bleu_score_batch))
                f_log = open(os.path.join(args.model_path, "log.txt"), "a+")
                f_log.write("Epoch: " + str(epoch) + "/" +
                            str(args.num_epochs) + " Step: " + str(i) + "/" +
                            str(total_step) + " loss: " + str(loss.item()) +
                            " Bleu: " + str(bleu_score_batch) + "\n")
                f_log.close()

            # Save the model checkpoints
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))

        train_loss_arr.append(np.array(iteration_loss))
        train_bleu_arr.append(np.array(iteration_bleu))

        val_loss = 0
        val_steps = 0
        val_iteration_loss = []
        val_iteration_bleu = []
        for j, (images_val, captions_val,
                lengths_val) in enumerate(val_loader):

            # Set mini-batch dataset
            images_val = images_val.to(device)
            captions_val = captions_val.to(device)
            targets = pack_padded_sequence(captions_val,
                                           lengths_val,
                                           batch_first=True)[0]

            # Forward, backward and optimize
            features = encoder(images_val)
            outputs = decoder(features, captions_val, lengths_val)
            #print(outputs.shape, targets.shape)
            loss = criterion(outputs, targets).item()
            val_loss += loss
            val_iteration_loss.append(loss)
            val_steps += 1

            #get BLEU score for corresponding batch
            sampled_ids = decoder.sample(features)
            sampled_ids = sampled_ids.cpu().numpy()
            bleu_score_batch = get_bleu(captions_val, sampled_ids, vocab)
            val_iteration_bleu.append(bleu_score_batch)

        val_loss /= val_steps
        print('Epoch [{}/{}], Val Loss: {:.4f}, Bleu: '.format(
            epoch, args.num_epochs, val_loss) + str(bleu_score_batch))
        f_log = open(os.path.join(args.model_path, "log.txt"), "a+")
        f_log.write("Epoch: " + str(epoch) + "/" + str(args.num_epochs) +
                    " val loss: " + str(val_loss) + " Bleu: " +
                    str(bleu_score_batch) + "\n\n")
        f_log.close()
        val_loss_arr.append(np.array(val_iteration_loss))
        val_bleu_arr.append(np.array(val_iteration_bleu))

    np.save(os.path.join(args.model_path, "train_loss.npy"),
            np.array(train_loss_arr))
    np.save(os.path.join(args.model_path, "val_loss.npy"),
            np.array(val_loss_arr))
    np.save(os.path.join(args.model_path, "train_bleu.npy"),
            np.array(train_bleu_arr))
    np.save(os.path.join(args.model_path, "val_bleu.npy"),
            np.array(val_bleu_arr))
Exemplo n.º 24
0
def main(args):

    #setup tensorboard
    if args.tensorboard:
        cc = CrayonClient(hostname="localhost")
        print(cc.get_experiment_names())
        #if args.name in cc.get_experiment_names():
        try:
            cc.remove_experiment(args.name)
        except:
            print("experiment didnt exist")
        cc_server = cc.create_experiment(args.name)

    # Create model directory
    full_model_path = args.model_path + "/" + args.name
    if not os.path.exists(full_model_path):
        os.makedirs(full_model_path)
    with open(full_model_path + "/parameters.json", 'w') as f:
        f.write((json.dumps(vars(args))))

    # Image preprocessing

    transform = transforms.Compose([
        transforms.Scale(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    mini_transform = transforms.Compose(
        [transforms.ToPILImage(),
         transforms.Scale(20),
         transforms.ToTensor()])

    # Load vocabulary wrapper.
    if args.vocab_path is not None:
        with open(args.vocab_path, 'rb') as f:
            vocab = pickle.load(f)
    else:
        print("building new vocab")
        vocab = build_vocab(args.image_dir, 1, None)
        with open((full_model_path + "/vocab.pkl"), 'wb') as f:
            pickle.dump(vocab, f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)
    code_data_set = ProcessingDataset(root=args.image_dir,
                                      vocab=vocab,
                                      transform=transform)
    train_ds, val_ds = validation_split(code_data_set)
    train_loader = torch.utils.data.DataLoader(train_ds, collate_fn=collate_fn)
    test_loader = torch.utils.data.DataLoader(val_ds, collate_fn=collate_fn)
    train_size = len(train_loader)
    test_size = len(test_loader)

    # Build the models
    encoder = EncoderCNN(args.embed_size, args.train_cnn)
    print(encoder)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    print(decoder)
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    #params = list(decoder.parameters()) #+ list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    start_time = time.time()
    add_log_entry(args.name, start_time, vars(args))

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            decoder.train()
            encoder.train()
            # Set mini-batch dataset
            image_ts = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]
            count = images.size()[0]

            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(image_ts)
            outputs = decoder(features, captions, lengths)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total = targets.size(0)
            max_index = outputs.max(dim=1)[1]
            #correct = (max_index == targets).sum()
            _, predicted = torch.max(outputs.data, 1)
            correct = predicted.eq(targets.data).cpu().sum()
            accuracy = 100. * correct / total

            if args.tensorboard:
                cc_server.add_scalar_value("train_loss", loss.data[0])
                cc_server.add_scalar_value("perplexity", np.exp(loss.data[0]))
                cc_server.add_scalar_value("accuracy", accuracy)

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, accuracy: %2.2f Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       accuracy, np.exp(loss.data[0])))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(full_model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(full_model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
            if 1 == 2 and i % int(train_size / 10) == 0:
                encoder.eval()
                #decoder.eval()
                correct = 0
                for ti, (timages, tcaptions,
                         tlengths) in enumerate(test_loader):
                    timage_ts = to_var(timages, volatile=True)
                    tcaptions = to_var(tcaptions)
                    ttargets = pack_padded_sequence(tcaptions,
                                                    tlengths,
                                                    batch_first=True)[0]
                    tfeatures = encoder(timage_ts)
                    toutputs = decoder(tfeatures, tcaptions, tlengths)
                    print(ttargets)
                    print(toutputs)
                    print(ttargets.size())
                    print(toutputs.size())
                    #correct = (ttargets.eq(toutputs[0].long())).sum()

                accuracy = 100 * correct / test_size
                print('accuracy: %.4f' % (accuracy))
                if args.tensorboard:
                    cc_server.add_scalar_value("accuracy", accuracy)

    torch.save(
        decoder.state_dict(),
        os.path.join(full_model_path,
                     'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
    torch.save(
        encoder.state_dict(),
        os.path.join(full_model_path,
                     'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
    end_time = time.time()
    print("finished training, runtime: %d", [(end_time - start_time)])
Exemplo n.º 25
0
def main(args):
    #create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    #image preprocessing and normalzation stuff(define transforms)
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])

    #load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    #build data-loader
    data_loader = get_loader(args.image_dir,
                             args.caption,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    #build the model
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(len(vocab), args.embed_size, args.hidden_size,
                         args.num_layers).to(device)

    #define loss and optimizer function
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.batch_norm.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    #train the model
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            #forward and backprop
            features = encoder(images)
            outputs = decoder(features, captions, lengths)

            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()

            loss.backward()
            optimizer.step()

            #print log info
            if i % args.log_step == 0:
                print(
                    'Epoch {}/{}  , Step {}/{}  , Loss {:.4f} , Perplexity{:5.4f}'
                    .format(epoch, args.num_epochs, i, total_step, loss.item(),
                            np.exp(loss.item())))

            #save the model
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
Exemplo n.º 26
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_caption_loader(args.caption_path,
                                     vocab,
                                     75,
                                     args.batch_size,
                                     shuffle=True,
                                     num_workers=args.num_workers)

    # Build the models
    encoder = EncoderRNN(len(vocab), args.embed_size,
                         args.hidden_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.embedding.parameters()) + list(encoder.rnn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (captions_src, captions_tgt, lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            captions_src = captions_src.to(device)
            captions_tgt = captions_tgt.to(device)
            targets = pack_padded_sequence(captions_tgt,
                                           lengths,
                                           batch_first=True)[0]

            # Forward, backward and optimize
            enc_output, enc_hidden = encoder(captions_src)
            outputs = decoder(enc_hidden[:, -1:, :], captions_tgt, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                    .format(epoch, args.num_epochs, i, total_step, loss.item(),
                            np.exp(loss.item())))

            # Save the model checkpoints
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
Exemplo n.º 27
0
def main():
    # Configuration for hyper-parameters
    config = Config()
    
    # Create model directory
    if not os.path.exists(config.model_path):
        os.makedirs(config.model_path)
    
    # Image preprocessing
    transform = config.train_transform
    
    # Load vocabulary wrapper
    with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    image_path = os.path.join(config.image_path, 'train2014')
    json_path = os.path.join(config.caption_path, 'captions_train2014.json')
    train_loader = get_data_loader(image_path, json_path, vocab, 
                                   transform, config.batch_size,
                                   shuffle=True, num_workers=config.num_threads) 
    total_step = len(train_loader)

    # Build Models
    encoder = EncoderCNN(config.embed_size)
    decoder = DecoderRNN(config.embed_size, config.hidden_size, 
                         len(vocab), config.num_layers)
    
    if torch.cuda.is_available()
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters())
    optimizer = torch.optim.Adam(params, lr=config.learning_rate)
    
    # Train the Models
    for epoch in range(config.num_epochs):
        for i, (images, captions, lengths) in enumerate(train_loader):
            
            # Set mini-batch dataset
            images = Variable(images)
            captions = Variable(captions)
            if torch.cuda.is_available():
                images = images.cuda()
                captions = captions.cuda()
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            
            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Print log info
            if i % config.log_step == 0:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                      %(epoch, config.num_epochs, i, total_step, 
                        loss.data[0], np.exp(loss.data[0]))) 
                
            # Save the Model
            if (i+1) % config.save_step == 0:
                torch.save(decoder.state_dict(), 
                           os.path.join(config.model_path, 
                                        'decoder-%d-%d.pkl' %(epoch+1, i+1)))
                torch.save(encoder.state_dict(), 
                           os.path.join(config.model_path, 
                                        'encoder-%d-%d.pkl' %(epoch+1, i+1)))
Exemplo n.º 28
0
        # Randomly sample a caption length, and sample indices with that length.
        indices = data_loader.dataset.get_train_indices()
        # Create and assign a batch sampler to retrieve a batch with the sampled indices.
        new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
        data_loader.batch_sampler.sampler = new_sampler

        # Obtain the batch.
        images, captions = next(iter(data_loader))

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = captions.to(device)

        # Zero the gradients.
        decoder.zero_grad()
        encoder.zero_grad()

        # Pass the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs = decoder(features, captions)

        # Calculate the batch loss.
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))

        # Backward pass.
        loss.backward()

        # Update the parameters in the optimizer.
        optimizer.step()
Exemplo n.º 29
0
class Worker:
    def __init__(self, args):
        # Initialize MPI/NCCL and set topology variables
        self.init_dist(args.gpu_only)
        self.rank = self.dist.get_rank()
        self.world_size = self.dist.get_world_size()
        self.local_rank = self.dist.get_local_rank()
        self.local_size = self.dist.get_local_size()
        self.n_gpus = self.dist.get_n_gpus()
        self.n_nodes = self.world_size / self.local_size
        self.node = self.rank // self.local_size
        self.n_cpu_workers = (self.local_size - self.n_gpus) * self.n_nodes
        self.n_gpu_workers = self.n_gpus * self.n_nodes

        # Set RNG seed for reproducibility, can be left on
        torch.manual_seed(1234)

        # CuDNN reproducibility
        if args.reproducible:
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False

        # Set number of threads
        if self.dist.is_cpu_rank():
            #torch.set_num_threads(args.num_threads)
            print("[Rank {}] Setting number of OMP threads to {}".format(
                self.rank, args.num_threads),
                  flush=True)

        # Calculate batch sizes
        self.total_batch_size = args.batch_size
        self.cpu_batch_size = args.cpu_batch_size
        assert ((self.total_batch_size - self.cpu_batch_size * self.n_cpu_workers * self.n_nodes) \
            % (self.n_gpus * self.n_nodes) == 0), "GPU batch size is not an integer"
        self.gpu_batch_size = int((self.total_batch_size - self.cpu_batch_size * self.n_cpu_workers * self.n_nodes) \
            / (self.n_gpus * self.n_nodes))
        self.batch_size = self.cpu_batch_size if self.dist.is_cpu_rank(
        ) else self.gpu_batch_size

        print("[Rank {}] Current CUDA device: {}".format(
            self.rank, torch.cuda.current_device()),
              flush=True)

    def init_dist(self, gpu_only):
        # C++ extension module with JIT compilation
        dist_module = load(
            name="dist",
            sources=["dist.cu"],
            verbose=True,
            with_cuda=True,
            extra_cuda_cflags=[
                '-ccbin',
                'g++',
                '-std=c++11',
                '-O3',
                #'-I/usr/mpi/gcc/openmpi-2.1.2-hfi/include',
                #'-I/usr/mpi/gcc/mvapich2-2.3b-hfi/include',
                '-I/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/include',
                #'-I/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/include64',
                '-I/pylon5/ac7k4vp/jchoi157/pytorch/build/nccl/include'
            ],
            extra_ldflags=[
                '-L/opt/packages/cuda/9.2/lib64',
                '-lcudart',
                '-lrt',
                #'-L/usr/mpi/gcc/openmpi-2.1.2-hfi/lib64', '-lmpi',
                #'-L/usr/mpi/gcc/mvapich2-2.3b-hfi/lib', '-lmpi',
                '-L/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/lib',
                '-lmpi',
                #'-L/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/lib64', '-lmpi',
                '-L/pylon5/ac7k4vp/jchoi157/pytorch/build/nccl/lib',
                '-lnccl'
            ],
            build_directory="/home/jchoi157/torch_extensions")
        self.dist = dist_module.DistManager(gpu_only, False)

    def average_gradients(self):
        # Only all-reduce decoder parameters since encoder is pre-trained
        for param in self.decoder.parameters():
            if self.dist.is_cpu_rank():
                param.grad.data = param.grad.data.cuda(0, non_blocking=True)
                param.grad.data *= (self.cpu_batch_size /
                                    self.total_batch_size)
            else:
                param.grad.data *= (self.gpu_batch_size /
                                    self.total_batch_size)

            self.dist.hetero_allreduce(param.grad.data)

            if self.dist.is_cpu_rank():
                param.grad.data = param.grad.data.cpu()

    def train(self, args):
        # Create model directory
        if not os.path.exists(args.model_path):
            os.makedirs(args.model_path)

        # Image preprocessing, normalization for the pretrained resnet
        transform = transforms.Compose([
            transforms.RandomCrop(args.crop_size),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        # Load vocabulary wrapper
        with open(args.vocab_path, 'rb') as f:
            vocab = pickle.load(f)

        # Build data loader
        data_loader = get_loader(
            args.image_dir,
            args.caption_path,
            vocab,
            transform,
            self.rank,
            self.world_size,
            self.local_size,
            self.n_gpus,
            self.total_batch_size,
            self.cpu_batch_size,
            self.gpu_batch_size,
            self.batch_size,
            shuffle=(False if args.reproducible else True),
            no_partition=args.no_partition)
        self.num_batches = len(data_loader)
        print("[Rank {}] batch size {}, num batches {}".format(
            self.rank,
            self.total_batch_size if args.no_partition else self.batch_size,
            self.num_batches),
              flush=True)

        # Build the models
        self.encoder = EncoderCNN(args.embed_size)
        self.decoder = DecoderRNN(args.embed_size, args.hidden_size,
                                  len(vocab), args.num_layers)
        if self.dist.is_gpu_rank():
            self.encoder = self.encoder.cuda(self.local_rank)
            self.decoder = self.decoder.cuda(self.local_rank)

        # Loss and optimizer
        criterion = nn.CrossEntropyLoss()
        params = list(self.decoder.parameters()) + list(
            self.encoder.linear.parameters()) + list(
                self.encoder.bn.parameters())
        optimizer = torch.optim.Adam(params, lr=args.learning_rate)

        # Train the models
        total_step = len(data_loader)
        for epoch in range(args.num_epochs):
            epoch_start_time = time.time()
            batch_time_sum = 0
            batch_time_total = 0
            processed_batches = 0
            processed_batches_total = 0
            batch_start_time = time.time()
            for i, (images, captions, lengths) in enumerate(data_loader):
                # Set mini-batch dataset
                if self.dist.is_gpu_rank():
                    images = images.cuda(self.local_rank)
                    captions = captions.cuda(self.local_rank)
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]

                # Forward, backward, all-reduce and optimize
                features = self.encoder(images)
                outputs = self.decoder(features, captions, lengths)
                loss = criterion(outputs, targets)
                self.decoder.zero_grad()
                self.encoder.zero_grad()
                loss.backward()
                if not args.no_partition:
                    self.average_gradients()
                optimizer.step()

                batch_time = time.time() - batch_start_time
                batch_time_sum += batch_time
                batch_time_total += batch_time
                processed_batches += 1
                processed_batches_total += 1

                saved_loss = loss.item()
                # Print log info
                if i % args.log_step == 0 and i != 0:
                    print(
                        'Rank [{}], Epoch [{}/{}], Step [{}/{}], Average time: {:.6f}, Loss: {:.4f}, Perplexity: {:5.4f}'
                        .format(self.rank, epoch, args.num_epochs, i,
                                total_step, batch_time_sum / processed_batches,
                                saved_loss, np.exp(saved_loss)),
                        flush=True)
                    batch_time_sum = 0
                    processed_batches = 0

                # Save the model checkpoints
                if (i + 1) % args.save_step == 0:
                    torch.save(
                        self.decoder.state_dict(),
                        os.path.join(
                            args.model_path,
                            'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
                    torch.save(
                        self.encoder.state_dict(),
                        os.path.join(
                            args.model_path,
                            'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))

                batch_start_time = time.time()

            epoch_time = time.time() - epoch_start_time
            print(
                '!!! Rank [{}], Epoch [{}], Time: {:.6f}, Average batch time: {:.6f}, Loss: {:.4f}, Perplexity: {:5.4f}'
                .format(self.rank, epoch, epoch_time,
                        batch_time_total / processed_batches_total, saved_loss,
                        np.exp(saved_loss)),
                flush=True)
Exemplo n.º 30
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        # transforms.Resize(190, 190),
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab,
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers)

    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the models
    total_step = len(data_loader)
    print type(data_loader)
    # a, (b, c, d) = enumerate(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                      .format(epoch + 1, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item())))

                # Save the model checkpoints
            if epoch % args.save_step == 0:
                print 'Save models\n'
                torch.save(decoder.state_dict(), os.path.join(
                    args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
                torch.save(encoder.state_dict(), os.path.join(
                    args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
Exemplo n.º 31
0
def main(args):
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Build the models, can use a feedforward/convolutional encoder and an RNN decoder
    encoder = EncoderCNN(args.embed_size).to(
        device)  #can be sequential or convolutional
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers).to(device)
    # Loss and optimizer
    criterion1 = nn.CrossEntropyLoss()
    criterion2 = nn.NLLLoss()
    softmax = nn.LogSoftmax(dim=1)
    params = list(decoder.parameters()) + list(encoder.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    total_training_steps = args.num_iters
    losses = []
    perplexity = []
    for epoch in range(args.num_epochs):
        for i in range(total_training_steps):
            prog_data = generate_training_data(args.batch_size)

            images = [im[0] for im in prog_data]
            transforms = [transform[1] for transform in prog_data]

            [ele.insert(0, '<start>')
             for ele in transforms]  #start token for each sequence
            [ele.append('<end>')
             for ele in transforms]  #end token for each sequence

            lengths = [len(trans) for trans in transforms]

            maximum_len = max(lengths)
            for trans in transforms:
                if len(trans) != maximum_len:
                    trans.extend(['pad'] * (maximum_len - len(trans)))

            padded_lengths = [len(trans) for trans in transforms]
            transforms = [[word_to_int(word) for word in transform]
                          for transform in transforms]
            transforms = torch.tensor(transforms, device=device)
            images = torch.tensor(images, device=device)
            images = images.unsqueeze(
                1)  #Uncomment this line when training using EncoderCNN
            lengths = torch.tensor(lengths, device=device)
            padded_lengths = torch.tensor(padded_lengths, device=device)
            targets = pack_padded_sequence(transforms,
                                           padded_lengths,
                                           batch_first=True)[0]

            features = encoder(images)
            outputs = decoder(features, transforms, padded_lengths)
            #print(outputs)

            loss = criterion1(outputs, targets)
            losses.append(loss.item())
            perplexity.append(np.exp(loss.item()))

            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f},Perplexity: {:5.4f}'
                    .format(epoch, args.num_epochs, i, total_training_steps,
                            loss.item(), np.exp(loss.item())))

            # Save the model checkpoints
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))

    y = losses
    z = perplexity
    x = np.arange(len(losses))
    plt.plot(x, y, label='Cross Entropy Loss')
    plt.plot(x, z, label='Perplexity')
    plt.xlabel('Iterations')
    plt.ylabel('Cross Entropy Loss and Perplexity')
    plt.title("Cross Entropy Loss and Model Perplexity During Training")
    plt.legend()
    plt.savefig('plots/plots_cnn/cnn4_gpu', dpi=100)
Exemplo n.º 32
0
 
 # Randomly sample a caption length, and sample indices with that length.
 indices = data_loader.dataset.get_train_indices()
 # Create and assign a batch sampler to retrieve a batch with the sampled indices.
 new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
 data_loader.batch_sampler.sampler = new_sampler
 
 # Obtain the batch.
 images, captions = next(iter(data_loader))
 # print(images.shape)
 # Move batch of images and captions to GPU if CUDA is available.
 images = images.to(device)
 captions = captions.to(device)
 
 # Zero the gradients.
 decoder.zero_grad()
 encoder.zero_grad()
 
 # Pass the inputs through the CNN-RNN model.
 features = encoder(images)
 outputs = decoder(features, captions)
 
 # Calculate the batch loss.
 loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
 
 # Backward pass.
 loss.backward()
 
 # Update the parameters in the optimizer.
 optimizer.step()
     
Exemplo n.º 33
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([ 
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers) 

    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    
    # Train the models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            
            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            
            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                      .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) 
                
            # Save the model checkpoints
            if (i+1) % args.save_step == 0:
                torch.save(decoder.state_dict(), os.path.join(
                    args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))
                torch.save(encoder.state_dict(), os.path.join(
                    args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))