예제 #1
0
    def train(self, dataset, dataset_eval=None):

        # Obtain needed information
        data_size = dataset.data_size
        token_size = dataset.token_size
        ans_size = dataset.ans_size
        pretrained_emb = dataset.pretrained_emb

        # Define the MCAN model
        net = Net(self.__C, pretrained_emb, token_size, ans_size)
        net.cuda()
        net.train()

        # Define the Question-only model
        qnet = QNet(self.__C, pretrained_emb, token_size, ans_size)
        qnet.cuda()
        qnet.train()

        # Watch net & qnet
        wandb.watch(net)
        wandb.watch(qnet)

        # Define the multi-gpu training if needed
        if self.__C.N_GPU > 1:
            net = nn.DataParallel(net, device_ids=self.__C.DEVICES)

        # Define the binary cross entropy loss
        # loss_fn = torch.nn.BCELoss(size_average=False).cuda()
        loss_qm = torch.nn.BCELoss(reduction='sum').cuda()
        loss_qo = torch.nn.BCELoss(reduction='sum').cuda()

        # Load checkpoint if resume training
        if self.__C.RESUME:  # default -> FALSE
            print(' ========== Resume training')

            if self.__C.CKPT_PATH is not None:
                print('Warning: you are now using CKPT_PATH args, '
                      'CKPT_VERSION and CKPT_EPOCH will not work')

                path = self.__C.CKPT_PATH
            else:
                path = self.__C.CKPTS_PATH + \
                       'ckpt_' + self.__C.CKPT_VERSION + \
                       '/epoch' + str(self.__C.CKPT_EPOCH) + '.pkl'

            # Load the network parameters
            print('Loading ckpt {}'.format(path))
            ckpt = torch.load(path)
            print('Finish!')
            net.load_state_dict(ckpt['state_dict'])

            # Load the optimizer paramters
            #params = list(net.parameters()) + list(qnet.parameters())
            optim = get_optim(self.__C, net, data_size, ckpt['lr_base'])
            optim._step = int(data_size / self.__C.BATCH_SIZE *
                              self.__C.CKPT_EPOCH)
            optim.optimizer.load_state_dict(ckpt['optimizer'])

            start_epoch = self.__C.CKPT_EPOCH

        else:
            if ('ckpt_' + self.__C.VERSION) in os.listdir(self.__C.CKPTS_PATH):
                shutil.rmtree(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION)

            os.mkdir(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION)

            #params = net.parameters() + qnet.parameters()
            optim = get_optim(self.__C, net, data_size)
            optim_q = get_optim(self.__C, qnet, data_size)
            start_epoch = 0

        loss_sum = 0
        L_qo_sum = 0
        L_qm_sum = 0
        named_params = list(net.named_parameters()) + list(
            qnet.named_parameters())
        grad_norm = np.zeros(len(named_params))

        # Define multi-thread dataloader
        if self.__C.SHUFFLE_MODE in ['external']:
            dataloader = Data.DataLoader(dataset,
                                         batch_size=self.__C.BATCH_SIZE,
                                         shuffle=False,
                                         num_workers=self.__C.NUM_WORKERS,
                                         pin_memory=self.__C.PIN_MEM,
                                         drop_last=True)
        else:
            dataloader = Data.DataLoader(dataset,
                                         batch_size=self.__C.BATCH_SIZE,
                                         shuffle=True,
                                         num_workers=self.__C.NUM_WORKERS,
                                         pin_memory=self.__C.PIN_MEM,
                                         drop_last=True)

        # Training script
        for epoch in range(start_epoch, self.__C.MAX_EPOCH):

            # Save log information
            logfile = open(
                self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt',
                'a+')
            logfile.write(
                'nowTime: ' +
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n')
            logfile.close()

            # Learning Rate Decay
            if epoch in self.__C.LR_DECAY_LIST:
                adjust_lr(optim, self.__C.LR_DECAY_R)
                adjust_lr(optim_q, self.__C.LR_DECAY_R)

            # Externally shuffle
            if self.__C.SHUFFLE_MODE == 'external':
                shuffle_list(dataset.ans_list)

            time_start = time.time()
            # Iteration
            for step, (img_feat_iter, ques_ix_iter,
                       ans_iter) in enumerate(dataloader):

                optim.zero_grad()
                optim_q.zero_grad()

                img_feat_iter = img_feat_iter.cuda()
                ques_ix_iter = ques_ix_iter.cuda()
                ans_iter = ans_iter.cuda()

                for accu_step in range(self.__C.GRAD_ACCU_STEPS):

                    sub_img_feat_iter = \
                        img_feat_iter[accu_step * self.__C.SUB_BATCH_SIZE:
                                      (accu_step + 1) * self.__C.SUB_BATCH_SIZE]
                    sub_ques_ix_iter = \
                        ques_ix_iter[accu_step * self.__C.SUB_BATCH_SIZE:
                                     (accu_step + 1) * self.__C.SUB_BATCH_SIZE]
                    sub_ans_iter = \
                        ans_iter[accu_step * self.__C.SUB_BATCH_SIZE:
                                 (accu_step + 1) * self.__C.SUB_BATCH_SIZE]

                    out, q_emb, lang_feat_mask = net(sub_img_feat_iter,
                                                     sub_ques_ix_iter)
                    pred_qo, q_out = qnet(q_emb, lang_feat_mask)
                    #print(pred_qo.shape, sub_ans_iter.shape)
                    #print(torch.argmax(sub_ans_iter.long(), dim=1))
                    ans_idx = torch.argmax(sub_ans_iter.long(), dim=1)
                    pred_idx = torch.argmax(
                        pred_qo.long(),
                        dim=1)  # predicted answer index from QO
                    qo_scale = pred_qo.detach().clone()
                    for i in range(self.__C.SUB_BATCH_SIZE):
                        if (ans_idx[i] == pred_idx[i]):
                            qo_scale[i, :] = torch.ones(3129)

                    L_qo = loss_qo(q_out, sub_ans_iter)
                    L_qm = loss_qm(
                        torch.sigmoid(out * torch.sigmoid(qo_scale)),
                        sub_ans_iter)

                    #L_qo = loss_qo(q_out, sub_ans_iter)
                    #L_qm = loss_qm(torch.sigmoid(out*torch.sigmoid(pred_qo)), sub_ans_iter)

                    loss = L_qo + L_qm

                    # only mean-reduction needs be divided by grad_accu_steps
                    # removing this line wouldn't change our results because the speciality of Adam optimizer,
                    # but would be necessary if you use SGD optimizer.
                    # loss /= self.__C.GRAD_ACCU_STEPS
                    loss.backward()
                    loss_sum += loss.cpu().data.numpy(
                    ) * self.__C.GRAD_ACCU_STEPS
                    L_qo_sum += L_qo.cpu().data.numpy(
                    ) * self.__C.GRAD_ACCU_STEPS
                    L_qm_sum += L_qm.cpu().data.numpy(
                    ) * self.__C.GRAD_ACCU_STEPS

                    wandb.log({
                        "Training loss":
                        loss.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE,
                        "Question only loss":
                        L_qo.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE,
                        "Fusion loss":
                        L_qm.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE
                    })  # Tracking training loss

                    if self.__C.VERBOSE:  # print loss every step -> TRUE
                        if dataset_eval is not None:
                            mode_str = self.__C.SPLIT[
                                'train'] + '->' + self.__C.SPLIT['val']
                        else:
                            mode_str = self.__C.SPLIT[
                                'train'] + '->' + self.__C.SPLIT['test']

                        print(
                            "\r[version %s][epoch %2d][step %4d/%4d][%s] loss: %.4f, lr: %.2e"
                            % (self.__C.VERSION, epoch + 1, step,
                               int(data_size / self.__C.BATCH_SIZE), mode_str,
                               loss.cpu().data.numpy() /
                               self.__C.SUB_BATCH_SIZE, optim._rate),
                            end='          ')

                # Gradient norm clipping
                if self.__C.GRAD_NORM_CLIP > 0:
                    nn.utils.clip_grad_norm_(net.parameters(),
                                             self.__C.GRAD_NORM_CLIP)

                # Save the gradient information
                for name in range(len(named_params)):
                    norm_v = torch.norm(named_params[name][1].grad).cpu().data.numpy() \
                        if named_params[name][1].grad is not None else 0
                    grad_norm[name] += norm_v * self.__C.GRAD_ACCU_STEPS
                    # print('Param %-3s Name %-80s Grad_Norm %-20s'%
                    #       (str(grad_wt),
                    #        params[grad_wt][0],
                    #        str(norm_v)))

                optim.step()
                optim_q.step()

            time_end = time.time()
            print('Finished in {}s'.format(int(time_end - time_start)))

            # print('')
            epoch_finish = epoch + 1

            # Save checkpoint
            state = {
                'state_dict': net.state_dict(),
                'optimizer': optim.optimizer.state_dict(),
                'lr_base': optim.lr_base
            }
            torch.save(
                state, self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION +
                '/epoch' + str(epoch_finish) + '.pkl')

            # Logging
            logfile = open(
                self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt',
                'a+')
            logfile.write('epoch = ' + str(epoch_finish) + '  Q loss = ' +
                          str(L_qo_sum / data_size) + '  fusion loss = ' +
                          str(L_qm_sum / data_size) + '  loss = ' +
                          str(loss_sum / data_size) + '\n' + 'lr = ' +
                          str(optim._rate) + '\n\n')
            logfile.close()

            # Eval after every epoch
            if dataset_eval is not None:
                self.eval(dataset_eval,
                          state_dict=net.state_dict(),
                          valid=True)

            # if self.__C.VERBOSE:
            #     logfile = open(
            #         self.__C.LOG_PATH +
            #         'log_run_' + self.__C.VERSION + '.txt',
            #         'a+'
            #     )
            #     for name in range(len(named_params)):
            #         logfile.write(
            #             'Param %-3s Name %-80s Grad_Norm %-25s\n' % (
            #                 str(name),
            #                 named_params[name][0],
            #                 str(grad_norm[name] / data_size * self.__C.BATCH_SIZE)
            #             )
            #         )
            #     logfile.write('\n')
            #     logfile.close()

            loss_sum = 0
            L_qo_sum = 0
            L_qm_sum = 0
            grad_norm = np.zeros(len(named_params))
    def train(self, dataset, dataset_eval=None):

        #1.3 首先训练的开始前要获取需要的信息,数据集大小,数据集的问题嵌入大小,答案大小,文本嵌入向量
        data_size = dataset.data_size
        token_size = dataset.token_size  #18405
        ans_size = dataset.ans_size
        pretrained_emb = dataset.pretrained_emb

        #1.4 需要的信息获取后,开始定义MCAN模型,传入需要的参数,输出:多模态融合特征 proj_feat
        net = Net(self.__C, pretrained_emb, token_size, ans_size)
        net.cuda()
        net.train()  # 1.5 调用train进行训练,这步的前一步是?后一步是?

        # 如果需要的话,定义多gpu训练
        if self.__C.N_GPU > 1:
            net = nn.DataParallel(net, device_ids=self.__C.DEVICES)

        loss_fn = torch.nn.BCELoss(reduction='sum').cuda()

        # 如果恢复训练,则加载检查点
        if self.__C.RESUME:
            print(' ========== 恢复性训练 ==========')

            if self.__C.CKPT_PATH is not None:
                print('警告:您现在正在使用CKPT_PATH参数,CKPT_VERSION和CKPT_EPOCH不能工作')

                path = self.__C.CKPT_PATH  #此处要设置ckpt_path的目录,不能为None
            else:
                path = self.__C.CKPTS_PATH + \
                       'ckpt_' + self.__C.CKPT_VERSION + \
                       '/epoch' + str(self.__C.CKPT_EPOCH) + '.pkl'

            # 加载模型网络参数
            print('加载ckpt {} 文件'.format(path))
            ckpt = torch.load(path)
            print('参数加载完成!')
            #...............state_dict这里存什么数据.............
            net.load_state_dict(ckpt['state_dict'])

            # 加载优化器参数
            optim = get_optim(self.__C, net, data_size, ckpt['lr_base'])
            optim._step = int(data_size / self.__C.BATCH_SIZE *
                              self.__C.CKPT_EPOCH)
            optim.optimizer.load_state_dict(ckpt['optimizer'])
            # epoch
            start_epoch = self.__C.CKPT_EPOCH

        # 如果不恢复训练,则重新训练
        else:
            if ('ckpt_' + self.__C.VERSION) in os.listdir(self.__C.CKPTS_PATH):
                shutil.rmtree(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION)

            os.mkdir(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION)

            optim = get_optim(self.__C, net, data_size)
            start_epoch = 0

        loss_sum = 0
        named_params = list(net.named_parameters())  # 参数名称
        grad_norm = np.zeros(len(named_params))  # 梯度规范化

        # 定义多线程数据加载 dataloader
        if self.__C.SHUFFLE_MODE in ['external']:
            dataloader = Data.DataLoader(
                dataset,
                batch_size=self.__C.BATCH_SIZE,
                shuffle=False,
                num_workers=self.__C.NUM_WORKERS,  # 进程数
                pin_memory=self.__C.PIN_MEM,
                drop_last=True)
        else:
            dataloader = Data.DataLoader(dataset,
                                         batch_size=self.__C.BATCH_SIZE,
                                         shuffle=True,
                                         num_workers=self.__C.NUM_WORKERS,
                                         pin_memory=self.__C.PIN_MEM,
                                         drop_last=True)

        # 训练过程 这里max_epoch我设置为1
        for epoch in range(start_epoch, self.__C.MAX_EPOCH):

            # 保存日志信息
            logfile = open(
                self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt',
                'a+')
            # 写入日志信息
            logfile.write(
                'nowTime: ' +
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n')
            logfile.close()

            # 学习率衰减
            if epoch in self.__C.LR_DECAY_LIST:
                adjust_lr(optim, self.__C.LR_DECAY_R)

            # Externally shuffle
            if self.__C.SHUFFLE_MODE == 'external':
                shuffle_list(dataset.ans_list)

            time_start = time.time()

            # 迭代的加载 图像特征迭代器,问题特征迭代器,答案迭代器
            for step, (img_feat_iter, ques_ix_iter,
                       ans_iter) in enumerate(dataloader):
                optim.zero_grad()  # 梯度清零
                img_feat_iter = img_feat_iter.cuda()
                ques_ix_iter = ques_ix_iter.cuda()
                ans_iter = ans_iter.cuda()
                # grad_accu_steps:累计梯度,来解决本地显存不足的问题,
                # 其是变相扩大batchsize,如果batch_size=6,样本总量为24,grad_acc_steps=2
                # 那么参数更新次数为24/6=4,如果减小batch_size = 6/2=3,则参数更新次数不变
                for accu_step in range(self.__C.GRAD_ACCU_STEPS):

                    sub_img_feat_iter = \
                        img_feat_iter[accu_step * self.__C.SUB_BATCH_SIZE:
                                      (accu_step + 1) * self.__C.SUB_BATCH_SIZE]
                    sub_ques_ix_iter = \
                        ques_ix_iter[accu_step * self.__C.SUB_BATCH_SIZE:
                                     (accu_step + 1) * self.__C.SUB_BATCH_SIZE]
                    sub_ans_iter = \
                        ans_iter[accu_step * self.__C.SUB_BATCH_SIZE:
                                 (accu_step + 1) * self.__C.SUB_BATCH_SIZE]

                    pred = net(
                        sub_img_feat_iter,  #[5,100,2048]
                        sub_ques_ix_iter  # [5,14]
                    )

                    loss = loss_fn(pred, sub_ans_iter)
                    # 只有平均减少需要被grad_accu_steps划分
                    loss.backward()  # 反向传播,计算当前梯度
                    loss_sum += loss.cpu().data.numpy(
                    ) * self.__C.GRAD_ACCU_STEPS
                    # 输出每个train的loss
                    if self.__C.VERBOSE:
                        if dataset_eval is not None:
                            mode_str = self.__C.SPLIT['train']
                        else:
                            mode_str = self.__C.SPLIT[
                                'train'] + '->' + self.__C.SPLIT['train']

                        print(
                            "\r[version %s][epoch %2d][step %4d/%4d][%s] loss: %.4f, lr: %.2e"
                            % (self.__C.VERSION, epoch + 1, step,
                               int(data_size / self.__C.BATCH_SIZE), mode_str,
                               loss.cpu().data.numpy() /
                               self.__C.SUB_BATCH_SIZE, optim._rate),
                            end='          ')

                # Gradient norm clipping 梯度标准剪裁
                if self.__C.GRAD_NORM_CLIP > 0:
                    nn.utils.clip_grad_norm_(net.parameters(),
                                             self.__C.GRAD_NORM_CLIP)

                # 保存梯度下降信息
                for name in range(len(named_params)):
                    norm_v = torch.norm(named_params[name][1].grad).cpu().data.numpy() \
                        if named_params[name][1].grad is not None else 0
                    grad_norm[name] += norm_v * self.__C.GRAD_ACCU_STEPS
                optim.step()

                # with open('One_epoch_data.txt','w') as F:
                #     F.write(net.state_dict()+optim.optimizer.state_dict()+optim.lr_base)

            time_end = time.time()
            print('Finished in {}s'.format(int(time_end - time_start)))

            # print('')
            epoch_finish = epoch + 1

            # 保存检查点
            state = {
                'state_dict': net.state_dict(),
                'optimizer': optim.optimizer.state_dict(),
                'lr_base': optim.lr_base
            }

            print("===========训练模型的state=====")
            print(state)
            torch.save(
                state, self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION +
                '/epoch' + str(epoch_finish) + '.pkl')

            # 打开日志文件
            logfile = open(
                self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt',
                'a+')
            logfile.write('epoch = ' + str(epoch_finish) + '  loss = ' +
                          str(loss_sum / data_size) + '\n' + 'lr = ' +
                          str(optim._rate) + '\n\n')
            logfile.close()

            # 每个epoch后,进行模型评估,调用评估函数
            if dataset_eval is not None:
                self.eval(dataset_eval,
                          state_dict=net.state_dict(),
                          valid=True)

            loss_sum = 0
            grad_norm = np.zeros(len(named_params))
예제 #3
0
    def train(self, dataset, dataset_eval=None):

        # Obtain needed information
        data_size = dataset.data_size
        token_size = dataset.token_size
        ans_size = dataset.ans_size
        pretrained_emb = dataset.pretrained_emb

        # Define the MCAN model
        net = Net(
            self.__C,
            pretrained_emb,
            token_size,
            ans_size
        )
        net.cuda()
        net.train()

        # Define the multi-gpu training if needed
        if self.__C.N_GPU > 1:
            net = nn.DataParallel(net, device_ids=self.__C.DEVICES)

        # Define the binary cross entropy loss
        # loss_fn = torch.nn.BCELoss(size_average=False).cuda()
        loss_fn = torch.nn.BCELoss(reduction='sum').cuda()

        # Load checkpoint if resume training
        if self.__C.RESUME:
            print(' ========== Resume training')

            if self.__C.CKPT_PATH is not None:
                print('Warning: you are now using CKPT_PATH args, '
                      'CKPT_VERSION and CKPT_EPOCH will not work')

                path = self.__C.CKPT_PATH
            else:
                path = self.__C.CKPTS_PATH + \
                       'ckpt_' + self.__C.CKPT_VERSION + \
                       '/epoch' + str(self.__C.CKPT_EPOCH) + '.pkl'

            # Load the network parameters
            print('Loading ckpt {}'.format(path))
            ckpt = torch.load(path)
            print('Finish!')
            net.load_state_dict(ckpt['state_dict'])

            # Load the optimizer paramters
            optim = get_optim(self.__C, net, data_size, ckpt['lr_base'])
            optim._step = int(data_size / self.__C.BATCH_SIZE * self.__C.CKPT_EPOCH)
            optim.optimizer.load_state_dict(ckpt['optimizer'])

            start_epoch = self.__C.CKPT_EPOCH

        else:
            if ('ckpt_' + self.__C.VERSION) in os.listdir(self.__C.CKPTS_PATH):
                shutil.rmtree(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION)

            os.mkdir(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION)

            optim = get_optim(self.__C, net, data_size)
            start_epoch = 0

        loss_sum = 0
        named_params = list(net.named_parameters())
        grad_norm = np.zeros(len(named_params))

        # Define multi-thread dataloader
        if self.__C.SHUFFLE_MODE in ['external']:
            dataloader = Data.DataLoader(
                dataset,
                batch_size=self.__C.BATCH_SIZE,
                shuffle=False,
                num_workers=self.__C.NUM_WORKERS,
                pin_memory=self.__C.PIN_MEM,
                drop_last=True
            )
        else:
            dataloader = Data.DataLoader(
                dataset,
                batch_size=self.__C.BATCH_SIZE,
                shuffle=True,
                num_workers=self.__C.NUM_WORKERS,
                pin_memory=self.__C.PIN_MEM,
                drop_last=True
            )

        # Training script
        for epoch in range(start_epoch, self.__C.MAX_EPOCH):

            # Save log information
            logfile = open(
                self.__C.LOG_PATH +
                'log_run_' + self.__C.VERSION + '.txt',
                'a+'
            )
            logfile.write(
                'nowTime: ' +
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
                '\n'
            )
            logfile.close()

            # Learning Rate Decay
            if epoch in self.__C.LR_DECAY_LIST:
                adjust_lr(optim, self.__C.LR_DECAY_R)

            # Externally shuffle
            if self.__C.SHUFFLE_MODE == 'external':
                shuffle_list(dataset.ans_list)

            time_start = time.time()
            # Iteration
            for step, (
                    img_feat_iter,
                    ques_ix_iter,
                    ans_iter,
                    fact_idx_iter,
            ) in enumerate(dataloader):

                optim.zero_grad()

                img_feat_iter = img_feat_iter.cuda()
                ques_ix_iter = ques_ix_iter.cuda()
                ans_iter = ans_iter.cuda()
                fact_idx_iter = fact_idx_iter.cuda()

                for accu_step in range(self.__C.GRAD_ACCU_STEPS):

                    sub_img_feat_iter = \
                        img_feat_iter[accu_step * self.__C.SUB_BATCH_SIZE:
                                      (accu_step + 1) * self.__C.SUB_BATCH_SIZE]
                    sub_ques_ix_iter = \
                        ques_ix_iter[accu_step * self.__C.SUB_BATCH_SIZE:
                                     (accu_step + 1) * self.__C.SUB_BATCH_SIZE]
                    sub_ans_iter = \
                        ans_iter[accu_step * self.__C.SUB_BATCH_SIZE:
                                 (accu_step + 1) * self.__C.SUB_BATCH_SIZE]
                    sub_fact_idx_iter = \
                        fact_idx_iter[accu_step * self.__C.SUB_BATCH_SIZE:
                                      (accu_step + 1) * self.__C.SUB_BATCH_SIZE]

                    pred = net(
                        sub_img_feat_iter,
                        sub_ques_ix_iter,
                        sub_fact_idx_iter,
                    )

                    loss = loss_fn(pred, sub_ans_iter)
                    loss /= self.__C.GRAD_ACCU_STEPS
                    loss.backward()
                    loss_sum += loss.cpu().data.numpy() * self.__C.GRAD_ACCU_STEPS

                    if self.__C.VERBOSE:
                        if dataset_eval is not None:
                            mode_str = self.__C.SPLIT['train'] + '->' + self.__C.SPLIT['val']
                        else:
                            mode_str = self.__C.SPLIT['train'] + '->' + self.__C.SPLIT['test']

                        print("\r[version %s][epoch %2d][step %4d/%4d][%s] loss: %.4f, lr: %.2e" % (
                            self.__C.VERSION,
                            epoch + 1,
                            step,
                            int(data_size / self.__C.BATCH_SIZE),
                            mode_str,
                            loss.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE,
                            optim._rate
                        ), end='          ')

                # Gradient norm clipping
                if self.__C.GRAD_NORM_CLIP > 0:
                    nn.utils.clip_grad_norm_(
                        net.parameters(),
                        self.__C.GRAD_NORM_CLIP
                    )

                # Save the gradient information
                for name in range(len(named_params)):
                    norm_v = torch.norm(named_params[name][1].grad).cpu().data.numpy() \
                        if named_params[name][1].grad is not None else 0
                    grad_norm[name] += norm_v * self.__C.GRAD_ACCU_STEPS
                    # print('Param %-3s Name %-80s Grad_Norm %-20s'%
                    #       (str(grad_wt),
                    #        params[grad_wt][0],
                    #        str(norm_v)))

                optim.step()

            time_end = time.time()
            print('Finished in {}s'.format(int(time_end-time_start)))

            # print('')
            epoch_finish = epoch + 1

            # Save checkpoint
            state = {
                'state_dict': net.state_dict(),
                'optimizer': optim.optimizer.state_dict(),
                'lr_base': optim.lr_base
            }
            torch.save(
                state,
                self.__C.CKPTS_PATH +
                'ckpt_' + self.__C.VERSION +
                '/epoch' + str(epoch_finish) +
                '.pkl'
            )

            # Logging
            logfile = open(
                self.__C.LOG_PATH +
                'log_run_' + self.__C.VERSION + '.txt',
                'a+'
            )
            logfile.write(
                'epoch = ' + str(epoch_finish) +
                '  loss = ' + str(loss_sum / data_size) +
                '\n' +
                'lr = ' + str(optim._rate) +
                '\n\n'
            )
            logfile.close()

            # Eval after every epoch
            if dataset_eval is not None:
                self.eval(
                    dataset_eval,
                    state_dict=net.state_dict(),
                    valid=True
                )

            # if self.__C.VERBOSE:
            #     logfile = open(
            #         self.__C.LOG_PATH +
            #         'log_run_' + self.__C.VERSION + '.txt',
            #         'a+'
            #     )
            #     for name in range(len(named_params)):
            #         logfile.write(
            #             'Param %-3s Name %-80s Grad_Norm %-25s\n' % (
            #                 str(name),
            #                 named_params[name][0],
            #                 str(grad_norm[name] / data_size * self.__C.BATCH_SIZE)
            #             )
            #         )
            #     logfile.write('\n')
            #     logfile.close()

            loss_sum = 0
            grad_norm = np.zeros(len(named_params))
예제 #4
0
파일: exec.py 프로젝트: jojo23333/mcan-vqa
    def train(self, dataset, dataset_eval=None):

        # Obtain needed information
        data_size = dataset.data_size

        dataset.__getitem__(0)
        # Define the binary cross entropy loss
        optim, net = self.build(dataset)

        loss_fn = torch.nn.BCELoss(reduction='mean').cuda()
        loss_sum = 0

        if self.__C.RESUME:
            start_epoch = self.__C.CKPT_EPOCH
        else:
            start_epoch = 0

        # Define multi-thread dataloader
        if self.__C.SHUFFLE_MODE in ['external']:
            dataloader = Data.DataLoader(dataset,
                                         batch_size=self.__C.BATCH_SIZE,
                                         shuffle=False,
                                         num_workers=self.__C.NUM_WORKERS,
                                         pin_memory=self.__C.PIN_MEM,
                                         drop_last=True)
        else:
            dataloader = Data.DataLoader(dataset,
                                         batch_size=self.__C.BATCH_SIZE,
                                         shuffle=True,
                                         num_workers=self.__C.NUM_WORKERS,
                                         pin_memory=self.__C.PIN_MEM,
                                         drop_last=True)

        # Training script
        logfile_iter = open(
            self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '_iter.txt',
            'a+')
        print("begin training", flush=True)
        for epoch in range(start_epoch, self.__C.MAX_EPOCH):
            # TODO add meter here
            meter = TrainLossMeter()
            # Save log information
            logfile = open(
                self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt',
                'a+')
            logfile.write(
                'nowTime: ' +
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n')
            logfile.close()

            # Learning Rate Decay
            if epoch in self.__C.LR_DECAY_LIST:
                adjust_lr(optim, self.__C.LR_DECAY_R)

            # Externally shuffle
            if self.__C.SHUFFLE_MODE == 'external':
                shuffle_list(dataset.ans_list)

            time_start = time.time()
            # Iteration
            for step, batch in enumerate(dataloader):
                optim.zero_grad()
                img_feat = batch["img_feat"].cuda()
                ques_ix = batch["ques_ix"].cuda()
                ans = batch["ans_score"].cuda()
                ans_embedding = batch["ans_embedding_sampled"].cuda()
                ans_sampled = batch["ans_score_sampled"].cuda()
                # abs = abs.cuda()
                # mask_abs, mask_ans = [x.cuda() for x in loss_masks]

                # TODO MODIFY HERE
                for accu_step in range(self.__C.GRAD_ACCU_STEPS):

                    sub_img_feat = \
                        img_feat[accu_step * self.__C.SUB_BATCH_SIZE:
                                      (accu_step + 1) * self.__C.SUB_BATCH_SIZE]
                    sub_ques_ix = \
                        ques_ix[accu_step * self.__C.SUB_BATCH_SIZE:
                                     (accu_step + 1) * self.__C.SUB_BATCH_SIZE]
                    sub_ans = \
                        ans[accu_step * self.__C.SUB_BATCH_SIZE:
                                 (accu_step + 1) * self.__C.SUB_BATCH_SIZE]
                    sub_ans_embedding = \
                        ans_embedding[accu_step * self.__C.SUB_BATCH_SIZE:
                                 (accu_step + 1) * self.__C.SUB_BATCH_SIZE]
                    sub_ans_sampled = \
                        ans_sampled[accu_step * self.__C.SUB_BATCH_SIZE:
                                 (accu_step + 1) * self.__C.SUB_BATCH_SIZE]

                    input_dict = {
                        "img_feat": sub_img_feat,
                        "ques_ix": sub_ques_ix,
                        "ans_ix": sub_ans_embedding
                    }

                    # TODO get pred and pred_parent
                    pred, pred_abs = net(input_dict)
                    # TODO loss of pred_parent and pred based on gt path
                    # loss_ans, loss_abs = self.h_classifier.get_loss(
                    #                               pred, pred_abs,
                    #                               sub_ans, sub_abs,
                    #                               sub_mask_ans, sub_mask_abs,
                    #                               loss_fn)
                    # loss = loss_ans + loss_abs * self.__C.ABS_ALPHA
                    if self.__C.MODEL.startswith('q_small'):
                        loss = loss_fn(pred, sub_ans_sampled)
                    else:
                        loss = loss_fn(pred, sub_ans)

                    # only mean-reduction needs be divided by grad_accu_steps
                    # removing this line wouldn't change our results because the speciality of Adam optimizer,
                    # but would be necessary if you use SGD optimizer.
                    loss /= self.__C.GRAD_ACCU_STEPS
                    loss.backward()
                    loss_sum += loss.cpu().data.numpy(
                    ) * self.__C.GRAD_ACCU_STEPS
                    meter.update_iter({"loss": loss.cpu().item()
                                       })  # / self.__C.SUB_BATCH_SIZE})#,
                    #    "loss_ans":loss_ans.cpu().item(),
                    #    "loss_abs":loss_abs.cpu().item()})
                    global_step = step + int(
                        data_size / self.__C.BATCH_SIZE) * epoch
                    self.writer.add_scalar('train/loss_bce',
                                           loss.cpu().item(), global_step)

                    # TODO ADD PERIODIC PRINT
                    if step % self.__C.LOG_CYCLE == self.__C.LOG_CYCLE - 1:
                        if dataset_eval is not None:
                            mode_str = self.__C.SPLIT[
                                'train'] + '->' + self.__C.SPLIT['val']
                        else:
                            mode_str = self.__C.SPLIT[
                                'train'] + '->' + self.__C.SPLIT['test']

                        info_str = "[%s][version %s][epoch %2d][step %4d/%4d][%s] lr: %.2e %s " % (
                            datetime.datetime.now().strftime(
                                "%y/%m/%d, %H:%M:%S"), self.__C.VERSION, epoch
                            + 1, step, int(data_size / self.__C.BATCH_SIZE),
                            mode_str, optim._rate, meter.log_iter())
                        print(info_str, flush=True)
                        logfile_iter.write(info_str)

                # Gradient norm clipping
                if self.__C.GRAD_NORM_CLIP > 0:
                    nn.utils.clip_grad_norm_(net.parameters(),
                                             self.__C.GRAD_NORM_CLIP)

                optim.step()

            time_end = time.time()
            print('Finished in {}s'.format(int(time_end - time_start)))
            info_str = "[version %s][epoch %2d] lr: %.2e loss: %s\n" % (
                self.__C.VERSION, epoch + 1, optim._rate, meter.log_epoch())
            print(info_str, flush=True)
            logfile_iter.write(info_str)

            # print('')
            epoch_finish = epoch + 1

            # Save checkpoint
            if self.__C.N_GPU > 1:
                state_dict = net.module.state_dict()
            else:
                state_dict = net.state_dict()

            state = {
                'state_dict': state_dict,
                'optimizer': optim.optimizer.state_dict(),
                'lr_base': optim.lr_base
            }
            torch.save(
                state, self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION +
                '/epoch_latest' + '.pkl')
            if epoch % 3 == 2:
                torch.save(
                    state, self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION +
                    '/epoch' + str(epoch_finish) + '.pkl')

            # Logging
            logfile = open(
                self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt',
                'a+')
            logfile.write('epoch = ' + str(epoch_finish) + '  loss = ' +
                          str(loss_sum) + '\n' + 'lr = ' + str(optim._rate) +
                          '\n\n')
            logfile.close()

            # Eval after every epoch
            if epoch % 3 == 2 and dataset_eval is not None:
                self.eval(dataset_eval, state_dict=state_dict, valid=True)

            loss_sum = 0
예제 #5
0
파일: exec.py 프로젝트: mg9/mca-on-r2c
    def train(self, dataset):
       
        net = Net(
            self.__C,
        )

        net.cuda()
        net.train()

         #Create checkpoint
        if ('ckpt_' + self.__C.VERSION) in os.listdir(self.__C.CKPTS_PATH):
            shutil.rmtree(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION)
        os.mkdir(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION)
        loader_params = {'batch_size': 16, 'num_gpus':1}
        dataloader = TheLoader.from_dataset(dataset, **loader_params)
        loss_sum = 0
        named_params = list(net.named_parameters())
        grad_norm = np.zeros(len(named_params))
        
        loss_fn = torch.nn.NLLLoss().cuda()
        
        # Load checkpoint if resume training
        if self.__C.RESUME:
            print(' ========== Resume training')
            path = self.__C.CKPTS_PATH + \
                       'ckpt_' + self.__C.CKPT_VERSION + \
                       '/epoch' + str(self.__C.CKPT_EPOCH) + '.pkl'

            # Load the network parameters
            print('Loading ckpt {}'.format(path))
            ckpt = torch.load(path)
            print('Finish!')
            net.load_state_dict(ckpt['state_dict'])

            # Load the optimizer paramters
            optim = get_optim(self.__C, net, len(dataloader), ckpt['lr_base'])
            optim._step = int(len(dataloader) / self.__C.BATCH_SIZE * self.__C.CKPT_EPOCH)
            optim.optimizer.load_state_dict(ckpt['optimizer'])

            start_epoch = self.__C.CKPT_EPOCH
        else:
            optim = get_optim(self.__C, net, len(dataloader))
            start_epoch = 0

       

        for epoch in range(start_epoch, self.__C.MAX_EPOCH):
            print("Training epoch...",  epoch)
            # Learning Rate Decay
            if epoch in self.__C.LR_DECAY_LIST:
                adjust_lr(optim, self.__C.LR_DECAY_R)
            
            time_start = time.time()
            print("time_start:" , time_start)
            pred_argmax = []
            
            for b, (time_per_batch, batch) in enumerate(time_batch(dataloader)):
                optim.zero_grad()
                x, goldsentence = net(**batch)
                goldsentence = goldsentence[:, 1:]
                x = x[:,:31,:]
                pred_argmax = np.argmax(x.cpu().data.numpy(), axis=2)


                loss = loss_fn(x.permute(0,2,1), goldsentence)
                loss /= self.__C.GRAD_ACCU_STEPS
                loss.backward()
                loss_sum += loss.cpu().data.numpy() * self.__C.GRAD_ACCU_STEPS
                mode_str = self.__C.SPLIT['train']


                print("\r[version %s][epoch %2d][%s] loss: %.4f, lr: %.2e" % (
                    self.__C.VERSION,
                    epoch + 1,
                    mode_str,
                    loss.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE,
                    optim._rate
                ), end='          ')

                # Gradient norm clipping
                if self.__C.GRAD_NORM_CLIP > 0:
                    nn.utils.clip_grad_norm_(
                        net.parameters(),
                        self.__C.GRAD_NORM_CLIP
                    )

                # Save the gradient information
                for name in range(len(named_params)):
                    norm_v = torch.norm(named_params[name][1].grad).cpu().data.numpy() \
                        if named_params[name][1].grad is not None else 0
                    grad_norm[name] += norm_v * self.__C.GRAD_ACCU_STEPS

                optim.step()


            time_end = time.time()
            print('Finished in {}s'.format(int(time_end-time_start)))
            epoch_finish = epoch + 1
        
            loss_sum = 0
            grad_norm = np.zeros(len(named_params))


            # Save checkpoint
            state = {
                'state_dict': net.state_dict(),
                'optimizer': optim.optimizer.state_dict(),
                'lr_base': optim.lr_base
            }
            torch.save(
                state,
                self.__C.CKPTS_PATH +
                'ckpt_' + self.__C.VERSION +
                '/epoch' + str(epoch_finish) +
                '.pkl'
            )

           
            print("Gold sentence: " , str(goldsentence.cpu().data))
            print("A sample prediction: ", pred_argmax )
            print("Checkpoint saved. " )
예제 #6
0
    def train(self, dataset, dataset_eval=None):
        super_time_start = time.time()

        # Obtain needed information
        data_size = dataset.data_size
        token_size = dataset.token_size
        ans_size = dataset.ans_size
        pretrained_emb = dataset.pretrained_emb

        # Define the MCAN model
        net = Net(self.__C, pretrained_emb, token_size, ans_size)
        net.cuda()
        net.train()

        # Define the binary cross entropy loss
        loss_fn = torch.nn.BCELoss(reduction='sum').cuda()

        # Load checkpoint if resume training
        if self.__C.RESUME:
            print('========== Resume training')

            if self.__C.CKPT_PATH is not None:
                print(
                    'Warning: you are now using CKPT_PATH args, CKPT_VERSION and CKPT_EPOCH will not work'
                )
                path = self.__C.CKPT_PATH
            else:
                path = self.__C.CKPTS_PATH + 'ckpt_' + self.__C.CKPT_VERSION \
                       + '/epoch' + str(self.__C.CKPT_EPOCH) + '.pkl'

            # Load the network parameters
            print('========== Loading ckpt {}'.format(path))
            ckpt = torch.load(path)
            print('========== Finished!')
            net.load_state_dict(ckpt['state_dict'])

            # Load the optimizer parameters
            optim = get_optim(self.__C, net, data_size, ckpt['lr_base'])
            optim._step = int(data_size / self.__C.BATCH_SIZE *
                              self.__C.CKPT_EPOCH)
            optim.optimizer.load_state_dict(ckpt['optimizer'])

            start_epoch = self.__C.CKPT_EPOCH

        else:
            if ('ckpt_' + self.__C.VERSION) in os.listdir(self.__C.CKPTS_PATH):
                shutil.rmtree(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION)

            os.mkdir(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION)

            optim = get_optim(self.__C, net, data_size)
            start_epoch = 0

        loss_sum = 0
        named_params = list(net.named_parameters())
        grad_norm = np.zeros(len(named_params))

        # Define multi-thread dataloader
        dataloader = Data.DataLoader(dataset,
                                     batch_size=self.__C.BATCH_SIZE,
                                     shuffle=False,
                                     num_workers=self.__C.NUM_WORKERS,
                                     pin_memory=self.__C.PIN_MEM,
                                     drop_last=True)

        # Training script
        for epoch in range(start_epoch, self.__C.MAX_EPOCH):
            epoch_finish = epoch + 1
            # Save log information
            logfile = open(
                self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt',
                'a+')
            logfile.write(
                'nowTime: ' +
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n')
            logfile.close()

            # Learning Rate Decay
            if epoch in self.__C.LR_DECAY_LIST:
                adjust_lr(optim, self.__C.LR_DECAY_R)

            # Externally shuffle
            shuffle_list(dataset.ans_list)

            time_start = time.time()
            # Iteration
            for step, (img_feat_iter, ques_ix_iter,
                       ans_iter) in enumerate(dataloader):
                optim.zero_grad()

                img_feat_iter = img_feat_iter.cuda()
                ques_ix_iter = ques_ix_iter.cuda()
                ans_iter = ans_iter.cuda()

                pred = net(img_feat_iter, ques_ix_iter)

                loss = loss_fn(pred, ans_iter)
                loss.backward()
                loss_sum += loss.cpu().data.numpy()
                if self.__C.VERBOSE:  # print loss every step
                    if dataset_eval is not None:
                        mode_str = self.__C.SPLIT[
                            'train'] + '->' + self.__C.SPLIT['val']
                    else:
                        mode_str = self.__C.SPLIT[
                            'train'] + '->' + self.__C.SPLIT['test']
                    print(
                        "\r[version %s][epoch %2d][step %4d/%4d][%s] loss: %.4f, lr: %.2e"
                        % (self.__C.VERSION, epoch_finish, step,
                           int(data_size / self.__C.BATCH_SIZE), mode_str,
                           loss.cpu().data.numpy() / self.__C.BATCH_SIZE,
                           optim._rate),
                        end=' ')

                # Save the gradient information
                for name in range(len(named_params)):
                    if named_params[name][1].grad is not None:
                        norm_v = torch.norm(
                            named_params[name][1].grad).cpu().data.numpy()
                    else:
                        norm_v = 0
                    grad_norm[name] += norm_v
                optim.step()

            time_end = time.time()
            print('========== Finished in {}s'.format(
                int(time_end - time_start)))

            # Save checkpoint
            state = {
                'state_dict': net.state_dict(),
                'optimizer': optim.optimizer.state_dict(),
                'lr_base': optim.lr_base
            }
            torch.save(
                state, self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION +
                '/epoch' + str(epoch_finish) + '.pkl')

            # Logging
            logfile = open(
                self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt',
                'a+')
            logfile.write('epoch = ' + str(epoch_finish) + '  loss = ' +
                          str(loss_sum / data_size) + '\n' + 'lr = ' +
                          str(optim._rate) + '\n\n')
            logfile.close()

            # Eval after every epoch
            if dataset_eval is not None:
                self.eval(dataset_eval,
                          state_dict=net.state_dict(),
                          valid=True)

            loss_sum = 0
            grad_norm = np.zeros(len(named_params))
        print('========== Total Training time is {}s'.format(
            int(time.time() - super_time_start)))