def train_network(self, args):

        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          args.init_lr)
        self.scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer,
                                                                gamma=0.96,
                                                                last_epoch=-1)
        save_path = 'weights_' + args.dataset
        start_epoch = 1

        # add resume part for continuing training when break previously, 10-16-2020
        if args.resume_train:
            self.model, self.optimizer, start_epoch = self.load_model(
                self.model, self.optimizer, args.resume_train, strict=True)
        # end

        if not os.path.exists(save_path):
            os.mkdir(save_path)
        if args.ngpus > 1:
            if torch.cuda.device_count() > 1:
                print("Let's use", torch.cuda.device_count(), "GPUs!")
                # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
                self.model = nn.DataParallel(self.model)
        self.model.to(self.device)

        criterion = loss.LossAll()
        print('Setting up data...')

        dataset_module = self.dataset[args.dataset]

        dsets = {
            x: dataset_module(data_dir=args.data_dir,
                              phase=x,
                              input_h=args.input_h,
                              input_w=args.input_w,
                              down_ratio=self.down_ratio)
            for x in self.dataset_phase[args.dataset]
        }

        dsets_loader = {}
        dsets_loader['train'] = torch.utils.data.DataLoader(
            dsets['train'],
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.num_workers,
            pin_memory=True,
            drop_last=True,
            collate_fn=collater)

        print('Starting training...')
        train_loss = []
        ap_list = []
        for epoch in range(start_epoch, args.num_epoch + 1):
            print('-' * 10)
            print('Epoch: {}/{} '.format(epoch, args.num_epoch))
            epoch_loss = self.run_epoch(phase='train',
                                        data_loader=dsets_loader['train'],
                                        criterion=criterion)
            train_loss.append(epoch_loss)
            self.scheduler.step(epoch)

            np.savetxt(os.path.join(save_path, 'train_loss.txt'),
                       train_loss,
                       fmt='%.6f')

            if epoch % 5 == 0 or epoch > 20:
                self.save_model(
                    os.path.join(save_path, 'model_{}.pth'.format(epoch)),
                    epoch, self.model, self.optimizer)

            if 'test' in self.dataset_phase[args.dataset] and epoch % 5 == 0:
                mAP = self.dec_eval(args, dsets['test'])
                ap_list.append(mAP)
                np.savetxt(os.path.join(save_path, 'ap_list.txt'),
                           ap_list,
                           fmt='%.6f')

            self.save_model(os.path.join(save_path, 'model_last.pth'), epoch,
                            self.model, self.optimizer)
示例#2
0
    def train_network(self, args):
        save_path = 'weights_'+args.dataset
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        self.optimizer = torch.optim.Adam(self.model.parameters(), args.init_lr)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.96, last_epoch=-1)
        if args.ngpus>0:
            if torch.cuda.device_count() > 1:
                print("Let's use", torch.cuda.device_count(), "GPUs!")
                self.model = nn.DataParallel(self.model)

        self.model.to(self.device)

        criterion = loss.LossAll()
        print('Setting up data...')

        dataset_module = self.dataset[args.dataset]

        dsets = {x: dataset_module(data_dir=args.data_dir,
                                   phase=x,
                                   input_h=args.input_h,
                                   input_w=args.input_w,
                                   down_ratio=args.down_ratio)
                 for x in ['train', 'val']}

        dsets_loader = {'train': torch.utils.data.DataLoader(dsets['train'],
                                                             batch_size=args.batch_size,
                                                             shuffle=True,
                                                             num_workers=args.num_workers,
                                                             pin_memory=True,
                                                             drop_last=True,
                                                             collate_fn=collater),

                        'val':torch.utils.data.DataLoader(dsets['val'],
                                                          batch_size=1,
                                                          shuffle=False,
                                                          num_workers=1,
                                                          pin_memory=True,
                                                          collate_fn=collater)}


        print('Starting training...')
        train_loss = []
        val_loss = []
        for epoch in range(1, args.num_epoch+1):
            print('-'*10)
            print('Epoch: {}/{} '.format(epoch, args.num_epoch))
            epoch_loss = self.run_epoch(phase='train',
                                        data_loader=dsets_loader['train'],
                                        criterion=criterion)
            train_loss.append(epoch_loss)
            scheduler.step(epoch)

            epoch_loss = self.run_epoch(phase='val',
                                        data_loader=dsets_loader['val'],
                                        criterion=criterion)
            val_loss.append(epoch_loss)

            np.savetxt(os.path.join(save_path, 'train_loss.txt'), train_loss, fmt='%.6f')
            np.savetxt(os.path.join(save_path, 'val_loss.txt'), val_loss, fmt='%.6f')

            if epoch % 10 == 0 or epoch ==1:
                self.save_model(os.path.join(save_path, 'model_{}.pth'.format(epoch)), epoch, self.model)

            if len(val_loss)>1:
                if val_loss[-1]<np.min(val_loss[:-1]):
                    self.save_model(os.path.join(save_path, 'model_last.pth'), epoch, self.model)
示例#3
0
    def train_network(self, args):
        optimizer = torch.optim.AdamW(self.model.parameters(), args.init_lr)
        self.optimizer = Lookahead(optimizer)
        milestones = [5 + x * 80 for x in range(5)]
        # print(f'milestones:{milestones}')
        # self.scheduler  = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.96, last_epoch=-1)
        scheduler_c = CyclicCosAnnealingLR(optimizer,
                                           milestones=milestones,
                                           eta_min=5e-5)
        self.scheduler = LearningRateWarmUP(optimizer=optimizer,
                                            target_iteration=5,
                                            target_lr=0.003,
                                            after_scheduler=scheduler_c)

        save_path = 'weights_' + args.dataset
        start_epoch = 1
        best_loss = 1000
        # try:
        #     self.model, _, _ = self.load_model(self.model, self.optimizer, args.resume)
        # except:
        #     print('load pretrained model failed')

        # self.model = self.load_model(self.model, self.optimizer, args.resume)

        if not os.path.exists(save_path):
            os.mkdir(save_path)
        if args.ngpus > 1:
            if torch.cuda.device_count() > 1:
                print("Let's use", torch.cuda.device_count(), "GPUs!")
                # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
                self.model = nn.DataParallel(self.model)
        self.model.to(self.device)

        criterion = loss.LossAll()
        print('Setting up data...')

        dataset_module = self.dataset[args.dataset]

        dsets = {
            x: dataset_module(data_dir=args.data_dir,
                              phase=x,
                              input_h=args.input_h,
                              input_w=args.input_w,
                              down_ratio=self.down_ratio)
            for x in self.dataset_phase[args.dataset]
        }

        dsets_loader = {}
        dsets_loader['train'] = torch.utils.data.DataLoader(
            dsets['train'],
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.num_workers,
            pin_memory=True,
            drop_last=True,
            collate_fn=collater)
        dsets_loader['valid'] = torch.utils.data.DataLoader(
            dsets['valid'],
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.num_workers,
            pin_memory=True,
            drop_last=True,
            collate_fn=collater)
        print('Starting training...')
        train_loss = []
        valid_loss = []
        ap_list = []
        for epoch in range(start_epoch, args.num_epoch + 1):
            print('-' * 10)
            print('Epoch: {}/{} '.format(epoch, args.num_epoch))
            epoch_loss = self.run_epoch(phase='train',
                                        data_loader=dsets_loader['train'],
                                        criterion=criterion)
            train_loss.append(epoch_loss)
            epoch_loss = self.run_epoch(phase='valid',
                                        data_loader=dsets_loader['valid'],
                                        criterion=criterion)
            valid_loss.append(epoch_loss)

            self.scheduler.step(epoch)

            np.savetxt(os.path.join(save_path, 'train_loss.txt'),
                       train_loss,
                       fmt='%.6f')
            np.savetxt(os.path.join(save_path, 'valid_loss.txt'),
                       valid_loss,
                       fmt='%.6f')
            # if epoch % 5 == 0 or epoch > 20:
            #     self.save_model(os.path.join(save_path, 'model_{}.pth'.format(epoch)),
            #                     epoch,
            #                     self.model,
            #                     self.optimizer)

            if epoch_loss < best_loss:
                self.save_model(
                    os.path.join(save_path, 'model_{}.pth'.format(epoch)),
                    epoch, self.model, self.optimizer)
                print(f'find optimal model, {best_loss}==>{epoch_loss}')
                best_loss = epoch_loss

            self.save_model(os.path.join(save_path, 'model_last.pth'), epoch,
                            self.model, self.optimizer)
    def train_network(self, args):
        # 多gpu的情况我就考虑不了了,删掉即可(随时优化无用代码,降低复杂度)
        # if args.ngpus > 1:
        #     if torch.cuda.device_count() > 1:
        #         print("Let's use", torch.cuda.device_count(), "GPUs!")
        #         # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
        #         self.model = nn.DataParallel(self.model)

        self.optimizer = torch.optim.Adam(self.model.parameters(), args.init_lr)
        # !!!! LR scheduler may not work for low version pytorch.
        self.scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.96, last_epoch=-1)
        save_path = 'weights_' + args.dataset
        start_epoch = 1

        # 当需要加载预训练模型的时候可以指定resume_train参数
        if args.resume_train:
            self.model, self.optimizer, start_epoch = self.load_model(self.model,
                                                                      self.optimizer,
                                                                      args.resume_train,
                                                                      strict=True)
        if not os.path.exists(save_path):
            os.mkdir(save_path)

        self.model.to(self.device)

        # 生成loss.LossAll实例, 用于对检测结果计算loss
        criterion = loss.LossAll()
        print('Setting up data...')

        # self.dataset是一个字典{数据集名:数据集类(在datasets内)},如果只要一个数据集,考虑优化的简单点
        # 这里利用args参数选择了一个数据集
        dataset_module = self.dataset[args.dataset]

        # 对数据集中每个指定的阶段(在self.dataset_phase中指定),生成一个类实例(dataset类可以看看实现了啥接口)
        dsets = {x: dataset_module(data_dir=args.data_dir,
                                   phase=x,
                                   input_h=args.input_h,
                                   input_w=args.input_w,
                                   down_ratio=self.down_ratio)
                 for x in self.dataset_phase[args.dataset]}

        # 这里作者保留了扩展性,但是其实只用到了一个dsets_loader['train']
        dsets_loader = {}
        # 神奇的dataloader,在run_epoch中可以直接使用 for batch_data in dsets_loader['train']
        #
        dsets_loader['train'] = torch.utils.data.DataLoader(dsets['train'],
                                                            batch_size=args.batch_size,
                                                            shuffle=True,
                                                            num_workers=args.num_workers,
                                                            pin_memory=True,
                                                            drop_last=True,
                                                            collate_fn=collater)

        print('Starting training...')
        train_loss = []
        ap_list = []
        for epoch in range(start_epoch, args.num_epoch + 1):
            print('-' * 10)
            print('Epoch: {}/{} '.format(epoch, args.num_epoch))
            # run_epoch方法,封装了前向传播,loss计算,后向传播的代码
            epoch_loss = self.run_epoch(phase='train',
                                        data_loader=dsets_loader['train'],
                                        criterion=criterion)
            train_loss.append(epoch_loss)

            # !!! 注意可能造成版本问题的lr scheduler 相关代码
            self.scheduler.step(epoch)

            np.savetxt(os.path.join(save_path, 'train_loss.txt'), train_loss, fmt='%.6f')

            # 隔5保1
            if epoch % 5 == 0: # or epoch > 20:
                self.save_model(os.path.join(save_path, 'model_{}.pth'.format(epoch)),
                                epoch,
                                self.model,
                                self.optimizer)

            # 隔5测1(test指的是验证集?)
            if 'test' in self.dataset_phase[args.dataset] and epoch % 5 == 0:
                # 使用dec_eval函数对结果指标进行计算
                mAP = self.dec_eval(args, dsets['test'])
                ap_list.append(mAP)
                np.savetxt(os.path.join(save_path, 'ap_list.txt'), ap_list, fmt='%.6f')

            self.save_model(os.path.join(save_path, 'model_last.pth'),
                            epoch,
                            self.model,
                            self.optimizer)
示例#5
0
    def train_network(self, args):
        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          args.init_lr)
        start_epoch = 1
        path = 'weights_dota/model_last.pth'  # 恢复
        if '.pth' in args.resume:
            self.model, _, start_epoch = self.load_model(self.model,
                                                         self.optimizer,
                                                         path,
                                                         strict=True)
        # 将学习率去掉 self.optimizer

        # 以指数形式衰减学习率(可以调参!!!!!!!!!!!)
        self.scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer,
                                                                gamma=0.96,
                                                                last_epoch=-1)
        save_path = 'weights_' + args.dataset  # 创建检测点保存目录

        if not os.path.exists(save_path):
            os.mkdir(save_path)
        # 多GPU并行操作
        if args.ngpus > 1:
            if torch.cuda.device_count() > 1:
                print("Let's use", torch.cuda.device_count(), "GPUs!")
                self.model = nn.DataParallel(self.model)
        self.model.to(self.device)
        criterion = loss.LossAll()  # loss函数的设计
        print('Setting up data...')
        dataset_module = self.dataset[args.dataset]  # DOTA类
        dsets = {
            x: dataset_module(data_dir=args.data_dir,
                              phase=x,
                              input_h=args.input_h,
                              input_w=args.input_w,
                              down_ratio=self.down_ratio)
            for x in self.dataset_phase[args.dataset]
        }  # DOTA类的初始化操作
        # 调用torch自带函数完成数据迭代器的生成
        dsets_loader = {}
        dsets_loader['train'] = torch.utils.data.DataLoader(
            dsets['train'],
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.num_workers,
            pin_memory=True,
            drop_last=True,
            collate_fn=collater)
        print('Starting training...')
        train_loss = []
        ap_list = []
        # 迭代
        for epoch in range(start_epoch, args.num_epoch + 1):
            print('-' * 10)
            print('Epoch: {}/{} '.format(epoch, args.num_epoch))
            # 单个epoch的loss
            epoch_loss = self.run_epoch(phase='train',
                                        data_loader=dsets_loader['train'],
                                        criterion=criterion)
            train_loss.append(epoch_loss)
            self.scheduler.step()  # 根据epoch调整学习率
            np.savetxt(os.path.join(save_path, 'train_loss.txt'),
                       train_loss,
                       fmt='%.6f')  # 保存训练时loss的变化
            # 检查点保存 or epoch > 40
            if epoch % 10 == 0:
                self.save_model_yan(
                    os.path.join(save_path, 'model_{}.pth'.format(epoch)),
                    epoch, self.model)
            '''
            # 如果有test关键字,则进行验证输出
            if 'test' in self.dataset_phase[args.dataset] and epoch % 5 == 0:
                mAP = self.dec_eval(args, dsets['test'])
                ap_list.append(mAP)
                np.savetxt(os.path.join(save_path, 'ap_list.txt'), ap_list, fmt='%.6f')
           '''
            # 保存每个epoch的模型
            self.save_model(os.path.join(save_path, 'model_last.pth'), epoch,
                            self.model, self.optimizer)