def __init__(self, config, num_classes, train_triplet=False): """ :param config: 配置参数 :param num_classes: 训练集的类别数;类型为int :param train_triplet: 是否只训练triplet损失;类型为bool """ self.num_classes = num_classes self.model_name = config.model_name self.last_stride = config.last_stride self.num_gpus = torch.cuda.device_count() print('Using {} GPUS'.format(self.num_gpus)) print('NUM_CLASS: {}'.format(self.num_classes)) print('USE LOSS: {}'.format(config.selected_loss)) # 加载模型,只要有GPU,则使用DataParallel函数,当GPU有多个GPU时,调用sync_bn函数 self.model = get_model(self.model_name, self.num_classes, self.last_stride) if torch.cuda.is_available(): self.model = torch.nn.DataParallel(self.model) if self.num_gpus > 1: self.model = convert_model(self.model) self.model = self.model.cuda() # 加载超参数 self.epoch = config.epoch # 实例化实现各种子函数的 solver 类 self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) # 加载损失函数 self.criterion = Loss(self.model_name, config.selected_loss, config.margin, self.num_classes) # 加载优化函数 self.optim = get_optimizer(config, self.model) # 加载学习率衰减策略 self.scheduler = get_scheduler(config, self.optim) # 创建保存权重的路径 self.model_path = os.path.join(config.save_path, config.model_name) if not os.path.exists(self.model_path): os.makedirs(self.model_path) # 如果只训练Triplet损失 if train_triplet: self.solver.load_checkpoint(os.path.join(self.model_path, '{}.pth'.format(self.model_name))) # 保存json文件和初始化tensorboard TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.datetime.now()) self.writer = SummaryWriter(log_dir=os.path.join(self.model_path, TIMESTAMP)) with codecs.open(self.model_path + '/' + TIMESTAMP + '.json', 'w', "utf-8") as json_file: json.dump({k: v for k, v in config._get_kwargs()}, json_file, ensure_ascii=False) # 设置随机种子,注意交叉验证部分划分训练集和验证集的时候,要保持种子固定 self.seed = int(time.time()) seed_torch(self.seed) with open(self.model_path + '/' + TIMESTAMP + '.pkl', 'wb') as f: pickle.dump({'seed': self.seed}, f, -1)
def prepare(config, train_labels_number): """ Args: config: 配置参数 train_labels_number: list, 某一折的[number_class0, number__class1, ...] Returns: optimizer: 优化器 model: 模型 criterion: 损失函数 """ # 加载模型 prepare_model = PrepareModel() model = prepare_model.create_model( model_type=config.model_type, classes_num=config.num_classes, drop_rate=config.drop_rate, pretrained=True, bn_to_gn=config.bn_to_gn ) if torch.cuda.is_available(): model = torch.nn.DataParallel(model) model = model.cuda() # 加载优化器 optimizer = prepare_model.create_optimizer(config.model_type, model, config) # 加载损失函数 criterion = Loss(config.model_type, config.loss_name, config.num_classes, train_labels_number, config.beta_CB, config.gamma) return optimizer, model, criterion
def __init__(self, config, fold): """ Args: config: 配置参数 fold: 当前为第几折 """ self.config = config self.fold = fold self.epoch = config.epoch self.num_classes = config.num_classes self.lr_scheduler = config.lr_scheduler print('USE LOSS: {}'.format(config.loss_name)) # 加载模型 prepare_model = PrepareModel() self.model = prepare_model.create_local_attention_model( model_type=config.model_type, classes_num=self.num_classes, last_stride=2, droprate=0) # 得到最新产生的权重文件 weight_path = os.path.join('checkpoints', config.model_type) lists = os.listdir(weight_path) # 获得文件夹内所有文件 lists.sort( key=lambda fn: os.path.getmtime(weight_path + '/' + fn)) # 排序 weight_path = os.path.join(weight_path, lists[-1], 'model_best.pth') # 加载之前训练的权重 pretrained_dict = torch.load(weight_path)['state_dict'] model_dict = self.model.state_dict() pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } # filter out unnecessary keys model_dict.update(pretrained_dict) self.model.load_state_dict(model_dict) print('Successfully Loaded from %s' % weight_path) if torch.cuda.is_available(): self.model = torch.nn.DataParallel(self.model) self.model = self.model.cuda() # 加载优化器 self.optimizer = prepare_model.create_optimizer( config.model_type, self.model, config) # 加载衰减策略 self.exp_lr_scheduler = prepare_model.create_lr_scheduler( self.lr_scheduler, self.optimizer, step_size=config.lr_step_size, restart_step=config.restart_step, multi_step=config.multi_step) # 加载损失函数 self.criterion = Loss(config.model_type, config.loss_name, self.num_classes) # 实例化实现各种子函数的 solver 类 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) # log初始化 self.writer, self.time_stamp = self.init_log() self.model_path = os.path.join(self.config.save_path, self.config.model_type, self.time_stamp) # 初始化分类度量准则类 with open("online-service/model/label_id_name.json", 'r', encoding='utf-8') as json_file: self.class_names = list(json.load(json_file).values()) self.classification_metric = ClassificationMetric( self.class_names, self.model_path) self.max_accuracy_valid = 0
class TrainVal: def __init__(self, config, fold): """ Args: config: 配置参数 fold: 当前为第几折 """ self.config = config self.fold = fold self.epoch = config.epoch self.num_classes = config.num_classes self.lr_scheduler = config.lr_scheduler print('USE LOSS: {}'.format(config.loss_name)) # 加载模型 prepare_model = PrepareModel() self.model = prepare_model.create_local_attention_model( model_type=config.model_type, classes_num=self.num_classes, last_stride=2, droprate=0) # 得到最新产生的权重文件 weight_path = os.path.join('checkpoints', config.model_type) lists = os.listdir(weight_path) # 获得文件夹内所有文件 lists.sort( key=lambda fn: os.path.getmtime(weight_path + '/' + fn)) # 排序 weight_path = os.path.join(weight_path, lists[-1], 'model_best.pth') # 加载之前训练的权重 pretrained_dict = torch.load(weight_path)['state_dict'] model_dict = self.model.state_dict() pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } # filter out unnecessary keys model_dict.update(pretrained_dict) self.model.load_state_dict(model_dict) print('Successfully Loaded from %s' % weight_path) if torch.cuda.is_available(): self.model = torch.nn.DataParallel(self.model) self.model = self.model.cuda() # 加载优化器 self.optimizer = prepare_model.create_optimizer( config.model_type, self.model, config) # 加载衰减策略 self.exp_lr_scheduler = prepare_model.create_lr_scheduler( self.lr_scheduler, self.optimizer, step_size=config.lr_step_size, restart_step=config.restart_step, multi_step=config.multi_step) # 加载损失函数 self.criterion = Loss(config.model_type, config.loss_name, self.num_classes) # 实例化实现各种子函数的 solver 类 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) # log初始化 self.writer, self.time_stamp = self.init_log() self.model_path = os.path.join(self.config.save_path, self.config.model_type, self.time_stamp) # 初始化分类度量准则类 with open("online-service/model/label_id_name.json", 'r', encoding='utf-8') as json_file: self.class_names = list(json.load(json_file).values()) self.classification_metric = ClassificationMetric( self.class_names, self.model_path) self.max_accuracy_valid = 0 def train(self, train_loader, valid_loader): """ 完成模型的训练,保存模型与日志 Args: train_loader: 训练数据的DataLoader valid_loader: 验证数据的Dataloader """ global_step = 0 max_accuracy_valid = 0 for epoch in range(self.epoch): self.model.train() epoch += 1 images_number, epoch_corrects = 0, 0 tbar = tqdm.tqdm(train_loader) for i, (images, labels) in enumerate(tbar): # 网络的前向传播与反向传播 labels_predict = self.solver.forward(images) loss = self.solver.cal_loss(labels_predict, labels, self.criterion) self.solver.backword(self.optimizer, loss) images_number += images.size(0) epoch_corrects += self.model.module.get_classify_result( labels_predict, labels, self.device).sum() train_acc_iteration = self.model.module.get_classify_result( labels_predict, labels, self.device).mean() # 保存到tensorboard,每一步存储一个 descript = self.criterion.record_loss_iteration( self.writer.add_scalar, global_step + i) self.writer.add_scalar('TrainAccIteration', train_acc_iteration, global_step + i) params_groups_lr = str() for group_ind, param_group in enumerate( self.optimizer.param_groups): params_groups_lr = params_groups_lr + 'params_group_%d' % group_ind + ': %.12f, ' % param_group[ 'lr'] descript = '[Train Fold {}][epoch: {}/{}][Lr :{}][Acc: {:.4f}]'.format( self.fold, epoch, self.epoch, params_groups_lr, train_acc_iteration) + descript tbar.set_description(desc=descript) # 写到tensorboard中 epoch_acc = epoch_corrects / images_number self.writer.add_scalar('TrainAccEpoch', epoch_acc, epoch) self.writer.add_scalar('Lr', self.optimizer.param_groups[0]['lr'], epoch) descript = self.criterion.record_loss_epoch( len(train_loader), self.writer.add_scalar, epoch) # Print the log info print('[Finish epoch: {}/{}][Average Acc: {:.4}]'.format( epoch, self.epoch, epoch_acc) + descript) # 验证模型 val_accuracy, val_loss, is_best = self.validation(valid_loader) # 保存参数 state = { 'epoch': epoch, 'state_dict': self.model.module.state_dict(), 'max_score': max_accuracy_valid } self.solver.save_checkpoint( os.path.join( self.model_path, '%s_fold%d.pth' % (self.config.model_type, self.fold)), state, is_best) # 写到tensorboard中 self.writer.add_scalar('ValidLoss', val_loss, epoch) self.writer.add_scalar('ValidAccuracy', val_accuracy, epoch) # 每一个epoch完毕之后,执行学习率衰减 if self.lr_scheduler == 'ReduceLR': self.exp_lr_scheduler.step(val_loss) else: self.exp_lr_scheduler.step() global_step += len(train_loader) def validation(self, valid_loader): tbar = tqdm.tqdm(valid_loader) self.model.eval() labels_predict_all, labels_all = np.empty(shape=(0, )), np.empty( shape=(0, )) epoch_loss = 0 with torch.no_grad(): for i, (_, images, labels) in enumerate(tbar): # 网络的前向传播 labels_predict = self.solver.forward(images) loss = self.solver.cal_loss(labels_predict, labels, self.criterion) epoch_loss += loss # 先经过softmax函数,再经过argmax函数 labels_predict = F.softmax(labels_predict, dim=1) labels_predict = torch.argmax(labels_predict, dim=1).detach().cpu().numpy() labels_predict_all = np.concatenate( (labels_predict_all, labels_predict)) labels_all = np.concatenate((labels_all, labels)) descript = '[Valid][Loss: {:.4f}]'.format(loss) tbar.set_description(desc=descript) classify_report, my_confusion_matrix, acc_for_each_class, oa, average_accuracy, kappa = \ self.classification_metric.get_metric( labels_all, labels_predict_all ) if oa > self.max_accuracy_valid: is_best = True self.max_accuracy_valid = oa self.classification_metric.draw_cm_and_save_result( classify_report, my_confusion_matrix, acc_for_each_class, oa, average_accuracy, kappa) else: is_best = False print('OA:{}, AA:{}, Kappa:{}'.format(oa, average_accuracy, kappa)) return oa, epoch_loss / len(tbar), is_best def init_log(self): # 保存配置信息和初始化tensorboard TIMESTAMP = "log-{0:%Y-%m-%dT%H-%M-%S}-localAtt".format( datetime.datetime.now()) log_dir = os.path.join(self.config.save_path, self.config.model_type, TIMESTAMP) writer = SummaryWriter(log_dir=log_dir) with codecs.open(os.path.join(log_dir, 'config.json'), 'w', "utf-8") as json_file: json.dump({k: v for k, v in config._get_kwargs()}, json_file, ensure_ascii=False) seed = int(time.time()) seed_torch(seed) with open(os.path.join(log_dir, 'seed.pkl'), 'wb') as f: pickle.dump({'seed': seed}, f, -1) return writer, TIMESTAMP
class TrainBaseline(object): def __init__(self, config, num_classes, train_triplet=False): """ :param config: 配置参数 :param num_classes: 训练集的类别数;类型为int :param train_triplet: 是否只训练triplet损失;类型为bool """ self.num_classes = num_classes self.model_name = config.model_name self.last_stride = config.last_stride self.num_gpus = torch.cuda.device_count() print('Using {} GPUS'.format(self.num_gpus)) print('NUM_CLASS: {}'.format(self.num_classes)) print('USE LOSS: {}'.format(config.selected_loss)) # 加载模型,只要有GPU,则使用DataParallel函数,当GPU有多个GPU时,调用sync_bn函数 self.model = get_model(self.model_name, self.num_classes, self.last_stride) if torch.cuda.is_available(): self.model = torch.nn.DataParallel(self.model) if self.num_gpus > 1: self.model = convert_model(self.model) self.model = self.model.cuda() # 加载超参数 self.epoch = config.epoch # 实例化实现各种子函数的 solver 类 self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) # 加载损失函数 self.criterion = Loss(self.model_name, config.selected_loss, config.margin, self.num_classes) # 加载优化函数 self.optim = get_optimizer(config, self.model) # 加载学习率衰减策略 self.scheduler = get_scheduler(config, self.optim) # 创建保存权重的路径 self.model_path = os.path.join(config.save_path, config.model_name) if not os.path.exists(self.model_path): os.makedirs(self.model_path) # 如果只训练Triplet损失 if train_triplet: self.solver.load_checkpoint(os.path.join(self.model_path, '{}.pth'.format(self.model_name))) # 保存json文件和初始化tensorboard TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.datetime.now()) self.writer = SummaryWriter(log_dir=os.path.join(self.model_path, TIMESTAMP)) with codecs.open(self.model_path + '/' + TIMESTAMP + '.json', 'w', "utf-8") as json_file: json.dump({k: v for k, v in config._get_kwargs()}, json_file, ensure_ascii=False) # 设置随机种子,注意交叉验证部分划分训练集和验证集的时候,要保持种子固定 self.seed = int(time.time()) seed_torch(self.seed) with open(self.model_path + '/' + TIMESTAMP + '.pkl', 'wb') as f: pickle.dump({'seed': self.seed}, f, -1) def train(self, train_loader): """ 完成模型的训练,保存模型与日志 :param train_loader: 训练集的Dataloader :return: None """ global_step = 0 for epoch in range(self.epoch): epoch += 1 self.model.train() images_number, epoch_corrects, index = 0, 0, 0 tbar = tqdm.tqdm(train_loader) for index, (images, labels) in enumerate(tbar): # 网络的前向传播与反向传播 outputs = self.solver.forward((images, labels)) loss = self.solver.cal_loss(outputs, labels, self.criterion) self.solver.backword(self.optim, loss) images_number += images.size(0) epoch_corrects += self.model.module.get_classify_result(outputs, labels, self.device).sum() train_acc_iteration = self.model.module.get_classify_result(outputs, labels, self.device).mean() * 100 # 保存到tensorboard,每一步存储一个 global_step += 1 descript = self.criterion.record_loss_iteration(self.writer.add_scalar, global_step) self.writer.add_scalar('TrainAccIteration', train_acc_iteration, global_step) descript = '[Train][epoch: {}/{}][Lr :{:.7f}][Acc: {:.2f}]'.format(epoch, self.epoch, self.scheduler.get_lr()[1], train_acc_iteration) + descript tbar.set_description(desc=descript) # 每一个epoch完毕之后,执行学习率衰减 self.scheduler.step() # 写到tensorboard中 epoch_acc = epoch_corrects / images_number * 100 self.writer.add_scalar('TrainAccEpoch', epoch_acc, epoch) self.writer.add_scalar('Lr', self.scheduler.get_lr()[1], epoch) descript = self.criterion.record_loss_epoch(index, self.writer.add_scalar, epoch) # Print the log info print('[Finish epoch: {}/{}][Average Acc: {:.2}]'.format(epoch, self.epoch, epoch_acc) + descript) state = { 'epoch': epoch, 'state_dict': self.model.module.state_dict(), } self.solver.save_checkpoint( os.path.join(self.model_path, '{}.pth'.format(self.model_name)), state, False)
def __init__(self, config, fold): """ Args: config: 配置参数 fold: 当前为第几折 """ self.config = config self.fold = fold self.epoch = config.epoch self.num_classes = config.num_classes self.lr_scheduler = config.lr_scheduler self.save_interval = 10 self.cut_mix = config.cut_mix self.beta = config.beta self.cutmix_prob = config.cutmix_prob self.auto_aug = config.auto_aug # 多尺度 self.image_size = config.image_size self.multi_scale = config.multi_scale self.val_multi_scale = config.val_multi_scale self.multi_scale_size = config.multi_scale_size self.multi_scale_interval = config.multi_scale_interval # 稀疏训练 self.sparsity = config.sparsity self.sparsity_scale = config.sparsity_scale self.penalty_type = config.penalty_type self.selected_labels = config.selected_labels if self.auto_aug: print('@ Using AutoAugment.') if self.cut_mix: print('@ Using cut mix.') if self.multi_scale: print('@ Using multi scale training.') print('@ Using LOSS: {}'.format(config.loss_name)) # 加载模型 prepare_model = PrepareModel() self.model = prepare_model.create_model(model_type=config.model_type, classes_num=self.num_classes, drop_rate=config.drop_rate, pretrained=True, bn_to_gn=config.bn_to_gn) if config.weight_path: self.model = prepare_model.load_chekpoint(self.model, config.weight_path) # 稀疏训练 self.sparsity_train = None if config.sparsity: print('@ Using sparsity training.') self.sparsity_train = Sparsity(self.model, sparsity_scale=self.sparsity_scale, penalty_type=self.penalty_type) # l1正则化 self.l1_regular = config.l1_regular self.l1_decay = config.l1_decay if self.l1_regular: print('@ Using l1_regular') self.l1_reg_loss = Regularization(self.model, weight_decay=self.l1_decay, p=1) if torch.cuda.is_available(): self.model = torch.nn.DataParallel(self.model) self.model = self.model.cuda() # 加载优化器 self.optimizer = prepare_model.create_optimizer( config.model_type, self.model, config) # 加载衰减策略 self.exp_lr_scheduler = prepare_model.create_lr_scheduler( self.lr_scheduler, self.optimizer, step_size=config.lr_step_size, restart_step=config.restart_step, multi_step=config.multi_step, warmup=config.warmup, multiplier=config.multiplier, warmup_epoch=config.warmup_epoch, delay_epoch=config.delay_epoch) # 加载损失函数 self.criterion = Loss(config.model_type, config.loss_name, self.num_classes) # 实例化实现各种子函数的 solver 类 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) # log初始化 self.writer, self.time_stamp = self.init_log() self.model_path = os.path.join(self.config.save_path, self.config.model_type, self.time_stamp) # 初始化分类度量准则类 with open("online-service/model/label_id_name.json", 'r', encoding='utf-8') as json_file: self.class_names = list(json.load(json_file).values()) self.classification_metric = ClassificationMetric( self.class_names, self.model_path) self.max_accuracy_valid = 0
class TrainVal: def __init__(self, config, fold): """ Args: config: 配置参数 fold: 当前为第几折 """ self.config = config self.fold = fold self.epoch = config.epoch self.num_classes = config.num_classes self.lr_scheduler = config.lr_scheduler self.save_interval = 10 self.cut_mix = config.cut_mix self.beta = config.beta self.cutmix_prob = config.cutmix_prob self.auto_aug = config.auto_aug # 多尺度 self.image_size = config.image_size self.multi_scale = config.multi_scale self.val_multi_scale = config.val_multi_scale self.multi_scale_size = config.multi_scale_size self.multi_scale_interval = config.multi_scale_interval # 稀疏训练 self.sparsity = config.sparsity self.sparsity_scale = config.sparsity_scale self.penalty_type = config.penalty_type self.selected_labels = config.selected_labels if self.auto_aug: print('@ Using AutoAugment.') if self.cut_mix: print('@ Using cut mix.') if self.multi_scale: print('@ Using multi scale training.') print('@ Using LOSS: {}'.format(config.loss_name)) # 加载模型 prepare_model = PrepareModel() self.model = prepare_model.create_model(model_type=config.model_type, classes_num=self.num_classes, drop_rate=config.drop_rate, pretrained=True, bn_to_gn=config.bn_to_gn) if config.weight_path: self.model = prepare_model.load_chekpoint(self.model, config.weight_path) # 稀疏训练 self.sparsity_train = None if config.sparsity: print('@ Using sparsity training.') self.sparsity_train = Sparsity(self.model, sparsity_scale=self.sparsity_scale, penalty_type=self.penalty_type) # l1正则化 self.l1_regular = config.l1_regular self.l1_decay = config.l1_decay if self.l1_regular: print('@ Using l1_regular') self.l1_reg_loss = Regularization(self.model, weight_decay=self.l1_decay, p=1) if torch.cuda.is_available(): self.model = torch.nn.DataParallel(self.model) self.model = self.model.cuda() # 加载优化器 self.optimizer = prepare_model.create_optimizer( config.model_type, self.model, config) # 加载衰减策略 self.exp_lr_scheduler = prepare_model.create_lr_scheduler( self.lr_scheduler, self.optimizer, step_size=config.lr_step_size, restart_step=config.restart_step, multi_step=config.multi_step, warmup=config.warmup, multiplier=config.multiplier, warmup_epoch=config.warmup_epoch, delay_epoch=config.delay_epoch) # 加载损失函数 self.criterion = Loss(config.model_type, config.loss_name, self.num_classes) # 实例化实现各种子函数的 solver 类 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) # log初始化 self.writer, self.time_stamp = self.init_log() self.model_path = os.path.join(self.config.save_path, self.config.model_type, self.time_stamp) # 初始化分类度量准则类 with open("online-service/model/label_id_name.json", 'r', encoding='utf-8') as json_file: self.class_names = list(json.load(json_file).values()) self.classification_metric = ClassificationMetric( self.class_names, self.model_path) self.max_accuracy_valid = 0 def train(self, train_loader, valid_loader): """ 完成模型的训练,保存模型与日志 Args: train_loader: 训练数据的DataLoader valid_loader: 验证数据的Dataloader """ global_step = 0 for epoch in range(self.epoch): self.model.train() epoch += 1 images_number, epoch_corrects = 0, 0 tbar = tqdm.tqdm(train_loader) image_size = self.image_size l1_regular_loss = 0 loss_with_l1_regular = 0 for i, (images, labels) in enumerate(tbar): if self.multi_scale: if i % self.multi_scale_interval == 0: image_size = random.choice(self.multi_scale_size) images = multi_scale_transforms(image_size, images, auto_aug=self.auto_aug) if self.cut_mix: # 使用cut_mix r = np.random.rand(1) if self.beta > 0 and r < self.cutmix_prob: images, labels_a, labels_b, lam = generate_mixed_sample( self.beta, images, labels) labels_predict = self.solver.forward(images) loss = self.solver.cal_loss_cutmix( labels_predict, labels_a, labels_b, lam, self.criterion) else: # 网络的前向传播 labels_predict = self.solver.forward(images) loss = self.solver.cal_loss(labels_predict, labels, self.criterion) else: # 网络的前向传播 labels_predict = self.solver.forward(images) loss = self.solver.cal_loss(labels_predict, labels, self.criterion) if self.l1_regular: current_l1_regular_loss = self.l1_reg_loss(self.model) loss += current_l1_regular_loss l1_regular_loss += current_l1_regular_loss.item() loss_with_l1_regular += loss.item() self.solver.backword(self.optimizer, loss, sparsity=self.sparsity_train) images_number += images.size(0) epoch_corrects += self.model.module.get_classify_result( labels_predict, labels, self.device).sum() train_acc_iteration = self.model.module.get_classify_result( labels_predict, labels, self.device).mean() # 保存到tensorboard,每一步存储一个 descript = self.criterion.record_loss_iteration( self.writer.add_scalar, global_step + i) self.writer.add_scalar('TrainAccIteration', train_acc_iteration, global_step + i) params_groups_lr = str() for group_ind, param_group in enumerate( self.optimizer.param_groups): params_groups_lr = params_groups_lr + 'pg_%d' % group_ind + ': %.8f, ' % param_group[ 'lr'] descript = '[Train Fold {}][epoch: {}/{}][image_size: {}][Lr :{}][Acc: {:.4f}]'.format( self.fold, epoch, self.epoch, image_size, params_groups_lr, train_acc_iteration) + descript if self.l1_regular: descript += '[L1RegularLoss: {:.4f}][Loss: {:.4f}]'.format( current_l1_regular_loss.item(), loss.item()) tbar.set_description(desc=descript) # 写到tensorboard中 epoch_acc = epoch_corrects / images_number self.writer.add_scalar('TrainAccEpoch', epoch_acc, epoch) self.writer.add_scalar('Lr', self.optimizer.param_groups[0]['lr'], epoch) if self.l1_regular: l1_regular_loss_epoch = l1_regular_loss / len(train_loader) loss_with_l1_regular_epoch = loss_with_l1_regular / len( train_loader) self.writer.add_scalar('TrainL1RegularLoss', l1_regular_loss_epoch, epoch) self.writer.add_scalar('TrainLossWithL1Regular', loss_with_l1_regular_epoch, epoch) descript = self.criterion.record_loss_epoch( len(train_loader), self.writer.add_scalar, epoch) # Print the log info print('[Finish epoch: {}/{}][Average Acc: {:.4}]'.format( epoch, self.epoch, epoch_acc) + descript) # 验证模型 val_accuracy, val_loss, is_best = self.validation( valid_loader, self.val_multi_scale) # 保存参数 state = { 'epoch': epoch, 'state_dict': self.model.module.state_dict(), 'max_score': self.max_accuracy_valid } self.solver.save_checkpoint( os.path.join( self.model_path, '%s_fold%d.pth' % (self.config.model_type, self.fold)), state, is_best) if epoch % self.save_interval == 0: self.solver.save_checkpoint( os.path.join( self.model_path, '%s_epoch%d_fold%d.pth' % (self.config.model_type, epoch, self.fold)), state, False) # 写到tensorboard中 self.writer.add_scalar('ValidLoss', val_loss, epoch) self.writer.add_scalar('ValidAccuracy', val_accuracy, epoch) # 每一个epoch完毕之后,执行学习率衰减 if self.lr_scheduler == 'ReduceLR': self.exp_lr_scheduler.step(metrics=val_accuracy) else: self.exp_lr_scheduler.step() global_step += len(train_loader) print('BEST ACC:{}'.format(self.max_accuracy_valid)) source_path = os.path.join(self.model_path, 'model_best.pth') target_path = os.path.join(self.config.save_path, self.config.model_type, 'backup', 'model_best.pth') print('Copy %s to %s' % (source_path, target_path)) shutil.copy(source_path, target_path) def validation(self, valid_loader, multi_scale=False): self.model.eval() labels_predict_all, labels_all = np.empty(shape=(0, )), np.empty( shape=(0, )) epoch_loss = 0 with torch.no_grad(): if multi_scale: multi_oa = [] for image_size in self.multi_scale_size: tbar = tqdm.tqdm(valid_loader) # 对于每一个尺度都计算准确率 for i, (_, images, labels) in enumerate(tbar): images = multi_scale_transforms(image_size, images, auto_aug=False) # 网络的前向传播 labels_predict = self.solver.forward(images) loss = self.solver.cal_loss(labels_predict, labels, self.criterion) epoch_loss += loss # 先经过softmax函数,再经过argmax函数 labels_predict = F.softmax(labels_predict, dim=1) labels_predict = torch.argmax( labels_predict, dim=1).detach().cpu().numpy() labels_predict_all = np.concatenate( (labels_predict_all, labels_predict)) labels_all = np.concatenate((labels_all, labels)) descript = '[Valid][Loss: {:.4f}]'.format(loss) tbar.set_description(desc=descript) classify_report, my_confusion_matrix, acc_for_each_class, oa, average_accuracy, kappa = \ self.classification_metric.get_metric( labels_all, labels_predict_all ) multi_oa.append(oa) oa = np.asarray(multi_oa).mean() else: tbar = tqdm.tqdm(valid_loader) for i, (_, images, labels) in enumerate(tbar): # 网络的前向传播 labels_predict = self.solver.forward(images) loss = self.solver.cal_loss(labels_predict, labels, self.criterion) epoch_loss += loss # 先经过softmax函数,再经过argmax函数 labels_predict = F.softmax(labels_predict, dim=1) labels_predict = torch.argmax( labels_predict, dim=1).detach().cpu().numpy() labels_predict_all = np.concatenate( (labels_predict_all, labels_predict)) labels_all = np.concatenate((labels_all, labels)) descript = '[Valid][Loss: {:.4f}]'.format(loss) tbar.set_description(desc=descript) classify_report, my_confusion_matrix, acc_for_each_class, oa, average_accuracy, kappa = \ self.classification_metric.get_metric( labels_all, labels_predict_all ) if oa > self.max_accuracy_valid: is_best = True self.max_accuracy_valid = oa if not self.selected_labels: # 只有在未指定训练类别时才画混淆矩阵,否则会出错 self.classification_metric.draw_cm_and_save_result( classify_report, my_confusion_matrix, acc_for_each_class, oa, average_accuracy, kappa) else: is_best = False print('OA:{}, AA:{}, Kappa:{}'.format(oa, average_accuracy, kappa)) return oa, epoch_loss / len(tbar), is_best def init_log(self): # 保存配置信息和初始化tensorboard TIMESTAMP = "log-{0:%Y-%m-%dT%H-%M-%S}".format(datetime.datetime.now()) log_dir = os.path.join(self.config.save_path, self.config.model_type, TIMESTAMP) writer = SummaryWriter(log_dir=log_dir) with codecs.open(os.path.join(log_dir, 'config.json'), 'w', "utf-8") as json_file: json.dump({k: v for k, v in config._get_kwargs()}, json_file, ensure_ascii=False) seed = int(time.time()) seed_torch(seed) with open(os.path.join(log_dir, 'seed.pkl'), 'wb') as f: pickle.dump({'seed': seed}, f, -1) return writer, TIMESTAMP
def __init__(self, config, fold, train_labels_number): """ Args: config: 配置参数 fold: int, 当前为第几折 train_labels_number: list, 某一折的[number_class0, number__class1, ...] """ self.config = config self.fold = fold self.epoch = config.epoch self.num_classes = config.num_classes self.lr_scheduler = config.lr_scheduler self.save_interval = 100 self.cut_mix = config.cut_mix self.beta = config.beta self.cutmix_prob = config.cutmix_prob self.image_size = config.image_size self.multi_scale = config.multi_scale self.multi_scale_size = config.multi_scale_size self.multi_scale_interval = config.multi_scale_interval if self.cut_mix: print('Using cut mix.') if self.multi_scale: print('Using multi scale training.') print('USE LOSS: {}'.format(config.loss_name)) # 加载模型 prepare_model = PrepareModel() self.model = prepare_model.create_model(model_type=config.model_type, classes_num=self.num_classes, drop_rate=config.drop_rate, pretrained=True, bn_to_gn=config.bn_to_gn) if torch.cuda.is_available(): self.model = torch.nn.DataParallel(self.model) self.model = self.model.cuda() # 加载优化器 self.optimizer = prepare_model.create_optimizer( config.model_type, self.model, config) # 加载衰减策略 self.exp_lr_scheduler = prepare_model.create_lr_scheduler( self.lr_scheduler, self.optimizer, step_size=config.lr_step_size, restart_step=config.restart_step, multi_step=config.multi_step) # 加载损失函数 self.criterion = Loss(config.model_type, config.loss_name, self.num_classes, train_labels_number, config.beta_CB, config.gamma) # 实例化实现各种子函数的 solver 类 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) if config.restore: weight_path = os.path.join('checkpoints', config.model_type) if config.restore == 'last': lists = os.listdir(weight_path) # 获得文件夹内所有文件 lists.sort(key=lambda fn: os.path.getmtime(weight_path + '/' + fn)) # 按照最近修改时间排序 weight_path = os.path.join(weight_path, lists[-1], 'model_best.pth') else: weight_path = os.path.join(weight_path, config.restore, 'model_best.pth') self.solver.load_checkpoint(weight_path) # log初始化 self.writer, self.time_stamp = self.init_log() self.model_path = os.path.join(self.config.train_url, self.config.model_type, self.time_stamp) # 初始化分类度量准则类 with open("online-service/model/label_id_name.json", 'r', encoding='utf-8') as json_file: self.class_names = list(json.load(json_file).values()) self.classification_metric = ClassificationMetric(self.class_names, self.model_path, text_flag=0) self.max_accuracy_valid = 0
class TrainVal: def __init__(self, config, fold, train_labels_number): """ Args: config: 配置参数 fold: int, 当前为第几折 train_labels_number: list, 某一折的[number_class0, number__class1, ...] """ self.config = config self.fold = fold self.epoch = config.epoch self.num_classes = config.num_classes self.lr_scheduler = config.lr_scheduler self.save_interval = 100 self.cut_mix = config.cut_mix self.beta = config.beta self.cutmix_prob = config.cutmix_prob self.image_size = config.image_size self.multi_scale = config.multi_scale self.multi_scale_size = config.multi_scale_size self.multi_scale_interval = config.multi_scale_interval if self.cut_mix: print('Using cut mix.') if self.multi_scale: print('Using multi scale training.') print('USE LOSS: {}'.format(config.loss_name)) # 加载模型 prepare_model = PrepareModel() self.model = prepare_model.create_model(model_type=config.model_type, classes_num=self.num_classes, drop_rate=config.drop_rate, pretrained=True, bn_to_gn=config.bn_to_gn) if torch.cuda.is_available(): self.model = torch.nn.DataParallel(self.model) self.model = self.model.cuda() # 加载优化器 self.optimizer = prepare_model.create_optimizer( config.model_type, self.model, config) # 加载衰减策略 self.exp_lr_scheduler = prepare_model.create_lr_scheduler( self.lr_scheduler, self.optimizer, step_size=config.lr_step_size, restart_step=config.restart_step, multi_step=config.multi_step) # 加载损失函数 self.criterion = Loss(config.model_type, config.loss_name, self.num_classes, train_labels_number, config.beta_CB, config.gamma) # 实例化实现各种子函数的 solver 类 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) if config.restore: weight_path = os.path.join('checkpoints', config.model_type) if config.restore == 'last': lists = os.listdir(weight_path) # 获得文件夹内所有文件 lists.sort(key=lambda fn: os.path.getmtime(weight_path + '/' + fn)) # 按照最近修改时间排序 weight_path = os.path.join(weight_path, lists[-1], 'model_best.pth') else: weight_path = os.path.join(weight_path, config.restore, 'model_best.pth') self.solver.load_checkpoint(weight_path) # log初始化 self.writer, self.time_stamp = self.init_log() self.model_path = os.path.join(self.config.train_url, self.config.model_type, self.time_stamp) # 初始化分类度量准则类 with open("online-service/model/label_id_name.json", 'r', encoding='utf-8') as json_file: self.class_names = list(json.load(json_file).values()) self.classification_metric = ClassificationMetric(self.class_names, self.model_path, text_flag=0) self.max_accuracy_valid = 0 def train(self, train_loader, valid_loader): """ 完成模型的训练,保存模型与日志 Args: train_loader: 训练数据的DataLoader valid_loader: 验证数据的Dataloader """ global_step = 0 for epoch in range(self.epoch): self.model.train() epoch += 1 images_number, epoch_corrects = 0, 0 tbar = tqdm.tqdm(train_loader) image_size = self.image_size for i, (_, images, labels) in enumerate(tbar): if self.multi_scale: if i % self.multi_scale_interval == 0: image_size = random.choice(self.multi_scale_size) images = multi_scale_transforms(image_size, images) if self.cut_mix: # 使用cut_mix r = np.random.rand(1) if self.beta > 0 and r < self.cutmix_prob: images, labels_a, labels_b, lam = generate_mixed_sample( self.beta, images, labels) labels_predict = self.solver.forward(images) loss = self.solver.cal_loss_cutmix( labels_predict, labels_a, labels_b, lam, self.criterion) else: # 网络的前向传播 labels_predict = self.solver.forward(images) loss = self.solver.cal_loss(labels_predict, labels, self.criterion) else: # 网络的前向传播 labels_predict = self.solver.forward(images) loss = self.solver.cal_loss(labels_predict, labels, self.criterion) self.solver.backword(self.optimizer, loss) images_number += images.size(0) epoch_corrects += self.model.module.get_classify_result( labels_predict, labels, self.device).sum() train_acc_iteration = self.model.module.get_classify_result( labels_predict, labels, self.device).mean() # 保存到tensorboard,每一步存储一个 descript = self.criterion.record_loss_iteration( self.writer.add_scalar, global_step + i) self.writer.add_scalar('TrainAccIteration', train_acc_iteration, global_step + i) params_groups_lr = str() for group_ind, param_group in enumerate( self.optimizer.param_groups): params_groups_lr = params_groups_lr + 'pg_%d' % group_ind + ': %.8f, ' % param_group[ 'lr'] descript = '[Train Fold {}][epoch: {}/{}][image_size: {}][Lr :{}][Acc: {:.4f}]'.format( self.fold, epoch, self.epoch, image_size, params_groups_lr, train_acc_iteration) + descript # 对于 CyclicLR,要每一步均执行依次学习率衰减 if self.lr_scheduler == 'CyclicLR': self.exp_lr_scheduler.step() self.writer.add_scalar( 'Lr', self.optimizer.param_groups[1]['lr'], global_step + i) tbar.set_description(desc=descript) # 写到tensorboard中 epoch_acc = epoch_corrects / images_number self.writer.add_scalar('TrainAccEpoch', epoch_acc, epoch) if self.lr_scheduler != 'CyclicLR': self.writer.add_scalar('Lr', self.optimizer.param_groups[1]['lr'], epoch) descript = self.criterion.record_loss_epoch( len(train_loader), self.writer.add_scalar, epoch) # Print the log info print('[Finish epoch: {}/{}][Average Acc: {:.4}]'.format( epoch, self.epoch, epoch_acc) + descript) # 验证模型 val_accuracy, val_loss, is_best = self.validation(valid_loader) # 保存参数 state = { 'epoch': epoch, 'state_dict': self.model.module.state_dict(), 'max_score': self.max_accuracy_valid } self.solver.save_checkpoint( os.path.join( self.model_path, '%s_fold%d.pth' % (self.config.model_type, self.fold)), state, is_best) if epoch % self.save_interval == 0: self.solver.save_checkpoint( os.path.join( self.model_path, '%s_epoch%d_fold%d.pth' % (self.config.model_type, epoch, self.fold)), state, False) # 写到tensorboard中 self.writer.add_scalar('ValidLoss', val_loss, epoch) self.writer.add_scalar('ValidAccuracy', val_accuracy, epoch) # 每一个epoch完毕之后,执行学习率衰减 if self.lr_scheduler == 'ReduceLR': self.exp_lr_scheduler.step(val_loss) elif self.lr_scheduler != 'CyclicLR': self.exp_lr_scheduler.step() global_step += len(train_loader) print('BEST ACC:{}'.format(self.max_accuracy_valid)) def validation(self, valid_loader): tbar = tqdm.tqdm(valid_loader) self.model.eval() labels_predict_all, labels_all = np.empty(shape=(0, )), np.empty( shape=(0, )) epoch_loss = 0 with torch.no_grad(): for i, (_, images, labels) in enumerate(tbar): # 网络的前向传播 labels_predict = self.solver.forward(images) loss = self.solver.cal_loss(labels_predict, labels, self.criterion) epoch_loss += loss # 先经过softmax函数,再经过argmax函数 labels_predict = F.softmax(labels_predict, dim=1) labels_predict = torch.argmax(labels_predict, dim=1).detach().cpu().numpy() labels_predict_all = np.concatenate( (labels_predict_all, labels_predict)) labels_all = np.concatenate((labels_all, labels)) descript = '[Valid][Loss: {:.4f}]'.format(loss) tbar.set_description(desc=descript) classify_report, my_confusion_matrix, acc_for_each_class, oa, average_accuracy, kappa = \ self.classification_metric.get_metric( labels_all, labels_predict_all ) if oa > self.max_accuracy_valid: is_best = True self.max_accuracy_valid = oa self.classification_metric.draw_cm_and_save_result( classify_report, my_confusion_matrix, acc_for_each_class, oa, average_accuracy, kappa) else: is_best = False print('OA:{}, AA:{}, Kappa:{}'.format(oa, average_accuracy, kappa)) return oa, epoch_loss / len(tbar), is_best def init_log(self): # 保存配置信息和初始化tensorboard TIMESTAMP = "log-{0:%Y-%m-%dT%H-%M-%S}".format(datetime.datetime.now()) log_dir = os.path.join(self.config.train_url, self.config.model_type, TIMESTAMP) writer = SummaryWriter(log_dir=log_dir) with codecs.open(os.path.join(log_dir, 'param.json'), 'w', "utf-8") as json_file: json.dump({k: v for k, v in config._get_kwargs()}, json_file, ensure_ascii=False) seed = int(time.time()) seed_torch(seed) with open(os.path.join(log_dir, 'seed.pkl'), 'wb') as f: pickle.dump({'seed': seed}, f, -1) return writer, TIMESTAMP
class TrainVal(object): def __init__(self, config, num_query, num_classes, num_valid_classes, fold, train_triplet=False): """ :param config: 配置参数 :param num_query: 该fold查询集的数量;类型为int :param num_classes: 该fold训练集的类别数;类型为int :param num_valid_classes: 该fold验证集的类别数;类型为int :param fold: 训练的哪一折;类型为int :param train_triplet: 是否只训练triplet损失;类型为bool """ self.num_query = num_query self.num_classes = num_classes self.fold = fold self.model_name = config.model_name self.last_stride = config.last_stride self.dist = config.dist self.cython = config.cython self.num_gpus = torch.cuda.device_count() print('Using {} GPUS'.format(self.num_gpus)) print('TRAIN_VALID_RATIO: {}'.format(self.num_classes/num_valid_classes)) print('NUM_CLASS: {}'.format(self.num_classes)) if self.cython: print('USE CYTHON TO EVAL!') print('USE LOSS: {}'.format(config.selected_loss)) # 加载模型,只要有GPU,则使用DataParallel函数,当GPU有多个GPU时,调用sync_bn函数 self.model = get_model(self.model_name, self.num_classes, self.last_stride) if torch.cuda.is_available(): self.model = torch.nn.DataParallel(self.model) if self.num_gpus > 1: self.model = convert_model(self.model) self.model = self.model.cuda() # 加载超参数 self.epoch = config.epoch # 实例化实现各种子函数的 solver 类 self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) # 加载损失函数 self.criterion = Loss(self.model_name, config.selected_loss, config.margin, self.num_classes) # 加载优化函数 self.optim = get_optimizer(config, self.model) # 加载学习率衰减策略 self.scheduler = get_scheduler(config, self.optim) # 创建保存权重的路径 self.model_path = os.path.join(config.save_path, config.model_name) if not os.path.exists(self.model_path): os.makedirs(self.model_path) # 如果只训练Triplet损失 if train_triplet: self.solver.load_checkpoint(os.path.join(self.model_path, '{}_fold{}_best.pth'.format(self.model_name, self.fold))) # 保存json文件和初始化tensorboard TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.datetime.now()) self.writer = SummaryWriter(log_dir=os.path.join(self.model_path, TIMESTAMP)) with codecs.open(self.model_path + '/' + TIMESTAMP + '.json', 'w', "utf-8") as json_file: json.dump({k: v for k, v in config._get_kwargs()}, json_file, ensure_ascii=False) # 设置随机种子,注意交叉验证部分划分训练集和验证集的时候,要保持种子固定 self.seed = int(time.time()) seed_torch(self.seed) with open(self.model_path + '/' + TIMESTAMP + '.pkl', 'wb') as f: pickle.dump({'seed': self.seed}, f, -1) # 设置其他参数 self.max_score = 0 def train(self, train_loader, valid_loader): """ 完成模型的训练,保存模型与日志 :param train_loader: 训练集的Dataloader :param valid_loader: 验证集的Dataloader :return: None """ global_step = 0 for epoch in range(self.epoch): epoch += 1 self.model.train() images_number, epoch_corrects, index = 0, 0, 0 tbar = tqdm.tqdm(train_loader) for index, (images, labels) in enumerate(tbar): # 网络的前向传播与反向传播 outputs = self.solver.forward((images, labels)) loss = self.solver.cal_loss(outputs, labels, self.criterion) self.solver.backword(self.optim, loss) images_number += images.size(0) epoch_corrects += self.model.module.get_classify_result(outputs, labels, self.device).sum() train_acc_iteration = self.model.module.get_classify_result(outputs, labels, self.device).mean() * 100 # 保存到tensorboard,每一步存储一个 global_step += 1 descript = self.criterion.record_loss_iteration(self.writer.add_scalar, global_step) self.writer.add_scalar('TrainAccIteration', train_acc_iteration, global_step) descript = '[Train][epoch: {}/{}][Lr :{:.7f}][Acc: {:.2f}]'.format(epoch, self.epoch, self.scheduler.get_lr()[1], train_acc_iteration) + descript tbar.set_description(desc=descript) # 每一个epoch完毕之后,执行学习率衰减 self.scheduler.step() # 写到tensorboard中 epoch_acc = epoch_corrects / images_number * 100 self.writer.add_scalar('TrainAccEpoch', epoch_acc, epoch) self.writer.add_scalar('Lr', self.scheduler.get_lr()[1], epoch) descript = self.criterion.record_loss_epoch(index, self.writer.add_scalar, epoch) # Print the log info print('[Finish epoch: {}/{}][Average Acc: {:.2}]'.format(epoch, self.epoch, epoch_acc) + descript) # 验证模型 rank1, mAP, average_score = self.validation(valid_loader) if average_score > self.max_score: is_best = True self.max_score = average_score else: is_best = False state = { 'epoch': epoch, 'state_dict': self.model.module.state_dict(), 'max_score': self.max_score } self.solver.save_checkpoint( os.path.join(self.model_path, '{}_fold{}.pth'.format(self.model_name, self.fold)), state, is_best) self.writer.add_scalar('Rank1', rank1, epoch) self.writer.add_scalar('MAP', mAP, epoch) self.writer.add_scalar('AverageScore', average_score, epoch) def validation(self, valid_loader): """ 完成模型的验证过程 :param valid_loader: 验证集的Dataloader :return rank1: rank1得分;类型为float :return mAP: 平均检索精度;类型为float :return average_score: 平均得分;类型为float """ self.model.eval() tbar = tqdm.tqdm(valid_loader) features_all, labels_all = [], [] with torch.no_grad(): for i, (images, labels, paths) in enumerate(tbar): # 完成网络的前向传播 # features = self.solver.forward((images, labels))[-1] features = self.solver.tta((images, labels)) features_all.append(features.detach().cpu()) labels_all.append(labels) features_all = torch.cat(features_all, dim=0) labels_all = torch.cat(labels_all, dim=0) query_features = features_all[:self.num_query] query_labels = labels_all[:self.num_query] gallery_features = features_all[self.num_query:] gallery_labels = labels_all[self.num_query:] if self.dist == 're_rank': distmat = re_rank(query_features, gallery_features) elif self.dist == 'cos_dist': distmat = cos_dist(query_features, gallery_features) elif self.dist == 'euclidean_dist': distmat = euclidean_dist(query_features, gallery_features) else: assert "Not implemented :{}".format(self.dist) all_rank_precison, mAP, _ = eval_func(distmat, query_labels.numpy(), gallery_labels.numpy(), use_cython=self.cython) rank1 = all_rank_precison[0] average_score = 0.5 * rank1 + 0.5 * mAP print('Rank1: {:.2%}, mAP {:.2%}, average score {:.2%}'.format(rank1, mAP, average_score)) return rank1, mAP, average_score
def __init__(self, config, fold, train_labels_number): """ Args: config: 配置参数 fold: int, 当前为第几折 train_labels_number: list, 某一折的[number_class0, number__class1, ...] """ self.config = config self.fold = fold self.epoch = config.epoch self.num_classes = config.num_classes self.lr_scheduler = config.lr_scheduler self.cut_mix = config.cut_mix self.beta = config.beta self.cutmix_prob = config.cutmix_prob self.train_url = config.train_url self.bucket_name = config.bucket_name self.image_size = config.image_size self.multi_scale = config.multi_scale self.multi_scale_size = config.multi_scale_size self.multi_scale_interval = config.multi_scale_interval if self.cut_mix: print('Using cut mix.') if self.multi_scale: print('Using multi scale training.') print('USE LOSS: {}'.format(config.loss_name)) # 拷贝预训练权重 print("=> using pre-trained model '{}'".format(config.model_type)) if not mox.file.exists( '/home/work/.cache/torch/checkpoints/se_resnext101_32x4d-3b2fe3d8.pth' ): mox.file.copy( os.path.join(self.bucket_name, 'model_zoo/se_resnext101_32x4d-3b2fe3d8.pth'), '/home/work/.cache/torch/checkpoints/se_resnext101_32x4d-3b2fe3d8.pth' ) print( 'copy pre-trained model from OBS to: %s success' % (os.path.abspath( '/home/work/.cache/torch/checkpoints/se_resnext101_32x4d-3b2fe3d8.pth' ))) else: print('use exist pre-trained model at: %s' % (os.path.abspath( '/home/work/.cache/torch/checkpoints/se_resnext101_32x4d-3b2fe3d8.pth' ))) # 拷贝预训练权重 print("=> using pre-trained model '{}'".format(config.model_type)) if not mox.file.exists( '/home/work/.cache/torch/checkpoints/efficientnet-b5-b6417697.pth' ): mox.file.copy( os.path.join(self.bucket_name, 'model_zoo/efficientnet-b5-b6417697.pth'), '/home/work/.cache/torch/checkpoints/efficientnet-b5-b6417697.pth' ) print( 'copy pre-trained model from OBS to: %s success' % (os.path.abspath( '/home/work/.cache/torch/checkpoints/efficientnet-b5-b6417697.pth' ))) else: print('use exist pre-trained model at: %s' % (os.path.abspath( '/home/work/.cache/torch/checkpoints/efficientnet-b5-b6417697.pth' ))) # 加载模型 prepare_model = PrepareModel() self.model = prepare_model.create_model(model_type=config.model_type, classes_num=self.num_classes, drop_rate=config.drop_rate, pretrained=True, bn_to_gn=config.bn_to_gn) self.model = torch.nn.DataParallel(self.model).cuda() # 加载优化器 self.optimizer = prepare_model.create_optimizer( config.model_type, self.model, config) # 加载衰减策略 self.exp_lr_scheduler = prepare_model.create_lr_scheduler( self.lr_scheduler, self.optimizer, step_size=config.lr_step_size, restart_step=config.restart_step, multi_step=config.multi_step) # 加载损失函数 self.criterion = Loss(config.model_type, config.loss_name, self.num_classes, train_labels_number, config.beta_CB, config.gamma) # 实例化实现各种子函数的 solver 类 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) if config.restore: weight_path = os.path.join('checkpoints', config.model_type) if config.restore == 'last': lists = os.listdir(weight_path) # 获得文件夹内所有文件 lists.sort(key=lambda fn: os.path.getmtime(weight_path + '/' + fn)) # 按照最近修改时间排序 weight_path = os.path.join(weight_path, lists[-1], 'model_best.pth') else: weight_path = os.path.join(weight_path, config.restore, 'model_best.pth') self.solver.load_checkpoint(weight_path) # log初始化 self.writer, self.time_stamp = self.init_log() self.model_path = os.path.join(self.config.train_local, self.config.model_type, self.time_stamp) # 初始化分类度量准则类 with open(config.local_data_root + 'label_id_name.json', 'r', encoding='utf-8') as json_file: self.class_names = list(json.load(json_file).values()) self.classification_metric = ClassificationMetric(self.class_names, self.model_path, text_flag=0) self.max_accuracy_valid = 0