def train(net, loss_func, optimizer, train_data, valid_data, n_class, device, model_name, epochs=20): """ 训练 :param net: AI网络 :param loss_func: loss function :param optimizer: optimizer :param train_data: train data set :param valid_data: valid data set :param n_class: n种分类 :param device: torch.device CPU or GPU :param model_name: 用于保存模型权重 :param epochs: 训练多少个EPOCH :return: """ for e in range(1, epochs + 1): get_logger().info('Epoch: {:02d}'.format(e)) # 一个epoch训练 _epoch_train(net, loss_func, optimizer, train_data, n_class, device, e) # 每个epoch的参数都保存 save_dir = save_weight(net, model_name, e) get_logger().info(save_dir) # 日志记录 # 一个epoch验证 _epoch_valid(net, loss_func, valid_data, n_class, device, e) pass pass
def _epoch_valid(net, loss_func, data, n_class, device, i_epoch): """ 一个epoch验证 :param net: AI网络 :param loss_func: loss function :param data: valid data set :param n_class: n种分类 :param device: torch.device CPU or GPU :return: loss, miou """ net.to(device) net.eval() # 验证 total_loss = 0. # 一个epoch验证的loss total_cm = np.zeros((n_class, n_class)) # ndarray total_batch_miou = 0. with torch.no_grad(): # 验证阶段,不需要计算梯度,节省内存 bar_format = '{desc}{postfix}|{n_fmt}/{total_fmt}|{percentage:3.0f}%|{bar}|{elapsed}<{remaining}' # {desc}{进度条百分比}[{当前/总数}{用时<剩余时间}{自己指定的后面显示的}] tqdm_data = tqdm(data, ncols=120, # 进度条宽120列,linux必须指定,否则按照terminal宽度80 bar_format=bar_format, # 进度条格式 desc='Epoch {:02d} Valid'.format(i_epoch)) # 进度条的{desc} for i_batch, (im, lb) in enumerate(tqdm_data, start=1): im = im.to(device) # [N,C,H,W] tensor 一个验证batch image lb = lb.to(device) # [N,H,W] tensor 一个验证batch label output = net(im) # [N,C,H,W] tensor 前向传播,计算一个验证batch的output loss = loss_func(output, lb.type(torch.long)) # 计算一个验证batch的loss batch_loss = loss.detach().item() # detach还是加上吧, total_loss += batch_loss # 累加验证batch的loss # 验证的时候不进行反向传播 pred = torch.argmax(F.softmax(output, dim=1), dim=1) # [N,H,W] tensor 将输出转化为dense prediction batch_cm = get_confusion_matrix(pred.cpu().numpy(), lb.cpu().numpy(), n_class) # 计算混淆矩阵并累加 total_cm += batch_cm batch_miou = get_metrics(batch_cm, metrics='mean_iou') total_batch_miou += batch_miou tqdm_str = 'Loss={:.4f}|mIoU={:.4f}|bat_mIoU={:.4f}' # 进度条 tqdm_data.set_postfix_str( tqdm_str.format(total_loss / i_batch, get_metrics(total_cm, metrics='mean_iou'), total_batch_miou / i_batch)) pass total_loss /= len(data) # 求取一个epoch验证的loss mean_iou = get_metrics(total_cm, metrics='mean_iou') # float 求mIoU total_batch_miou /= len(data) # 记录Valid日志 log_str = ('Valid Loss: {:.4f}|' 'Valid mIoU: {:.4f}|' 'Valid bat_mIoU: {:.4f}') log_str = log_str.format(total_loss, mean_iou, total_batch_miou) get_logger().info(log_str) return total_loss, mean_iou, total_batch_miou
def main(): args = tools.get_args(parser) config = tools.get_config(args) tools.init(config) tb_logger, logger = tools.get_logger(config) tools.check_dist_init(config, logger) checkpoint = tools.get_checkpoint(config) runner = tools.get_model(config, checkpoint) loaders = tools.get_data_loader(config) if dist.is_master(): logger.info(config) if args.mode == 'train': train(config, runner, loaders, checkpoint, tb_logger) elif args.mode == 'evaluate': evaluate(runner, loaders) elif args.mode == 'calc_flops': if dist.is_master(): flops = tools.get_model_flops(config, runner.get_model()) logger.info('flops: {}'.format(flops)) elif args.mode == 'calc_params': if dist.is_master(): params = tools.get_model_parameters(runner.get_model()) logger.info('params: {}'.format(params)) else: assert checkpoint is not None from models.dmcp.utils import sample_model sample_model(config, runner.get_model()) if dist.is_master(): logger.info('Done')
def get_model(model_type, in_channels, n_class, device, load_weight=None): """ 获取AI网络 :param model_type: 网络类型 :param in_channels: 输入图像通道数 :param n_class: n种分类 :param device: torch.device GPU or CPU :param load_weight: string已有权重文件的绝对路径,有就加载,默认没有 :return: """ if model_type == 'fcn8s': # raise NotImplementedError model = FCN8s(n_class) elif model_type == 'unet_resnet152': raise NotImplementedError # model = unet_resnet('resnet152', in_channels, n_class, pretrained=True) elif model_type == 'deeplabv3p_resnet50': model = DeepLabV3P('resnet50', in_channels, n_class) elif model_type == 'deeplabv3p_resnet101': model = DeepLabV3P('resnet101', in_channels, n_class) elif model_type == 'deeplabv3p_xception': model = DeepLabV3P('xception', in_channels, n_class) else: raise ValueError('model name error!') get_logger().info('-' * 32 + str(model_type) + '-' * 32) model.to(device) if load_weight is None: get_logger().info('Load weight is not specified!') elif os.path.exists(load_weight): # 有训练好的模型就加载 get_logger().info(load_weight + ' exists! loading...') wt = torch.load(load_weight, map_location=device) model.load_state_dict(wt) else: get_logger().info(load_weight + ' can not be found!') pass return model
def test(net, data, device, resize_to=256, n_class=8, compare=False): """ 测试 :param net: AI网络 :param data: test dataset :param device: torch.device GPU or CPU :param n_class: n种分类 :param compare: 是否生成对比图片 :return: """ net.to(device) net.eval() # 测试 total_cm = np.zeros((n_class, n_class)) # 记录整个测试的混淆矩阵 total_batch_miou = 0. # 累加每张图像的mIoU offset = 690 # 剪裁690x3384 pair_crop = PairCrop(offsets=(offset, None)) # 剪裁690x3384 pair_resize = PairResize(size=resize_to) pair_norm_to_tensor = PairNormalizeToTensor(norm=True) # 归一化并正则化 with torch.no_grad(): # 测试阶段,不需要计算梯度,节省内存 bar_format = '{desc}{postfix}|{n_fmt}/{total_fmt}|{percentage:3.0f}%|{bar}|{elapsed}<{remaining}' # {desc}{进度条百分比}[{当前/总数}{用时<剩余时间}{自己指定的后面显示的}] tqdm_data = tqdm(data, ncols=120, bar_format=bar_format, desc='Test') for i_batch, (im, lb) in enumerate(tqdm_data, start=1): # if i_batch > 1: # break im_t, lb_t = pair_crop(im, lb) # PIL Image,PIL Image im_t, lb_t = pair_resize(im_t, lb_t) # PIL Image,PIL Image im_t, lb_t = pair_norm_to_tensor(im_t, lb_t) # [C,H,W]tensor,[H,W]tensor im_t = im_t.to(device) # [C,H,W]tensor装入GPU im_t = im_t.unsqueeze(0) # 转换为[N,C,H,W] tensor output = net(im_t) # 经过模型输出[N,C,H,W] tensor pred = torch.argmax(F.softmax(output, dim=1), dim=1) # [N,H,W] tensor pred = pred.unsqueeze( 1) # [N,C,H,W] tensor, F.interpolate操作图像需要[N,C,H,W] tensor pred = pred.type( torch.float ) # 转为float数,F.interpolate只对float类型操作,int,long等都没有实现 pred = F.interpolate(pred, size=(lb.size[1] - offset, lb.size[0]), mode='nearest') # pred用nearest差值 pred = pred.type(torch.uint8) # 再转回int类型 pred = pred.squeeze(0).squeeze(0) # [H,W]tensor pred = pred.cpu().numpy() # [H,W]ndarray supplement = np.zeros((offset, lb.size[0]), dtype=np.uint8) # [H,W]ndarray,补充成背景 pred = np.append( supplement, pred, axis=0) # 最终的估值,[H,W]ndarray,在H方向cat,给pred补充被剪裁的690x3384 batch_cm = get_confusion_matrix(pred, lb, n_class) # 本张图像的混淆矩阵 total_cm += batch_cm # 累加 if compare: # 生成对比图 fontsize = 16 # 图像文字字体大小 fig, ax = plt.subplots(2, 2, figsize=(20, 15)) # 画布 ax = ax.flatten() ax[0].imshow(im) # 左上角显示原图 ax[0].set_title('Input Image', fontsize=fontsize) # 标题 ax[1].imshow(LaneSegDataset.decode_rgb( np.asarray(lb))) # 右上角显示 Grand Truth ax[1].set_title('Grand Truth', fontsize=fontsize) # 标题 batch_miou = get_metrics(batch_cm, metrics='mean_iou') # 计算本张图像的mIoU fig.suptitle('mIoU:{:.4f}'.format(batch_miou), fontsize=fontsize) # 用mIoU作为大标题 total_batch_miou += batch_miou mask = (pred != 0).astype( np.uint8) * 255 # [H,W]ndarray,alpha融合的mask pred = LaneSegDataset.decode_rgb(pred) # [H,W,C=3]ndarray RGB ax[3].imshow(pred) # 右下角显示Pred ax[3].set_title('Pred', fontsize=fontsize) # 标题 mask = mask[..., np.newaxis] # [H,W,C=1]ndarray pred = np.append(pred, mask, axis=2) # [H,W,C=4]ndarray,RGB+alpha变为RGBA im = im.convert('RGBA') pred = Image.fromarray(pred).convert('RGBA') im_comp = Image.alpha_composite(im, pred) # alpha融合 ax[2].imshow(im_comp) # 左下角显示融合图像 ax[2].set_title('Pred over Input', fontsize=fontsize) # 标题 plt.subplots_adjust(left=0.01, bottom=0.01, right=0.99, top=0.99, wspace=0.01, hspace=0.01) # 调整子图边距间距 plt.savefig('/home/mist/imfolder/pred-{:s}.jpg'.format( now_str())) # 保存图像 plt.close(fig) pass tqdm_str = 'mIoU={:.4f}|bat_mIoU={:.4f}' # 进度条 tqdm_data.set_postfix_str( tqdm_str.format(get_metrics(total_cm), total_batch_miou / i_batch)) pass mean_iou = get_metrics(total_cm) # 整个测试的mIoU total_batch_miou /= len(data) logger = get_logger() msg = ('Test mIoU : {:.4f}|' 'Test bat_mIoU : {:.4f}').format(mean_iou, total_batch_miou) logger.info(msg) return mean_iou
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) if __name__ == '__main__': # Read configures conf = cp.ConfigParser() # Project configures conf.read('./CleverRec.properties', encoding='utf-8') configs = dict(conf.items('default')) recommender = configs['recommender'] # Model configures conf.read(os.path.join('./conf/', recommender + '.properties'), encoding='utf-8') configs.update(dict(conf.items('parameters'))) # Get logger logger = get_logger(configs['log.dir'], recommender) logger.info('=' * 100) logger.info('Current model: %s' % recommender) # Read and preprocess data if configs['model_type'] == 'ranking': data = RankingPreprocess(configs, logger) else: data = RatingPreprocess(configs, logger) # tf settings if configs['gpu.is_gpu']: os.environ['CUDA_VISIBLE_DEVICES'] = configs['gpu.id'] tf_conf = tf.ConfigProto() tf_conf.gpu_options.per_process_gpu_memory_fraction = float( configs['gpu.mem_frac'])
def _epoch_train(net, loss_func, optimizer, data, n_class, device, i_epoch): """ 一个epoch训练 :param net: AI网络 :param loss_func: loss function :param optimizer: optimizer :param data: train data set :param n_class: n种分类 :param device: torch.device CPU or GPU :return: loss, miou """ net.to(device) net.train() # 训练 total_loss = 0. # 一个epoch训练的loss total_cm = np.zeros((n_class, n_class)) # ndarray 一个epoch的混淆矩阵 total_batch_miou = 0. bar_format = '{desc}{postfix}|{n_fmt}/{total_fmt}|{percentage:3.0f}%|{bar}|{elapsed}<{remaining}' # {desc}{进度条百分比}[{当前/总数}{用时<剩余时间}{自己指定的后面显示的}] tqdm_data = tqdm(data, ncols=120, # 进度条宽120列,linux必须指定,否则按照terminal宽度80 bar_format=bar_format, # 进度条格式 desc='Epoch {:02d} Train'.format(i_epoch)) # 进度条的{desc} for i_batch, (im, lb) in enumerate(tqdm_data, start=1): im = im.to(device) # [N,C,H,W] tensor 一个训练batch image lb = lb.to(device) # [N,H,W] tensor 一个训练batch label optimizer.zero_grad() # 清空梯度 output = net(im) # [N,C,H,W] tensor 前向传播,计算一个训练batch的output loss = loss_func(output, lb.type(torch.long)) # 计算一个训练batch的loss batch_loss = loss.detach().item() # train过程有gradient,必须detach才能读取 total_loss += batch_loss # 累加训练batch的loss loss.backward() # 反向传播 optimizer.step() # 优化器迭代 pred = torch.argmax(F.softmax(output, dim=1), dim=1) # [N,H,W] tensor 将输出转化为dense prediction,减少一个C维度 batch_cm = get_confusion_matrix(pred.cpu().numpy(), lb.cpu().numpy(), n_class) # 计算混淆矩阵并累加 total_cm += batch_cm batch_miou = get_metrics(batch_cm, metrics='mean_iou') total_batch_miou += batch_miou tqdm_str = 'Loss={:.4f}|mIoU={:.4f}|bat_mIoU={:.4f}' # 进度条 tqdm_data.set_postfix_str( tqdm_str.format(total_loss / i_batch, get_metrics(total_cm, metrics='mean_iou'), total_batch_miou / i_batch)) pass total_loss /= len(data) # float 求取一个epoch的loss mean_iou = get_metrics(total_cm, metrics='mean_iou') # float 求mIoU total_batch_miou /= len(data) # 计算所有batch的miou的平均 # 记录Train日志 log_str = ('Train Loss: {:.4f}|' 'Train mIoU: {:.4f}|' 'Train bat_mIoU: {:.4f}') log_str = log_str.format(total_loss, mean_iou, total_batch_miou) get_logger().info(log_str) return total_loss, mean_iou, total_batch_miou
def worker(gpu, ngpus_per_node, args_in): # init args = copy.deepcopy(args_in) jobid = os.environ["SLURM_JOBID"] procid = int(os.environ["SLURM_PROCID"]) args.gpu = gpu if args.gpu is not None: logger_name = "{}.{}-{:d}-{:d}.search.log".format( args.name, jobid, procid, gpu) else: logger_name = "{}.{}-{:d}-all.search.log".format( args.name, jobid, procid) logger = tools.get_logger(os.path.join(args.path, logger_name)) if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.mp_dist: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu args.print_params(logger.info) # get cuda device device = torch.device('cuda', gpu) # begin logger.info("Logger is set - training start") logger.info('back:{}, dist_url:{}, world_size:{}, rank:{}'.format( args.dist_backend, args.dist_url, args.world_size, args.rank)) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) data_parser = { 'ETTh1': { 'data': 'ETTh1.csv', 'T': 'OT', 'M': [7, 7, 7], 'S': [1, 1, 1], 'MS': [7, 7, 1] }, 'ETTh2': { 'data': 'ETTh2.csv', 'T': 'OT', 'M': [7, 7, 7], 'S': [1, 1, 1], 'MS': [7, 7, 1] }, 'ETTm1': { 'data': 'ETTm1.csv', 'T': 'OT', 'M': [7, 7, 7], 'S': [1, 1, 1], 'MS': [7, 7, 1] }, 'ETTm2': { 'data': 'ETTm2.csv', 'T': 'OT', 'M': [7, 7, 7], 'S': [1, 1, 1], 'MS': [7, 7, 1] }, 'WTH': { 'data': 'WTH.csv', 'T': 'WetBulbCelsius', 'M': [12, 12, 12], 'S': [1, 1, 1], 'MS': [12, 12, 1] }, 'ECL': { 'data': 'ECL.csv', 'T': 'MT_320', 'M': [321, 321, 321], 'S': [1, 1, 1], 'MS': [321, 321, 1] }, 'Solar': { 'data': 'solar_AL.csv', 'T': 'POWER_136', 'M': [137, 137, 137], 'S': [1, 1, 1], 'MS': [137, 137, 1] }, } if args.data in data_parser.keys(): data_info = data_parser[args.data] args.data_path = data_info['data'] args.target = data_info['T'] args.enc_in, args.dec_in, args.c_out = data_info[args.features] args.s_layers = [ int(s_l) for s_l in args.s_layers.replace(' ', '').split(',') ] args.detail_freq = args.freq args.freq = args.freq[-1:] Exp = Exp_M_Informer for ii in range(args.itr): # setting record of experiments setting = '{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_at{}_fc{}_eb{}_dt{}_mx{}_{}_{}'.format( args.model, args.data, args.features, args.seq_len, args.label_len, args.pred_len, args.d_model, args.n_heads, args.e_layers, args.d_layers, args.d_ff, args.attn, args.factor, args.embed, args.distil, args.mix, args.des, ii) exp = Exp(args) # set experiments logger.info( '>>>>>>>start training : {}>>>>>>>>>>>>>>>>>>>>>>>>>>'.format( setting)) exp.train(ii, logger) logger.info( '>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format( setting)) exp.test(setting, logger) if args.do_predict: logger.info( '>>>>>>>predicting : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'. format(setting)) exp.predict(setting, True) torch.cuda.empty_cache()