def test(**kwargs): for k_, v_ in kwargs.items(): setattr(opt, k_, v_) if opt.vis: vis = Visualizer(opt.env) test_data = Val_MoireData(opt.test_path) test_dataloader = DataLoader(test_data, batch_size=opt.test_batch_size, shuffle=False, num_workers=opt.num_workers, drop_last=False) model = get_model("HRDN") prefix = "{0}{1}/".format(opt.save_prefix, "HRDN") model.eval() torch.cuda.empty_cache() # criterion_c = L1_Charbonnier_loss() # loss_meter = meter.AverageValueMeter() psnr_meter = meter.AverageValueMeter() for ii, (moires, clears, labels) in tqdm(enumerate(test_dataloader)): moires = moires.to(opt.device) clears = clears.to(opt.device) output_list, _ = model(moires) outputs = output_list[0] moires = tensor2im(moires) outputs = tensor2im(outputs) clears = tensor2im(clears) psnr = colour.utilities.metric_psnr(outputs, clears) psnr_meter.add(psnr) bs = moires.shape[0] for jj in range(bs): output, clear = outputs[jj], clears[jj] label = labels[jj] img_path = "{0}{1}_output.png".format(prefix, label) save_single_image(output, img_path) if opt.vis and vis != None and (ii + 1) % 10 == 0: # 每10个iter画图一次 vis.log(">>>>>>>> batch_psnr:{psnr}<<<<<<<<<<".format(psnr=psnr)) torch.cuda.empty_cache() print("average psnr is {}".format(psnr_meter.value()[0]))
def train(**kwargs): # load kwargs opt.parse(kwargs) print(kwargs) # visdom vis = Visualizer(opt.env) # vis log opt vis.log('user config:') for k, v in opt.__class__.__dict__.items(): if not k.startswith('__'): vis.log('{} {}'.format(k, getattr(opt, k))) # config model model = getattr(models, opt.model)() if opt.use_pretrained_model: model = load_pretrained() if opt.load_model_path: # load exist model model.load(opt.load_model_path) elif opt.use_weight_init: # we need init weight # model.apply(weight_init) # if use GPU if opt.use_gpu: model.cuda() # genearte_data train_data = Flower(train=True) val_data = Flower(train=False) test_data = Flower(test=True) train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) test_dataloader = DataLoader(test_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) # criterion and optimizer criterion = torch.nn.CrossEntropyLoss() lr = opt.lr if 'Dense' in opt.model: optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=opt.weight_decay) else: optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=opt.weight_decay) # meters loss_meter = meter.AverageValueMeter() # 17 classes confusion_matrix = meter.ConfusionMeter(17) previous_loss = 1e100 # best_accuracy = 0 # start training for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for bactch_index, (data, label) in tqdm(enumerate(train_dataloader)): # train model input = Variable(data) target = Variable(label) # gpu update if opt.use_gpu: input = input.cuda() target = target.cuda() optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() optimizer.step() # update meter loss_meter.add(loss.data[0]) # print(score.data, target.data) # [batch_size, 17] [batch_size] confusion_matrix.add(score.data, target.data) # plot if bactch_index % opt.print_freq == opt.print_freq - 1: # cross_entropy print('loss ', loss_meter.value()[0]) # visualize loss vis.plot('loss', loss_meter.value()[0]) # save model for this epoch if opt.use_pretrained_model is False and epoch % opt.save_freq == 0: model.save() # validate val_cm, val_accuracy = val(model, val_dataloader) # test test_cm, test_accuracy = val(model, test_dataloader) # plot validation accuracy print('Epoch {}/{}: val_accuracy {}'.format(epoch, opt.max_epoch, val_accuracy)) # plot vis vis.plot('val_accuracy', val_accuracy) vis.plot('test_accuracy', test_accuracy) vis.log('epoch:{epoch}, lr:{lr}, loss:{loss}'.format( epoch=epoch, loss=loss_meter.value()[0], lr=lr)) # vis.log('epoch:{epoch}, lr:{lr}, loss:{loss}, train_cm:{train_cm}, val_cm:{val_cm}'.format( # epoch=epoch, loss=loss_meter.value()[0], val_cm = str(val_cm.value()),train_cm=str(confusion_matrix.value()),lr=lr) # ) # update best validation model if val_accuracy > best_accuracy: best_accuracy = val_accuracy torch.save(model.state_dict(), './checkpoints/best_{}.pth'.format(opt.model)) if opt.use_pretrained_model is False: model.save('./checkpoints/best_{}.pth'.format( model.model_name)) # update learning rate for this epoch if float(loss_meter.value()[0]) > previous_loss: lr = lr * opt.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0] print('Best model validation accuracy {}'.format(best_accuracy))
def train(): vis = Visualizer("Kesci" + time.strftime('%m%d%H%M')) train_data = AppData("../kesci/data/data_v3_23d/train_ab.json", iflabel=True) val_data = AppData("../kesci/data/data_v3_23d/val_ab.json", iflabel=True) train_dataloader = DataLoader(train_data, 256, shuffle=True, num_workers=4) val_dataloader = DataLoader(val_data, 512, shuffle=False, num_workers=2) test_data = AppData("../kesci/data/data_v3_23d/test_ab.json", iflabel=True) test_dataloader = DataLoader(test_data, 512, shuffle=False, num_workers=2) criterion = t.nn.BCEWithLogitsLoss().cuda() learning_rate = 0.002 weight_decay = 0.0003 model = DoubleSequence(31, 128, 1).cuda() optimizer = t.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e100 for epoch in range(400): loss_meter.reset() confusion_matrix.reset() for ii, (data, property, target) in tqdm(enumerate(train_dataloader)): input = Variable(data).cuda() input2 = Variable(property).cuda() target = Variable(target).cuda() output = model(input, input2) optimizer.zero_grad() loss = criterion(output, target) loss.backward() optimizer.step() loss_meter.add(loss.data[0]) if ii % 100 == 99: vis.plot('loss', loss_meter.value()[0]) if epoch % 3 == 2: train_cm, train_f1 = val(model, train_dataloader) vis.plot('train_f1', train_f1) val_cm, val_f1 = val(model, val_dataloader) vis.plot_many({'val_f1': val_f1, 'learning_rate': learning_rate}) if loss_meter.value()[0] > previous_loss: learning_rate = learning_rate * 0.9 # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = learning_rate previous_loss = loss_meter.value()[0] if epoch % 3 == 2: model.save() test_cm, test_f1 = val(model, test_dataloader) vis.plot('test_f1', test_f1) vis.log( "训练集:{train_f1:%}, {train_pre:%}, {train_rec:%} | 验证集:{val_f1:%}, {val_pre:%}, {val_rec:%} | \ 测试集:{test_f1:%}, {test_pre:%}, {test_rec:%} | {train_true_num:%}, {val_true_num:%}, {test_true_num:%}" .format( train_f1=train_f1, val_f1=val_f1, test_f1=test_f1, train_true_num=train_cm.value()[:, 0].sum() / len(train_data), val_true_num=val_cm.value()[:, 0].sum() / len(val_data), test_true_num=test_cm.value()[:, 0].sum() / len(test_data), train_pre=train_cm.value()[0][0] / train_cm.value()[0].sum(), train_rec=train_cm.value()[0][0] / train_cm.value()[:, 0].sum(), val_pre=val_cm.value()[0][0] / val_cm.value()[0].sum(), val_rec=val_cm.value()[0][0] / val_cm.value()[:, 0].sum(), test_pre=test_cm.value()[0][0] / test_cm.value()[0].sum(), test_rec=test_cm.value()[0][0] / test_cm.value()[:, 0].sum()))
def train(): vis = Visualizer("Kesci") train_data = AppData("data/data_16d_target/train.json", iflabel=True) val_data = AppData("data/data_16d_target/val.json", iflabel=True) train_dataloader = DataLoader(train_data, 32, shuffle=True, num_workers=4) val_dataloader = DataLoader(val_data, 256, shuffle=False, num_workers=2) test_data = AppData("data/data_16d_target/test.json", iflabel=True) test_dataloader = DataLoader(test_data, 256, shuffle=False, num_workers=2) criterion = t.nn.CrossEntropyLoss().cuda() learning_rate = 0.003 weight_decay = 0.0002 model = Sequence(15, 128, 1).cuda() optimizer = t.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e100 for epoch in range(500): loss_meter.reset() confusion_matrix.reset() for ii, (data, property, label) in tqdm(enumerate(train_dataloader)): input = Variable(data).cuda() input2 = Variable(property).cuda() target = Variable(label).cuda().view(-1) output = model(input, input2) optimizer.zero_grad() loss = criterion(output, target) loss.backward() optimizer.step() loss_meter.add(loss.data[0]) confusion_matrix.add(output.data, target.data) if ii % 100 == 99: vis.plot('loss', loss_meter.value()[0]) if epoch % 3 == 2: train_cm, train_f1 = val(model, train_dataloader) vis.plot('train_f1', train_f1) val_cm, val_f1 = val(model, val_dataloader) vis.plot_many({'val_f1': val_f1, 'learning_rate': learning_rate}) # vis.log("epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}".format( # epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), # train_cm=str(confusion_matrix.value()), lr=learning_rate)) if loss_meter.value()[0] > previous_loss: learning_rate = learning_rate * 0.95 # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = learning_rate previous_loss = loss_meter.value()[0] if epoch % 10 == 9: model.save() test_cm, test_f1 = val(model, test_dataloader) vis.plot('test_f1', test_f1) vis.log( "model:{model} | {train_f1}, {train_pre}, {train_rec} | {val_f1}, {val_pre}, {val_rec} | {test_f1}, {test_pre}, {test_rec}" .format(train_f1=train_f1, val_f1=val_f1, test_f1=test_f1, model=time.strftime('%m%d %H:%M:%S'), train_pre=str(train_cm.value()[0][0] / train_cm.value()[:, 0].sum()), train_rec=str(train_cm.value()[0][0] / train_cm.value()[0].sum()), val_pre=str(val_cm.value()[0][0] / val_cm.value()[:, 0].sum()), val_rec=str(val_cm.value()[0][0] / val_cm.value()[0].sum()), test_pre=str(test_cm.value()[0][0] / test_cm.value()[:, 0].sum()), test_rec=str(test_cm.value()[0][0] / test_cm.value()[0].sum())))
def train(**kwargs): opt.parse(**kwargs) # step1: configure model model = getattr(models,opt.model)(opt.num_class) if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() # step2: data train_data = DogCat(opt.train_data_path, transform=opt.train_transform, train = True) val_data = DogCat(opt.train_data_path, transform=opt.test_val_transform, train = False, test= False) train_dataloader = DataLoader(train_data, batch_size= opt.batch_size, shuffle=opt.shuffle, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, batch_size= opt.batch_size, shuffle=opt.shuffle, num_workers=opt.num_workers) # step3: criterion and optimizer criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = t.optim.Adam(params=model.parameters(), lr=lr, weight_decay=opt.weight_decay) # step4: meters loss_meter = meter.AverageValueMeter() # 用于统计一个epoch内的平均误差 confusion_matrix = meter.ConfusionMeter(opt.num_class) previous_loss=1e6 # step5: train vis = Visualizer(opt.env) for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii,(data, label) in tqdm(enumerate(train_dataloader)): # train model input = Variable(data) target = Variable(label) if opt.use_gpu: input = input.cuda() target = target.cuda() optimizer.zero_grad() score = model(input) loss = criterion(score,target) loss.backward() optimizer.step() loss_meter.add(loss.data) confusion_matrix.add(score.data, target.data) # ipdb.set_trace() if ii%opt.print_freq == opt.print_freq-1: vis.plot(win='loss', y=loss_meter.value()[0]) model.save() # step6: validate and visualize val_confusion_matrix, val_accuracy = val(model, val_dataloader) vis.plot(win='val_accuracy',y=val_accuracy) vis.log(win='log_text', info= 'epoch:{epoch}, lr:{lr}, loss:{loss}, train_cm:{train_cm}, val_cm:{val_cm}'.format( epoch=epoch,lr=lr,loss=loss_meter.value()[0],train_cm=str(confusion_matrix.value()),val_cm=str(val_confusion_matrix) ) ) # step7: update learning_rate if loss_meter.value()[0] > previous_loss: lr=lr*opt.lr_decay for param_group in optimizer.param_groups: param_group['lr']=lr previous_loss=loss_meter.value()[0]
def train(**kwargs): ''' 训练 :param kwargs: 可调整参数,默认是config中的默认参数 :return:训练出完整模型 ''' # 根据命令行参数更新配置 opt.parse(kwargs) # visdom绘图程序 vis = Visualizer(opt.env, port=opt.vis_port) # step:1 构建模型 # 选取配置中名字为model的模型 model = getattr(models, opt.model)() # 是否读取保存好的模型参数 if opt.load_model_path: model.load(opt.load_model_path) # 设置GPU os.environ["CUDA_VISIBLE_DEVICES"] = "2" model.to(opt.device) # step2: 数据 train_data = CWRUDataset2D(opt.train_data_root, train=True) # 测试数据集和验证数据集是一样的,这些数据是没有用于训练的 test_data = CWRUDataset2D(opt.train_data_root, train=False) train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True) test_dataloader = DataLoader(test_data, opt.batch_size, shuffle=False) # step3: 目标函数和优化器 # 损失函数,交叉熵 criterion = torch.nn.CrossEntropyLoss() lr = opt.lr # 优化函数,Adam optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=opt.weight_decay) # step4: 统计指标,平滑处理之后的损失,还有混淆矩阵 # 损失进行取平均及方差计算。 loss_meter = meter.AverageValueMeter() # 混淆矩阵 confusion_matrix = meter.ConfusionMeter(opt.category) previous_loss = 1e10 # 训练 for epoch in range(opt.max_epoch): # 重置 loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in tqdm(enumerate(train_dataloader)): # 训练模型 input = data.to(opt.device) target = label.to(opt.device) optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() optimizer.step() # 更新统计指标以及可视化 loss_meter.add(loss.item()) # detach 一下更安全保险 confusion_matrix.add(score.detach(), target.detach()) if (ii + 1) % opt.print_freq == 0: vis.plot('loss', loss_meter.value()[0]) # 进入debug模式 if os.path.exists(opt.debug_file): import ipdb; ipdb.set_trace() # 每个batch保存模型 model.save() # 计算测试集上的指标和可视化 val_cm, val_accuracy = val(model, test_dataloader) vis.plot('val_accuracy', val_accuracy) vis.log("epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}".format( epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) # 如果损失不在下降,那么就降低学习率 if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
def train(**kwargs): """ 训练 """ # 根据传入的参数更改配置信息 opt.parse(kwargs) vis = Visualizer(opt.env) cudnn.enabled = True cudnn.benchmark = True # step1: 配置并加载模型 model = getattr(models, opt.model)() if opt.load_model_path: model.load(opt.load_model_path) model.to(opt.device) # step2: 加载数据(训练集和交叉验证集) train_data = SceneData(opt.train_data_root, opt.labels, train=True) val_data = SceneData(opt.train_data_root, opt.labels, train=False) train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) # step3: 目标函数和优化器 criterion = t.nn.CrossEntropyLoss() # 交叉熵损失函数 lr = opt.lr optimizer = t.optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) # Adam算法 """ # 冻结除全连接层外的所有层,只训练最后的全连接层(用于有全连接层模型的finetune) for para in list(model.parameters())[:-1]: para.requires_grad = False optimizer = t.optim.Adam(params=[model.fc.weight, model.fc.bias], lr=opt.lr, weight_decay=opt.weight_decay) # Adam算法 """ # step4: 统计指标:平滑处理之后的损失,还有混淆矩阵 loss_meter = meter.AverageValueMeter() # 能够计算所有数的平均值和标准差,用来统计一次训练中损失的平均值 confusion_matrix = meter.ConfusionMeter(opt.num_labels) previous_loss = 1e100 # 训练 for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() # 每次读出一个batch的数据训练 for step, (data, label) in tqdm.tqdm(enumerate(train_dataloader), total=len(train_data)): train_input = data.to(opt.device) label_input = label.to(opt.device) optimizer.zero_grad() # 梯度清零 score = model(train_input) # 调用模型 loss = criterion(score, label_input) # 计算损失函数 loss.backward() # 反向传播 optimizer.step() # 优化 # 更新统计指标及可视化 loss_meter.add(loss.item()) confusion_matrix.add(score.detach(), label_input.detach()) if step % opt.print_freq == opt.print_freq - 1: vis.plot('loss', loss_meter.value()[0]) model.save() # 计算验证集上的指标及可视化 val_cm, val_accuracy = val(model, val_dataloader) vis.plot('val_accuracy', val_accuracy) vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) # 如果损失不再下降,则降低学习率 if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # 动态修改学习率 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
def train(**kwargs): opt.parse(kwargs) vis = Visualizer(opt.env) # step1: configure model model = getattr(models, opt.model)() if opt.retrain: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() # step2: data train_data = DogCat(opt.train_data_root,train=True) val_data = DogCat(opt.train_data_root,train=False) train_dataloader = DataLoader(train_data,opt.batch_size, shuffle=True,num_workers=opt.num_workers) val_dataloader = DataLoader(val_data,opt.batch_size, shuffle=False,num_workers=opt.num_workers) # step3: criterion and optimizer criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = t.optim.Adam(model.parameters(),lr = lr,weight_decay = opt.weight_decay) # step4: meters loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e100 # train for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii,(data,label) in tqdm(enumerate(train_dataloader),total=len(train_data)): # train model input = Variable(data) target = Variable(label) if opt.use_gpu: input = input.cuda() target = target.cuda() optimizer.zero_grad() score = model(input) loss = criterion(score,target) loss.backward() optimizer.step() # meters update and visualize loss_meter.add(loss.item()) confusion_matrix.add(score.data, target.data) if ii%opt.print_freq==opt.print_freq-1: vis.plot('loss', loss_meter.value()[0]) # 进入debug模式 if os.path.exists(opt.debug_file): import ipdb; ipdb.set_trace() model.save(opt.load_model_path) # validate and visualize val_cm,val_accuracy = val(model,val_dataloader) vis.plot('val_accuracy',val_accuracy) vis.log("epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}".format( epoch = epoch,loss = loss_meter.value()[0],val_cm = str(val_cm.value()),train_cm=str(confusion_matrix.value()),lr=lr)) # update learning rate if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
def train(): vis = Visualizer(opt.env + opt.model) net = getattr(models, opt.model)() print('当前使用的模型为' + opt.model) # 分类损失函数使用交叉熵 criterion = t.nn.CrossEntropyLoss() optimizer = t.optim.Adam(net.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) start_epoch = 0 if opt.load_model_path: checkpoint = t.load(opt.load_model_path) # 加载多GPU模型参数到 CPU上 state_dict = checkpoint['net'] new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove `module.` new_state_dict[name] = v net.load_state_dict(new_state_dict) # 加载模型 optimizer.load_state_dict(checkpoint['optimizer']) # 加载优化器 start_epoch = checkpoint['epoch'] # 加载训练批次 # 学习率每当到达milestones值则更新参数 if start_epoch == 0: scheduler = t.optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.milestones, gamma=0.1, last_epoch=-1) print('从头训练 ,学习率为{}'.format(optimizer.param_groups[0]['lr'])) else: scheduler = t.optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.milestones, gamma=0.1, last_epoch=start_epoch) print('加载预训练模型{}并从{}轮开始训练,学习率为{}'.format( opt.load_model_path, start_epoch, optimizer.param_groups[0]['lr'])) # 网络转移到GPU上 if opt.use_gpu: net = t.nn.DataParallel(net, device_ids=opt.device_ids) # 模型转为GPU并行 net.cuda() cudnn.benchmark = True train_data = NodeDataSet(train=True) val_data = NodeDataSet(val=True) train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) for epoch in range(opt.max_epoch - start_epoch): print('开始 epoch {}/{}.'.format(start_epoch + epoch + 1, opt.max_epoch)) epoch_loss = 0 # 每轮判断是否更新学习率 scheduler.step() # 迭代数据集加载器 for ii, (block_3d, truth_label) in enumerate(train_dataloader): if opt.use_gpu: block_3d = block_3d.cuda() truth_label = truth_label.cuda() predict_label = net(block_3d) loss = criterion(predict_label, truth_label) epoch_loss += loss.item() if ii % 8 == 0: vis.plot('训练集loss', loss.item()) optimizer.zero_grad() # 优化器梯度清零 loss.backward() # 反向传播 optimizer.step() # 更新参数 # 当前时刻的一些信息 vis.log("epoch:{epoch},lr:{lr},loss:{loss}".format( epoch=epoch, loss=loss.item(), lr=optimizer.param_groups[0]['lr'])) vis.plot('每轮epoch的loss均值', epoch_loss / ii) # 保存模型、优化器、当前轮次等 state = { 'net': net.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch } if not os.path.exists(opt.checkpoint_root): os.makedirs(opt.checkpoint_root) t.save(state, opt.checkpoint_root + '{}_node.pth'.format(epoch)) # ============验证=================== val_loss = 0 with t.no_grad(): for jj, (val_block_3d, val_label) in enumerate(val_dataloader): if opt.use_gpu: val_block_3d = val_block_3d.cuda() val_label = val_label.cuda() val_predict_label = net(val_block_3d) loss = criterion(val_predict_label, val_label) val_loss += loss.item() vis.plot('验证集loss均值', val_loss / jj)
def train(**kwargs): opt.parse(kwargs) # 根据命令行输入更新配置,如 python main.py train --env='env1219' -- vis = Visualizer(opt.env) # 第一步:加载模型(模型,预训练参数,GPU) model = getattr(models, opt.model)() # 等价于 models.AlexNet() # model = torchvision.models.resnet34(pretrained=True, num_classes=1000) # model.fc = nn.Linear(512, 2) # 修改最后一层为我们的二分类问题 if opt.load_model_path: model.load(opt.load_model_path) # 加载模型参数 if opt.use_gpu: model.cuda() # 第二步:加载数据(训练集、验证集,用DataLoader来装载) train_data = DogCat(opt.train_data_root, train=True) val_data = DogCat(opt.train_data_root, train=False) train_data_loader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_data_loader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) # 第三步:定义损失函数和优化器 criterion = nn.CrossEntropyLoss() lr = opt.lr optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=opt.weight_decay) # 第四步:统计指标:平滑处理之后的损失,混淆矩阵 # meter是PyTorchNet里面的一个重要工具,可以帮助用户快速统计训练过程中的一些指标。 loss_meter = meter.AverageValueMeter( ) # loss_meter.value(),返回一个二元组,第一个元素是均值,第二个元素是标准差 confusion_matrix = meter.ConfusionMeter(2) # confusion_matrix是一个2*2的混淆矩阵 previous_loss = 1e100 # 初始置为10^100 # 第五步:开始训练 for epoch in range(opt.max_epoch): loss_meter.reset() # 置为nan confusion_matrix.reset() # 清零 # tqdm 是一个快速,可扩展的Python进度条,可以在长循环中添加一个进度提示信息,用户只需要封装任意的迭代器 tqdm(iterator) import math for ii, (data, label) in tqdm(enumerate(train_data_loader), total=math.ceil( len(train_data) / opt.batch_size)): # 模型参数训练 input = Variable(data) target = Variable(label) if opt.use_gpu: input = input.cuda() target = target.cuda() optimizer.zero_grad() # 每轮都要清空一轮梯度信息 pred = model(input) # 计算损失,输入为(pred,target),其中pred的shape为(batch_size,n_classes),target为(batch_size),target中的值为0或1 # 示例 # pred = t.randn(128,2) # target = t.empty(128,dtype=t.long).random_(2) # criterion(pred,target) # 某个样本的预测值x为[0.11,0.39],对应标签y为0,那么计算损失的方式为: -x[y] + log(exp(x[0])+exp(x[1])) # 具体计算方式见: https://blog.csdn.net/geter_CS/article/details/84857220 loss = criterion(pred, target) loss.backward() # 计算梯度 optimizer.step() # 更新网络权重参数 # 更新统计指标以及可视化 loss_meter.add(loss.item()) confusion_matrix.add(pred.data, target.data) if ii % opt.print_freq == opt.print_freq - 1: # 每20个batch输出一次损失 vis.plot('loss', loss_meter.value()[0]) # 如果需要的话,进入debug模式 # if os.path.exists(opt.debug_file): # ipdb.set_trace() # 保存训练好的模型 model.save() # 存在checkpoints目录下 # 在验证集上测试结果并可视化 val_cm, val_accuracy = val(model, val_data_loader) vis.plot('val_accuracy', val_accuracy) # 绘制的是精确度 vis.log( 'epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}' .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) # 如果损失不再下降,则衰减学习率 if loss_meter.value()[0] > previous_loss: lr = opt.lr * opt.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0] # 更新previous_loss的值为当前损失的平均值
def train(**kwargs): ''' 训练 ''' # 根据命令行参数更新配置 opt.parse(kwargs) vis = Visualizer(opt.env) # step1: configure model定义网络 model = getattr(models, opt.model)() if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() # step2: data定义数据 train_data = DogCat(opt.train_data_root, train=True) val_data = DogCat(opt.train_data_root, train=False) train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) # step3: criterion and optimizer定义损失函数和优化器 criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = t.optim.Adam(model.parameters(), lr=lr, weight_decay=opt.weight_decay) # step4: meters统计指标:平滑处理之后的损失,还有混淆矩阵 loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e100 # step5:train开始训练 for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in enumerate(train_dataloader): # train model 训练网络 input = Variable(data) target = Variable(label) if opt.use_gpu: input = input.cuda() target = target.cuda() optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() optimizer.step() # meters update and visualize可视化各种指标 loss_meter.add(loss.data[0]) confusion_matrix.add(score.data, target.data) if ii % opt.print_freq == opt.print_freq - 1: # 计算在验证集上的指标 vis.plot('loss', loss_meter.value()[0]) # 如果需要的话,进入debug模式 if os.path.exists(opt.debug_file): import ipdb ipdb.set_trace() model.save() # validate and visualize计算验证集上的指标及可视化 val_cm, val_accuracy = val(model, val_dataloader) vis.plot('val_accuracy', val_accuracy) vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) # update learning rate如果损失不再下降,则降低学习率 if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
def train(): vis=Visualizer(opt.env) # 定义一个网络模型对象 netWork=NetWork() # 先将模型加载到内存中,即CPU中 map_location = lambda storage, loc: storage if opt.load_model_path: netWork.load_state_dict(t.load(opt.load_model_path,map_location=map_location)) # 将模型转到GPU 1:转到GPU1上 if opt.use_gpu: netWork.cuda(1) # step2: 加载数据 train_data=DataProcessing(opt.data_root,train=True) #train=False test=False 则为验证集 val_data=DataProcessing(opt.data_root,train=False) # 数据集加载器 train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, 1, shuffle=False, num_workers=opt.num_workers) # step3: criterion 损失函数和optimizer优化器 # 损失函数使用均方误差 criterion = t.nn.MSELoss() lr=opt.lr # 优化器使用Adam optimizer = t.optim.Adam(netWork.parameters(), lr=opt.lr, weight_decay =opt.weight_decay) # step4: 统计指标meters 仪表 显示损失的图形 #计算所有书的平均数和标准差,来统计一个epoch中损失的平均值 loss_meter=meter.AverageValueMeter() # 定义初试的loss previous_loss = 1e100 for epoch in range(opt.max_epoch): #清空仪表信息 loss_meter.reset() # 迭代数据集加载器 for ii, (data_origin, data_grayscale) in enumerate(train_dataloader): #训练模型 #input_img为模型输入图像,为灰度加噪图 input_img=Variable(data_grayscale) #output_real_img 为target图像,即为 原图的灰度图 output_real_img=Variable(data_origin) #将数据转到GPU if opt.use_gpu: input_img=input_img.cuda(1) output_real_img=output_real_img.cuda(1) #优化器梯度清零 optimizer.zero_grad() #前向传播,得到网络产生的输出图像output_img output_img=netWork(input_img) # 损失为MSE均方误差 loss=criterion(output_img,output_real_img) # 反向传播 自动求梯度 loss进行反向传播 loss.backward() # 更新优化器的可学习参数 optimizer优化器进行更新参数 optimizer.step() # 更新仪表 并可视化 loss_meter.add(loss.data[0]) # 每print_freq次可视化loss if ii % opt.print_freq == opt.print_freq - 1: # plot是自定义的方法 vis.plot('loss', loss_meter.value()[0]) # 一个epoch之后保存模型 netWork.save() # 利用lena.jpg测试每个epoch的模型 add_every_epoch_lena(netWork,epoch) # # 使用验证集和可视化 # val_output_img= val(netWork, val_dataloader) # vis.img("val_output_img",val_output_img.data) # 当前时刻的一些信息 vis.log("epoch:{epoch},lr:{lr},loss:{loss}".format( epoch=epoch, loss=loss_meter.value()[0],lr=lr)) # 更新学习率 如果损失开始升高,则降低学习率 if loss_meter.value()[0] > previous_loss: lr=lr*opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0] vis.img_many(dict_lena) print("============训练完毕=============")
def train(**kwargs): #opt.parse(kwargs) vis = Visualizer() # step1: configure model print("come step 1") #model = getattr(models, opt.model)() model=models.resnet18(pretrained=True) if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() # step2: data print("come here step 2") train_data = DogCat(opt.train_data_root,train=True) val_data = DogCat(opt.train_data_root,train=False) train_dataloader = DataLoader(train_data,opt.batch_size, shuffle=False,num_workers=opt.num_workers) print(train_dataloader) val_dataloader = DataLoader(val_data,opt.batch_size, shuffle=False,num_workers=opt.num_workers) # step3: criterion and optimizer print("come step 3") criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = t.optim.Adam(model.parameters(),lr = lr,weight_decay = opt.weight_decay) # step4: meters print("come step 4") loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e100 # train for epoch in range(opt.max_epoch): print("start training") loss_meter.reset() confusion_matrix.reset() # print("original matrix is:{}".format(confusion_matrix.value())) for ii,(data,label) in tqdm(enumerate(train_dataloader)): print("label is {}".format(label)) # train model input = Variable(data) target = Variable(label) if opt.use_gpu: input = input.cuda() target = target.cuda() optimizer.zero_grad() score = model(input) print("score is ",score) print("target is",target) loss = criterion(score,target) loss.backward() optimizer.step() #print("Epoch is :{s},Loss is {}".format(epoch,loss)) # meters update and visualize loss_meter.add(loss.item()) confusion_matrix.add(score.data, target.data) #if ii%opt.print_freq==opt.print_freq-1: # vis.plot('loss', loss_meter.value()[0]) # # # 进入debug模式 # if os.path.exists(opt.debug_file): # import ipdb; # ipdb.set_trace() print("Now learning rate is {}".format(lr)) if (epoch % opt.lr_decay_epoch == 0): lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr model.save() # validate and visualize val_cm, val_accuracy = val(model, val_dataloader) print("end val function") print(val_cm.value(),val_accuracy) vis.plot('val_accuracy',val_accuracy) vis.log("epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}".format( epoch = epoch,loss = loss_meter.value()[0],val_cm = str(val_cm.value()),train_cm=str(confusion_matrix.value()),lr=lr)) # update learning rate if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr
def train(**kwargs): #根据命令行参数更新配置 opt.parse(kwargs=kwargs) vis = Visualizer(opt.env) #1.模型 model = getattr(models, opt.model)() #注意实例化 if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() #2 数据 train_data = DogCat(opt.train_data_root, train=True) val_data = DogCat(opt.train_data_root, train=False) train_dataloader = DataLoader(dataset=train_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(dataset=val_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers) #3 损失函数和优化器 criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = t.optim.Adam(params=model.parameters(), lr=lr, weight_decay=opt.weight_decay) #4 统计指标:平滑处理之后的损失,还有混淆矩阵 loss_meter = meter.AverageValueMeter() #AverageValueMeter,计算所有数的平均值 confusion_matrix = meter.ConfusionMeter(2) #用来统计二分类的一些统计指标 previous_loss = 1e100 #训练 for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in enumerate(train_dataloader): #训练模型参数 input = Variable(data) target = Variable(label) if opt.use_gpu: input = input.cuda() target = target.cuda() optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() optimizer.step() #更新统计指标及可视化 loss_meter.add(loss.item()) confusion_matrix.add(predicted=score.data, target=target) if ii % opt.print_frep == opt.print_frep - 1: vis.plot("loss", loss_meter.value()[0]) #如果需要的话进入debug模式 if os.path.exists(opt.debug_file): import ipdb ipdb.set_trace() model.save() #计算验证集上的指标以及可视化 val_cm, val_accuracy = val(model, val_dataloader) vis.plot("val_accuracy", val_accuracy) vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) #如果损失不再下降,则降低学习率 if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay for param_group in optimizer.param_groups: param_group["lr"] = lr previous_loss = loss_meter.value()[0]
def train_torch(self, datasetTypeCls, learning_rate_value=None, learning_rate_decay=None, num_epochs=5000, early_stop_epochs=5): """ :param datasetTypeCls: :param learning_rate_value: :param learning_rate_decay: :param num_epochs: :param early_stop_epochs: 连续这么多个epoch上,val_loss都没有降低,则提前终止训练 :return: """ # Load dataset self.saved_params = [] if self.pretrained: print('Saved model found. Loading...') self.load_model() if learning_rate_value is None: learning_rate_value = self.learning_rate_value if learning_rate_decay is None: learning_rate_decay = self.learning_rate_decay # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") # print(self.sinputs) # ??????? 弄啥嘞? # [Xiao] vis = Visualizer('xiao-moddrop-v3.0') # step 1: setup model model = self.model # model = model.cuda() # step 2: data\ if datasetTypeCls in [DatasetOfDamagedMultimodal, DatasetLQAudio, DatasetLQVideoClassifier, DatasetLQVideoFeatureExtractor, DatasetLQSkeleton]: # print(f'In basicClassifier.py, df = {df}') train_data = datasetTypeCls(self.input_folder, train_valid_test='train') val_data = datasetTypeCls(self.input_folder, train_valid_test='valid') else: train_data = datasetTypeCls(self.input_folder, self.modality, 'train', self.hand_list, self.seq_per_class, self.nclasses, self.input_size, self.step, self.nframes) val_data = datasetTypeCls(self.input_folder, self.modality, 'valid', self.hand_list, 200, self.nclasses, self.input_size, self.step, self.nframes) print('Dataset prepared.') # self._load_dataset('train') # ?? train_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=12) # num_workers 按 CPU 逻辑核数目来。查看命令是: cat /proc/cpuinfo| grep "processor"| wc -l val_loader = DataLoader(val_data, batch_size=32, shuffle=False, num_workers=12) print('DataLoader prepared.') # val_loader = DataLoader(self.val_data, 32) # step 3: criterion and optimizer self.criterion = nn.CrossEntropyLoss() self.lr = 0.02 # 0.001 self.optimizer = torch.optim.SGD(model.parameters(), lr=self.lr, weight_decay=1-0.9998, nesterov=True, momentum=0.8) # visdom show line of loss win = vis.line( X=numpy.array([0, 1]), Y=numpy.array([0, 1]), name="loss" ) win1 = vis.line( X=numpy.array([0, 1]), Y=numpy.array([0, 1]), name="loss_epoch" ) # step 4: go to GPU # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # 这一句放到init方法中了 model.to(self.device) print('Training begin...') # for data in train_loader: # print(data[1]) # # break # print('HH=======25-20947489THRLGIHRSGHRNKGNREOSG========') best_val_loss = self.nclasses epochs_no_better_val_loss = 0 for epoch in range(num_epochs): print(f'CURRENT EPOCH: {epoch}') # In each epoch, we do a full pass over the training data: losses = [] train_loader_len = len(train_loader) ten_pct = train_loader_len // 10 for ii, (data, label) in enumerate(train_loader): if ii % ten_pct == 0 : print(f'ii = {ii}, percentage: {ii/train_loader_len}') input = data target = label.to(torch.int64) # [Xiao] 如果是多模态的输入,需要区别对待 if not isinstance(input, dict): # 若当前的输入是 torch.tensor ,说明不是最终的多模态输入,可以直接上GPU input, target = input.to(self.device), target.to(self.device) else: # 若是字典,说明是多模态输入,这里先将label上GPU,其他的部分在输入model.forward()中分别取出来作为tensor再上GPU target = target.to(self.device) # print(f'input.shape is : {input.shape}') # print(f'target.shape is : {target.shape}') # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): self.optimizer.zero_grad() score = model(input) # print(f'score shape is: {score.shape}') loss = self.criterion(score, target) # score 即为长度等于 nclasses 的“概率”向量, target 即为单一值(类别的编号) loss.backward() self.optimizer.step() losses.append(loss.data) # print(f'score is : {score}') # print(f'target is : {target}') # print(f'loss.data is: {loss.data}') # print(f'numpy.array(loss.data) is : {numpy.array(loss.data)}') # if ii % 10 == 0: # vis.plot('loss', loss) # vis.line(X=torch.Tensor([ii + epoch*len(train_loader)]), Y=torch.Tensor([loss]), win=win, update='append', name='train_loss') print(f'Computation over epoch {epoch} is OK.') # 计算验证集上的指标及可视化 val_loss = self.val(model, val_loader) # 若验证误差降低,更新最好模型 if val_loss < best_val_loss: best_val_loss = val_loss self.save_model() epochs_no_better_val_loss = 0 else: # 若验证loss没有降低,则累计到 early_stop_epochs 个epoch之后,就提前停止 epochs_no_better_val_loss += 1 if epochs_no_better_val_loss >= early_stop_epochs: break # vis.plot('val_loss', val_loss) print(f'Validation over epoch {epoch} is OK.') vis.line(X=torch.Tensor([epoch]), Y=torch.Tensor([sum(losses) / len(losses)]), win=win1, update='append', name='mean_train_loss_per_epoch') vis.line(X=torch.Tensor([epoch]), Y=torch.Tensor([val_loss]), win=win1, update='append', name='val_loss') vis.log("[Train Loss] epoch:{epoch},lr:{lr},loss:{loss}".format( epoch=epoch, loss=loss.data, lr=self.lr)) vis.log("[Valid Loss] epoch:{epoch},lr:{lr},loss:{loss}".format( epoch=epoch, loss=val_loss, lr=self.lr))
def train(**kwargs): opt._parse(kwargs) vis = Visualizer(opt.env, port=opt.vis_port) # step1: configure model model = getattr(models, opt.model)() if opt.load_model_path: model.load_new(opt.load_model_path) else: print('Initialize the model!') model.apply(weight_init) model.to(opt.device) # step2: data train_data = TextData(opt.data_root, opt.train_txt_path) val_data = TextData(opt.data_root, opt.val_txt_path) train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) # step3: criterion and optimizer criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = model.get_optimizer(lr, opt.weight_decay) # step4: meters loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e10 # train for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in tqdm(enumerate(train_dataloader)): # train model input = data.to(opt.device) target = label.to(opt.device) optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() #for n, p in model.named_parameters(): # print(n) # h = p.register_hook(lambda grad: print(grad)) optimizer.step() # meters update and visualize loss_meter.add(loss.item()) confusion_matrix.add(score.data, target.data) if ii % opt.print_freq == 0: vis.plot('loss', loss_meter.value()[0]) # enter debug mode if os.path.exists(opt.debug_file): ipdb.set_trace() if ii % (opt.print_freq * 10) == 0: vis.images(input.cpu().numpy(), opts=dict(title='Label', caption='Label'), win=1) print('Epoch: {} Iter: {} Loss: {}'.format(epoch, ii, loss)) if epoch % 2 == 0: model.save('./checkpoints/' + opt.env + '_' + str(epoch) + '.pth') # validate and visualize val_cm, val_accuracy = val(model, val_dataloader) vis.plot('val_accuracy', val_accuracy) vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) train_cm = confusion_matrix.value() t_accuracy = 100. * (train_cm[0][0] + train_cm[1][1]) / (train_cm.sum()) vis.plot('train_accuracy', t_accuracy) if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
def train(**kwargs): #init for k_, v_ in kwargs.items(): setattr(opt, k_, v_) if opt.vis: vis = Visualizer(opt.env) vis_val = Visualizer('valdemoire') #dataset FiveCrop_transforms = transforms.Compose([ transforms.FiveCrop(256), transforms.Lambda(lambda crops: torch.stack( [transforms.ToTensor()(crop) for crop in crops])) ]) data_transforms = transforms.Compose([ # transforms.RandomCrop(256), transforms.ToTensor() ]) train_data = MoireData(opt.train_path) test_data = MoireData(opt.test_path, is_val=True) train_dataloader = DataLoader(train_data, batch_size=opt.train_batch_size, shuffle=True, num_workers=opt.num_workers, drop_last=True) test_dataloader = DataLoader(test_data, batch_size=opt.val_batch_size, shuffle=True, num_workers=opt.num_workers, drop_last=True) last_epoch = 0 #model_init cfg.merge_from_file("config/cfg.yaml") model = get_pose_net(cfg, pretrained=opt.model_path) #initweight model = model.to(opt.device) if opt.vis: val_loss, val_psnr = val(model, test_dataloader, vis_val) print(val_loss, val_psnr) else: val_loss, val_psnr = val(model, test_dataloader) print(val_loss, val_psnr) criterion_c = L1_Charbonnier_loss() criterion_s = L1_Sobel_Loss() lr = opt.lr optimizer = torch.optim.Adam( params=model.parameters(), lr=lr, weight_decay=0.01 #0.005 ) if opt.model_path: map_location = lambda storage, loc: storage checkpoint = torch.load(opt.model_path, map_location=map_location) last_epoch = checkpoint["epoch"] optimizer_state = checkpoint["optimizer"] optimizer.load_state_dict(optimizer_state) lr = checkpoint["lr"] for param_group in optimizer.param_groups: param_group['lr'] = lr loss_meter = meter.AverageValueMeter() psnr_meter = meter.AverageValueMeter() previous_loss = 1e100 accumulation_steps = opt.accumulation_steps for epoch in range(opt.max_epoch): if epoch < last_epoch: continue loss_meter.reset() psnr_meter.reset() torch.cuda.empty_cache() loss_list = [] for ii, (moires, clear_list) in tqdm(enumerate(train_dataloader)): moires = moires.to(opt.device) clears = clear_list[0].to(opt.device) output_list, edge_output_list = model(moires) outputs, edge_X = output_list[0], edge_output_list[0] if epoch < 20: pass elif epoch >= 20 and epoch < 40: opt.loss_alpha = 0.9 else: opt.loss_alpha = 1.0 c_loss = criterion_c(outputs, clears) s_loss = criterion_s(edge_X, clears) loss = opt.loss_alpha * c_loss + (1 - opt.loss_alpha) * s_loss # saocaozuo gradient accumulation loss = loss / accumulation_steps loss.backward() if (ii + 1) % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() loss_meter.add(loss.item() * accumulation_steps) moires = tensor2im(moires) outputs = tensor2im(outputs) clears = tensor2im(clears) psnr = colour.utilities.metric_psnr(outputs, clears) psnr_meter.add(psnr) if opt.vis and (ii + 1) % opt.plot_every == 0: #100个batch画图一次 vis.images(moires, win='moire_image') vis.images(outputs, win='output_image') vis.text( "current outputs_size:{outputs_size},<br/> outputs:{outputs}<br/>" .format(outputs_size=outputs.shape, outputs=outputs), win="size") vis.images(clears, win='clear_image') #record the train loss to txt vis.plot('train_loss', loss_meter.value() [0]) #meter.value() return 2 value of mean and std vis.log( "epoch:{epoch}, lr:{lr}, train_loss:{loss}, train_psnr:{train_psnr}" .format(epoch=epoch + 1, loss=loss_meter.value()[0], lr=lr, train_psnr=psnr_meter.value()[0])) loss_list.append(str(loss_meter.value()[0])) torch.cuda.empty_cache() if opt.vis: val_loss, val_psnr = val(model, test_dataloader, vis_val) vis.plot('val_loss', val_loss) vis.log( "epoch:{epoch}, average val_loss:{val_loss}, average val_psnr:{val_psnr}" .format(epoch=epoch + 1, val_loss=val_loss, val_psnr=val_psnr)) else: val_loss, val_psnr = val(model, test_dataloader) #每个epoch把loss写入文件 with open(opt.save_prefix + "loss_list.txt", 'a') as f: f.write("\nepoch_{}\n".format(epoch + 1)) f.write('\n'.join(loss_list)) if (epoch + 1) % opt.save_every == 0 or epoch == 0: # 每5个epoch保存一次 prefix = opt.save_prefix + 'HRnet_epoch{}_'.format(epoch + 1) file_name = time.strftime(prefix + '%m%d_%H_%M_%S.pth') checkpoint = { 'epoch': epoch + 1, "optimizer": optimizer.state_dict(), "model": model.state_dict(), "lr": lr } torch.save(checkpoint, file_name) if (loss_meter.value()[0] > previous_loss) or ((epoch + 1) % 10) == 0: lr = lr * opt.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0] prefix = opt.save_prefix + 'HRnet_final_' file_name = time.strftime(prefix + '%m%d_%H_%M_%S.pth') checkpoint = { 'epoch': epoch + 1, "optimizer": optimizer.state_dict(), "model": model.state_dict(), "lr": lr } torch.save(checkpoint, file_name)
def train(): vis = Visualizer(opt.env, port=opt.vis_port) # step1 : load model model = getattr(models, opt.model)(pretrained=True) # 加载预训练模型,微调或者特征提取 model = init_extract_model(model, 10) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # step2: data train_data_list = standard_data(opt.train_data_dir, 'train') val_data_list = standard_data(opt.train_data_dir, 'val') train_dataloader = DataLoader(IcvDataset(train_data_list), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(IcvDataset(val_data_list, train=False), batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers) # step3: criterion and optimizer and scheduler criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) # 每100个epoch 下降 lr=lr*gamma scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=300, gamma=0.1) # step4: define metrics train_losses = AverageMeter() train_top1 = AverageMeter() # step5.1: some parameters for K-fold and restart model start_epoch = 0 best_top1 = 50 # step5.2: restart the training process # PyTorch 保存断点checkpoints 的格式为 .tar文件扩展名格式 if opt.resum_model_dir is not None: checkpoint = torch.load(opt.resum_model_dir) start_epoch = checkpoint["epoch"] best_top1 = checkpoint["best_top1"] optimizer.load_state_dict(checkpoint["optimizer"]) model.load_state_dict(checkpoint["state_dict"]) # 在恢复训练时,需要调用 model.train() 以确保所有网络层处于训练模式 model.train() # step6 : train for epoch in range(start_epoch, opt.max_epoch): # lr 下降 scheduler.step(epoch) lr = get_learning_rate(optimizer) train_losses.reset() train_top1.reset() for iter, (input, target) in enumerate(train_dataloader): input = input.to(device) target = target.to(device) optimizer.zero_grad() # forword output = model(input) loss = criterion(output, target) precious = accuracy(output, target, topk=(1, )) # loss and acc train_losses.update(loss.item(), input.size(0)) train_top1.update(precious[0].item(), input.size(0)) # backword loss.backward() optimizer.step() val_loss, val_top1 = val(model, val_dataloader, criterion, device) is_best = val_top1.avg > best_top1 best_top1 = max(val_top1.avg, best_top1) print("epoch : {}/{}".format(epoch, opt.max_epoch)) print("train-->loss:{},acc:{}".format(train_losses.avg, train_top1.avg)) print("val-->loss:{},acc:{}".format(val_loss.avg, val_top1.avg)) vis.plot_many({ 'train_loss': train_losses.avg, 'val_loss': val_loss.avg }) # vis.plot('train_loss', train_losses.avg) # vis.plot('val_accuracy', val_top1.avg) vis.log( "epoch:{epoch},lr:{lr},train_loss:{train_loss},val_loss:{val_loss},train_acc:{train_acc},val_acc:{val_acc}" .format(epoch=epoch, train_loss=train_losses.avg, val_loss=str(val_loss.avg), train_acc=str(train_top1.avg), val_acc=str(val_top1.avg), lr=lr)) if epoch % 10 == 0: save_checkpoint( { "epoch": epoch + 1, "model": opt.model, "state_dict": model.state_dict(), "best_top1": best_top1, "optimizer": optimizer.state_dict(), "val_loss": val_loss.avg, }, opt.save_model_dir, is_best, epoch)
def train(**kwargs): """根据命令行参数更新配置""" opt.parse(kwargs) vis = Visualizer(opt.env) """(1)step1:加载网络,若有预训练模型也加载""" #model = getattr(models,opt.model)() model = models.resnet34(pretrained=True) model.fc = nn.Linear(512, 2) #if opt.load_model_path: # model.load(opt.load_model_path) if opt.use_gpu: #GPU model.cuda() """(2)step2:处理数据""" train_data = DogCat(opt.train_data_root, train=True) #训练集 val_data = DogCat(opt.train_data_root, train=False) #验证集 train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) """(3)step3:定义损失函数和优化器""" criterion = t.nn.CrossEntropyLoss() #交叉熵损失 lr = opt.lr #学习率 optimizer = t.optim.SGD(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) """(4)step4:统计指标,平滑处理之后的损失,还有混淆矩阵""" loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e10 """(5)开始训练""" for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in enumerate(train_dataloader): print "ii:", ii #训练模型参数 input = Variable(data) target = Variable(label) if opt.use_gpu: input = input.cuda() target = target.cuda() #梯度清零 optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() #反向传播 #更新参数 optimizer.step() #更新统计指标及可视化 loss_meter.add(loss.item()) #print score.shape,target.shape confusion_matrix.add(score.detach(), target.detach()) if ii % opt.print_freq == opt.print_freq - 1: vis.plot('loss', loss_meter.value()[0]) if os.path.exists(opt.debug_file): import ipdb ipdb.set_trace() #model.save() name = time.strftime('model' + '%m%d_%H:%M:%S.pth') t.save(model.state_dict(), 'checkpoints/' + name) """计算验证集上的指标及可视化""" val_cm, val_accuracy = val(model, val_dataloader) vis.plot('val_accuracy', val_accuracy) vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) print "epoch:", epoch, "loss:", loss_meter.value( )[0], "accuracy:", val_accuracy """如果损失不再下降,则降低学习率""" if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay for param_group in optimizer.param_groups: param_group["lr"] = lr previous_loss = loss_meter.value()[0]
def train(**kwargs): opt.parse(kwargs) vis = Visualizer(opt.env) # step1: configure model model = getattr(models, opt.model)() if os.path.exists(opt.load_model_path): model.load(opt.load_model_path) if opt.use_gpu: model.cuda() if os.path.exists(opt.pars_path): dic = load_dict(opt.pars_path) previous_loss = dic['loss'][-1] if 'loss' in dic.keys() else 1e100 else: dic = {} # step2: data train_data = DogCat(opt.train_data_root, train=True) val_data = DogCat(opt.train_data_root, train=False) train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) # step2: criterion and optimizer criterion = nn.CrossEntropyLoss() lr = opt.lr optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=opt.weight_decay) # step4: meters loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) #previous_loss = 1e100 # train for epoch in range(5, opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)): #confusion_matrix.reset() # train model input = Variable(data) target = Variable(label) if opt.use_gpu: input = input.cuda() target = target.cuda() optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() optimizer.step() # meters update and visualize loss_meter.add(loss.data.item()) confusion_matrix.add(score.data, target.data) if ii % opt.print_freq == opt.print_freq - 1: dic = save_dict(opt.pars_path, dic, loss_data=loss_meter.value()[0]) #loss_meter.reset() vis.plot('loss', dic['loss_data']) name = model.save() if os.path.exists(opt.debug_file): import ipdb ipdb.set_trave() name = model.save() # update learning: reduce learning rate when loss no longer decrease if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0] dic = save_dict(opt.pars_path, dic, name=name, epoch=epoch, lr=lr, loss=loss_meter.value()[0], train_cm=confusion_matrix.value()) # validate and visualize val_cm, val_accuracy = val(model, val_dataloader) dic = save_dict(opt.pars_path, dic, val_accuracy=val_accuracy, val_cm=val_cm.value()) vis.log(dic)
def train(**kwargs): opt.parse(kwargs) vis = Visualizer(opt.env) # step1: configure model model = getattr(models, opt.model)(opt) if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() # step2: data train_data = DocumentPair(opt.train_data_root, doc_type='train', suffix='txt', load=lambda x: x.strip().split(',')) train_data.initialize(vocab_size=opt.vocab_size) val_data = DocumentPair(opt.validate_data_root, doc_type='validate', suffix='txt', load=lambda x: x.strip().split(','), vocab=train_data.vocab) val_data.initialize() train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) # step3: criterion and optimizer criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = t.optim.Adam(model.parameters(), lr=lr, weight_decay=opt.weight_decay) # step4: meters loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e100 # train for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii, batch in enumerate(train_dataloader): data_left, data_right, label, num_pos = load_data( batch, opt, train_data.vocab) # train model input_data_left, input_data_right = Variable( t.from_numpy(data_left)), Variable(t.from_numpy(data_right)) target = Variable(t.from_numpy(label)) if opt.use_gpu: input_data_left, input_data_right = input_data_left.cuda( ), input_data_right.cuda() target = target.cuda() optimizer.zero_grad() scores, predictions = model((input_data_left, input_data_right)) loss = criterion(scores, target.max(1)[1]) loss.backward() optimizer.step() # meters update and visualize loss_meter.add(loss.data[0]) confusion_matrix.add(predictions.data, target.max(1)[1].data) if ii % opt.print_freq == opt.print_freq - 1: vis.plot('loss', loss_meter.value()[0]) # 进入debug模式 if os.path.exists(opt.debug_file): import ipdb ipdb.set_trace() model.save() # validate and visualize val_cm, val_accuracy = val(model, val_dataloader) vis.plot('val_accuracy', val_accuracy) vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) # update learning rate if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
def train(**kwargs): opt._parse(kwargs) vis = Visualizer(opt.env,port = opt.vis_port) # step1: configure model model = getattr(models, opt.model)() if opt.load_model_path: model.load(opt.load_model_path) model.to(opt.device) # step2: data train_data = DogCat(opt.train_data_root,train=True) val_data = DogCat(opt.train_data_root,train=False) train_dataloader = DataLoader(train_data,opt.batch_size, shuffle=True,num_workers=opt.num_workers) val_dataloader = DataLoader(val_data,opt.batch_size, shuffle=False,num_workers=opt.num_workers) # step3: criterion and optimizer criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = model.get_optimizer(lr, opt.weight_decay) # step4: meters loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e10 # train for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() train_loss = 0. train_acc = 0. i = 0 for ii,(data,label) in tqdm(enumerate(train_dataloader)): # train model input = data.to(opt.device) target = label.to(opt.device) optimizer.zero_grad() score = model(input) loss = criterion(score,target) train_loss += loss.item() pred = t.max(score, 1)[1] train_correct = (pred==target).sum() train_acc += train_correct.item() print('epoch ', epoch, ' batch ', i) i+=1 print('Train Loss: %f, Acc: %f' % (loss.item(), train_correct.item() / float(len(data)))) loss.backward() optimizer.step() # meters update and visualize loss_meter.add(loss.item()) # detach 一下更安全保险 confusion_matrix.add(score.detach(), target.detach()) if (ii + 1)%opt.print_freq == 0: vis.plot('loss', loss_meter.value()[0]) # 进入debug模式 if os.path.exists(opt.debug_file): import ipdb; ipdb.set_trace() print('Train Loss: {:.6f}, Acc: {:.6f}'.format(train_loss / (len( train_data)), train_acc / (len(train_data)))) # model.save() prefix = 'checkpoints/' + opt.model + '_a'+str(epoch)+'.pth' t.save(model.state_dict(), prefix) # validate and visualize val_cm,val_accuracy = val(model,val_dataloader, criterion, val_data) vis.plot('val_accuracy',val_accuracy) vis.log("epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}".format( epoch = epoch,loss = loss_meter.value()[0],val_cm = str(val_cm.value()),train_cm=str(confusion_matrix.value()),lr=lr)) # update learning rate if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
def train(): t.cuda.set_device(1) # n_channels:医学影像为一通道灰度图 n_classes:二分类 net = UNet(n_channels=1, n_classes=1) optimizer = t.optim.SGD(net.parameters(), lr=opt.learning_rate, momentum=0.9, weight_decay=0.0005) criterion = t.nn.BCELoss() # 二进制交叉熵(适合mask占据图像面积较大的场景) start_epoch = 0 if opt.load_model_path: checkpoint = t.load(opt.load_model_path) # 加载多GPU模型参数到 单模型上 state_dict = checkpoint['net'] new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove `module.` new_state_dict[name] = v net.load_state_dict(new_state_dict) # 加载模型 optimizer.load_state_dict(checkpoint['optimizer']) # 加载优化器 start_epoch = checkpoint['epoch'] # 加载训练批次 # 学习率每当到达milestones值则更新参数 if start_epoch == 0: scheduler = t.optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.milestones, gamma=0.1, last_epoch=-1) # 默认为-1 print('从头训练 ,学习率为{}'.format(optimizer.param_groups[0]['lr'])) else: scheduler = t.optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.milestones, gamma=0.1, last_epoch=start_epoch) print('加载预训练模型{}并从{}轮开始训练,学习率为{}'.format( opt.load_model_path, start_epoch, optimizer.param_groups[0]['lr'])) # 网络转移到GPU上 if opt.use_gpu: net = t.nn.DataParallel(net, device_ids=opt.device_ids) # 模型转为GPU并行 net.cuda() cudnn.benchmark = True # 定义可视化对象 vis = Visualizer(opt.env) train_data = NodeDataSet(train=True) val_data = NodeDataSet(val=True) test_data = NodeDataSet(test=True) # 数据集加载器 train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) test_dataloader = DataLoader(test_data, opt.test_batch_size, shuffle=False, num_workers=opt.num_workers) for epoch in range(opt.max_epoch - start_epoch): print('开始 epoch {}/{}.'.format(start_epoch + epoch + 1, opt.max_epoch)) epoch_loss = 0 # 每轮判断是否更新学习率 scheduler.step() # 迭代数据集加载器 for ii, (img, mask) in enumerate( train_dataloader): # pytorch0.4写法,不再将tensor封装为Variable # 将数据转到GPU if opt.use_gpu: img = img.cuda() true_masks = mask.cuda() masks_pred = net(img) # 经过sigmoid masks_probs = t.sigmoid(masks_pred) # 损失 = 二进制交叉熵损失 + dice损失 loss = criterion(masks_probs.view(-1), true_masks.view(-1)) # 加入dice损失 if opt.use_dice_loss: loss += dice_loss(masks_probs, true_masks) epoch_loss += loss.item() if ii % 2 == 0: vis.plot('训练集loss', loss.item()) # 优化器梯度清零 optimizer.zero_grad() # 反向传播 loss.backward() # 更新参数 optimizer.step() # 当前时刻的一些信息 vis.log("epoch:{epoch},lr:{lr},loss:{loss}".format( epoch=epoch, loss=loss.item(), lr=optimizer.param_groups[0]['lr'])) vis.plot('每轮epoch的loss均值', epoch_loss / ii) # 保存模型、优化器、当前轮次等 state = { 'net': net.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch } t.save(state, opt.checkpoint_root + '{}_unet.pth'.format(epoch)) # ============验证=================== net.eval() # 评价函数:Dice系数 Dice距离用于度量两个集合的相似性 tot = 0 for jj, (img_val, mask_val) in enumerate(val_dataloader): img_val = img_val true_mask_val = mask_val if opt.use_gpu: img_val = img_val.cuda() true_mask_val = true_mask_val.cuda() mask_pred = net(img_val) mask_pred = (t.sigmoid(mask_pred) > 0.5).float() # 阈值为0.5 # 评价函数:Dice系数 Dice距离用于度量两个集合的相似性 tot += dice_loss(mask_pred, true_mask_val).item() val_dice = tot / jj vis.plot('验证集 Dice损失', val_dice) # ============验证召回率=================== # 每10轮验证一次测试集召回率 if epoch % 10 == 0: result_test = [] for kk, (img_test, mask_test) in enumerate(test_dataloader): # 测试 unet分割能力,故 不使用真值mask if opt.use_gpu: img_test = img_test.cuda() mask_pred_test = net(img_test) # [1,1,512,512] probs = t.sigmoid(mask_pred_test).squeeze().squeeze().cpu( ).detach().numpy() # [512,512] mask = probs > opt.out_threshold result_test.append(mask) # 得到 测试集所有预测掩码,计算二维召回率 vis.plot('测试集二维召回率', getRecall(result_test).getResult()) net.train()
def train(**kwargs): opt._parse(kwargs) vis = Visualizer(opt.env,port = opt.vis_port) # step1: configure model model = getattr(models, opt.model)() if opt.load_model_path: model.load(opt.load_model_path) model.to(opt.device) # step2: data train_data = DogCat(opt.train_data_root,train=True) val_data = DogCat(opt.train_data_root,train=False) train_dataloader = DataLoader(train_data,opt.batch_size, shuffle=True,num_workers=opt.num_workers) val_dataloader = DataLoader(val_data,opt.batch_size, shuffle=False,num_workers=opt.num_workers) # step3: criterion and optimizer criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = model.get_optimizer(lr, opt.weight_decay) # step4: meters loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e10 # train for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii,(data,label) in tqdm(enumerate(train_dataloader)): # train model input = data.to(opt.device) target = label.to(opt.device) optimizer.zero_grad() score = model(input) loss = criterion(score,target) loss.backward() optimizer.step() # meters update and visualize loss_meter.add(loss.item()) # detach 一下更安全保险 confusion_matrix.add(score.detach(), target.detach()) if (ii + 1)%opt.print_freq == 0: vis.plot('loss', loss_meter.value()[0]) # 进入debug模式 if os.path.exists(opt.debug_file): import ipdb; ipdb.set_trace() model.save() # validate and visualize val_cm,val_accuracy = val(model,val_dataloader) vis.plot('val_accuracy',val_accuracy) vis.log("epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}".format( epoch = epoch,loss = loss_meter.value()[0],val_cm = str(val_cm.value()),train_cm=str(confusion_matrix.value()),lr=lr)) # update learning rate if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
def train(**kwargs): opt.parse(kwargs) vis = Visualizer(opt.env) model = getattr(models, opt.model)() if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() # 数据设定 户籍科 010 82640433 train_data = DogCat(opt.load_model_path, train=True) val_data = DogCat(opt.train_data_root, train=False) train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) train_dataloader = DataLoader(test_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) # 目标函数和优化器 criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = t.optim.Adam(model) # 统计指标,平滑处理之后的损失 loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e100 for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in tqdm( enumerate(train_dataloader)): # ii num ,(data,label) enumerate # 训练模型参数 input = Variable(data) target = Variable(label) if opt.use_gpu: input = input.cuda() target = target.cuda() optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() optimizer.stop() # 更新统计指标及可视化 loss_meter.add(loss.data[0]) confusion_matrix.add(loss.data[0]) confusion_matrix.add(score.data, target.data) if ii % opt.print_freq == opt.print_freq - 1: vis.plot('loss', loss_meter.value()[0]) if os.path.exist(opt.debug_file): import ipdb ipdb.set_trace() model.save() # 计算验证集上的指标及其可视化 val_cm, val_accuracy = val(model, val_dataloader) vis.plot('val_accuracy', val_accuracy) vis.log( 'epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}' .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
def train(**kwargs): opt.parse(kwargs) vis = Visualizer(opt.env) Model = getattr(models, opt.model) model = Model(40) if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() train_data = CGHData(opt.train_data_root, train=True) val_data = CGHData(opt.train_data_root, train=False) train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) criterion = nn.MSELoss() lr = opt.lr optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=opt.weight_decay) loss_meter = meter.AverageValueMeter() # confusion_matrix=meter.ConfusionMeter(2) previous_loss = 1e100 for epoch in range(opt.max_epoch): loss_meter.reset() # confusion_matrix.reset() for k, (data, label) in enumerate(train_dataloader): # print(k) if opt.use_gpu: input = input.cuda() target = target.cuda() optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() optimizer.step() loss_meter.add(loss.data[0]) # confusion_matrix.add(score.data, target.data) if k % opt.print_freq == opt.print_freq - 1: vis.plot('loss', loss_meter.value()[0]) model.save() vak_cm, val_accuracy = val(model, val_dataloader) vis.plot('val_accuracy', val_accuracy) vis.log("epoch:{epoch},lr:{lr},loss:{loss}".format( epoch=epoch, loss=loss_meter.value()[0], lr=lr)) if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
def train(**kwargs): # opt.parse(kwargs) vis = Visualizer(opt.env) savingData = [] # # step1: configure model model = getattr(models, opt.model)() if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() # step2: data train_data = DogCat(opt.train_data_root, train=True) val_data = DogCat(opt.train_data_root, train=False) test_data = DogCat(opt.test_data_root, test=True) train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) test_dataloader = DataLoader(test_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) # step3: criterion and optimizer criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = t.optim.Adam(model.parameters(), lr=lr, weight_decay=opt.weight_decay) # step4: meters loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e100 # train for epoch in range(opt.max_epoch + 1): # validate and visualize val_cm, val_accuracy = val(model, val_dataloader) test_cm, test_accuracy = val(model, test_dataloader) vis.plot('test_accuracy', test_accuracy) vis.plot('lr', lr) vis.plot('val_accuracy', val_accuracy) vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm},test_cm:{test_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), test_cm=str(test_cm.value()), lr=lr)) print("epoch = ", epoch, " loss = ", loss_meter.value()[0], " lr = ", lr) batch_results = [(epoch, loss_meter.value()[0], lr, str(val_cm.value()), str(confusion_matrix.value()), str(test_cm.value()), val_accuracy, test_accuracy) ] # savingData += batch_results # save_training_data(savingData, opt.traingData_file) # # update learning rate # if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # # 第二种降低学习率的方法:不会有moment等信息的丢失 # for param_group in optimizer.param_groups: # param_group['lr'] = lr if epoch == opt.max_epoch: return previous_loss = loss_meter.value()[0] loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in tqdm(enumerate(train_dataloader), total=len(train_data) / opt.batch_size): # train model input = data target = label if opt.use_gpu: input = input.cuda() target = target.cuda() optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() optimizer.step() # meters update and visualize loss_meter.add(loss.item()) confusion_matrix.add(score.data, target.data) if ii % opt.print_freq == opt.print_freq - 1: vis.plot('loss', loss_meter.value()[0]) # 进入debug模式 if os.path.exists(opt.debug_file): import ipdb ipdb.set_trace() prefix = 'checkpoints/' name = time.strftime(prefix + '%m%d_%H:%M:%S_' + str(epoch + 1) + '.pth') if epoch == 0: model.save(name) if np.mod(epoch + 1, 10) == 0: model.save(name)
def train(**kwargs): """根据命令行参数更新配置""" opt.parse(kwargs) vis = Visualizer(opt.env) """(1)step1:加载网络,若有预训练模型也加载""" model = getattr(models, opt.model)() """(2)step2:处理数据""" train_data = Ictal(opt.train_data_root, opt.model, train=True) # 训练集 val_data = Ictal(opt.train_data_root, opt.model, train=False) # 验证集 train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) """(3)step3:定义损失函数和优化器""" criterion = t.nn.CrossEntropyLoss() # 交叉熵损失 lr = opt.lr # 学习率 optimizer = t.optim.SGD(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) """(4)step4:统计指标,平滑处理之后的损失,还有混淆矩阵""" loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e10 start = time.time() """(5)开始训练""" for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in enumerate(train_dataloader): # 训练模型参数 input = Variable(data) if opt.model == 'CNN_1d': input = input.permute(0, 2, 1) target = Variable(label) # 梯度清零 optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() # 反向传播 # 更新参数 optimizer.step() # 更新统计指标及可视化 loss_meter.add(loss.item()) # print score.shape, target.shape confusion_matrix.add(score.detach(), target.detach()) if ii % opt.print_freq == opt.print_freq - 1: vis.plot('loss', loss_meter.value()[0]) if os.path.exists(opt.debug_file): import ipdb ipdb.set_trace() model.save(epoch) """计算验证集上的指标及可视化""" val_cm, val_accuracy = val(model, val_dataloader, opt.model) vis.plot('val_accuracy', val_accuracy) vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) tra_cm, tra_accuracy = val(model, train_dataloader, opt.model) print("epoch:", epoch, "loss:", loss_meter.value()[0], "val_accuracy:", val_accuracy, "tra_accuracy:", tra_accuracy) """如果损失不再下降,则降低学习率""" if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay for param_group in optimizer.param_groups: param_group["lr"] = lr previous_loss = loss_meter.value()[0] end = time.time() print(end - start)
def train(args, config): vis = Visualizer() train_set = MNIST(data_path=config.train_data_path, label_path=config.train_label_path, config=config, mode='train') valid_set = MNIST(data_path=config.train_data_path, label_path=config.train_label_path, config=config, mode='valid') train_dataloader = DataLoader(train_set, config.batch_size, shuffle=True, num_workers=config.num_workers) valid_dataloader = DataLoader(valid_set, config.batch_size, shuffle=False, num_workers=config.num_workers) model = getattr(network, args.model)().eval() if args.load_model_path: model.load(args.load_model_path) if args.use_gpu: model.cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=config.lr) train_loss_meter, valid_loss_meter = meter.AverageValueMeter( ), meter.AverageValueMeter() train_confusion_matrix, valid_confusion_matrix = meter.ConfusionMeter( 10), meter.ConfusionMeter(10) best_valid_loss = 1e5 best_epoch = 0 dist_to_best = 0 time_begin = time.clock() for epoch in range(config.epoch): # train model.train() train_loss_meter.reset() train_confusion_matrix.reset() for _iter, (train_data, train_target) in enumerate(train_dataloader): if args.use_gpu: train_data = train_data.cuda() train_target = train_target.cuda() optimizer.zero_grad() train_logits, train_output = model(train_data) train_loss = criterion(train_logits, train_target) train_loss.backward() optimizer.step() train_loss_meter.add(train_loss.item()) train_confusion_matrix.add(train_logits.data, train_target.data) if _iter % config.print_freq == 0: vis.plot('train_loss', train_loss_meter.value()[0]) model.save(path=os.path.join(args.ckpts_dir, 'model_{0}.pth'.format( str(epoch)))) # valid model.eval() valid_loss_meter.reset() valid_confusion_matrix.reset() for _iter, (valid_data, valid_target) in enumerate(valid_dataloader): if args.use_gpu: valid_data = valid_data.cuda() valid_target = valid_target.cuda() valid_logits, valid_output = model(valid_data) valid_loss = criterion(valid_logits, valid_target) valid_loss_meter.add(valid_loss.item()) valid_confusion_matrix.add(valid_logits.detach().squeeze(), valid_target.type(t.LongTensor)) valid_cm = valid_confusion_matrix.value() valid_accuracy = 100. * (valid_cm.diagonal().sum()) / (valid_cm.sum()) vis.plot('valid_accuracy', valid_accuracy) vis.log( "epoch:{epoch}, train_loss:{train_loss}, train_cm:{train_cm}, valid_loss:{valid_loss}, valid_cm:{valid_cm}, valid_accuracy:{valid_accuracy}" .format(epoch=epoch, train_loss=train_loss_meter.value()[0], train_cm=str(train_confusion_matrix.value()), valid_loss=valid_loss_meter.value()[0], valid_cm=str(valid_cm), valid_accuracy=valid_accuracy)) print( "epoch:{epoch}, train_loss:{train_loss}, valid_loss:{valid_loss}, valid_accuracy:{valid_accuracy}" .format(epoch=epoch, train_loss=train_loss_meter.value()[0], valid_loss=valid_loss_meter.value()[0], valid_accuracy=valid_accuracy)) print("train_cm:\n{train_cm}\n\nvalid_cm:\n{valid_cm}".format( train_cm=str(train_confusion_matrix.value()), valid_cm=str(valid_cm), )) # early stop if valid_loss_meter.value()[0] < best_valid_loss: best_epoch = epoch best_valid_loss = valid_loss_meter.value()[0] dist_to_best = 0 dist_to_best += 1 if dist_to_best > 4: break model.save(path=os.path.join(args.ckpts_dir, 'model.pth')) vis.save() print("save model successfully") print("best epoch: ", best_epoch) print("best valid loss: ", best_valid_loss) time_end = time.clock() print('time cost: %.2f' % (time_end - time_begin))
def train(**kwargs): opt._parse(kwargs) vis = Visualizer(opt.env, port=opt.vis_port) # step1: configure model model = getattr(models, opt.model)() if opt.load_model_path: model.load(opt.load_model_path) model.to(opt.device) # step2: data train_data = DogCat(opt.train_data_root, train=True) val_data = DogCat(opt.train_data_root, train=False) train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) # step3: criterion and optimizer criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = model.get_optimizer(lr, opt.weight_decay) # optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.99)) # step4: meters loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e10 # train for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() print("trian epoch: ", epoch) for ii, (data, label) in tqdm(enumerate(train_dataloader)): # train model input = data.to(opt.device) target = label.to(opt.device) optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() optimizer.step() # meters update and visualize loss_meter.add(loss.item()) # detach 一下更安全保险 confusion_matrix.add(score.detach(), target.detach()) if (ii + 1) % opt.print_freq == 0: vis.plot('loss', loss_meter.value()[0]) # # 进入debug模式 # if os.path.exists(opt.debug_file): # import ipdb; # ipdb.set_trace() model.save() # validate and visualize print("start eval:") val_cm, val_accuracy = val(model, val_dataloader) vis.plot('val_accuracy', val_accuracy) vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) # update learning rate if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr vis.plot('lr', lr) previous_loss = loss_meter.value()[0]
def train(**kwargs): ''' 训练 ''' # 根据命令行参数更新配置 vis = Visualizer(opt.env) # step1: configure model(定义网络) # 将config里的model赋值给models,从而定义model model = getattr(models, opt.model)() if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() # step2: data(定义数据) # 加载训练集 train_data = DogCat(opt.train_data_root, train=True) # 加载验证集 val_data = DogCat(opt.train_data_root, train=False) # 使用dataloader加载数据 # 加载训练集数据 train_dataloader = DataLoader( train_data, opt.batch_size, # 打乱数据 shuffle=True, num_workers=opt.num_workers) # 加载验证集数据 val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) # step3: criterion and optimizer(定义损失函数和优化器) criterion = t.nn.CrossEntropyLoss() # 分类问题使用交叉熵,优化器使用Adam lr = opt.lr optimizer = t.optim.Adam(model.parameters(), lr=lr, weight_decay=opt.weight_decay) # step4: meters(计算重要指标,平滑处理之后的损失,还有混淆矩阵) # 计算所有meter的的平均值和标准差,统计一个ecoph中损失的平均值 loss_meter = meter.AverageValueMeter() # 损失值 # 混淆矩阵 2表示2分类问题 confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e100 # train(开始训练) for epoch in range(opt.max_epoch): # 清空仪表信息和混淆矩阵信息 loss_meter.reset() confusion_matrix.reset() # 迭代训练集的加载器dataloader for ii, (data, label) in enumerate(train_dataloader): # train model (训练网络参数) # 输入为data input = Variable(data) # 输出target为label target = Variable(label) if opt.use_gpu: input = input.cuda() target = target.cuda() # 优化器梯度清零 optimizer.zero_grad() # 计算出输入的概率 score = model(input) # 损失函数 loss = criterion(score, target) # 反向传播,自动求梯度 loss.backward() # 更新优化器的可学习参数 optimizer.step() # meters update and visualize(更新统计指标,可视化各种指标) loss_meter.add(loss.data[0]) ''' confusionmeter 用来统计问题中的分类情况,比准确率的更加详细 ''' confusion_matrix.add(score.data, target.data) # 每print_freq次可视化loss if ii % opt.print_freq == opt.print_freq - 1: vis.plot('loss', loss_meter.value()[0]) # 进入debug模式 if os.path.exists(opt.debug_file): import ipdb ipdb.set_trace() # 保存模型 model.save() # validate and visualize(计算验证集上的指标及可视化) # 验证集数据val_dataloader val_cm, val_accuracy = val(model, val_dataloader) # 使用验证集计算准确率,val_cm混淆矩阵 # 可视化准确率 vis.plot('val_accuracy', val_accuracy) # 当前时刻的一些信息 vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) # update learning (如果损失不下降,降低学习率) if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]