def train(x_train, x_train_external, y_train): # model num_class = np.shape(y_train)[1] model = ResNet34(num_classes=num_class) model = model.to(device) # optimizer and loss optimizer = optim.Adam(model.parameters(), lr=config.lr) wc = y_train.sum(axis=0) wc = 1. / (np.log(wc + 1) + 1) w = torch.tensor(wc, dtype=torch.float).to(device) criterion1 = utils.WeightedMultilabel(w) lr = config.lr start_epoch = 1 stage = 1 best_auc = -1 # =========>开始训练<========= for epoch in range(start_epoch, config.max_epoch + 1): train_loss, train_auc = train_epoch(model, optimizer, criterion1, x_train, x_train_external, y_train) if epoch in config.stage_epoch: stage += 1 lr /= config.lr_decay utils.adjust_learning_rate(optimizer, lr) return model
def train(x_train, x_val, x_train_external, x_val_external, y_train, y_val, num_class): # model model = ECGNet(BasicBlock, [3, 4, 6, 3], num_classes=num_class) model = model.to(device) # optimizer and loss optimizer = optim.Adam(model.parameters(), lr=config.lr) # optimizer = optim. RMSProp(model.parameters(), lr=config.lr) wc = y_train.sum(axis=0) wc = 1. / (np.log(wc) + 1) #添加和标签权重的惩罚,如果一个标签和其他标签越接近越容易混淆,它的权重得分会越大,应该更加关注一些,此权重是已经做了归一化 # weight=np.array([0.9608,0.9000,0.8373,0.8373,0.8706,0.6412,0.8373,0.9118,1.0,0.9255,0.9118, # 0.9892,0.9588,0.9118,0.9118,0.8137,0.9608,1.0,0.9118,0.9588,0.9588,0.9863, # 0.8373,0.9892,0.9588,0.9118,0.9863]) # wc=weight*wc w = torch.tensor(wc, dtype=torch.float).to(device) criterion1 = utils.WeightedMultilabel(w) criterion2 = nn.BCEWithLogitsLoss() lr = config.lr start_epoch = 1 stage = 1 best_auc = -1 # =========>开始训练<========= print("*" * 10, "step into stage %02d lr %.5f" % (stage, lr)) for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_auc = train_epoch(model, optimizer, criterion1, x_train, x_train_external, y_train, num_class) val_loss, val_auc = val_epoch(model, criterion2, x_val, x_val_external, y_val, num_class) print( '#epoch:%02d stage:%d train_loss:%.4f train_auc:%.4f val_loss:%.4f val_auc:%.4f time:%s' % (epoch, stage, train_loss, train_auc, val_loss, val_auc, utils.print_time_cost(since))) if epoch in config.stage_epoch: stage += 1 lr /= config.lr_decay print("*" * 10, "step into stage %02d lr %.5f" % (stage, lr)) utils.adjust_learning_rate(optimizer, lr) return model
def train(x_train, x_val, x_train_external, x_val_external, y_train, y_val, num_class): # model model = ECGNet(BasicBlock1d, BasicBlock2d, num_classes=num_class) model = model.to(device) # optimizer and loss optimizer = optim.Adam(model.parameters(), lr=config.lr) # optimizer = optim. RMSProp(model.parameters(), lr=config.lr) wc = y_train.sum(axis=0) wc = 1. / (np.log(wc) + 1) w = torch.tensor(wc, dtype=torch.float).to(device) criterion1 = utils.WeightedMultilabel(w) criterion2 = nn.BCEWithLogitsLoss() lr = config.lr start_epoch = 1 stage = 1 best_auc = -1 # =========>开始训练<========= print("*" * 10, "step into stage %02d lr %.5f" % (stage, lr)) for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_auc = train_epoch(model, optimizer, criterion1, x_train, x_train_external, y_train, num_class) val_loss, val_auc = val_epoch(model, criterion2, x_val, x_val_external, y_val, num_class) print( '#epoch:%02d stage:%d train_loss:%.4f train_auc:%.4f val_loss:%.4f val_auc:%.4f time:%s' % (epoch, stage, train_loss, train_auc, val_loss, val_auc, utils.print_time_cost(since))) if epoch in config.stage_epoch: stage += 1 lr /= config.lr_decay print("*" * 10, "step into stage %02d lr %.5f" % (stage, lr)) utils.adjust_learning_rate(optimizer, lr) return model
def val(mode, ckpt): model = getattr(resnet, config.model_name)(num_classes=config.num_classes, input_dim=config.input_dim) model.load_state_dict(torch.load(ckpt, map_location='cpu')['state_dict']) model = model.to(device) val_dataset = ECGDataset(data_path=config.train_data, mode=mode) groups = config.groups count = val_dataset.count criterion = utils.WeightedMultilabel(groups, count, device) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, num_workers=6) val_loss, val_p, val_r, val_f1, pr_df = val_epoch(model, criterion, val_dataloader, False) print('val_loss:%0.3e val_precision:%.4f val_recall:%.4f val_f1:%.4f\n' % ( val_loss, val_p, val_r, val_f1, )) pr_df['arry'] = pr_df['arry'].map(val_dataset.idx2name) pr_df.to_csv('../user_data/%s_f1.csv' % mode, encoding='gbk') display.display(pr_df)
def train(args): # model if config.fuse == 'False': model = getattr(models, config.model_name)() elif config.fuse == 'True': model = ResMlp(ResMlpParams) else: raise ValueError( 'Not supported type of fuse item in train initialization phase!') if args.ckpt and not args.resume: state = torch.load(args.ckpt, map_location='cpu') model.load_state_dict(state['state_dict']) print('train with pretrained weight val_f1', state['f1']) model = model.to(device) # data train_dataset = ECGDataset(data_path=config.train_data, train=True) train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=4) val_dataset = ECGDataset(data_path=config.train_data, train=False) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, num_workers=2) print("train_datasize", len(train_dataset), "val_datasize", len(val_dataset)) # optimizer and loss optimizer = optim.Adam(model.parameters(), lr=config.lr) w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device) criterion = utils.WeightedMultilabel(w) # 模型保存文件夹 model_save_dir = '%s/%s_%s' % (args.output, config.model_name, time.strftime("%Y%m%d%H%M")) utils.mkdirs(model_save_dir) if args.ex: model_save_dir += args.ex best_f1 = -1 lr = config.lr start_epoch = 1 stage = 1 current_time = datetime.now().strftime('%b%d_%H-%M-%S') logdir = os.path.join(args.output, 'logs', current_time + '_' + config.fuse) writer = SummaryWriter(logdir) # 从上一个断点,继续训练 if args.resume: if os.path.exists(args.ckpt): # 这里是存放权重的目录 current_w = torch.load(os.path.join(args.ckpt, config.current_w)) best_w = torch.load(os.path.join(args.ckpt, config.best_w)) best_f1 = best_w['loss'] start_epoch = current_w['epoch'] + 1 lr = current_w['lr'] stage = current_w['stage'] model.load_state_dict(current_w['state_dict']) # 如果中断点恰好为转换stage的点 if start_epoch - 1 in config.stage_epoch: stage += 1 lr /= config.lr_decay utils.adjust_learning_rate(optimizer, lr) model.load_state_dict(best_w['state_dict']) # 一旦断点就从最好的模型开始训练 print("=> loaded checkpoint (epoch {})".format(start_epoch - 1)) # =========>开始训练<========= for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_f1 = train_epoch(model, optimizer, criterion, train_dataloader, show_interval=100) val_loss, val_f1 = val_epoch(model, criterion, val_dataloader) print( '#epoch:%02d stage:%d train_loss:%.3e train_f1:%.3f val_loss:%0.3e val_f1:%.3f time:%s\n' % (epoch, stage, train_loss, train_f1, val_loss, val_f1, utils.print_time_cost(since))) writer.add_scalar('scalar/train_loss', train_loss, epoch) # 粗略的查看,可以改造成每一个iteration的更加细致的查看 writer.add_scalar('scalar/train_f1', train_f1, epoch) writer.add_scalar('scalar/val_loss', val_loss, epoch) writer.add_scalar('scalar/val_f1', val_f1, epoch) state = { "state_dict": model.state_dict(), "epoch": epoch, "loss": val_loss, 'f1': val_f1, 'lr': lr, 'stage': stage } save_ckpt(state, best_f1 < val_f1, model_save_dir) best_f1 = max(best_f1, val_f1) if epoch in config.stage_epoch: stage += 1 lr /= config.lr_decay best_w = os.path.join( model_save_dir, config.best_w) # 在进入到每一个阶段之前选取前一阶段表现最好的模型进行训练,贪心方法,但是这样真的对吗? model.load_state_dict(torch.load(best_w)['state_dict']) print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr)) utils.adjust_learning_rate(optimizer, lr) writer.close()
def train(args): # model model = getattr(models, config.model_name)() if args.ckpt and not args.resume: state = torch.load(args.ckpt, map_location='cpu') model.load_state_dict(state['state_dict']) print('train with pretrained weight val_f1', state['f1']) model = model.to(device) # data train_dataset = ECGDataset(data_path=config.train_data, train=True) train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=6) val_dataset = ECGDataset(data_path=config.train_data, train=False) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, num_workers=4) print("train_datasize", len(train_dataset), "val_datasize", len(val_dataset)) # optimizer and loss optimizer = optim.Adam(model.parameters(), lr=config.lr) w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device) criterion = utils.WeightedMultilabel(w) # 模型保存文件夹 model_save_dir = '%s/%s_%s' % (config.ckpt, config.model_name, time.strftime("%Y%m%d%H%M")) if args.ex: model_save_dir += args.ex best_f1 = -1 lr = config.lr start_epoch = 1 stage = 1 # 从上一个断点,继续训练 if args.resume: if os.path.exists(args.ckpt): # 这里是存放权重的目录 model_save_dir = args.ckpt current_w = torch.load(os.path.join(args.ckpt, config.current_w)) best_w = torch.load(os.path.join(model_save_dir, config.best_w)) best_f1 = best_w['loss'] start_epoch = current_w['epoch'] + 1 lr = current_w['lr'] stage = current_w['stage'] model.load_state_dict(current_w['state_dict']) # 如果中断点恰好为转换stage的点 if start_epoch - 1 in config.stage_epoch: stage += 1 lr /= config.lr_decay utils.adjust_learning_rate(optimizer, lr) model.load_state_dict(best_w['state_dict']) print("=> loaded checkpoint (epoch {})".format(start_epoch - 1)) logger = Logger(logdir=model_save_dir, flush_secs=2) # =========>开始训练<========= for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_f1 = train_epoch(model, optimizer, criterion, train_dataloader, show_interval=100) val_loss, val_f1 = val_epoch(model, criterion, val_dataloader) print( '#epoch:%02d stage:%d train_loss:%.3e train_f1:%.3f val_loss:%0.3e val_f1:%.3f time:%s\n' % (epoch, stage, train_loss, train_f1, val_loss, val_f1, utils.print_time_cost(since))) logger.log_value('train_loss', train_loss, step=epoch) logger.log_value('train_f1', train_f1, step=epoch) logger.log_value('val_loss', val_loss, step=epoch) logger.log_value('val_f1', val_f1, step=epoch) state = { "state_dict": model.state_dict(), "epoch": epoch, "loss": val_loss, 'f1': val_f1, 'lr': lr, 'stage': stage } save_ckpt(state, best_f1 < val_f1, model_save_dir) best_f1 = max(best_f1, val_f1) if epoch in config.stage_epoch: stage += 1 lr /= config.lr_decay best_w = os.path.join(model_save_dir, config.best_w) model.load_state_dict(torch.load(best_w)['state_dict']) print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr)) utils.adjust_learning_rate(optimizer, lr)
def train(args): # model print(args.model_name) config.train_data = config.train_data + str(args.fold) + '.pth' # config.train_data = config.train_data + 'trainsfer_' + str(args.fold) + '.pth' config.model_name = args.model_name model = getattr(models, config.model_name)() model = model.to(device) # data if args.model_kind == 1: import dataset2 train_dataset = dataset2.ECGDataset(data_path=config.train_data, train=True, transform=True) train_dataloader = DataLoader(train_dataset, collate_fn=my_collate_fn, batch_size=config.batch_size, shuffle=True, num_workers=6) else: train_dataset = ECGDataset(data_path=config.train_data, train=True, transform=True) train_dataloader = DataLoader( train_dataset, #collate_fn=my_collate_fn, batch_size=config.batch_size, shuffle=True, num_workers=6) val_dataset = ECGDataset(data_path=config.train_data, train=False) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, num_workers=6) print("train_datasize", len(train_dataset), "val_datasize", len(val_dataset)) # optimizer and loss optimizer = optim.Adam(model.parameters(), lr=config.lr) # optimizer = optim.RMSprop(model.parameters(), lr=config.lr) w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device) if args.model_kind == 1: criterion = utils.WeightedMultilabel(w) print(1) else: criterion = utils2.WeightedMultilabel(w) # criterion = utils.My_loss(w) # 模型保存文件夹 model_save_dir = '%s/%s' % (config.ckpt + str(args.model_kind), config.model_name + '_' + str(args.fold)) args.ckpt = model_save_dir # if args.ex: model_save_dir += args.ex best_f1 = -1 lr = config.lr start_epoch = 1 stage = 1 # 从上一个断点,继续训练 if args.resume: if os.path.exists(args.ckpt): # 这里是存放权重的目录 # model_save_dir = args.ckpt current_w = torch.load(os.path.join(args.ckpt, config.current_w)) best_w = torch.load(os.path.join(model_save_dir, config.best_w)) best_f1 = best_w['best_f'] start_epoch = current_w['epoch'] + 1 lr = current_w['lr'] stage = current_w['stage'] model.load_state_dict(current_w['state_dict']) # 如果中断点恰好为转换stage的点 if start_epoch - 1 in config.stage_epoch: stage += 1 lr /= config.lr_decay utils.adjust_learning_rate(optimizer, lr) model.load_state_dict(best_w['state_dict']) print("=> loaded checkpoint (epoch {})".format(start_epoch - 1)) else: path = '%s/%s' % (config.ckpt, config.model_name + '_transfer') print(path) current_w = torch.load(os.path.join(path, config.best_w)) model.load_state_dict(current_w['state_dict']) logger = Logger(logdir=model_save_dir, flush_secs=2) # =========>开始训练<========= val_loss = 10 val_f1 = -1 state = {} for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_f1, best_f1 = train_epoch( model, optimizer, criterion, train_dataloader, epoch, lr, best_f1, val_dataloader, model_save_dir, state, 0) # if epoch % 2 == 1: val_loss, val_f1, _, _ = val_epoch(model, criterion, val_dataloader) print( '#epoch:%02d stage:%d train_loss:%.3e train_f1:%.3f val_loss:%0.3e val_f1:%.3f time:%s' % (epoch, stage, train_loss, train_f1, val_loss, val_f1, utils.print_time_cost(since))) logger.log_value('train_loss', train_loss, step=epoch) logger.log_value('train_f1', train_f1, step=epoch) logger.log_value('val_loss', val_loss, step=epoch) logger.log_value('val_f1', val_f1, step=epoch) state = { "state_dict": model.state_dict(), "epoch": epoch, "loss": val_loss, 'f1': val_f1, 'lr': lr, 'stage': stage, "best_f": best_f1 } if best_f1 < val_f1: save_ckpt(state, best_f1 < val_f1, model_save_dir) print('save best') else: save_ckpt(state, False, model_save_dir) best_f1 = max(best_f1, val_f1) if epoch in config.stage_epoch: stage += 1 lr /= config.lr_decay # best_w = os.path.join(model_save_dir, config.best_w) # model.load_state_dict(torch.load(best_w)['state_dict']) print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr)) utils.adjust_learning_rate(optimizer, lr)
def train_cv(input_directory, output_directory): # model # 模型保存文件夹 model_save_dir = '%s/%s_%s' % ( config.ckpt, config.model_name + "_cv", time.strftime("%Y%m%d%H%M") ) #'%s/%s_%s' % (config.ckpt, args.model_name+"_cv", time.strftime("%Y%m%d%H%M")) for fold in range(config.kfold): print("***************************fold : {}***********************". format(fold)) model = getattr(models, config.model_name)(fold=fold) # if args.ckpt and not args.resume: # state = torch.load(args.ckpt, map_location='cpu') # model.load_state_dict(state['state_dict']) # print('train with pretrained weight val_f1', state['f1']) num_ftrs = model.fc.in_features model.fc = nn.Linear(num_ftrs, config.num_classes) #2019/11/11 #save dense/fc weight for pretrain 55 classes # model = MyModel() # num_ftrs = model.classifier.out_features # model.fc = nn.Linear(55, config.num_classes) model = model.to(device) # data train_dataset = ECGDataset(data_path=config.train_data_cv.format(fold), data_dir=input_directory, train=True) train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, drop_last=True, num_workers=6) val_dataset = ECGDataset(data_path=config.train_data_cv.format(fold), data_dir=input_directory, train=False) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, drop_last=True, num_workers=4) print("fold_{}_train_datasize".format(fold), len(train_dataset), "fold_{}_val_datasize".format(fold), len(val_dataset)) # optimizer and loss optimizer = radam.RAdam( model.parameters(), lr=config.lr) #optim.Adam(model.parameters(), lr=config.lr) w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device) criterion = utils.WeightedMultilabel(w) ## utils.FocalLoss() # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', verbose=True, factor=0.1, patience=5, min_lr=1e-06, eps=1e-08) # if args.ex: model_save_dir += args.ex # best_f1 = -1 # lr = config.lr # start_epoch = 1 # stage = 1 best_f1 = -1 best_cm = -1 lr = config.lr start_epoch = 1 stage = 1 # 从上一个断点,继续训练 # if args.resume: # if os.path.exists(args.ckpt): # 这里是存放权重的目录 # model_save_dir = args.ckpt # current_w = torch.load(os.path.join(args.ckpt, config.current_w)) # best_w = torch.load(os.path.join(model_save_dir, config.best_w)) # best_f1 = best_w['loss'] # start_epoch = current_w['epoch'] + 1 # lr = current_w['lr'] # stage = current_w['stage'] # model.load_state_dict(current_w['state_dict']) # # 如果中断点恰好为转换stage的点 # if start_epoch - 1 in config.stage_epoch: # stage += 1 # lr /= config.lr_decay # utils.adjust_learning_rate(optimizer, lr) # model.load_state_dict(best_w['state_dict']) # print("=> loaded checkpoint (epoch {})".format(start_epoch - 1)) logger = Logger(logdir=model_save_dir, flush_secs=2) # =========>开始训练<========= for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_acc, train_f1, train_f2, train_g2, train_cm = train_epoch( model, optimizer, criterion, train_dataloader, show_interval=100) val_loss, val_acc, val_f1, val_f2, val_g2, val_cm = val_epoch( model, criterion, val_dataloader) # train_loss, train_f1 = train_beat_epoch(model, optimizer, criterion, train_dataloader, show_interval=100) # val_loss, val_f1 = val_beat_epoch(model, criterion, val_dataloader) print('#epoch:%02d, stage:%d, train_loss:%.3e, train_acc:%.3f, train_f1:%.3f, train_f2:%.3f, train_g2:%.3f,train_cm:%.3f,\n \ val_loss:%0.3e, val_acc:%.3f, val_f1:%.3f, val_f2:%.3f, val_g2:%.3f, val_cm:%.3f,time:%s\n' % (epoch, stage, train_loss, train_acc,train_f1,train_f2,train_g2,train_cm, \ val_loss, val_acc, val_f1, val_f2, val_g2, val_cm,utils.print_time_cost(since))) logger.log_value('fold{}_train_loss'.format(fold), train_loss, step=epoch) logger.log_value('fold{}_train_f1'.format(fold), train_f1, step=epoch) logger.log_value('fold{}_val_loss'.format(fold), val_loss, step=epoch) logger.log_value('fold{}_val_f1'.format(fold), val_f1, step=epoch) state = { "state_dict": model.state_dict(), "epoch": epoch, "loss": val_loss, 'f1': val_f1, 'lr': lr, 'stage': stage } save_ckpt_cv(state, best_cm < val_cm, model_save_dir, fold, output_directory) best_cm = max(best_cm, val_cm) scheduler.step(val_cm) # scheduler.step() if val_cm < best_cm: epoch_cum += 1 else: epoch_cum = 0 # save_ckpt_cv(state, best_f1 < val_f1, model_save_dir,fold) # best_f1 = max(best_f1, val_f1) # if val_f1 < best_f1: # epoch_cum += 1 # else: # epoch_cum = 0 # if epoch in config.stage_epoch: # if epoch_cum == 5: # stage += 1 # lr /= config.lr_decay # if lr < 1e-6: # lr = 1e-6 # print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) # best_w = os.path.join(model_save_dir, config.best_w_cv.format(fold)) # model.load_state_dict(torch.load(best_w)['state_dict']) # print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr)) # utils.adjust_learning_rate(optimizer, lr) # elif epoch_cum >= 12: # print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) # break if epoch_cum >= 12: print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) break
def train(input_directory, output_directory): # model model = getattr(models, config.model_name)() # if args.ckpt and not args.resume: # state = torch.load(args.ckpt, map_location='cpu') # model.load_state_dict(state['state_dict']) # print('train with pretrained weight val_f1', state['f1']) num_ftrs = model.fc.in_features model.fc = nn.Linear(num_ftrs, config.num_classes) model = model.to(device) # data train_dataset = ECGDataset(data_path=config.train_data, data_dir=input_directory, train=True) train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=6) val_dataset = ECGDataset(data_path=config.train_data, data_dir=input_directory, train=False) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, num_workers=4) print("train_datasize", len(train_dataset), "val_datasize", len(val_dataset)) # optimizer and loss #optimizer = optim.Adam(model.parameters(), lr=config.lr) optimizer = radam.RAdam(model.parameters(), lr=config.lr, weight_decay=1e-4) #config.lr #optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, dampening=0, weight_decay=1e-4, nesterov=False) w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device) criterion = utils.WeightedMultilabel(w) ## # utils.FocalLoss() # scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'max', verbose=True, factor=0.1, patience=5, min_lr=1e-06, eps=1e-08) #CosineAnnealingLR CosineAnnealingWithRestartsLR #scheduler = pytorchtools.CosineAnnealingWithRestartsLR(optimizer,T_max=30, T_mult = 1.2, eta_min=1e-6) # optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, nesterov=True) # scheduler = pytorchtools.CosineAnnealingLR_with_Restart(optimizer, T_max=12, T_mult=1, model=model, out_dir='./snapshot',take_snapshot=True, eta_min=1e-9) # 模型保存文件夹 model_save_dir = '%s/%s_%s' % (config.ckpt, config.model_name, time.strftime("%Y%m%d%H%M")) # if args.ex: model_save_dir += args.ex best_f1 = -1 best_cm = -1 lr = config.lr start_epoch = 1 stage = 1 # 从上一个断点,继续训练 # if args.resume: # if os.path.exists(args.ckpt): # 这里是存放权重的目录 # model_save_dir = args.ckpt # current_w = torch.load(os.path.join(args.ckpt, config.current_w)) # best_w = torch.load(os.path.join(model_save_dir, config.best_w)) # best_f1 = best_w['loss'] # start_epoch = current_w['epoch'] + 1 # lr = current_w['lr'] # stage = current_w['stage'] # model.load_state_dict(current_w['state_dict']) # # 如果中断点恰好为转换stage的点 # if start_epoch - 1 in config.stage_epoch: # stage += 1 # lr /= config.lr_decay # utils.adjust_learning_rate(optimizer, lr) # model.load_state_dict(best_w['state_dict']) # print("=> loaded checkpoint (epoch {})".format(start_epoch - 1)) logger = Logger(logdir=model_save_dir, flush_secs=2) # =========>开始训练<========= for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_acc, train_f1, train_f2, train_g2, train_cm = train_epoch( model, optimizer, criterion, train_dataloader, show_interval=100) val_loss, val_acc, val_f1, val_f2, val_g2, val_cm = val_epoch( model, criterion, val_dataloader) # train_loss, train_f1 = train_beat_epoch(model, optimizer, criterion, train_dataloader, show_interval=100) # val_loss, val_f1 = val_beat_epoch(model, criterion, val_dataloader) print('#epoch:%02d, stage:%d, train_loss:%.3e, train_acc:%.3f, train_f1:%.3f, train_f2:%.3f, train_g2:%.3f,train_cm:%.3f,\n \ val_loss:%0.3e, val_acc:%.3f, val_f1:%.3f, val_f2:%.3f, val_g2:%.3f, val_cm:%.3f,time:%s\n' % (epoch, stage, train_loss, train_acc,train_f1,train_f2,train_g2,train_cm, \ val_loss, val_acc, val_f1, val_f2, val_g2, val_cm,utils.print_time_cost(since))) logger.log_value('train_loss', train_loss, step=epoch) logger.log_value('train_f1', train_f1, step=epoch) logger.log_value('val_loss', val_loss, step=epoch) logger.log_value('val_f1', val_f1, step=epoch) state = { "state_dict": model.state_dict(), "epoch": epoch, "loss": val_loss, 'f1': val_f1, 'lr': lr, 'stage': stage } save_ckpt(state, best_cm < val_cm, model_save_dir, output_directory) best_cm = max(best_cm, val_cm) scheduler.step(val_cm) # scheduler.step() if val_cm < best_cm: epoch_cum += 1 else: epoch_cum = 0 # # if epoch in config.stage_epoch: # if epoch_cum == 5: # stage += 1 # lr /= config.lr_decay # if lr < 1e-6: # lr = 1e-6 # print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) # best_w = os.path.join(model_save_dir, config.best_w) # model.load_state_dict(torch.load(best_w)['state_dict']) # print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr)) # utils.adjust_learning_rate(optimizer, lr) # elif epoch_cum >= 12: # print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) # break if epoch_cum >= 12: print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) break
def train(mode='train', ckpt=None, resume=False): # model model = getattr(resnet, config.model_name)(num_classes=config.num_classes, input_dim=config.input_dim) if ckpt is not None and not resume: state = torch.load(ckpt, map_location='cpu') model.load_state_dict(state['state_dict']) print('train with pretrained weight val_f1', state['f1']) model = model.to(device) # data train_dataset = ECGDataset(data_path=config.train_data, mode=mode) train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=6) val_dataset = ECGDataset(data_path=config.train_data, mode='val') val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, num_workers=6) print("train_datasize", len(train_dataset), "val_datasize", len(val_dataset)) # optimizer and loss optimizer = optim.Adam(model.parameters(), lr=config.lr) groups = config.groups count = train_dataset.count criterion = utils.WeightedMultilabel(groups, count, device) # 模型保存文件夹 model_save_dir = '%s/%s_%s' % (config.ckpt, config.model_name, time.strftime("%Y%m%d%H%M")) if not os.path.exists(config.ckpt): os.mkdir(config.ckpt) os.mkdir(model_save_dir) best_f1 = -1 lr = config.lr start_epoch = 1 stage = 1 # 从上一个断点,继续训练 if resume: if os.path.exists(ckpt): # 这里是存放权重的目录 current_w = torch.load(os.path.join(ckpt)) start_epoch = current_w['epoch'] + 1 lr = current_w['lr'] stage = current_w['stage'] model.load_state_dict(current_w['state_dict']) print("=> loaded checkpoint (epoch {})".format(start_epoch - 1)) # logger = Logger(logdir=model_save_dir, flush_secs=2) # =========>开始训练<========= val_loss, val_p, val_r, val_f1 = val_epoch(model, criterion, val_dataloader) print('start training') print('val_loss:%.3e val_precision:%.4f val_recall:%.4f val_f1:%.4f \n' % (val_loss, val_p, val_r, val_f1)) for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_p, train_r, train_f1 = train_epoch( model, optimizer, criterion, train_dataloader, show_interval=config.show_interval) val_loss, val_p, val_r, val_f1, pr_df = val_epoch(model, criterion, val_dataloader, simple_mode=False) pr_df['arry'] = pr_df['arry'].map(val_dataset.idx2name) print('#epoch:%02d stage:%d time:%s' % (epoch, stage, utils.print_time_cost(since))) print( 'train_loss:%.3e train_precision:%.4f train_recall:%.4f train_f1:%.4f' % (train_loss, train_p, train_r, train_f1)) print( 'val_loss:%.3e val_precision:%.4f val_recall:%.4f val_f1:%.4f \n' % (val_loss, val_p, val_r, val_f1)) display.display(pr_df) # logger.log_value('train_loss', train_loss, step=epoch) # logger.log_value('train_f1', train_f1, step=epoch) # logger.log_value('val_loss', val_loss, step=epoch) # logger.log_value('val_f1', val_f1, step=epoch) state = { "state_dict": model.state_dict(), "epoch": epoch, "loss": val_loss, 'f1': val_f1, 'lr': lr, 'stage': stage } torch.save(state, os.path.join(model_save_dir, 'e%i' % (epoch))) best_f1 = max(best_f1, val_f1) if epoch in config.stage_epoch: stage += 1 lr /= config.lr_decay print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr)) utils.adjust_learning_rate(optimizer, lr)
def train(args): model = models.myecgnet() if args.ckpt and not args.resume: state = torch.load(args.ckpt, map_location='cpu') model.load_state_dict(state['state_dict']) print('train with pretrained weight val_f1', state['f1']) model = model.to(device) train_dataset = ECGDataset(data_path=config.train_data, train=True) train_dataloader = DataLoader(train_dataset, collate_fn=my_collate_fn, batch_size=config.batch_size, shuffle=True, num_workers=8) val_dataset = ECGDataset(data_path=config.train_data, train=False) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, num_workers=8) print("train_datasize", len(train_dataset), "val_datasize", len(val_dataset)) optimizer = AdamW(model.parameters(), lr=config.lr) w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device) criterion = utils.WeightedMultilabel(w) model_save_dir = '%s/%s_%s' % (config.ckpt, config.model_name, time.strftime("%Y%m%d%H%M")) os.mkdir(model_save_dir) if args.ex: model_save_dir += args.ex best_f1 = -1 lr = config.lr start_epoch = 1 stage = 1 if args.resume: if os.path.exists(args.ckpt): model_save_dir = args.ckpt current_w = torch.load(os.path.join(args.ckpt, config.current_w)) best_w = torch.load(os.path.join(model_save_dir, config.best_w)) best_f1 = best_w['loss'] start_epoch = current_w['epoch'] + 1 lr = current_w['lr'] stage = current_w['stage'] model.load_state_dict(current_w['state_dict']) if start_epoch - 1 in config.stage_epoch: stage += 1 lr /= config.lr_decay utils.adjust_learning_rate(optimizer, lr) model.load_state_dict(best_w['state_dict']) print("=> loaded checkpoint (epoch {})".format(start_epoch - 1)) for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_f1 = train_epoch(model, optimizer, criterion, train_dataloader, show_interval=10) val_loss, val_f1 = val_epoch(model, criterion, val_dataloader) print( '#epoch:%03d\tstage:%d\ttrain_loss:%.4f\ttrain_f1:%.3f\tval_loss:%0.4f\tval_f1:%.3f\ttime:%s\n' % (epoch, stage, train_loss, train_f1, val_loss, val_f1, utils.print_time_cost(since))) state = { "state_dict": model.state_dict(), "epoch": epoch, "loss": val_loss, 'f1': val_f1, 'lr': lr, 'stage': stage } save_ckpt(state, best_f1 < val_f1, model_save_dir) best_f1 = max(best_f1, val_f1) if epoch in config.stage_epoch: stage += 1 lr /= config.lr_decay best_w = os.path.join(model_save_dir, config.best_w) model.load_state_dict(torch.load(best_w)['state_dict']) print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr)) utils.adjust_learning_rate(optimizer, lr)