def train(self): ''' 训练模型,必须实现此方法 :return: ''' # pass df = pd.read_csv(os.path.join(DATA_PATH, DataID, 'train.csv')) kf = KFold(n_splits=5, shuffle=False, random_state=42) for fold, (train_idx, val_idx) in enumerate(kf.split(df)): # # abandon cross validation # if fold > 0: # break self.__init__(self.model_name) print( f'fold:{fold+1}...', 'train_size: %d, val_size: %d' % (len(train_idx), len(val_idx))) # generate dataloder train_data = ImageData(df, train_idx, mode='train') val_data = ImageData(df, val_idx, mode='valid') train_loader = DataLoader( train_data, batch_size=args.BATCH, shuffle=True, # drop_last=True ) val_loader = DataLoader(val_data, batch_size=args.BATCH, shuffle=False, drop_last=True) max_correct = 0 for epoch in range(args.EPOCHS): self.scheduler.step(epoch) train_loss, train_acc, val_loss, val_acc = self.train_one_epoch( train_loader, val_loader) start = time.strftime("%H:%M:%S") print(f'fold:{fold + 1}', f"epoch:{epoch + 1}/{args.EPOCHS} | ⏰: {start} ", f"Training Loss: {train_loss:.6f}.. ", f"Training Acc: {train_acc:.6f}.. ", f"validation Acc: {val_acc:.6f}.. ") train_log(train_loss=train_loss, train_acc=train_acc, val_loss=val_loss, val_acc=val_acc) if val_acc > max_correct: max_correct = val_acc torch.save( self.model, MODEL_PATH + '/' + f"{self.model_name}_best_fold{fold+1}.pth") # torch.save(self.model, MODEL_PATH + '/' + "best.pth") print('find optimal model')
def train_epochs(self): while self.epoch < self.max_epoch: self.epoch += 1 s_time = time.time() acc, loss = self.train() val_acc, val_loss = self.validate() if self.scheduler != None: self.scheduler.step(val_acc) s_time = time.time() - s_time print("这次花了这么长时间:%f" % s_time) train_log(train_loss=loss, train_acc=acc, val_acc=val_acc, val_loss=val_loss)
实现自己的模型保存逻辑 ''' # initialise gradients optimizer.zero_grad() # generate predictions outputs = net(x_train) # calculate loss loss = criterion(outputs, y_train) # compute loss loss.backward() nn.utils.clip_grad_norm_(net.parameters(), max_grad) # update parameters using gradients optimizer.step() scheduler.step() total_loss += loss.item() train_acc_ts = (outputs.argmax(1) == y_train) train_acc += (train_acc_ts).sum().item() train_bar.set_description('{}/{} loss: {:.4f}'.format( step + 1, dataset.get_step(), loss.item())) if step % 20 == 0: data_num = 20 * args.BATCH val_loss, val_acc = model.evaluate() train_log(total_loss / data_num, train_acc / data_num, val_loss, val_acc) if val_acc > best_score: model.save_model(net, MODEL_PATH, overwrite=True) best_score = val_acc print("Model saved!") total_loss, train_acc = 0., 0.
x_test = torch.from_numpy(x_test) y_test = torch.from_numpy(y_test) x_test = x_test.float().to(device) y_test = y_test.long().to(device) outputs = cnn(x_train) _, prediction = torch.max(outputs.data, 1) optimizer.zero_grad() # print(x_train.shape,outputs.shape,y_train.shape) loss = loss_fn(outputs, y_train) loss.backward() optimizer.step() #优化器针对loss进行参数更新 scheduler.step(loss.item()) #scheduler为针对学习率的调整策略 print(loss.detach()) # 若测试准确率高于当前最高准确率,则保存模型 val_acc, val_loss = eval(model, x_test, y_test) if val_loss >= 1: val_loss = 0.8 train_log(train_loss=loss.item(), val_loss=val_loss, val_acc=val_acc) if val_acc >= best_accuracy: best_accuracy = val_acc model.save_model(cnn, MODEL_PATH, overwrite=True) print("step %d, best accuracy %g" % (i, best_accuracy)) print(str(i) + "/" + str(args.EPOCHS)) print(best_accuracy)
net = tf.squeeze(net, [1, 2], name='fc8/squeezed') return net g = tf.Graph() prediction = tf.add(vgg_19(x_image), 0, name='y_conv') loss = slim.losses.softmax_cross_entropy(prediction, y) # 读入标签 optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0001) train_op = slim.learning.create_train_op(loss, optimizer) # 训练以及优化 # 求准确率: correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) merged = tf.summary.merge_all() saver = tf.train.Saver() init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) train_writer = tf.summary.FileWriter(LOG_PATH, sess.graph) x_train, y_train, x_test, y_test = dataset.next_batch(args.BATCH) for i in range(args.EPOCHS): train_dict = {x: x_train, y: y_train, keep_prob: 0.7} sess.run(train_op, feed_dict=train_dict) losses, acc_ = sess.run([loss, accuracy], feed_dict=train_dict) train_log(train_loss=losses, train_acc=acc_) y_convs = sess.run(prediction, feed_dict=train_dict) print("step:{}, loss:{}, acc:{}".format(i + 1, losses, acc_)) model.save_model(sess, MODEL_PATH, overwrite=True)
_, tra_loss, logits, train_acc = sess.run(fetches, feed_dict=feed_dict) val_que_x, val_que_len = que_val val_ans_x, val_ans_len = ans_val val_ans_x = process_ans_batch( val_ans_x, ans_dict, int(sorted(list(val_ans_len), reverse=True)[0])) feed_dict = { input_data: val_que_x, targets: val_ans_x, lr: learning_rate, target_sequence_length: val_ans_len, source_sequence_length: val_que_len } val_loss, val_acc = sess.run([cost, ans_accuracy], feed_dict=feed_dict) summary = sess.run(summary_op, feed_dict=feed_dict) train_writer.add_summary(summary, step) # 调用系统打印日志函数,这样在线上可看到训练和校验准确率和损失的实时变化曲线 train_log(train_loss=tra_loss, train_acc=train_acc, val_loss=val_loss, val_acc=val_acc) # 实现自己的保存模型逻辑 if step % 200 == 0: model.save_model(sess, MODEL_PATH, overwrite=True) model.save_model(sess, MODEL_PATH, overwrite=True)
} valLoss, valAcc_, y_pre, wrong_ = sess.run( [loss, accuracy, pred, not_correct_pred], feed_dict=feed_dict_val) val_f1score = metrics.f1_score(y_val[:, 0], y_pre, average='weighted') #if step%100 == 0: #print(x_text_val[wrong_],y_val[wrong_]) # -------save and print summary = sess.run(merged_summary, feed_dict=feed_dict) train_writer.add_summary(summary, step) print(' ') # cur_step = str(step + 1) + "/" + str(all_train_steps) print('steps: {0}'.format( str(current_step) + '/' + str(all_train_steps))) f1_mean = (val_f1score + train_f1) / 2 train_log(train_loss=loss_, train_acc=acc_, val_loss=valLoss, val_acc=f1_mean) print("val_f1:{}".format(f1_mean)) current_step += 1 #if current_step % 100 == 0: #modelpp.save_model(sess, MODEL_PATH, overwrite=True) #if current_step % 10 == 0: # 每 5 step验证一次 if acc_flag < f1_mean: acc_flag = f1_mean modelpp.save_model(sess, MODEL_PATH, overwrite=True) last_provement = current_step print('the save model steps is : {0}'.format( str(current_step) + '/' + str(all_train_steps))) print('the model f1score is {0}'.format(f1_mean))
cnn = FCN16s(1).to(device) optimizer = SGD(cnn.parameters(), lr=0.0005, momentum=0.9, weight_decay=0.0005) criterion = nn.BCELoss() # 定义损失函数 ''' dataset.get_step() 获取数据的总迭代次数 ''' lowest_loss = 1e5 for i in range(data.get_step()): print('----------------' + str(i) + "/" + str(data.get_step()) + '-------------------') cnn.train() x_train, y_train = data.next_train_batch() x_train = torch.from_numpy(x_train) y_train = torch.from_numpy(y_train) x_train = x_train.float().to(device) y_train = y_train.float().to(device) y_train = y_train.unsqueeze(1) optimizer.zero_grad() outputs = cnn(x_train) pred = torch.sigmoid(outputs) loss = criterion(pred, y_train) loss.backward() optimizer.step() print("now loss is : %f, lowest loss %f" % (loss.data, lowest_loss)) # 线上实时打印log train_log(train_loss=loss.data.cpu().numpy()) # 若测试准确率高于当前最高准确率,则保存模型 if loss.data < lowest_loss: lowest_loss = loss.data model.save_model(cnn, MODEL_PATH, overwrite=True) print("saved model!!!")
overwrite=True) best_score_by_acc = history_train.history['val_accuracy'][0] best_score_by_loss = history_train.history['val_loss'][0] best_epoch = epoch print('【保存了best:acc相同,loss降低】') # if history_train.history['val_acc'][0] > 0.80 and \ # round(best_score_by_loss/save_boundary, 2) >= round(history_train.history['val_loss'][0] /save_boundary, 2): if best_score_by_acc == 0: print('未能满足best_score的条件') else: print('当前【best】:acc:%.2f, loss:%.2f, epoch:%d' % (best_score_by_acc, best_score_by_loss, best_epoch + 1)) # 调用系统打印日志函数,这样在线上可看到训练和校验准确率和损失的实时变化曲线 train_log(train_loss=history_train.history['loss'][0], train_acc=history_train.history['accuracy'][0], val_loss=history_train.history['val_loss'][0], val_acc=history_train.history['val_accuracy'][0]) ''' 4/ 调整学习率和优化模型 ''' tmp_opt = wangyiOpt.reduce_lr_by_loss_and_epoch( history_train.history['loss'][0], epoch) # 应用新的学习率 if tmp_opt is not None: model_cnn.model_cnn.compile(loss='categorical_crossentropy', optimizer=tmp_opt, metrics=['accuracy']) # TODO 新的学习率,还没完成 # if optimzer_custom.compareHistoryList( history_train_all['loss'] ,pationce= 5 ,min_delta=0.001) :
def train_epoch(cfg, model, loader, optimizer, optimizer_center, center_criterion, loss_fun, epoch, n_epochs, grid, writer, logger, print_freq=50): batch_time = AverageMeter() losses = AverageMeter() losses_id = AverageMeter() losses_center = AverageMeter() error = AverageMeter() # Model on train mode model.train() grid.set_prob(epoch, cfg.SOLVER.MAX_EPOCHS) end = time.time() writer.add_scalar('data/lr', optimizer.param_groups[0]['lr'], epoch) for batch_idx, (input, target) in enumerate(loader): # Create vaiables optimizer.zero_grad() optimizer_center.zero_grad() if torch.cuda.is_available(): input = input.cuda() target = target.cuda() if cfg.INPUT.GRID_PRO > 0: input = grid(input) # compute output if not cfg.INPUT.MIXUP: output, feat = model(input, target) all_loss, id_loss, cen_loss = loss_fun(output, feat, target) else: input, targets_a, targets_b, lam = mixup_data(input, target, 0.5, use_cuda=True) input, targets_a, targets_b = map(Variable, (input, targets_a, targets_b)) output, feat = model(input, target) all_loss, id_loss, cen_loss = mixup_criterion( loss_fun, output, feat, targets_a, targets_b, lam) # measure accuracy and record loss batch_size = target.size(0) _, pred = output.data.cpu().topk(1, dim=1) error.update( torch.ne(pred.squeeze(), target.cpu()).float().sum().item() / batch_size, batch_size) losses.update(all_loss.item(), batch_size) losses_id.update(id_loss.item(), batch_size) if isinstance(cen_loss, int) or isinstance(cen_loss, float): losses_center.update(0, batch_size) else: losses_center.update(cen_loss.item(), batch_size) writer.add_scalar('data/loss', losses.avg, (epoch) * len(loader) + batch_idx) writer.add_scalar('data/loss_id', losses_id.avg, (epoch) * len(loader) + batch_idx) writer.add_scalar('data/loss_center', losses_center.avg, (epoch) * len(loader) + batch_idx) writer.add_scalar('data/train_error', error.avg, (epoch) * len(loader) + batch_idx) # compute gradient and do SGD step all_loss.backward() optimizer.step() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) optimizer_center.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() # print stats if batch_idx % print_freq == 0: res = '\t'.join([ 'Epoch: [%d/%d]' % (epoch + 1, n_epochs), 'Iter: [%d/%d]' % (batch_idx + 1, len(loader)), 'Time %.3f (%.3f)' % (batch_time.val, batch_time.avg), 'Loss %.4f (%.4f)' % (losses.val, losses.avg), 'Error %.4f (%.4f)' % (error.val, error.avg), ]) train_log(train_loss=losses.val, train_acc=error.val) logger.info(res) # Return summary statistics return batch_time.avg, losses.avg, error.avg
right_num = 0 n_smp = 0 for val_step in val_step_iterator: x_val, y_val = next(val_gen) n_smp += y_val.size()[0] TI_net.eval() outputs = TI_net(x_val, labels=y_val) val_loss += outputs[0].item() logits = outputs[1] pred = logits.argmax(dim=-1) right_num += torch.eq(pred, y_val).sum().item() ''' 实现自己的模型保存逻辑 ''' val_loss = val_loss / (val_step + 1) val_acc = right_num / n_smp print("step " + str(step + 1) + "/" + str(steps_per_epoch) + ", " + "epoch " + str(epoch + 1) + "/" + str(args.EPOCHS) + ", " + "val loss is " + str(val_loss) + ", val acc is " + str(val_acc)) # 调用系统打印日志函数,这样在线上可看到训练和校验准确率和损失的实时变化曲线 train_log(train_loss=train_loss, train_acc=0.5, val_loss=val_loss, val_acc=val_acc) if max_acc < val_acc: print("acc improved from {0} to {1}, model saved.".format( max_acc, val_acc)) max_acc = val_acc mymodel.save_model()
def train(self): # pass df = pd.read_csv(os.path.join(DATA_PATH, DataID, 'train.csv')) image_path_list = df['image_path'].values label_list = df['label'].values # 划分训练集和校验集 all_size = len(image_path_list) train_size = int(all_size * 0.9) train_image_path_list = image_path_list[:train_size] train_label_list = label_list[:train_size] val_image_path_list = image_path_list[train_size:] val_label_list = label_list[train_size:] print( 'train_size: %d, val_size: %d' % (len(train_image_path_list), len(val_image_path_list))) train_transform, val_trainsform = self.deal_with_data() train_data = ImageData(train_image_path_list, train_label_list, train_transform) val_data = ImageData(val_image_path_list, val_label_list, val_trainsform) train_loader = DataLoader(train_data, batch_size=args.BATCH, num_workers=0, shuffle=True) val_loader = DataLoader(val_data, batch_size=args.BATCH, num_workers=0, shuffle=False) model = EfficientNet.from_pretrained('efficientnet-b1') model.fc = nn.Linear(1280, 2) if use_gpu: model.to(DEVICE) criteration = nn.CrossEntropyLoss() criteration.cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.LR, momentum=0.9, weight_decay=5e-4) if args.SCHE == "cos": scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=4e-08) elif args.SCHE == "red": scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode="min", factor=0.1, patience=3, verbose=False, threshold=0.0001 ) else: sys.exit(-1) max_correct = 0 #scheduler_steplr = StepLR(optimizer, step_size=10, gamma=0.1) scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=5, after_scheduler=scheduler) for epoch in range(args.EPOCHS): #scheduler_warmup.step(epoch) model.train() correct = 0 # Train losses train_losses = [] for img, label in train_loader: img, label = img.to(DEVICE), label.to(DEVICE) optimizer.zero_grad() output = model(img) #loss = criteration(output, label) loss = self.label_smoothing(output, label,epsilon=0.1) loss.backward() optimizer.step() # Train Metric train_pred = output.detach().cpu().max(1, keepdim=True)[1] correct += train_pred.eq(label.detach().cpu(). view_as(train_pred)).sum().item() train_losses.append(loss.item()) del train_pred # print("Epoch {}, Loss {:.4f}".format(epoch, loss.item())) del img, label # Train loss curve train_avg_loss = np.mean(train_losses) acc = 100 * correct / len(train_image_path_list) scheduler_warmup.step_ReduceLROnPlateau(train_avg_loss) if epoch % 1 == 0 or epoch == args.EPOCHS - 1: correct = 0 with torch.no_grad(): model.eval() # Val losses val_losses = [] for val_img, val_label in val_loader: val_img = val_img.to(DEVICE), val_label = val_label.to(DEVICE) val_output = model(val_img[0]) loss = criteration(val_output, val_label) val_pred = val_output.detach().cpu().\ max(1, keepdim=True)[1] correct += val_pred.eq(val_label.detach().cpu(). view_as(val_pred)).\ sum().item() val_losses.append(loss.item()) del val_img, val_label, val_output, val_pred # Val loss curve val_avg_loss = np.mean(val_losses) val_acc = 100 * correct / len(val_image_path_list) if (correct > max_correct): max_correct = correct torch.save(model, MODEL_PATH + '/' + "best.pth") print("Epoch {}, Accuracy {:.0f}%".format( epoch, 100 * correct / len(val_image_path_list))) # LR curve train_log(train_loss=train_avg_loss, train_acc=acc, val_loss=val_avg_loss,val_acc=val_acc)