def fit_one_epoch(net,epoch,epoch_size,epoch_size_val,gen,genval,Epoch,cuda): total_r_loss = 0 total_c_loss = 0 total_loss = 0 val_loss = 0 start_time = time.time() net.train() with tqdm(total=epoch_size,desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) as pbar: for iteration, batch in enumerate(gen): if iteration >= epoch_size: break with torch.no_grad(): if cuda: batch = [Variable(torch.from_numpy(ann).type(torch.FloatTensor)).cuda() for ann in batch] else: batch = [Variable(torch.from_numpy(ann).type(torch.FloatTensor)) for ann in batch] batch_images, batch_hms, batch_whs, batch_regs, batch_reg_masks = batch optimizer.zero_grad() if backbone=="resnet50": hm, wh, offset = net(batch_images) c_loss = focal_loss(hm, batch_hms) wh_loss = 0.1*reg_l1_loss(wh, batch_whs, batch_reg_masks) off_loss = reg_l1_loss(offset, batch_regs, batch_reg_masks) loss = c_loss + wh_loss + off_loss total_loss += loss.item() total_c_loss += c_loss.item() total_r_loss += wh_loss.item() + off_loss.item() else: outputs = net(batch_images) loss = 0 c_loss_all = 0 r_loss_all = 0 index = 0 for output in outputs: hm, wh, offset = output["hm"].sigmoid(), output["wh"], output["reg"] c_loss = focal_loss(hm, batch_hms) wh_loss = 0.1*reg_l1_loss(wh, batch_whs, batch_reg_masks) off_loss = reg_l1_loss(offset, batch_regs, batch_reg_masks) loss += c_loss + wh_loss + off_loss c_loss_all += c_loss r_loss_all += wh_loss + off_loss index += 1 total_loss += loss.item()/index total_c_loss += c_loss_all.item()/index total_r_loss += r_loss_all.item()/index loss.backward() optimizer.step() waste_time = time.time() - start_time pbar.set_postfix(**{'total_r_loss' : total_r_loss / (iteration + 1), 'total_c_loss' : total_c_loss / (iteration + 1), 'lr' : get_lr(optimizer), 's/step' : waste_time}) pbar.update(1) start_time = time.time() net.eval() print('Start Validation') with tqdm(total=epoch_size_val, desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) as pbar: for iteration, batch in enumerate(genval): if iteration >= epoch_size_val: break with torch.no_grad(): if cuda: batch = [Variable(torch.from_numpy(ann).type(torch.FloatTensor)).cuda() for ann in batch] else: batch = [Variable(torch.from_numpy(ann).type(torch.FloatTensor)) for ann in batch] batch_images, batch_hms, batch_whs, batch_regs, batch_reg_masks = batch if backbone=="resnet50": hm, wh, offset = net(batch_images) c_loss = focal_loss(hm, batch_hms) wh_loss = 0.1*reg_l1_loss(wh, batch_whs, batch_reg_masks) off_loss = reg_l1_loss(offset, batch_regs, batch_reg_masks) loss = c_loss + wh_loss + off_loss val_loss += loss.item() else: outputs = net(batch_images) index = 0 loss = 0 for output in outputs: hm, wh, offset = output["hm"].sigmoid(), output["wh"], output["reg"] c_loss = focal_loss(hm, batch_hms) wh_loss = 0.1*reg_l1_loss(wh, batch_whs, batch_reg_masks) off_loss = reg_l1_loss(offset, batch_regs, batch_reg_masks) loss += c_loss + wh_loss + off_loss index += 1 val_loss += loss.item()/index pbar.set_postfix(**{'total_loss': val_loss / (iteration + 1)}) pbar.update(1) print('Finish Validation') print('Epoch:'+ str(epoch+1) + '/' + str(Epoch)) print('Total Loss: %.4f || Val Loss: %.4f ' % (total_loss/(epoch_size+1),val_loss/(epoch_size_val+1))) print('Saving state, iter:', str(epoch+1)) torch.save(model.state_dict(), 'logs/Epoch%d-Total_Loss%.4f-Val_Loss%.4f.pth'%((epoch+1),total_loss/(epoch_size+1),val_loss/(epoch_size_val+1))) return val_loss/(epoch_size_val+1)
def fit_one_epoch(net, epoch, epoch_size, epoch_size_val, gen, genval, Epoch, cuda): total_r_loss = 0 total_c_loss = 0 total_loss = 0 val_loss = 0 net.train() with tqdm(total=epoch_size, desc=f'Epoch {epoch + 1}/{Epoch}', postfix=dict, mininterval=0.3) as pbar: for iteration, batch in enumerate(gen): start_time = time.time() if iteration >= epoch_size: break with torch.no_grad(): if cuda: batch = [Variable(torch.from_numpy(ann).type(torch.FloatTensor)).cuda() for ann in batch] else: batch = [Variable(torch.from_numpy(ann).type(torch.FloatTensor)) for ann in batch] batch_images, batch_hms, batch_whs, batch_regs, batch_reg_masks = batch optimizer.zero_grad() hm, wh, offset = net(batch_images) c_loss = focal_loss(hm, batch_hms) wh_loss = 0.1 * reg_l1_loss(wh, batch_whs, batch_reg_masks) off_loss = reg_l1_loss(offset, batch_regs, batch_reg_masks) loss = c_loss + wh_loss + off_loss total_loss += loss.item() total_c_loss += c_loss.item() total_r_loss += wh_loss.item() + off_loss.item() loss.backward() optimizer.step() waste_time = time.time() - start_time pbar.set_postfix(**{'total_r_loss': total_r_loss / (iteration + 1), 'total_c_loss': total_c_loss / (iteration + 1), 'lr': get_lr(optimizer), 's/step': waste_time}) pbar.update(1) net.eval() print('Start Validation') with tqdm(total=epoch_size_val, desc=f'Epoch {epoch + 1}/{Epoch}', postfix=dict, mininterval=0.3) as pbar: for iteration, batch in enumerate(genval): if iteration >= epoch_size_val: break with torch.no_grad(): if cuda: batch = [Variable(torch.from_numpy(ann).type(torch.FloatTensor)).cuda() for ann in batch] else: batch = [Variable(torch.from_numpy(ann).type(torch.FloatTensor)) for ann in batch] batch_images, batch_hms, batch_whs, batch_regs, batch_reg_masks = batch hm, wh, offset = net(batch_images) c_loss = focal_loss(hm, batch_hms) wh_loss = 0.1 * reg_l1_loss(wh, batch_whs, batch_reg_masks) off_loss = reg_l1_loss(offset, batch_regs, batch_reg_masks) loss = c_loss + wh_loss + off_loss val_loss += loss.item() pbar.set_postfix(**{'total_loss': val_loss / (iteration + 1)}) pbar.update(1) # tensorboardX writer.add_scalars('loss', {'train': total_loss / (epoch_size + 1), 'val': val_loss / (epoch_size_val + 1)}, epoch) writer.add_scalar('lr', get_lr(optimizer), epoch) writer.flush() print('Finish Validation') print('Epoch:' + str(epoch + 1) + '/' + str(Epoch)) print('Total Loss: %.4f || Val Loss: %.4f ' % (total_loss / (epoch_size + 1), val_loss / (epoch_size_val + 1))) print('Saving state, iter:', str(epoch + 1)) # log.yaml avg_train_loss = total_loss / (epoch_size + 1) avg_train_loss = avg_train_loss.item() avg_val_loss = val_loss / (epoch_size_val + 1) avg_val_loss = avg_val_loss.item() log['epoch_number'] += 1 log['Epoch%03d' % (epoch + 1)] = [avg_train_loss, avg_val_loss] if log['best_val_loss'] < 0 or avg_val_loss < log['best_val_loss']: log['best_val_loss'] = avg_val_loss torch.save(model.state_dict(), 'logs/best.pth') with open('logs/log.yaml', 'w', encoding='utf-8') as f: yaml.dump(log, f) torch.save(model.state_dict(), 'logs/last.pth') return val_loss / (epoch_size_val + 1)
def fit_one_epoch(net, epoch, epoch_size, epoch_size_val, gen, gen_val, Epoch, cuda): total_r_loss = 0 total_c_loss = 0 total_loss = 0 val_loss = 0 start_time = time.time() net.train() with tqdm(total=epoch_size, desc='Epoch {}/{}'.format((epoch + 1), Epoch), postfix=dict, mininterval=0.3) as pbar: for iteration, batch in enumerate(gen): if iteration >= epoch_size: break with torch.no_grad(): if cuda: batch = [ Variable( torch.from_numpy(ann).type( torch.FloatTensor)).cuda() for ann in batch ] else: batch = [ Variable( torch.from_numpy(ann).type(torch.FloatTensor)) for ann in batch ] batch_images, batch_hms, batch_whs, batch_regs, batch_reg_masks = batch optimizer.zero_grad() hm, wh, offset = net(batch_images) c_loss = focal_loss(hm, batch_hms) wh_loss = 0.1 * reg_l1_loss(wh, batch_whs, batch_reg_masks) off_loss = reg_l1_loss(offset, batch_regs, batch_reg_masks) loss = c_loss + wh_loss + off_loss total_loss += loss.item() total_c_loss += c_loss.item() total_r_loss += wh_loss.item() + off_loss.item() loss.backward() optimizer.step() waste_time = time.time() - start_time pbar.set_postfix( **{ 'Total_Loss': total_loss / (iteration + 1), 'lr': get_lr(optimizer), 'step/s': waste_time }) pbar.update(1) start_time = time.time() net.eval() print('Start Validation') with tqdm(total=epoch_size_val, desc='Epoch {}/{}'.format((epoch + 1), Epoch), postfix=dict, mininterval=0.3) as pbar: for iteration, batch in enumerate(gen_val): if iteration >= epoch_size_val: break with torch.no_grad(): if cuda: batch = [ Variable( torch.from_numpy(ann).type( torch.FloatTensor)).cuda() for ann in batch ] else: batch = [ Variable( torch.from_numpy(ann).type(torch.FloatTensor)) for ann in batch ] batch_images, batch_hms, batch_whs, batch_regs, batch_reg_masks = batch optimizer.zero_grad() hm, wh, offset = net(batch_images) c_loss = focal_loss(hm, batch_hms) wh_loss = 0.1 * reg_l1_loss(wh, batch_whs, batch_reg_masks) off_loss = reg_l1_loss(offset, batch_regs, batch_reg_masks) loss = c_loss + wh_loss + off_loss val_loss += loss.item() pbar.set_postfix(**{'Val_Loss': val_loss / (iteration + 1)}) pbar.update(1) print('Finish Validation') print('Epoch:' + str(epoch + 1) + '/' + str(Epoch)) print('Total Loss: %.4f || Val Loss: %.4f ' % (total_loss / (epoch_size + 1), val_loss / (epoch_size_val + 1))) print('Saving state, iter:', str(epoch + 1)) torch.save( model.state_dict(), 'logs/Epoch%d-Total_Loss%.4f-Val_Loss%.4f.pth' % ((epoch + 1), total_loss / (epoch_size + 1), val_loss / (epoch_size_val + 1))) return val_loss / (epoch_size_val + 1)
def fit_one_epoch(model_train, model, loss_history, eval_callback, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, Epoch, cuda, fp16, scaler, backbone, save_period, save_dir, local_rank=0): total_r_loss = 0 total_c_loss = 0 total_loss = 0 val_loss = 0 if local_rank == 0: print('Start Train') pbar = tqdm(total=epoch_step, desc=f'Epoch {epoch + 1}/{Epoch}', postfix=dict, mininterval=0.3) model_train.train() for iteration, batch in enumerate(gen): if iteration >= epoch_step: break with torch.no_grad(): if cuda: batch = [ann.cuda(local_rank) for ann in batch] batch_images, batch_hms, batch_whs, batch_regs, batch_reg_masks = batch #----------------------# # 清零梯度 #----------------------# optimizer.zero_grad() if not fp16: if backbone == "resnet50": hm, wh, offset = model_train(batch_images) c_loss = focal_loss(hm, batch_hms) wh_loss = 0.1 * reg_l1_loss(wh, batch_whs, batch_reg_masks) off_loss = reg_l1_loss(offset, batch_regs, batch_reg_masks) loss = c_loss + wh_loss + off_loss total_loss += loss.item() total_c_loss += c_loss.item() total_r_loss += wh_loss.item() + off_loss.item() else: outputs = model_train(batch_images) loss = 0 c_loss_all = 0 r_loss_all = 0 index = 0 for output in outputs: hm, wh, offset = output["hm"].sigmoid( ), output["wh"], output["reg"] c_loss = focal_loss(hm, batch_hms) wh_loss = 0.1 * reg_l1_loss(wh, batch_whs, batch_reg_masks) off_loss = reg_l1_loss(offset, batch_regs, batch_reg_masks) loss += c_loss + wh_loss + off_loss c_loss_all += c_loss r_loss_all += wh_loss + off_loss index += 1 total_loss += loss.item() / index total_c_loss += c_loss_all.item() / index total_r_loss += r_loss_all.item() / index loss.backward() optimizer.step() else: from torch.cuda.amp import autocast with autocast(): if backbone == "resnet50": hm, wh, offset = model_train(batch_images) c_loss = focal_loss(hm, batch_hms) wh_loss = 0.1 * reg_l1_loss(wh, batch_whs, batch_reg_masks) off_loss = reg_l1_loss(offset, batch_regs, batch_reg_masks) loss = c_loss + wh_loss + off_loss total_loss += loss.item() total_c_loss += c_loss.item() total_r_loss += wh_loss.item() + off_loss.item() else: outputs = model_train(batch_images) loss = 0 c_loss_all = 0 r_loss_all = 0 index = 0 for output in outputs: hm, wh, offset = output["hm"].sigmoid( ), output["wh"], output["reg"] c_loss = focal_loss(hm, batch_hms) wh_loss = 0.1 * reg_l1_loss(wh, batch_whs, batch_reg_masks) off_loss = reg_l1_loss(offset, batch_regs, batch_reg_masks) loss += c_loss + wh_loss + off_loss c_loss_all += c_loss r_loss_all += wh_loss + off_loss index += 1 total_loss += loss.item() / index total_c_loss += c_loss_all.item() / index total_r_loss += r_loss_all.item() / index #----------------------# # 反向传播 #----------------------# scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() if local_rank == 0: pbar.set_postfix( **{ 'total_r_loss': total_r_loss / (iteration + 1), 'total_c_loss': total_c_loss / (iteration + 1), 'lr': get_lr(optimizer) }) pbar.update(1) if local_rank == 0: pbar.close() print('Finish Train') print('Start Validation') pbar = tqdm(total=epoch_step_val, desc=f'Epoch {epoch + 1}/{Epoch}', postfix=dict, mininterval=0.3) model_train.eval() for iteration, batch in enumerate(gen_val): if iteration >= epoch_step_val: break with torch.no_grad(): if cuda: batch = [ann.cuda(local_rank) for ann in batch] batch_images, batch_hms, batch_whs, batch_regs, batch_reg_masks = batch if backbone == "resnet50": hm, wh, offset = model_train(batch_images) c_loss = focal_loss(hm, batch_hms) wh_loss = 0.1 * reg_l1_loss(wh, batch_whs, batch_reg_masks) off_loss = reg_l1_loss(offset, batch_regs, batch_reg_masks) loss = c_loss + wh_loss + off_loss val_loss += loss.item() else: outputs = model_train(batch_images) index = 0 loss = 0 for output in outputs: hm, wh, offset = output["hm"].sigmoid( ), output["wh"], output["reg"] c_loss = focal_loss(hm, batch_hms) wh_loss = 0.1 * reg_l1_loss(wh, batch_whs, batch_reg_masks) off_loss = reg_l1_loss(offset, batch_regs, batch_reg_masks) loss += c_loss + wh_loss + off_loss index += 1 val_loss += loss.item() / index if local_rank == 0: pbar.set_postfix(**{'val_loss': val_loss / (iteration + 1)}) pbar.update(1) if local_rank == 0: pbar.close() print('Finish Validation') loss_history.append_loss(epoch + 1, total_loss / epoch_step, val_loss / epoch_step_val) eval_callback.on_epoch_end(epoch + 1, model_train) print('Epoch:' + str(epoch + 1) + '/' + str(Epoch)) print('Total Loss: %.3f || Val Loss: %.3f ' % (total_loss / epoch_step, val_loss / epoch_step_val)) #-----------------------------------------------# # 保存权值 #-----------------------------------------------# if (epoch + 1) % save_period == 0 or epoch + 1 == Epoch: torch.save( model.state_dict(), os.path.join( save_dir, 'ep%03d-loss%.3f-val_loss%.3f.pth' % (epoch + 1, total_loss / epoch_step, val_loss / epoch_step_val))) if len(loss_history.val_loss) <= 1 or ( val_loss / epoch_step_val) <= min(loss_history.val_loss): print('Save best model to best_epoch_weights.pth') torch.save(model.state_dict(), os.path.join(save_dir, "best_epoch_weights.pth")) torch.save(model.state_dict(), os.path.join(save_dir, "last_epoch_weights.pth"))