def build_reid_train_loader(cfg): train_transforms = build_transforms(cfg, is_train=True) logger = logging.getLogger(__name__) train_items = list() for d in cfg.DATASETS.NAMES: logger.info('prepare training set {}'.format(d)) print('preparing training set...') dataset = DATASET_REGISTRY.get(d)(cfg) print(dataset) train_items.extend(dataset.train) train_set = BlackreidDataset(train_items, train_transforms, mode='train', relabel=True) num_workers = cfg.DATALOADER.NUM_WORKERS batch_size = cfg.SOLVER.IMS_PER_BATCH num_instance = cfg.DATALOADER.NUM_INSTANCE if cfg.DATALOADER.PK_SAMPLER: data_sampler = samplers.RandomIdentitySampler(train_set.img_items, batch_size, num_instance) else: data_sampler = samplers.TrainingSampler(len(train_set)) batch_sampler = torch.utils.data.sampler.BatchSampler(data_sampler, batch_size, True) train_loader = torch.utils.data.DataLoader( train_set, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=fast_batch_collator, ) return data_prefetcher(cfg, train_loader)
def get_train_loader(self): opt = self.opt transforms = torchvision.transforms.Compose([ torchvision.transforms.Resize(opt.pretrained_image_size), torchvision.transforms.ToTensor(), ]) train_dataset = PFADataset(age_group=opt.age_group, max_iter=opt.max_iter, batch_size=opt.batch_size * len(opt.device_ids), dataset_name=opt.dataset_name, source=opt.source, transforms=transforms) train_sampler = tordata.distributed.DistributedSampler(train_dataset, shuffle=False) train_loader = tordata.DataLoader(dataset=train_dataset, batch_size=opt.batch_size, drop_last=True, num_workers=opt.num_workers, pin_memory=True, sampler=train_sampler) # source_img, true_img, source_label, target_label, true_label, true_age, mean_age return data_prefetcher(train_loader, [0, 1])
def build_reid_test_loader(cfg, dataset_name): test_transforms = build_transforms(cfg, is_train=False) logger = logging.getLogger(__name__) logger.info('prepare test set {}'.format(dataset_name)) print('preparing test set...') dataset = DATASET_REGISTRY.get(dataset_name)(cfg) print(dataset) test_items = dataset.query + dataset.gallery test_set = BlackreidDataset(test_items, test_transforms, mode='test', relabel=False) num_workers = cfg.DATALOADER.NUM_WORKERS batch_size = cfg.TEST.IMS_PER_BATCH data_sampler = samplers.InferenceSampler(len(test_set)) batch_sampler = torch.utils.data.BatchSampler(data_sampler, batch_size, False) test_loader = DataLoader( test_set, batch_sampler=batch_sampler, num_workers=num_workers, collate_fn=fast_batch_collator) return data_prefetcher(cfg, test_loader), len(dataset.query)
def train(train_loader, val_loader, model, optimizer, criterion, lr_scheduler, device, opt): ''' model training :param: train_loader: dataloader, val_loader: dataloader, model: cpkt, optimizer: optimizer, criterion: weighted_binary_crossentropy, lr_scheduler: LRScheduler, device: device, opt: dict :return ''' total_step = len(train_loader) best_acc = -1 losses = AverageMeter() batch_time = AverageMeter() end = time.time() print_freq = 20 iter_per_epoch = len(train_loader) iter_sum = iter_per_epoch * opt.epochs fast_train = hasattr(opt, 'fast_train') writer = SummaryWriter(opt.model_save_path) for epoch in range(opt.epochs): model.train() prefetcher = data_prefetcher(train_loader) datas, ages, sexs, labels = prefetcher.next() i = 0 while datas is not None: i += 1 lr_scheduler.update(i, epoch) # for i, (datas, ages, sexs, labels) in enumerate(train_loader): datas = datas.to(device) ages = ages.to(device) sexs = sexs.to(device) labels = labels.to(device) # Forward pass outputs = model(datas, ages, sexs) loss = criterion(outputs, labels) # Backward and optimize optimizer.zero_grad() loss.backward() optimizer.step() # Update tensorboard batch_time.update(time.time() - end) losses.update(loss.item(), datas.size(0)) # Print if (i + 1) % print_freq == 0: iter_used = epoch * iter_per_epoch + i used_time = batch_time.sum total_time = used_time / iter_used * iter_sum used_time = str(datetime.timedelta(seconds=used_time)) total_time = str(datetime.timedelta(seconds=total_time)) print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, LR:{:.5f}, Time[{:.7s}/{:.7s}]' .format(epoch + 1, opt.epochs, i + 1, total_step, loss.item(), optimizer.param_groups[0]['lr'], used_time, total_time), flush=True) writer.add_scalar('Learning_rate', optimizer.param_groups[0]['lr'], iter_used) writer.add_scalar('Train/Avg_Loss', losses.avg, iter_used) end = time.time() datas, ages, sexs, labels = prefetcher.next() if not fast_train: # acc in train set acc_train = val(train_loader, model, device) print('Train Accuracy: {} %'.format(acc_train), flush=True) writer.add_scalar('Train/F1_Score', acc_train, iter_used) # acc in validation set acc_val = val(val_loader, model, device) if acc_val > best_acc: # Save the model checkpoint best_acc = acc_val if epoch > int(opt.epochs * 0.8): save_name = args.model + '_e{}.ckpt'.format(epoch) save_path = opt.model_save_path + save_name torch.save(model.state_dict(), save_path) print('Validation Accuracy: {} %'.format(acc_val), flush=True) writer.add_scalar('Validation/F1_Score', acc_val, iter_used) else: if epoch > int(opt.epochs * 0.8): acc_val = val(val_loader, model, device) if acc_val > best_acc: best_acc = acc_val save_name = args.model + '_e{}.ckpt'.format(epoch) save_path = opt.model_save_path + save_name torch.save(model.state_dict(), save_path) return
def train(): model,pe = make_model(class_num) if opt["previous_stage"] is None: for name,param in model.named_parameters(): if param.dim() > 1: init.kaiming_normal_(param) elif 'weight' in name: init.constant_(param,1) else: init.constant_(param,0) else: model.load_state_dict(torch.load("./"+opt["model_name"]+opt["previous_stage"]+".pkl")) model = model.cuda() pe = pe.cuda() criterion = torch.nn.CrossEntropyLoss(ignore_index=0).cuda() if opt["optimizer"] == "Adam": optimizer = optim.Adadelta(model.parameters(),lr=opt["lr"]) elif opt["optimizer"] == "SGD": optimizer = optim.SGD(model.parameters(), lr=opt["lr"], momentum=0.9) lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,patience=3,verbose=True) else: optimizer = adabound.AdaBound(model.parameters()) if opt["previous_stage"] is not None: optimizer.load_state_dict(torch.load("./"+opt["model_name"]+opt["previous_stage"]+opt["optimizer"]+".pkl")) if opt["optimizer"] == "SGD": lr_scheduler.load_state_dict(torch.load("./"+opt["model_name"]+opt["previous_stage"]+opt["optimizer"]+"_lrScheduler.pkl")) TrainSet = dataset("/data1/luofuyou/ReCTS_lmdb",split_ratio=0.8,img_size=opt["img_size"]) num_batch = int(TrainSet.len / opt["batch_size"]) num_iter = opt["num_epoch"] * num_batch print(f"TrainSet_len:{TrainSet.len}") print("batch_size:%d;num_batch:%d;num_epoch:%d;iter:%d;optimizer:%s;lr:%.2f;train_stage:%s" % ( opt["batch_size"],num_batch, opt["num_epoch"], num_iter,opt["optimizer"],optimizer.param_groups[0]["lr"],opt["train_stage"])) ValidateSet = dataset("/data1/luofuyou/ReCTS_lmdb",bias=0.01,img_size=opt["img_size"]) print(f"ValidateSet_len:{ValidateSet.len}") totalTimeS = time.time() mask = make_mask(opt["max_len"]+2) if torch.cuda.device_count() > 1: updater = MultiGPUTrainer(model,criterion,optimizer,pe,mask) else: updater = OneGPUTrainer(model,criterion,optimizer,pe,mask) for epoch in range(1, opt["num_epoch"] + 1): torch.cuda.empty_cache() model.train() total_loss = torch.zeros(1).cuda() loader = DataLoader(TrainSet, batch_size=opt["batch_size"], shuffle=True, num_workers=8,pin_memory=True, drop_last=True) prefetcher = data_prefetcher(loader) start = time.time() X,Y = prefetcher.next() while X is not None: loss = updater.update(X,Y) total_loss += loss X, Y = prefetcher.next() end = time.time() print(f"epoch:{epoch};avg_loss:{total_loss.item() / num_batch};time consumed:{time_interval(end - start)}") torch.save(model.state_dict(), opt["model_name"]+opt["train_stage"]+".pkl") CCR=test(model,pe,ValidateSet,opt["max_len"],make_mask,class_num,converter,error_analysis=False) torch.save(optimizer.state_dict(),opt["model_name"]+opt["train_stage"]+opt["optimizer"]+".pkl") if opt["optimizer"] == "SGD": lr_scheduler.step(total_loss.item() / num_batch) torch.save(lr_scheduler.state_dict(),opt["model_name"]+opt["train_stage"]+opt["optimizer"]+"_lrScheduler.pkl") totalTimeE = time.time() print(f"the training has finished;time consumed is {time_interval(totalTimeE - totalTimeS)}") TrainSet.env.close() ValidateSet.env.close()
def train(): model = LSTMBase(opt) if opt["train_stage"] == 1.0: for name, param in model.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue else: model.load_state_dict( torch.load("./LSTMBase_params_{}.pkl".format(opt["train_stage"] - 1.0))) criterion = torch.nn.CrossEntropyLoss(ignore_index=0).cuda() optimizer = optim.Adadelta(model.parameters(), lr=opt["lr"], rho=opt["rho"], eps=opt["eps"]) if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model, device_ids=device_ids).cuda() else: model = model.cuda() model.train() ReCTS = dataset("/data1/luofuyou/ReCTS_lmdb", split_ratio=0.96, img_size=opt["img_size"]) num_batch = int(ReCTS.len / opt["batch_size"]) num_iter = opt["num_epoch"] * num_batch print( "dataSet_len:%d;batch size:%d;num_batch:%d;num_epoch:%d;iter:%d;lr=%f" % (ReCTS.len, opt["batch_size"], num_batch, opt["num_epoch"], num_iter, opt["lr"])) totalTimeS = time.time() p = 0 for epoch in range(1, opt["num_epoch"] + 1): torch.cuda.empty_cache() total_loss = 0.0 dataLoader = DataLoader(ReCTS, batch_size=opt["batch_size"], shuffle=True, num_workers=8, pin_memory=True, drop_last=True) prefetcher = data_prefetcher(dataLoader) epochTimeS = time.time() X, Y = prefetcher.next() while X is not None: try: text = converter.encode(Y, opt["batch_max_length"]) preds = model(X, text) target = text[:, 1:] # without [GO] Symbol cost = criterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1)) optimizer.zero_grad() cost.backward() clip_grad_norm_(model.parameters(), opt["grad_clip"]) optimizer.step() total_loss += cost.item() X, Y = prefetcher.next() except Exception as e: print(e) opt["batch_max_length"] += 4 epochTimeE = time.time() print("epoch:%d;avg_loss:%f;time consumed:%s" % (epoch, total_loss / num_batch, time_interval(epochTimeE - epochTimeS))) #if epoch % int(0.6*opt["num_epoch"]) == 0 or epoch % int(0.8*opt["num_epoch"]) == 0: # p += 1 # lr = opt["lr"] * np.power(0.1, p) # optimizer.param_groups[0]["lr"] = lr if torch.cuda.device_count() > 1: torch.save(model.module.state_dict(), "LSTMBase_params_{}.pkl".format(opt["train_stage"])) else: torch.save(model.state_dict(), "LSTMBase_params_{}.pkl".format(opt["train_stage"])) ReCTS.env.close() totalTimeE = time.time() print("the training has finished;the total time consumed is %s" % time_interval(totalTimeE - totalTimeS))
def train(args): start_epoch = 0 data_loader = DataLoader(dataset=FaceDetectSet(416, True), batch_size=args.batch, shuffle=True, num_workers=16) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") model = MSSD() print("add graph") writer.add_graph(model, torch.zeros((1, 3, 416, 416))) print("add graph over") if args.pretrained and os.path.exists(MODEL_SAVE_PATH): print("loading ...") state = torch.load(MODEL_SAVE_PATH) model.load_state_dict(state['net']) start_epoch = state['epoch'] print("loading over") model = torch.nn.DataParallel(model, device_ids=[0, 1]) # multi-GPU model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-5) scheduler = StepLR(optimizer, step_size=args.step, gamma=args.gama) train_loss = 0 loss_func = MLoss().to(device) to_pil_img = tfs.ToPILImage() to_tensor = tfs.ToTensor() for epoch in range(start_epoch, start_epoch + args.epoes): model.train() prefetcher = data_prefetcher(data_loader) img_tensor, label_tensor = prefetcher.next() last_img_tensor = img_tensor last_label_tensor = label_tensor optimizer.zero_grad() i_batch = 0 while img_tensor is not None: last_img_tensor = img_tensor last_label_tensor = label_tensor output = model(img_tensor) loss = loss_func(output, label_tensor) if loss is None: img_tensor, label_tensor = prefetcher.next() continue loss.backward() if i_batch % args.mini_batch == 0: optimizer.step() optimizer.zero_grad() train_loss = loss.item() global_step = epoch * len(data_loader) + i_batch progress_bar(i_batch, len(data_loader), 'loss: %f, epeche: %d' % (train_loss, epoch)) writer.add_scalar("loss", train_loss, global_step=global_step) img_tensor, label_tensor = prefetcher.next() i_batch += 1 #save one pic and output pil_img = to_pil_img(last_img_tensor[0].cpu()) bboxes = tensor2bbox(output[0], 416, [52, 26, 13], thresh=0.5) # bboxes = nms(bboxes, 0.6, 0.5) draw = ImageDraw.Draw(pil_img) for bbox in bboxes: draw.text((bbox[1] - bbox[3] / 2, bbox[2] - bbox[4] / 2 - 10), str(round(bbox[0].item(), 2)), fill=(255, 0, 0)) draw.rectangle((bbox[1] - bbox[3] / 2, bbox[2] - bbox[4] / 2, bbox[1] + bbox[3] / 2, bbox[2] + bbox[4] / 2), outline=(0, 255, 0)) draw.rectangle( (bbox[1] - bbox[3] / 2 + 1, bbox[2] - bbox[4] / 2 + 1, bbox[1] + bbox[3] / 2 - 1, bbox[2] + bbox[4] / 2 - 1), outline=(0, 255, 0)) writer.add_image("img: " + str(epoch), to_tensor(pil_img)) scheduler.step() if epoch % 10 == 0: print('Saving..') state = { 'net': model.module.state_dict(), 'epoch': epoch, } torch.save(state, "./data/mssd_face_detect" + str(epoch) + ".pt") if not os.path.isdir('data'): os.mkdir('data') print('Saving..') state = { 'net': model.module.state_dict(), 'epoch': epoch, } torch.save(state, MODEL_SAVE_PATH) writer.close()
def train(): model,pe = make_model(class_num) criterion = torch.nn.CrossEntropyLoss(ignore_index=0).cuda() if opt["optimizer"] == "Adam": optimizer = optim.Adadelta(model.parameters(),lr=opt["lr"]) elif opt["optimizer"] == "SGD": optimizer = optim.SGD(model.parameters(), lr=opt["lr"], momentum=0.9) lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,patience=5,verbose=True) else: optimizer = adabound.AdaBound(model.parameters()) if opt["previous_stage"] is None: for param in model.parameters(): if param.dim() > 1: init.kaiming_normal_(param) else: init.constant_(param,0) else: model.load_state_dict(torch.load("./"+opt["model_name"]+opt["previous_stage"]+".pkl")) optimizer.load_state_dict(torch.load("./"+opt["model_name"]+opt["previous_stage"]+opt["optimizer"]+".pkl")) if opt["optimizer"] == "SGD": lr_scheduler.load_state_dict(torch.load("./"+opt["model_name"]+opt["previous_stage"]+opt["optimizer"]+"_lrScheduler.pkl")) model = model.cuda() pe = pe.cuda() TrainSet = dataset("/data1/luofuyou/ReCTS_lmdb",split_ratio=0.99,img_size=opt["img_size"]) num_batch = int(TrainSet.len / opt["batch_size"]) num_iter = opt["num_epoch"] * num_batch print(f"TrainSet_len:{TrainSet.len}") print("batch_size:%d;num_batch:%d;num_epoch:%d;iter:%d;optimizer:%s;lr:%.2f;train_stage:%s" % ( opt["batch_size"],num_batch, opt["num_epoch"], num_iter,opt["optimizer"],opt["lr"],opt["train_stage"])) ValidateSet = dataset("/data1/luofuyou/ReCTS_lmdb",bias=0.99,img_size=opt["img_size"]) print(f"ValidateSet_len:{ValidateSet.len}") totalTimeS = time.time() updater = OneGPUTrainer(model,criterion,optimizer,pe) for epoch in range(1, opt["num_epoch"] + 1): model.train() total_loss = torch.zeros(1).cuda() loader = DataLoader(TrainSet, batch_size=opt["batch_size"], shuffle=True, num_workers=8,pin_memory=True, drop_last=True) prefetcher = data_prefetcher(loader) start = time.time() X,Y = prefetcher.next() while X is not None: loss = updater.update(X,Y) total_loss += loss X, Y = prefetcher.next() end = time.time() print(f"epoch:{epoch};avg_loss:{total_loss.item() / num_batch};time consumed:{time_interval(end - start)}") torch.cuda.empty_cache() torch.save(model.state_dict(), opt["model_name"]+opt["train_stage"]+".pkl") loader = DataLoader(ValidateSet,batch_size=44, shuffle=False, num_workers=8,pin_memory=True, drop_last=False) prefetcher = data_prefetcher(loader) CCR = 0.0 start = time.time() X, Y = prefetcher.next() while X is not None: preds = model(X,pe) text_indexs = torch.argmax(preds,dim=-1) texts = converter.decode(text_indexs) for text,label in zip(texts,Y): text = text[:text.find('[s]')] try: NED = 1 - editdistance.distance(text, label) / max(len(text), len(label)) CCR += NED except: pass X, Y = prefetcher.next() CCR /= ValidateSet.len CCR = round(100 * CCR, 2) end = time.time() print(f"CCR:{CCR}%;time consumed:{time_interval(end - start)}") torch.cuda.empty_cache() torch.save(optimizer.state_dict(),opt["model_name"]+opt["train_stage"]+opt["optimizer"]+".pkl") if opt["optimizer"] == "SGD": lr_scheduler.step(total_loss.item() / num_batch) torch.save(lr_scheduler.state_dict(),opt["model_name"]+opt["train_stage"]+opt["optimizer"]+"_lrScheduler.pkl") totalTimeE = time.time() print(f"the training has finished;time consumed is {time_interval(totalTimeE - totalTimeS)}") TrainSet.env.close() ValidateSet.env.close()