def __init__(self, model, dataset, ctx=-1, batch_size=128, optimizer='sgd', lambdas=[0.1, 0.1], print_freq=32): self.model = model self.dataset = dataset self.batch_size = batch_size self.optbb = optim.SGD(chain(self.model.age_classifier.parameters(), self.model.RFM.parameters(), self.model.margin_fc.parameters(), self.model.backbone.parameters()), lr=0.01, momentum=0.9) self.optDAL = optim.SGD(self.model.DAL.parameters(), lr=0.01, momentum=0.9) self.lambdas = lambdas self.print_freq = print_freq self.id_recorder = Recorder() self.age_recorder = Recorder() self.trainingDAL = False if ctx < 0: self.ctx = torch.device('cpu') else: self.ctx = torch.device(f'cuda:{ctx}')
def __init__(self, arg): self.arg = arg self.save_arg() if self.arg.random_fix: self.rng = RandomState(seed=self.arg.random_seed) self.device = GpuDataParallel() self.recoder = Recorder(self.arg.work_dir, self.arg.print_log) self.data_loader = {} self.topk = (1, 5) self.stat = Stat(self.arg.model_args['num_classes'], self.topk) self.model, self.optimizer = self.Loading() self.loss = self.criterion()
def __init__(self, worker_id, num_env, game_name, n_stack, child_conn, args): super(worker, self).__init__() self.daemon = True self.worker_id = worker_id self.num_env = num_env self.n_stack = n_stack self.child_conn = child_conn self.args = args self.envs = [] self.index_base = worker_id * num_env self.episode_length = [0] * num_env for i in range(num_env): time.sleep(0.1) access_index = self.index_base + i env = atari(game_name, n_stack) env.reset() self.envs.append(env) if args.record == True: self.recorder = [] for i in range(num_env): self.recorder.append( Recorder(int(worker_id * num_env + i), game_name))
def __init__( self, model, dataset, ctx=-1, batch_size=128, optimizer='sgd', grad_accu=1, lambdas=[0.05, 0.1], print_freq=32, train_head_only=True ): self.model = model self.dataset = dataset self.batch_size = batch_size self.finetune_layers = ( # self.model.backbone.repeat_3[-1:], self.model.backbone.last_bn, self.model.backbone.last_linear, self.model.backbone.block8 ) first_group = [ { "params": chain( self.model.age_classifier.parameters(), self.model.RFM.parameters(), self.model.margin_fc.parameters(), ), "lr": 5e-4 } ] if not train_head_only: # first_group[0]["lr"] = 1e-4 first_group.append( { "params": chain( *(x.parameters() for x in self.finetune_layers) ), "lr": 5e-5 } ) self.optbb = RAdam(first_group) self.optDAL = RAdam(self.model.DAL.parameters(), lr=5e-4) self.lambdas = lambdas self.print_freq = print_freq self.id_recorder = Recorder() self.age_recorder = Recorder() self.trainingDAL = False if ctx < 0: self.ctx = torch.device('cpu') else: self.ctx = torch.device(f'cuda:{ctx}') self.scaler1 = GradScaler() self.scaler2 = GradScaler() self.grad_accu = grad_accu self.train_head_only = train_head_only
def test(args): model_path = sorted(glob(os.path.join('ckpt', args.tag, '*.pth')))[-1] model = torch.load(model_path, map_location='cpu').eval() print('Loaded model: {}'.format(model_path)) model_name = os.path.basename(os.path.splitext(model_path)[0]) # initialize video writer video_filename = 'output_{}_{}.avi'.format(args.tag, model_name) dict_screen_shape = {"flappy": (288, 512), "pixelcopter": (48, 48)} out = Recorder(video_filename=video_filename, fps=30, width=dict_screen_shape[args.game][0], height=dict_screen_shape[args.game][1]) score_list = [] time_list = [] game = Game(game=args.game) for trials in range(10): elapsed_Time = 0 action = torch.zeros([model.number_of_actions], dtype=torch.float32) terminal = game.game_over() start = time.time() score = 0 image_data = game.get_torch_image() state = torch.cat( (image_data, image_data, image_data, image_data)).unsqueeze(0) while not terminal: output = model(state)[0] action = torch.zeros([model.number_of_actions], dtype=torch.float32) action_index = torch.argmax(output) score += game.act(action_index) terminal = game.game_over() image_data_1 = game.get_torch_image() state = torch.cat( (state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0) out.write(game.get_image()) game.reset_game() score_list.append(score) time_list.append(time.time() - start) print('Game Ended!') print('Score: {} !'.format(score)) # Add summary out.write_score(sum(score_list), sum(time_list)) out.save() print('Total Score: {}'.format(sum(score_list))) print('Total Run Time: {:.3f}'.format(sum(time_list))) print('Saved video: {}'.format(video_filename))
def train(): model, recorder = mdl.Classifier(), Recorder() optimizer = torch.optim.Adam(model.parameters(), weight_decay=constants.WEIGHT_DECAY) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, constants.EPOCHS) trainer = Trainer(model, optimizer, scheduler, recorder) trainer.fit(constants.EPOCHS) trainer.save_model() recorder.plot()
def main(): global ssn, folder runbool = True while runbool: mkdir() ssn = None print("Program started") while not ssnreceived(): print('waiting for ssn') rec = Recorder(ssn,folder) rec.run() stop = None while stop is None: stop = input('Enter something to stop the recording') print(' ') if str(stop.strip()) == 'stop': runbool = False time.sleep(0.2) print(stop) if rec is not None: rec.stop() filename = ssn if os.path.isfile(folder + filename + '.wav'): db = Database(folder, ssn, filename) db.upload() db.adduser() db.addfile() db.addfiletouser() else: print('Not added to database')
def train(args): Arguments.save_args(args, args.args_path) train_loader, val_loader, _ = get_dataloaders(args) model = UNetVgg16(n_classes=args.n_classes).to(args.device) optimizer = get_optimizer(args.optimizer, model) lr_scheduler = LRScheduler(args.lr_scheduler, optimizer) criterion = get_loss_fn(args.loss_type, args.ignore_index).to(args.device) model_saver = ModelSaver(args.model_path) recorder = Recorder(['train_miou', 'train_acc', 'train_loss', 'val_miou', 'val_acc', 'val_loss']) for epoch in range(args.n_epochs): print(f"{args.experim_name} Epoch {epoch+1}:") train_loss, train_acc, train_miou, train_ious = train_epoch( model=model, dataloader=train_loader, n_classes=args.n_classes, optimizer=optimizer, lr_scheduler=lr_scheduler, criterion=criterion, device=args.device, ) print(f"train | mIoU: {train_miou:.3f} | accuracy: {train_acc:.3f} | loss: {train_loss:.3f}") val_loss, val_scores = eval_epoch( model=model, dataloader=val_loader, n_classes=args.n_classes, criterion=criterion, device=args.device, ) val_miou, val_ious, val_acc = val_scores['mIoU'], val_scores['IoUs'], val_scores['accuracy'] print(f"valid | mIoU: {val_miou:.3f} | accuracy: {val_acc:.3f} | loss: {val_loss:.3f}") recorder.update([train_miou, train_acc, train_loss, val_miou, val_acc, val_loss]) recorder.save(args.record_path) if args.metric.startswith("IoU"): metric = val_ious[int(args.metric.split('_')[1])] else: metric = val_miou model_saver.save_models(metric, epoch+1, model, ious={'train': train_ious, 'val': val_ious}) print(f"best model at epoch {model_saver.best_epoch} with miou {model_saver.best_score:.5f}")
def run(rank, args): base_setting(args) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) random.seed(args.seed) gpuid = args.gpuid[rank] is_master = rank == 0 is_mp = len(args.gpuid) > 1 world_size = len(args.gpuid) if is_master: recorder = Recorder(args.log) tok = BertTokenizer.from_pretrained(args.model_type) if args.use_ids: collate_fn = partial(collate_mp_ids, pad_token_id=tok.pad_token_id, is_test=False) collate_fn_val = partial(collate_mp_ids, pad_token_id=tok.pad_token_id, is_test=True) train_set = RefactoringIDsDataset( f"./{args.dataset}/{args.datatype}/train", args.model_type, maxlen=args.max_len, max_num=args.max_num) val_set = RefactoringIDsDataset( f"./{args.dataset}/{args.datatype}/val", args.model_type, is_test=True, maxlen=512, is_sorted=False) else: collate_fn = partial(collate_mp, pad_token_id=tok.pad_token_id, is_test=False) collate_fn_val = partial(collate_mp, pad_token_id=tok.pad_token_id, is_test=True) train_set = RefactoringDataset( f"./{args.dataset}/{args.datatype}/train", args.model_type, maxlen=args.max_len, maxnum=args.max_num) val_set = RefactoringDataset(f"./{args.dataset}/{args.datatype}/val", args.model_type, is_test=True, maxlen=512, is_sorted=False, maxnum=args.max_num) if is_mp: train_sampler = torch.utils.data.distributed.DistributedSampler( train_set, num_replicas=world_size, rank=rank, shuffle=True) dataloader = DataLoader(train_set, batch_size=args.batch_size, shuffle=False, num_workers=4, collate_fn=collate_fn, sampler=train_sampler) val_sampler = torch.utils.data.distributed.DistributedSampler( val_set, num_replicas=world_size, rank=rank) val_dataloader = DataLoader(val_set, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn_val, sampler=val_sampler) else: dataloader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=4, collate_fn=collate_fn) val_dataloader = DataLoader(val_set, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn_val) # build models model_path = args.pretrained if args.pretrained is not None else args.model_type model = Refactor(model_path, num_layers=args.num_layers) if args.model_pt is not None: model.load_state_dict( torch.load(args.model_pt, map_location=f'cuda:{gpuid}')) if args.cuda: if len(args.gpuid) == 1: model = model.cuda() else: dist.init_process_group("nccl", rank=rank, world_size=world_size) model = nn.parallel.DistributedDataParallel( model.to(gpuid), [gpuid], find_unused_parameters=True) model.train() init_lr = args.max_lr / args.warmup_steps optimizer = optim.Adam(model.parameters(), lr=init_lr) if is_master: recorder.write_config(args, [model], __file__) minimum_loss = 100 all_step_cnt = 0 # start training for epoch in range(args.epoch): optimizer.zero_grad() step_cnt = 0 steps = 0 avg_loss = 0 for (i, batch) in enumerate(dataloader): if args.cuda: to_cuda(batch, gpuid) step_cnt += 1 output = model(batch["src_input_ids"], batch["candidate_ids"], batch["tgt_input_ids"]) similarity, gold_similarity = output['score'], output[ 'summary_score'] loss = args.scale * RankingLoss(similarity, gold_similarity, args.margin, args.gold_margin, args.gold_weight, no_gold=args.no_gold) loss = loss / args.accumulate_step avg_loss += loss.item() loss.backward() if step_cnt == args.accumulate_step: if args.grad_norm > 0: nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm) step_cnt = 0 steps += 1 all_step_cnt += 1 lr = args.max_lr * min( all_step_cnt**(-0.5), all_step_cnt * (args.warmup_steps**(-1.5))) for param_group in optimizer.param_groups: param_group['lr'] = lr optimizer.step() optimizer.zero_grad() if steps % args.report_freq == 0 and step_cnt == 0 and is_master: recorder.print("epoch: %d, batch: %d, avg loss: %.6f" % (epoch + 1, steps, avg_loss / args.report_freq)) recorder.print(f"learning rate: {lr:.6f}") recorder.plot("loss", {"loss": avg_loss / args.report_freq}, all_step_cnt) recorder.print() avg_loss = 0 del similarity, gold_similarity, loss if all_step_cnt % args.test_freq == 0 and all_step_cnt != 0 and step_cnt == 0: loss = test(val_dataloader, model, args, gpuid) if loss < minimum_loss and is_master: minimum_loss = loss if is_mp: recorder.save(model.module, "model.bin") else: recorder.save(model, "model.bin") recorder.save(optimizer, "optimizer.bin") recorder.print("best - epoch: %d, batch: %d" % (epoch + 1, i / args.accumulate_step + 1)) if is_master: if is_mp: recorder.save(model.module, "model_cur.bin") else: recorder.save(model, "model_cur.bin") recorder.save(optimizer, "optimizer_cur.bin") recorder.print("val score: %.6f" % (1 - loss))
cfg['saveto'] = './model_200/' cfg['report_interval'] = args.report print(cfg) train_data = batchify(corpus.train, cfg['batch_size']) val_data = batchify(corpus.valid, cfg['batch_size']) test_data = batchify(corpus.test, cfg['batch_size']) with open(cfg['init'], 'rb') as f: policy = torch.load(f) print(policy) reinforce_model = Reinforce(policy=policy, sigma=cfg['sigma'], gamma=cfg['gamma']) recorder = Recorder(output_path=cfg['output_file']) valid_loss = [] loss = evaluate(val_data, reinforce_model.policy, cfg) print('start from valid loss = ', loss) valid_loss.append(loss) ntokens = cfg['dict_size'] optimizer = optim.Adam(reinforce_model.parameters(), lr=cfg['lr']) start_time = time.time() for epoch in range(cfg['epochs']): total_loss = 0.0 total_LM_loss = 0.0 for i in range(0, train_data.size(0) - 1, cfg['max_len']):
class Trainer(): def __init__( self, model, dataset, ctx=-1, batch_size=128, optimizer='sgd', grad_accu=1, lambdas=[0.05, 0.1], print_freq=32, train_head_only=True ): self.model = model self.dataset = dataset self.batch_size = batch_size self.finetune_layers = ( # self.model.backbone.repeat_3[-1:], self.model.backbone.last_bn, self.model.backbone.last_linear, self.model.backbone.block8 ) first_group = [ { "params": chain( self.model.age_classifier.parameters(), self.model.RFM.parameters(), self.model.margin_fc.parameters(), ), "lr": 5e-4 } ] if not train_head_only: # first_group[0]["lr"] = 1e-4 first_group.append( { "params": chain( *(x.parameters() for x in self.finetune_layers) ), "lr": 5e-5 } ) self.optbb = RAdam(first_group) self.optDAL = RAdam(self.model.DAL.parameters(), lr=5e-4) self.lambdas = lambdas self.print_freq = print_freq self.id_recorder = Recorder() self.age_recorder = Recorder() self.trainingDAL = False if ctx < 0: self.ctx = torch.device('cpu') else: self.ctx = torch.device(f'cuda:{ctx}') self.scaler1 = GradScaler() self.scaler2 = GradScaler() self.grad_accu = grad_accu self.train_head_only = train_head_only def train(self, epochs, start_epoch, save_path=None): self.train_ds = ImageFolderWithAges( self.dataset['pat'], self.dataset['pos'], transforms=Compose( [ HorizontalFlip(p=0.5), OneOf([ IAAAdditiveGaussianNoise(), GaussNoise(), ], p=0.25), Resize(200, 200, cv2.INTER_AREA), ToTensor(normalize=dict( mean=[0.5, 0.5, 0.5], std=[0.50196, 0.50196, 0.50196]) ) ] ), root=self.dataset['train_root'], ) self.train_ld = DataLoader( self.train_ds, shuffle=True, batch_size=self.batch_size, num_workers=2, drop_last=True, pin_memory=True ) print("# Batches:", len(self.train_ld)) if self.dataset['val_root'] is not None: self.val_ds = ImageFolderWithAges( self.dataset['pat'], self.dataset['pos'], root=self.dataset['val_root'], transforms=Compose([ Resize(200, 200, cv2.INTER_AREA), ToTensor(normalize=dict( mean=[0.5, 0.5, 0.5], std=[0.50196, 0.50196, 0.50196]) ) ]) ) self.val_ld = DataLoader(self.val_ds, shuffle=False, batch_size=self.batch_size, pin_memory=True, num_workers=1) self.model = self.model.to(self.ctx) total_steps = len(self.train_ld) * epochs lr_durations = [ int(total_steps*0.05), int(np.ceil(total_steps*0.95)) ] break_points = [0] + list(np.cumsum(lr_durations))[:-1] self.schedulers = [ MultiStageScheduler( [ LinearLR(self.optbb, 0.01, lr_durations[0]), CosineAnnealingLR(self.optbb, lr_durations[1], eta_min=1e-6) ], start_at_epochs=break_points ), MultiStageScheduler( [ LinearLR(self.optDAL, 0.01, lr_durations[0]), CosineAnnealingLR(self.optDAL, lr_durations[1], eta_min=1e-6) ], start_at_epochs=break_points ) ] if self.train_head_only: set_trainable(self.model.backbone, False) for module in self.model.backbone.modules(): if isinstance(module, (nn.BatchNorm2d, nn.BatchNorm1d)): module.track_running_stats = False else: set_trainable(self.model.backbone, False) # for module in self.model.backbone.modules(): # if isinstance(module, (nn.BatchNorm2d, nn.BatchNorm1d)): # module.track_running_stats = False for module in self.finetune_layers: set_trainable(module, True) # for submodule in chain([module], module.modules()): # if isinstance(submodule, (nn.BatchNorm2d, nn.BatchNorm1d)): # submodule.track_running_stats = True count_model_parameters(self.model) # print(self.optbb.param_groups[-1]["lr"]) # print(self.optDAL.param_groups[-1]["lr"]) for epoch in range(epochs): print(f'---- epoch {epoch} ----') self.update() if self.dataset['val_root'] is not None: acc = self.validate() else: acc = -1. if save_path is not None: torch.save(self.model.state_dict(), os.path.join(save_path, f'{start_epoch+epoch}_{acc:.4f}.pth')) def update(self): print(' -- Training --') self.model.train() self.model.backbone.eval() if not self.train_head_only: for module in self.finetune_layers: module.train() # for submodule in chain([module], module.modules()): # if isinstance(submodule, (nn.BatchNorm2d, nn.BatchNorm1d)): # submodule.eval() self.id_recorder.reset() self.age_recorder.reset() for i, (xs, ys, agegrps) in enumerate(self.train_ld): if i % 80 == 0: # canonical maximization procesure self.set_train_mode(False) elif i % 80 == 28: # RFM optimize procesure self.set_train_mode(True) xs, ys, agegrps = xs.to(self.ctx), ys.to(self.ctx), agegrps.to(self.ctx) with autocast(): self.model(xs, ys, agegrps=agegrps) idLoss, id_acc, ageLoss, age_acc, cc = self.model(xs, ys, agegrps=agegrps) #print(f' ---\n{idLoss}\n{id_acc}\n{ageLoss}\n{age_acc}\n{cc}') total_loss = idLoss + ageLoss*self.lambdas[0] + cc*self.lambdas[1] total_loss /= self.grad_accu self.id_recorder.gulp(len(agegrps), idLoss.detach().item(), id_acc.detach().item()) self.age_recorder.gulp(len(agegrps), ageLoss.detach().item(), age_acc.detach().item()) if i % self.print_freq == 0: print( f' iter: {i} {i%70} total loss: {total_loss.item():.4f} ({idLoss.item():.4f}, {id_acc.item():.4f}, {ageLoss.item():.4f}, {age_acc.item():.4f}, {cc.item():.8f}) {self.optbb.param_groups[-1]["lr"]:.6f}') if self.trainingDAL: self.scaler1.scale(-1 * cc*self.lambdas[1]).backward() # total_loss.backward() # Trainer.flip_grads(self.model.DAL) if (i + 1) % self.grad_accu == 0: # self.optDAL.step() self.scaler1.step(self.optDAL) self.scaler1.update() self.optDAL.zero_grad() else: self.scaler2.scale(total_loss).backward() # total_loss.backward() # self.optbb.step() if (i + 1) % self.grad_accu == 0: self.scaler2.step(self.optbb) self.scaler2.update() self.optbb.zero_grad() for scheduler in self.schedulers: scheduler.step() # show average training meta after epoch print(f' {self.id_recorder.excrete().result_as_string()}') print(f' {self.age_recorder.excrete().result_as_string()}') def validate(self): print(' -- Validating --') self.model.eval() self.id_recorder.reset() self.age_recorder.reset() for i, (xs, ys, agegrps) in enumerate(self.val_ld): xs, ys, agegrps = xs.to(self.ctx), ys.to(self.ctx), agegrps.to(self.ctx) with torch.no_grad(): with autocast(): idLoss, id_acc, ageLoss, age_acc, cc = self.model(xs, ys, agegrps) # total_loss = idLoss + ageLoss*self.lambdas[0] + cc*self.lambdas[1] self.id_recorder.gulp(len(agegrps), idLoss.item(), id_acc.item()) self.age_recorder.gulp(len(agegrps), ageLoss.item(), age_acc.item()) # show average validation meta after epoch print(f' {self.id_recorder.excrete().result_as_string()}') print(f' {self.age_recorder.excrete().result_as_string()}') return self.id_recorder.acc def set_train_mode(self, state): self.trainingDAL = not state # Trainer.set_grads(self.model.RFM, state) # # Trainer.set_grads(self.model.backbone, state) # Trainer.set_grads(self.model.margin_fc, state) # Trainer.set_grads(self.model.age_classifier, state) # Trainer.set_grads(self.model.DAL, not state) @staticmethod def set_grads(mod, state): for para in mod.parameters(): para.requires_grad = state @staticmethod def flip_grads(mod): for para in mod.parameters(): if para.requires_grad: para.grad = - para.grad
class Processor(): def __init__(self, arg): self.arg = arg self.save_arg() if self.arg.random_fix: self.rng = RandomState(seed=self.arg.random_seed) self.device = GpuDataParallel() self.recoder = Recorder(self.arg.work_dir, self.arg.print_log) self.data_loader = {} self.topk = (1, 5) self.stat = Stat(self.arg.model_args['num_classes'], self.topk) self.model, self.optimizer = self.Loading() self.loss = self.criterion() def criterion(self): loss = nn.CrossEntropyLoss(reduction="none") return self.device.criterion_to_device(loss) def train(self, epoch): self.model.train() self.recoder.print_log('Training epoch: {}'.format(epoch + 1)) loader = self.data_loader['train'] loss_value = [] self.recoder.timer_reset() current_learning_rate = [ group['lr'] for group in self.optimizer.optimizer.param_groups ] for batch_idx, data in enumerate(loader): self.recoder.record_timer("dataloader") image = self.device.data_to_device(data[0]) label = self.device.data_to_device(data[1]) self.recoder.record_timer("device") output = self.model(image) self.recoder.record_timer("forward") loss = torch.mean(self.loss(output, label)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.recoder.record_timer("backward") loss_value.append(loss.item()) if batch_idx % self.arg.log_interval == 0: # self.viz.append_loss(epoch * len(loader) + batch_idx, loss.item()) self.recoder.print_log( '\tEpoch: {}, Batch({}/{}) done. Loss: {:.8f} lr:{:.6f}'. format(epoch, batch_idx, len(loader), loss.item(), current_learning_rate[0])) self.recoder.print_time_statistics() self.optimizer.scheduler.step() self.recoder.print_log('\tMean training loss: {:.10f}.'.format( np.mean(loss_value))) def eval(self, loader_name): self.model.eval() for l_name in loader_name: loader = self.data_loader[l_name] loss_mean = [] for batch_idx, data in enumerate(loader): image = self.device.data_to_device(data[0]) label = self.device.data_to_device(data[1]) # Cal = CalculateParasAndFLOPs() # Cal.reset() # Cal.calculate_all(self.model, image) with torch.no_grad(): output = self.model(image) # loss = torch.mean(self.loss(output, label)) loss_mean += self.loss(output, label).cpu().detach().numpy().tolist() self.stat.update_accuracy(output.data.cpu(), label.cpu(), topk=self.topk) self.recoder.print_log('mean loss: ' + str(np.mean(loss_mean))) def Loading(self): self.device.set_device(self.arg.device) print("Loading model") if self.arg.model: model_class = import_class(self.arg.model) model = self.device.model_to_device( model_class(**self.arg.model_args)) if self.arg.weights: try: print("Loading pretrained model...") state_dict = torch.load(self.arg.weights) for w in self.arg.ignore_weights: if state_dict.pop(w, None) is not None: print('Sucessfully Remove Weights: {}.'.format(w)) else: print('Can Not Remove Weights: {}.'.format(w)) model.load_state_dict(state_dict, strict=True) optimizer = Optimizer(model, self.arg.optimizer_args) except RuntimeError: print("Loading from checkpoint...") state_dict = torch.load(self.arg.weights) self.rng.set_rng_state(state_dict['rng_state']) self.arg.optimizer_args[ 'start_epoch'] = state_dict["epoch"] + 1 self.recoder.print_log( "Resuming from checkpoint: epoch {}".format( self.arg.optimizer_args['start_epoch'])) model = self.device.load_weights(model, self.arg.weights, self.arg.ignore_weights) optimizer = Optimizer(model, self.arg.optimizer_args) optimizer.optimizer.load_state_dict( state_dict["optimizer_state_dict"]) optimizer.scheduler.load_state_dict( state_dict["scheduler_state_dict"]) else: optimizer = Optimizer(model, self.arg.optimizer_args) else: raise ValueError("No Models.") print("Loading model finished.") self.load_data() return model, optimizer def load_data(self): print("Loading data") Feeder = import_class(self.arg.dataloader) self.data_loader = dict() if self.arg.train_loader_args != {}: self.data_loader['train'] = torch.utils.data.DataLoader( dataset=Feeder(**self.arg.train_loader_args), batch_size=self.arg.batch_size, shuffle=True, drop_last=True, num_workers=self.arg.num_worker, ) if self.arg.valid_loader_args != {}: self.data_loader['valid'] = torch.utils.data.DataLoader( dataset=Feeder(**self.arg.valid_loader_args), batch_size=self.arg.test_batch_size, shuffle=False, drop_last=False, num_workers=self.arg.num_worker, ) if self.arg.test_loader_args != {}: test_dataset = Feeder(**self.arg.test_loader_args) self.stat.test_size = len(test_dataset) self.data_loader['test'] = torch.utils.data.DataLoader( dataset=test_dataset, batch_size=self.arg.test_batch_size, shuffle=False, drop_last=False, num_workers=self.arg.num_worker, ) print("Loading data finished.") def start(self): if self.arg.phase == 'train': self.recoder.print_log('Parameters:\n{}\n'.format( str(vars(self.arg)))) for epoch in range(self.arg.optimizer_args['start_epoch'], self.arg.num_epoch): save_model = ((epoch + 1) % self.arg.save_interval == 0) or \ (epoch + 1 == self.arg.num_epoch) eval_model = ((epoch + 1) % self.arg.eval_interval == 0) or \ (epoch + 1 == self.arg.num_epoch) self.train(epoch) if save_model: model_path = '{}/epoch{}_model.pt'.format( self.arg.work_dir, epoch + 1) self.save_model(epoch, self.model, self.optimizer, model_path) if eval_model: if self.arg.valid_loader_args != {}: self.stat.reset_statistic() self.eval(loader_name=['valid']) self.print_inf_log(epoch + 1, "Valid") if self.arg.test_loader_args != {}: self.stat.reset_statistic() self.eval(loader_name=['test']) self.print_inf_log(epoch + 1, "Test") elif self.arg.phase == 'test': if self.arg.weights is None: raise ValueError('Please appoint --weights.') self.recoder.print_log('Model: {}.'.format(self.arg.model)) self.recoder.print_log('Weights: {}.'.format(self.arg.weights)) if self.arg.valid_loader_args != {}: self.stat.reset_statistic() self.eval(loader_name=['valid']) self.print_inf_log(self.arg.optimizer_args['start_epoch'], "Valid") if self.arg.test_loader_args != {}: self.stat.reset_statistic() self.eval(loader_name=['test']) self.print_inf_log(self.arg.optimizer_args['start_epoch'], "Test") self.recoder.print_log('Evaluation Done.\n') def print_inf_log(self, epoch, mode): static = self.stat.show_accuracy('{}/{}_confusion_mat'.format( self.arg.work_dir, mode)) prec1 = static[str(self.topk[0])] / self.stat.test_size * 100 prec5 = static[str(self.topk[1])] / self.stat.test_size * 100 self.recoder.print_log( "Epoch {}, {}, Evaluation: prec1 {:.4f}, prec5 {:.4f}".format( epoch, mode, prec1, prec5), '{}/{}.txt'.format(self.arg.work_dir, self.arg.phase)) def save_model(self, epoch, model, optimizer, save_path): torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.optimizer.state_dict(), 'scheduler_state_dict': optimizer.scheduler.state_dict(), 'rng_state': self.rng.save_rng_state() }, save_path) def save_arg(self): arg_dict = vars(self.arg) if not os.path.exists(self.arg.work_dir): os.makedirs(self.arg.work_dir) with open('{}/config.yaml'.format(self.arg.work_dir), 'w') as f: yaml.dump(arg_dict, f)
def main(): opts = BaseOptions() args = opts.parse() logger = Logger(args.save_path) opts.print_options(logger) mean = np.array([0.485, 0.406, 0.456]) std = np.array([0.229, 0.224, 0.225]) train_transform = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop((224, 224), padding=7), transforms.ToTensor(), transforms.Normalize(mean, std) ]) test_transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize(mean, std)]) train_data = Market('data/{}.mat'.format(args.dataset), state='train', transform=train_transform) gallery_data = Market('data/{}.mat'.format(args.dataset), state='gallery', transform=test_transform) probe_data = Market('data/{}.mat'.format(args.dataset), state='probe', transform=test_transform) num_classes = train_data.return_num_class() train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=2, pin_memory=True, drop_last=True) gallery_loader = torch.utils.data.DataLoader(gallery_data, batch_size=args.batch_size, shuffle=False, num_workers=2, pin_memory=True) probe_loader = torch.utils.data.DataLoader(probe_data, batch_size=args.batch_size, shuffle=False, num_workers=2, pin_memory=True) net = resnet.resnet50(pretrained=False, num_classes=num_classes).cuda() checkpoint = torch.load(args.pretrain_path) fixed_layers = ('fc', ) state_dict = reset_state_dict(checkpoint, net, *fixed_layers) net.load_state_dict(state_dict) logger.print_log('loaded pre-trained feature net') criterion_CE = nn.CrossEntropyLoss().cuda() bn_params, conv_params = partition_params(net, 'bn') optimizer = torch.optim.SGD([{ 'params': bn_params, 'weight_decay': 0 }, { 'params': conv_params }], lr=args.lr, momentum=0.9, weight_decay=args.wd) train_stats = ('acc', 'loss') val_stats = ('acc', ) recorder = Recorder(args.epochs, val_stats[0], train_stats, val_stats) logger.print_log( 'observing training stats: {} \nvalidation stats: {}'.format( train_stats, val_stats)) start_epoch = 0 if args.resume: if os.path.isfile(args.resume): logger.print_log("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) recorder = checkpoint['recorder'] start_epoch = checkpoint['epoch'] net.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.print_log("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: logger.print_log("=> no checkpoint found at '{}'".format( args.resume)) # Main loop start_time = time.time() epoch_time = AverageMeter() for epoch in range(start_epoch, args.epochs): need_hour, need_mins, need_secs = convert_secs2time( epoch_time.avg * (args.epochs - epoch)) need_time = '[Need: {:02d}:{:02d}:{:02d}]'.format( need_hour, need_mins, need_secs) logger.print_log('\n==>>{:s} [Epoch={:03d}/{:03d}] {:s}'.format( time_string(), epoch, args.epochs, need_time)) lr, _ = adjust_learning_rate(optimizer, (args.lr, args.lr), epoch, args.epochs, args.lr_strategy) print(" lr:{}".format(lr)) train(train_loader, net, criterion_CE, optimizer, epoch, recorder, logger, args) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': net.state_dict(), 'recorder': recorder, 'optimizer': optimizer.state_dict(), }, False, args.save_path, 'checkpoint.pth.tar') recorder.plot_curve(os.path.join(args.save_path, 'curve.png')) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() evaluate(gallery_loader, probe_loader, net, args.epochs - 1, recorder, logger)
def main(config, resume): set_seed(config['seed']) train_recorder = Recorder() # setup data_loader instances train_data = getattr(module_data, config['dataloader']['type'])( data_path=config['dataloader']['args']['train_data'], data_quota=config['dataloader']['args']['data_quota']) logging.info('using %d examples to train. ' % len(train_data)) data_loader = DataLoader( dataset=train_data, batch_size=config['dataloader']['args']['batch_size']) # val_data = getattr(module_data, config['dataloader']['type'])( # data_path = config['dataloader']['args']['val_data'], # data_quota = config['dataloader']['args']['data_quota'] # ) # logging.info('using %d examples to val. ' % len(val_data)) # valid_data_loader = DataLoader(dataset = val_data, # batch_size = config['data_loader']['batch_size']) # build model architecture model = getattr(models, config['model']['type'])(config['model']['args'], device=config['device']) logging.info(['my PID is: ', os.getpid()]) # get function handles of loss and metrics loss = getattr(module_loss, config['loss'])() # metrics = [getattr(module_metric, met) for met in config['metrics']] # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler g_trainable_params = filter(lambda p: p.requires_grad, model.G.parameters()) g_optimizer = getattr(torch.optim, config['optimizer']['generator']['type'])( g_trainable_params, **config['optimizer']['generator']['args']) d_trainable_params = filter(lambda p: p.requires_grad, model.D.parameters()) d_optimizer = getattr(torch.optim, config['optimizer']['discriminator']['type'])( d_trainable_params, **config['optimizer']['discriminator']['args']) trainer = Trainer(model, loss, g_optimizer, d_optimizer, resume=resume, config=config, data_loader=data_loader, valid_data_loader=None, metrics=None, lr_scheduler=None, train_recorder=train_recorder) logging.info('begin training. ') trainer.train()
model = getattr(model_def, args.arch)() model.cuda() train_loader, test_loader = data_loader(batch_size=args.batch_size, n_workers=args.workers, dataset=args.dataset) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, nesterov=True) prev_state = None if args.resume: prev_state = torch.load('references/{}_checkpoint.th'.format(args.arch)) epoch_time = AverageMeter() rec = Recorder() all_start_time = time.time() start_epoch = 0 if prev_state: print() model.load_state_dict(prev_state['model_state']) optimizer.load_state_dict(prev_state['optimizer_state']) epoch_time = prev_state['epoch_time'] rec = prev_state['records'] all_start_time -= prev_state['training_time'] print('Overriding provided arg with prev_state args: ', prev_state['args']) args = prev_state['args'] start_epoch = prev_state['epoch'] scheduler = None if args.scheduler == 'exponential':
def test(self, data_fetcher, num_samples, if_baseline=False, if_return_each=False, img_save_folder=None, if_train=True): """ val (in training): idx_out=0/1/2/3/4 test: idx_out=-2, record time wo. iqa """ if if_baseline or if_train: assert self.crit_lst is not None, 'NO METRICS!' if self.crit_lst is not None: if_tar_only = False msg = 'dst vs. src | ' if if_baseline else 'tar vs. src | ' else: if_tar_only = True msg = 'only get dst | ' report_dict = None recorder_dict = dict() for crit_name in self.crit_lst: recorder_dict[crit_name] = Recorder() write_dict_lst = [] timer = CUDATimer() # validation baseline: no iqa, no parse name # validation, not baseline: no iqa, parse name # test baseline: no iqa, no parse name # test, no baseline, iqa, no parse name if_iqa = True if (not if_train) and (not if_baseline) else False if if_iqa: timer_wo_iqam = Recorder() idx_out = -2 # testing; judge by IQAM if_parse_name = True if if_train and (not if_baseline) else False self.set_eval_mode() data_fetcher.reset() test_data = data_fetcher.next() assert len(test_data['name']) == 1, 'ONLY SUPPORT bs==1!' pbar = tqdm(total=num_samples, ncols=100) while test_data is not None: im_lq = test_data['lq'].cuda(non_blocking=True) # assume bs=1 im_name = test_data['name'][0] # assume bs=1 if if_parse_name: im_type = im_name.split('_')[-1].split('.')[0] if im_type in ['qf50', 'qp22']: idx_out = 0 elif im_type in ['qf40', 'qp27']: idx_out = 1 elif im_type in ['qf30', 'qp32']: idx_out = 2 elif im_type in ['qf20', 'qp37']: idx_out = 3 elif im_type in ['qf10', 'qp42']: idx_out = 4 else: raise Exception(f"im_type IS {im_type}, NO MATCHING TYPE!") timer.start_record() if if_tar_only: if if_iqa: time_wo_iqa, im_out = self.model.net[self.model.infer_subnet](inp_t=im_lq, idx_out=idx_out).clamp_(0., 1.) else: im_out = self.model.net[self.model.infer_subnet](inp_t=im_lq, idx_out=idx_out).clamp_(0., 1.) timer.record_inter() else: im_gt = test_data['gt'].cuda(non_blocking=True) # assume bs=1 if if_baseline: im_out = im_lq else: if if_iqa: time_wo_iqa, im_out = self.model.net[self.model.infer_subnet](inp_t=im_lq, idx_out=idx_out) im_out = im_out.clamp_(0., 1.) else: im_out = self.model.net[self.model.infer_subnet](inp_t=im_lq, idx_out=idx_out).clamp_(0., 1.) timer.record_inter() _msg = f'{im_name} | ' for crit_name in self.crit_lst: crit_fn = self.crit_lst[crit_name]['fn'] crit_unit = self.crit_lst[crit_name]['unit'] perfm = crit_fn(torch.squeeze(im_out, 0), torch.squeeze(im_gt, 0)) recorder_dict[crit_name].record(perfm) _msg += f'[{perfm:.3e}] {crit_unit:s} | ' _msg = _msg[:-3] if if_return_each: msg += _msg + '\n' pbar.set_description(_msg) if if_iqa: timer_wo_iqam.record(time_wo_iqa) if img_save_folder is not None: # save im im = tensor2im(torch.squeeze(im_out, 0)) save_path = img_save_folder / (str(im_name) + '.png') cv2.imwrite(str(save_path), im) pbar.update() test_data = data_fetcher.next() pbar.close() if not if_tar_only: for crit_name in self.crit_lst: crit_unit = self.crit_lst[crit_name]['unit'] crit_if_focus = self.crit_lst[crit_name]['if_focus'] ave_perfm = recorder_dict[crit_name].get_ave() msg += f'{crit_name} | [{ave_perfm:.3e}] {crit_unit} | ' write_dict_lst.append(dict(tag=f'{crit_name} (val)', scalar=ave_perfm)) if crit_if_focus: report_dict = dict(ave_perfm=ave_perfm, lsb=self.crit_lst[crit_name]['fn'].lsb) ave_fps = 1. / timer.get_ave_inter() msg += f'ave. fps | [{ave_fps:.1f}]' if if_iqa: ave_time_wo_iqam = timer_wo_iqam.get_ave() fps_wo_iqam = 1. / ave_time_wo_iqam msg += f' | ave. fps wo. IQAM | [{fps_wo_iqam:.1f}]' if if_train: assert report_dict is not None return msg.rstrip(), write_dict_lst, report_dict else: return msg.rstrip()
def main(): # fetch arguments args = parse_args() # initialize logger logger = SysLogger(LOGFILE) recorder = Recorder(RECORDER_FILE) logger.info('starting...') rl_knobs = knobs.get_rl_knobs(args.scenario) pso_knobs = knobs.get_pso_knobs(args.scenario) bm = benchmark.get_benchmark_instance(args.benchmark) env = DB_Env(db_info=args.db_info, benchmark=bm, recorder=recorder) if len(rl_knobs) == 0 and args.is_train: print(SysLogger) logger.print( 'current mode is training, so you must set reinforcement learning knobs.', fd=SysLogger.stderr) return -1 # reinforcement learning if len(rl_knobs) > 0: env.set_tuning_knobs(rl_knobs) # lazy loading. Because loading tensorflow has to cost too much time. from algorithms.rl_agent import RLAgent rl = RLAgent(env, agent='ddpg') if args.is_train: rl.fit(STEPS, nb_max_episode_steps=NB_MAX_EPISODE_STEPS) rl.save(args.model_path) logger.print('saved model at %s' % args.model_path) return 0 # training mode stop here. if not args.model_path: from sys import stderr print('have no model path, you can use --model-path argument.', file=stderr, flush=True) exit(-1) rl.load(args.model_path) rl.test(TEST_EPISODES, nb_max_episode_steps=NB_MAX_EPISODE_STEPS) recorder.write_best_val('reward') # heuristic algorithm if len(pso_knobs) > 0: env.set_tuning_knobs(pso_knobs) def heuristic_callback(v): s, r, d, _ = env.step(v, False) return -r # - reward pso = Pso(func=heuristic_callback, dim=len(pso_knobs), particle_nums=3, max_iteration=100, x_min=0, x_max=1, max_vel=0.5) pso.update() # if you have other approaches, you can code here. recorder.write_best_val('reward') logger.print('please see result at logfile: %s.' % RECORDER_FILE)
class Trainer(): def __init__(self, model, dataset, ctx=-1, batch_size=128, optimizer='sgd', lambdas=[0.1, 0.1], print_freq=32): self.model = model self.dataset = dataset self.batch_size = batch_size self.optbb = optim.SGD(chain(self.model.age_classifier.parameters(), self.model.RFM.parameters(), self.model.margin_fc.parameters(), self.model.backbone.parameters()), lr=0.01, momentum=0.9) self.optDAL = optim.SGD(self.model.DAL.parameters(), lr=0.01, momentum=0.9) self.lambdas = lambdas self.print_freq = print_freq self.id_recorder = Recorder() self.age_recorder = Recorder() self.trainingDAL = False if ctx < 0: self.ctx = torch.device('cpu') else: self.ctx = torch.device(f'cuda:{ctx}') def train(self, epochs, start_epoch, save_path=None): self.train_ds = ImageFolderWithAgeGroup(self.dataset['pat'], self.dataset['pos'], \ age_cutoffs, self.dataset['train_root'], transform=transforms.Compose(\ [transforms.RandomHorizontalFlip(p=0.5), transforms.ToTensor(), \ transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))])) self.train_ld = DataLoader(self.train_ds, shuffle=True, batch_size=self.batch_size) if self.dataset['val_root'] is not None: self.val_ds = ImageFolderWithAgeGroup(self.dataset['pat'], self.dataset['pos'], age_cutoffs, self.dataset['val_root'], \ transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))])) self.val_ld = DataLoader(self.val_ds, shuffle=True, batch_size=self.batch_size) self.model = self.model.to(self.ctx) for epoch in range(epochs): print(f'---- epoch {epoch} ----') self.update() if self.dataset['val_root'] is not None: acc = self.validate() else: acc = -1. if save_path is not None: torch.save( self.model.state_dict(), os.path.join(save_path, f'{start_epoch+epoch}_{acc:.4f}.state')) def update(self): print(' -- Training --') self.model.train() self.id_recorder.reset() self.age_recorder.reset() for i, (xs, ys, agegrps) in enumerate(self.train_ld): if i % 70 == 0: # canonical maximization procesure self.set_train_mode(False) elif i % 70 == 20: # RFM optimize procesure self.set_train_mode(True) xs, ys, agegrps = xs.to(self.ctx), ys.to(self.ctx), agegrps.to( self.ctx) idLoss, id_acc, ageLoss, age_acc, cc = self.model(xs, ys, agegrps) #print(f' ---\n{idLoss}\n{id_acc}\n{ageLoss}\n{age_acc}\n{cc}') total_loss = idLoss + ageLoss * self.lambdas[ 0] + cc * self.lambdas[1] self.id_recorder.gulp(len(agegrps), idLoss.item(), id_acc.item()) self.age_recorder.gulp(len(agegrps), ageLoss.item(), age_acc.item()) if i % self.print_freq == 0: print( f' iter: {i} {i%70} total loss: {total_loss.item():.4f} ({idLoss.item():.4f}, {id_acc.item():.4f}, {ageLoss.item():.4f}, {age_acc.item():.4f}, {cc.item():.8f})' ) if self.trainingDAL: self.optDAL.zero_grad() total_loss.backward() Trainer.flip_grads(self.model.DAL) self.optDAL.step() else: self.optbb.zero_grad() total_loss.backward() self.optbb.step() # show average training meta after epoch print(f' {self.id_recorder.excrete().result_as_string()}') print(f' {self.age_recorder.excrete().result_as_string()}') def validate(self): print(' -- Validating --') self.model.eval() self.id_recorder.reset() self.age_recorder.reset() for i, (xs, ys, agegrps) in enumerate(self.val_ld): xs, ys, agegrps = xs.to(self.ctx), ys.to(self.ctx), agegrps.to( self.ctx) with torch.no_grad(): idLoss, id_acc, ageLoss, age_acc, cc = self.model( xs, ys, agegrps) total_loss = idLoss + ageLoss * self.lambdas[ 0] + cc * self.lambdas[1] self.id_recorder.gulp(len(agegrps), idLoss.item(), id_acc.item()) self.age_recorder.gulp(len(agegrps), ageLoss.item(), age_acc.item()) # show average validation meta after epoch print(f' {self.id_recorder.excrete().result_as_string()}') print(f' {self.age_recorder.excrete().result_as_string()}') return self.id_recorder.acc def set_train_mode(self, state): self.trainingDAL = not state Trainer.set_grads(self.model.RFM, state) Trainer.set_grads(self.model.backbone, state) Trainer.set_grads(self.model.margin_fc, state) Trainer.set_grads(self.model.age_classifier, state) Trainer.set_grads(self.model.DAL, not state) @staticmethod def set_grads(mod, state): for para in mod.parameters(): para.requires_grad = state @staticmethod def flip_grads(mod): for para in mod.parameters(): if para.requires_grad: para.grad = -para.grad
def main(args): if args.seed: np.random.seed(int(args.seed)) torch.backends.cudnn.deterministic = True torch.manual_seed(0) config = get_config(args.dataset, args.version) method = config['model'] criterion = nn.CrossEntropyLoss().cuda() try: model = model_mappings[method](K=config['n_class']).cuda() except KeyError: print('%s model does not exist' % method) sys.exit(1) model_dir = './saved/%s_%s.pth' % (config['name'], method) if args.mode == 'train': log_dir = './log/%s_%s.log' % (config['name'], method) train_loader, validation_loader = get_dataloader(config) if config['optimizer'] == 'Adam': optimizer = optim.Adam(model.parameters(), lr=config['lr'], weight_decay=5e-4) elif config['optimizer'] == 'SGD': optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9, weight_decay=5e-4) else: print('cannot found %s optimizer' % config['optimizer']) sys.exit(1) scheduler = ReduceLROnPlateau(optimizer, patience=3) recorder = Recorder(('loss_train', 'acc_train', 'loss_val', 'acc_val')) iou_val_max = 0 for epoch in range(1, config['epoch'] + 1): print('Epoch %s:' % epoch) loss_train, acc_train = train(config, model, criterion, optimizer, train_loader, method=method) loss_val, acc_val, iou_val = evaluate(config, model, criterion, validation_loader, method=method) scheduler.step(loss_train) # update loss and accuracy per epoch recorder.update((loss_train, acc_train, loss_val, acc_val)) # save model with higher iou if iou_val > iou_val_max and args.save: torch.save(recorder.record, log_dir) torch.save( { 'epoch': epoch, 'version': args.version, 'model_state_dict': model.state_dict(), }, model_dir) print( 'validation iou improved from %.5f to %.5f. Model Saved.' % (iou_val_max, iou_val)) iou_val_max = iou_val elif args.mode == 'evaluate': test_dir = '%s/%s' % (config['root'], args.test_folder) test_set = Dataset(test_dir, config['size'], *get_transform(config, is_train=False)) test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0, drop_last=False) model.load_state_dict(torch.load(model_dir)['model_state_dict']) # save prediction results, make directory if not exists save_dir = '%s/predictions/%s_%s' % (test_dir, args.version, method) if not os.path.isdir('%s/predictions' % test_dir): os.mkdir('%s/predictions' % test_dir) if not os.path.isdir(save_dir): os.mkdir(save_dir) evaluate(config, model, criterion, test_loader, method=method, test_flag=True, save_dir=save_dir) else: print('%s mode does not exist' % args.mode)
def main(args): # Defines configuration dictionary and network architecture to use config = get_config(args.dataset, args.version) method = config['model'] # Defines the loss function. Takes a tensor as argument to initiate class balancing, # which can be obtained from the balance script. Uncomment argument below. if config['balance'] and args.gpu and torch.cuda.is_available(): criterion = nn.CrossEntropyLoss(weight=balance(config)).cuda() elif config['balance']: criterion = nn.CrossEntropyLoss(weight=balance(config)) elif args.gpu and torch.cuda.is_available(): criterion = nn.CrossEntropyLoss().cuda() else: criterion = nn.CrossEntropyLoss() # Maps configuration method to network class defined in models.py try: if args.gpu and torch.cuda.is_available(): model = model_mappings[method](K=config['n_class']).cuda() else: model = model_mappings[method](K=config['n_class']) except KeyError: print('%s model does not exist' % method) sys.exit(1) if args.mode == 'train': # Starts training time to be completed at end of conditional statement start = time.time() # Defines directory for trained network, training log, and training plot # respectively; create these directories in MatSeg if this is not already done. model_dir = './saved/%s_%s.pth' % (config['name'], method) log_dir = './log/%s_%s.log' % (config['name'], method) plot_dir = './plots/%s_%s.png' % (config['name'], method) # Obtains iterable data sets from function above train_loader, validation_loader = get_dataloader(config) # Conditional outlining choice of optimizer; includes hard-coded hyperparameters if config['optimizer'] == 'Adam': optimizer = optim.Adam(model.parameters(), lr=config['lr'], weight_decay=5e-4) elif config['optimizer'] == 'SGD': optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9, weight_decay=5e-4) else: print('cannot found %s optimizer' % config['optimizer']) sys.exit(1) # Defines dynamic learning rate reduction. Patience defines the number of epochs after # which to reduce the LR should training loss not decrease in those epochs. scheduler = ReduceLROnPlateau(optimizer, patience=config['patience']) # Gives entries in the Recorder object to measure; obtained from evaluate function recorder = Recorder(('loss_train', 'acc_train', 'loss_val', 'acc_val', 'mean_iou', 'class_precision', 'class_iou')) iou_val_max = 0 # Iterate through number of epochs for epoch in range(1, config['epoch'] + 1): gc.collect() print('Epoch %s:' % epoch) loss_train, acc_train = train(config, model, criterion, optimizer, train_loader, method=method, gpu=args.gpu) loss_val, acc_val, iou_val, class_precision, class_iou = evaluate( config, model, criterion, validation_loader, gpu=args.gpu, method=method) # Update learning rate scheduler based on training loss scheduler.step(loss_train) # Update metrics in Recorder object for each epoch recorder.update((loss_train, acc_train, loss_val, acc_val, iou_val, class_precision, class_iou)) # Save model with higher mean IoU if iou_val > iou_val_max and args.save: torch.save(recorder.record, log_dir) torch.save( { 'epoch': epoch, 'version': args.version, 'model_state_dict': model.state_dict(), }, model_dir) print( 'validation iou improved from %.5f to %.5f. Model Saved.' % (iou_val_max, iou_val)) iou_val_max = iou_val # Stop training if learning rate is reduced three times or (commented out) if validation loss # loss does not decrease for 20 epochs. Otherwise, continue training. if (optimizer.param_groups[0]['lr'] / config['lr']) <= 1e-3: print('Learning Rate Reduced to 1e-3 of Original Value', 'Training Stopped', sep='\n') epochs = epoch break # elif all(recorder['loss_val'][-20:][i] <= recorder['loss_val'][-20:][i+1] for i in range(19)): # print('Loss has not decreased for previous 20 epochs', 'Training Stopped', sep='\n') # epochs = epoch # break else: epochs = epoch continue # Obtain time after all epochs, compute total training time, print and plot results end = time.time() time_taken = end - start print(recorder.record) plotting(recorder.record, config, start, time_taken, plot_dir, epochs) elif args.mode == 'evaluate': # Load test data into and iterable dataset with no augmentation and verbose metrics test_dir = '%s/%s' % (config['root'], args.test_folder) test_set = Dataset(test_dir, config['size'], *get_transform(config, is_train=False)) test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0, drop_last=False) # Load desired trained network from saved directory model_dir = './saved/%s_%s.pth' % (config['name'], method) model.load_state_dict(torch.load(model_dir)['model_state_dict']) # Define directories to which to save predictions and overlays respectively, and create them if necessary save_dir = '%s/predictions/%s_%s' % (test_dir, args.version, method) overlay_dir = '%s/overlays/%s_%s' % (test_dir, args.version, method) labels_dir = os.path.join(test_dir, 'labels_npy') if not os.path.isdir('%s/predictions' % test_dir): os.mkdir('%s/predictions' % test_dir) if not os.path.isdir(save_dir): os.mkdir(save_dir) evaluate(config, model, criterion, test_loader, gpu=args.gpu, method=method, test_flag=True, save_dir=save_dir) # Creates overlays if this is specified in the command line if os.path.isdir(labels_dir) and args.overlay: if not os.path.isdir(overlay_dir): os.makedirs(overlay_dir) overlay(labels_dir, save_dir, overlay_dir, config['n_class']) else: print('%s mode does not exist' % args.mode)
def train_eval_save(car_id_list, dest_term, model_id, n_save_viz=0): """ TRAIN and EVAL for given car and experimental settings """ # Load datasets path_trn, meta_trn, dest_trn, dt_trn, full_path_trn, \ path_tst, meta_tst, dest_tst, dt_tst, full_path_tst = \ unified_latest_seqdata(car_id_list, proportion_list, dest_term, train_ratio=0.8, seq_len=FLAGS.seq_len, data_dir=DATA_DIR) print('trn_data:', path_trn.shape, dest_trn.shape) print('tst_data:', path_tst.shape, dest_tst.shape) # Define model dir model_dir = os.path.join(MODEL_DIR, 'dest_type_%d' % dest_term, 'minibatch', model_id) model = Model(model_dir) FLAGS.train = FLAGS.train or model.latest_checkpoint is None # Build graph and initialize all variables model.build_graph() model.init_or_restore_all_variables(restart=FLAGS.restart) # TRAIN PART if FLAGS.train: # model.print_all_trainable_variables() model.train(path_trn, meta_trn, dest_trn) # TEST EVALUATION PART # FOR TARGETING CARS for car_id in car_id_list: # LOAD DATA path_trn, meta_trn, dest_trn, dt_trn, full_path_trn, \ path_tst, meta_tst, dest_tst, dt_tst, full_path_tst = \ unified_latest_seqdata([car_id], proportion_list, dest_term, train_ratio=0.8, seq_len=FLAGS.seq_len, data_dir=DATA_DIR) # dist_tst = model.eval_dist(path_tst, meta_tst, dest_tst) # recorder = Recorder('PATHWISE_' + RECORD_FNAME) # for i in tqdm(range(len(dist_tst))): # recorder.append_values( # ['car{:03}'.format(car_id) if isinstance(car_id, int) else 'car' + car_id, # dt_tst[i], *meta_tst[i], dist_tst[i]]) # recorder.next_line() if FLAGS.record: log.info('save the results to %s', RECORD_FNAME) global_step = model.latest_step loss_trn = model.eval_mean_distance(path_trn, meta_trn, dest_trn) loss_tst = model.eval_mean_distance(path_tst, meta_tst, dest_tst) print('car_id:', car_id, 'trn_data:', path_trn.shape, dest_trn.shape, end='--') print(loss_trn, loss_tst) # SAVE THE RESULT INTO CSV recorder = Recorder(RECORD_FNAME) recorder.append_values(['car{:03}'.format(car_id) if isinstance(car_id, int) else 'car' + car_id, model_id, len(path_trn), len(path_tst), global_step, loss_trn, loss_tst]) recorder.next_line() if n_save_viz > 0: # DEFINE PLOT AND GET PRED POINTS pred_tst = model.predict(path_tst, meta_tst) myplot = ResultPlot() myplot.add_point( path_trn, label=None, color='lightgray', marker='.', s=10, alpha=1, must_contain=False) myplot.add_point( dest_trn, label=None, color='gray', marker='.', s=10, alpha=1, must_contain=False) # PLOT ALL TEST ERRORS for i in range(pred_tst.shape[0]): difference = np.stack([dest_tst[i], pred_tst[i]], axis=0) myplot.add_tmp_path( difference, label=None, color='lightblue', marker=None, must_contain=True) myplot.add_tmp_point( dest_tst[i], label=None, color='mediumblue', marker='*', s=100, alpha=1, must_contain=True) myplot.add_tmp_point( pred_tst[i], label=None, color='crimson', marker='*', s=100, alpha=1, must_contain=True) dist_km = dist(dest_tst, pred_tst, to_km=True) # Define details to save plot save_dir = os.path.join(VIZ_DIR, 'path_and_prediction', 'dest_term_%d' % dest_term, 'car_%03d' % car_id) fname = model_id + '.png' title = '{fname}\ndist={dist_km}km' title = title.format(fname=fname, dist_km='N/A' if dist_km is None else '%.1f' % dist_km) myplot.draw_and_save(title, save_dir, fname) # FOR EACH TRIP for i in range(n_save_viz): myplot.add_tmp_path( full_path_tst[i], label=None, color='lightblue', marker='.', must_contain=True) myplot.add_tmp_path( path_tst[i], label='input_path', color='mediumblue', marker='.', must_contain=True) dest_true, dest_pred = dest_tst[i], pred_tst[i] myplot.add_tmp_point( dest_true, label='true_destination', color='mediumblue', marker='*', s=100, alpha=1, must_contain=True) myplot.add_tmp_point( dest_pred, label='pred_destination', color='crimson', marker='*', s=100, alpha=1, must_contain=True) start_time = convert_time_for_fname(dt_tst[i]) dist_km = dist(dest_pred, dest_true, to_km=True) # Define details to save plot save_dir = os.path.join(VIZ_DIR, 'path_and_prediction', 'dest_term_%d' % dest_term, 'car_%03d' % car_id, 'start_%s' % start_time) fname = model_id + '.png' title = '{datetime}\n{fname}\ndist={dist_km}km' title = title.format(fname=fname, datetime=start_time, dist_km='N/A' if dist_km is None else '%.1f' % dist_km) myplot.draw_and_save(title, save_dir, fname) # Close tf session to release GPU memory model.close_session()