def main(): args = parse_args() make_output_dir(args) config_for_multi_gpu(args) set_seed(args) with Timer('load input'): train_data_loader, dev_data_loader, test_data_loader = load_data_for_nlu_task( args, train=True, dev=True, test=False) print(f'train batch size: {args.train_batch_size}') print(f'train data batch num: {len(train_data_loader)}') # 每个epoch做两次dev: args.eval_interval = len(train_data_loader) // 2 print(f'eval interval: {args.eval_interval}') # 注意该参数影响学习率warm up args.max_train_steps = len(train_data_loader) * args.max_train_epochs print(f'max steps: {args.max_train_steps}') if not args.early_stop: print( f'do not use early stop, training will last {args.max_train_epochs} epochs' ) with Timer('load trainer'): trainer = load_trainer(args) with Timer('Train'): trainer.train(train_data_loader, dev_data_loader)
def train(self, train_data_loader, dev_data_loader=None): best_result = BestResult() self.model.zero_grad() set_seed(self.args) train_stop = False summary_writer = SummaryWriter(log_dir=self.args.summary_dir) global_step = 0 for epoch in range(self.args.max_train_epochs + 1): epoch_train_loss = 0 train_data_loader = tqdm(train_data_loader, desc=f'Training epoch {epoch}') for step, batch in enumerate(train_data_loader): batch = tuple(t.to(self.args.device) for t in batch) self.model.train() inputs, y_trues = self._unpack_batch(self.args, batch) logits = self.model(inputs) loss, _ = self._update_and_predict(logits, y_trues, calc_loss=True, update=True, calc_prediction=False) global_step += 1 if loss is not None: epoch_train_loss += loss if global_step % self.args.eval_interval == 0: summary_writer.add_scalar('loss/train', loss, global_step) if dev_data_loader: f1, report = self.dev(dev_data_loader) summary_writer.add_scalar('metrics/f1', f1, global_step) if best_result.is_new_record(f1, global_step, epoch): best_result.best_report = report print(f"\n## NEW BEST RESULT in epoch {epoch} ##") print(best_result) if self.args.early_stop and (epoch - best_result.best_epoch ) > self.args.early_stop_epoch: print(f'\n## Early stop in epoch:{epoch} ##') train_stop = True break if train_stop: break summary_writer.add_scalar( 'epoch_average_loss', epoch_train_loss / len(train_data_loader), epoch) with open(self.args.dev_result_path, 'w', encoding='utf-8') as f: f.write(str(best_result) + '\n') print("\n## BEST RESULT in Training ##") print(best_result) summary_writer.close() print('train stop')
def main(): with Timer('parse args'): args = parse_args() # 添加多卡运行下的配置参数 # BERT训练须在多卡下运行,单卡非常慢 config_for_multi_gpu(args) # set_seed 必须在设置n_gpu之后 set_seed(args) if args.run_mode == 'train': train(args) elif args.run_mode == 'dev': dev(args) elif args.run_mode == 'inference': inference(args)
def __init__(self, args): self.args = args self.model = None self.optimizer = None self.scheduler = None self.epoch = 0 # s = State(args) set_seed(self.args.seed, self.args.cudnn_behavoir) self.log = Log(self.args.log_path) self.writer = Tensorboard(self.args.tensorboard_path) self.stati = Statistic(self.args.expernameid, self.args.experid_path, self.args.root_path) self.stati.add('hparam', self.args.dict()) # s.writer.add_hparams(hparam_dict=s.args.dict(), metric_dict={}) self.record = Record()
def run(self): """ Run process. :return: None """ # Set random seed for this process set_seed(self.seed) # Create eval agent eval_agent = copy.deepcopy(self.agent) eval_results = [] while not self.stop_flag.is_set(): # Wait for evaluation self.__wait_for_eval__() # Find out current step current_step = sum(self.workers_steps) # Copy current agent's state to eval agent eval_agent.model.load_state_dict(self.agent.model.state_dict()) # Evaluate agent for given number of episodes result = self.__eval__(eval_agent, self.eval_episodes) # Store evaluation result eval_results.append(result) # Log evaluation result log_eval_result(current_step, result) # If termination condition passed given evaluation result, finish training if self.goal and self.goal(result): logger.info("") logger.info("Termination condition passed") logger.info("") self.stop_flag.set() # If workers reached total number of training steps, finish training if self.__workers_finished__(): logger.info("") self.stop_flag.set() # Put result to the queue self.result_queue.put(RunResult([], eval_results))
def run(self): """ Run process. :return: None """ # Set random seed for this process set_seed(self.seed) # Initialize worker's current step self.workers_steps[self.worker.worker_id] = 0 # Train until stop flag is set or number of training steps is reached while not self.stop_flag.is_set() and self.workers_steps[ self.worker.worker_id] < self.train_steps: # Train worker for batch steps self.__train__(self.batch_steps)
def main(): with Timer('parse args'): args = parse_args() # 添加多卡运行下的配置参数 # Setup CUDA, GPU & distributed training config_for_multi_gpu(args) # set_seed 必须在设置n_gpu之后 set_seed(args) # 创建输出文件夹,保存运行结果,配置文件,模型参数 if args.run_mode == 'train' and args.local_rank in [-1, 0]: make_output_dir(args) if args.run_mode == 'train': train(args) elif args.run_mode == 'dev': dev(args) elif args.run_mode == 'inference': inference(args)
def worker(self, env_fn_serialized, seed, remote, parent_remote): # Set random seed for this process set_seed(seed) # Close pipe parent_remote.close() # Create environment env = deserialize(env_fn_serialized)() while True: # Wait for data cmd, data = remote.recv() if cmd == 'state': # Return current state remote.send(env.state) elif cmd == 'step': # Perform action reward, next_state, done = env.step(data) # Reset environments if done flag is set if done: env.reset() # Return observation remote.send((reward, next_state, done)) elif cmd == 'reset': # Reset environment state = env.reset() remote.send(state) elif cmd == 'close': # Close pipe remote.close() break else: raise NotImplementedError
def setUp(self): set_seed(self.seed)
def train(self, train_data_loader, dev_data_loader=None, dev_CoNLLU_file=None): self.optimizer, self.optim_scheduler = get_optimizer( self.args, self.model) global_step = 0 best_result = BestResult() self.model.zero_grad() set_seed( self.args ) # Added here for reproductibility (even between python 2 and 3) train_stop = False summary_writer = SummaryWriter(log_dir=self.args.summary_dir) for epoch in range(1, self.args.max_train_epochs + 1): epoch_ave_loss = 0 train_data_loader = tqdm(train_data_loader, desc=f'Training epoch {epoch}') # 某些模型在训练时可能需要一些定制化的操作,默认什么都不做 # 具体参考子类中_custom_train_operations的实现 self._custom_train_operations(epoch) for step, batch in enumerate(train_data_loader): batch = tuple(t.to(self.args.device) for t in batch) self.model.train() # debug_print(batch) # word_mask:以word为单位,1为真实输入,0为PAD inputs, word_mask, _, dep_ids = self._unpack_batch( self.args, batch) # word_pad_mask:以word为单位,1为PAD,0为真实输入 word_pad_mask = torch.eq(word_mask, 0) unlabeled_scores, labeled_scores = self.model(inputs) labeled_target = dep_ids unlabeled_target = labeled_target.ge(1).to( unlabeled_scores.dtype) # Calc loss and update: loss, _ = self._update_and_predict( unlabeled_scores, labeled_scores, unlabeled_target, labeled_target, word_pad_mask, label_loss_ratio=self.model.label_loss_ratio if not self.args.parallel_train else self.model.module.label_loss_ratio, calc_loss=True, update=True, calc_prediction=False) global_step += 1 if loss is not None: epoch_ave_loss += loss if global_step % self.args.eval_interval == 0: summary_writer.add_scalar('loss/train', loss, global_step) # 记录学习率 for i, param_group in enumerate( self.optimizer.param_groups): summary_writer.add_scalar(f'lr/group_{i}', param_group['lr'], global_step) if dev_data_loader: UAS, LAS = self.dev(dev_data_loader, dev_CoNLLU_file) summary_writer.add_scalar('metrics/uas', UAS, global_step) summary_writer.add_scalar('metrics/las', LAS, global_step) if best_result.is_new_record(LAS=LAS, UAS=UAS, global_step=global_step): self.logger.info( f"\n## NEW BEST RESULT in epoch {epoch} ##") self.logger.info('\n' + str(best_result)) # 保存最优模型: if hasattr(self.model, 'module'): # 多卡,torch.nn.DataParallel封装model self.model.module.save_pretrained( self.args.output_model_dir) else: self.model.save_pretrained( self.args.output_model_dir) if self.args.early_stop and global_step - best_result.best_LAS_step > self.args.early_stop_steps: self.logger.info( f'\n## Early stop in step:{global_step} ##') train_stop = True break if train_stop: break # print(f'\n- Epoch {epoch} average loss : {epoch_ave_loss / len(train_data_loader)}') summary_writer.add_scalar('epoch_loss', epoch_ave_loss / len(train_data_loader), epoch) with open(self.args.dev_result_path, 'w', encoding='utf-8') as f: f.write(str(best_result) + '\n') self.logger.info("\n## BEST RESULT in Training ##") self.logger.info('\n' + str(best_result)) summary_writer.close()
def train(args): def _get_dataloader(datasubset, tokenizer, device, args, subset_classes=True): """ Get specific dataloader. Args: datasubset ([type]): [description] tokenizer ([type]): [description] device ([type]): [description] args ([type]): [description] Returns: dataloader """ if subset_classes: dataloader = StratifiedLoaderwClassesSubset( datasubset, k=args['k'], max_classes=args['max_classes'], max_batch_size=args['max_batch_size'], tokenizer=tokenizer, device=device, shuffle=True, verbose=False) else: dataloader = StratifiedLoader( datasubset, k=args['k'], max_batch_size=args['max_batch_size'], tokenizer=tokenizer, device=device, shuffle=True, verbose=False) return dataloader def _adapt_and_fit(support_labels, support_input, query_labels, query_input, loss_fn, model_init, args, mode="train"): """ Adapts the init model to a support set and computes loss on query set. Args: support_labels ([type]): [description] support_text ([type]): [description] query_labels ([type]): [description] query_text ([type]): [description] model_init ([type]): [description] args mode """ ##################### # Create model_task # ##################### if (not args['dropout']) and mode == "train": for module in model_init.modules(): if isinstance(module, nn.Dropout): module.eval() else: module.train() elif mode != "train": model_init.eval() else: model_init.train() model_task = deepcopy(model_init) for name, param in model_task.encoder.model.named_parameters(): transformer_layer = re.search("(?:encoder\.layer\.)([0-9]+)", name) if transformer_layer and (int(transformer_layer.group(1)) > args['inner_nu']): param.requires_grad = True elif 'pooler' in name: param.requires_grad = False elif args['inner_nu'] < 0: param.requires_grad = True else: param.requires_grad = False model_task_optimizer = optim.SGD(model_task.parameters(), lr=args['inner_lr']) model_task.zero_grad() ####################### # Generate prototypes # ####################### labs = torch.sort(torch.unique(support_labels))[0] if (not args['kill_prototypes']): y = model_init(support_input) prototypes = torch.stack( [torch.mean(y[support_labels == c], dim=0) for c in labs]) W_init = 2 * prototypes b_init = -torch.norm(prototypes, p=2, dim=1)**2 else: W_init = torch.empty( (labs.size()[0], model_init.out_dim)).to(model_task.get_device()) nn.init.kaiming_normal_(W_init) b_init = torch.zeros((labs.size()[0])).to(model_task.get_device()) W_task, b_task = W_init.detach(), b_init.detach() W_task.requires_grad, b_task.requires_grad = True, True ################# # Adapt to data # ################# for _ in range(args['n_inner']): y = model_task(support_input) logits = F.linear(y, W_task, b_task) inner_loss = loss_fn(logits, support_labels) W_task_grad, b_task_grad = torch.autograd.grad(inner_loss,\ [W_task, b_task], retain_graph=True) inner_loss.backward() if args['clip_val'] > 0: torch.nn.utils.clip_grad_norm_(model_task.parameters(), args['clip_val']) model_task_optimizer.step() W_task = W_task - args['output_lr'] * W_task_grad b_task = b_task - args['output_lr'] * b_task_grad if args['print_inner_loss']: print(f"\tInner Loss: {inner_loss.detach().cpu().item()}") ######################### # Validate on query set # ######################### if mode == "train": for module in model_task.modules(): if isinstance(module, nn.Dropout): module.eval() W_task = W_init + (W_task - W_init).detach() b_task = b_init + (b_task - b_init).detach() y = model_task(query_input) logits = F.linear(y, W_task, b_task) outer_loss = loss_fn(logits, query_labels) if mode == "train": model_task_params = [ param for param in model_task.parameters() if param.requires_grad ] model_task_grads = torch.autograd.grad(outer_loss, model_task_params, retain_graph=True) model_init_params = [ param for param in model_init.parameters() if param.requires_grad ] model_init_grads = torch.autograd.grad(outer_loss, model_init_params, retain_graph=False, allow_unused=True) model_init_grads = model_init_grads + model_task_grads for param, grad in zip(model_init_params, model_init_grads): if param.grad != None and grad != None: param.grad += grad.detach() elif grad != None: param.grad = grad.detach() else: param.grad = None else: del model_task, W_task, b_task, W_task_grad, b_task_grad, W_init, b_init if outer_loss.detach().cpu().item() > 10: print(outer_loss.detach().cpu().item(), inner_loss.detach().cpu().item()) return logits.detach(), outer_loss.detach() ####################### # Logging Directories # ####################### log_dir = os.path.join(args['checkpoint_path'], args['version']) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'tensorboard'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'checkpoint'), exist_ok=True) #print(f"Saving models and logs to {log_dir}") checkpoint_save_path = os.path.join(log_dir, 'checkpoint') with open(os.path.join(log_dir, 'checkpoint', 'hparams.pickle'), 'wb') as file: pickle.dump(args, file) ########################## # Device, Logging, Timer # ########################## set_seed(args['seed']) timer = Timer() device = torch.device('cuda' if ( torch.cuda.is_available() and args['gpu']) else 'cpu') # Build the tensorboard writer writer = SummaryWriter(os.path.join(log_dir, 'tensorboard')) ################### # Load in dataset # ################### print("Data Prep") dataset = meta_dataset(include=args['include'], verbose=True) dataset.prep(text_tokenizer=manual_tokenizer) print("") #################### # Init models etc. # #################### model_init = SeqTransformer(args) tokenizer = AutoTokenizer.from_pretrained(args['encoder_name']) tokenizer.add_special_tokens({'additional_special_tokens': specials()}) model_init.encoder.model.resize_token_embeddings(len(tokenizer.vocab)) if args['optimizer'] == "Adam": meta_optimizer = optim.Adam(model_init.parameters(), lr=args['meta_lr']) elif args['optimizer'] == "SGD": meta_optimizer = optim.SGD(model_init.parameters(), lr=args['meta_lr']) meta_scheduler = get_constant_schedule_with_warmup(meta_optimizer, args['warmup_steps']) reduceOnPlateau = optim.lr_scheduler.ReduceLROnPlateau( meta_optimizer, mode='max', factor=args['lr_reduce_factor'], patience=args['patience'], verbose=True) model_init = model_init.to(device) loss_fn = nn.CrossEntropyLoss() ################# # Training loop # ################# best_overall_acc_s = 0.0 for episode in range(1, args['max_episodes'] + 1): outer_loss_agg, acc_agg, f1_agg = 0.0, 0.0, 0.0 outer_loss_s_agg, acc_s_agg, f1_s_agg = 0.0, 0.0, 0.0 for ii in range(1, args['n_outer'] + 1): ################# # Sample a task # ################# task = dataset_sampler(dataset, sampling_method='sqrt') datasubset = dataset.datasets[task]['train'] dataloader = _get_dataloader(datasubset, tokenizer, device, args, subset_classes=args['subset_classes']) support_labels, support_input, query_labels, query_input = next( dataloader) logits, outer_loss = _adapt_and_fit(support_labels, support_input, query_labels, query_input, loss_fn, model_init, args, mode="train") ###################### # Inner Loop Logging # ###################### with torch.no_grad(): mets = logging_metrics(logits.detach().cpu(), query_labels.detach().cpu()) outer_loss_ = outer_loss.detach().cpu().item() acc = mets['acc'] f1 = mets['f1'] outer_loss_s = outer_loss_ / np.log(dataloader.n_classes) acc_s = acc / (1 / dataloader.n_classes) f1_s = f1 / (1 / dataloader.n_classes) outer_loss_agg += outer_loss_ / args['n_outer'] acc_agg += acc / args['n_outer'] f1_agg += f1 / args['n_outer'] outer_loss_s_agg += outer_loss_s / args['n_outer'] acc_s_agg += acc_s / args['n_outer'] f1_s_agg += f1_s / args['n_outer'] print( "{:} | Train | Episode {:04}.{:02} | Task {:^20s}, N={:} | Loss {:5.2f}, Acc {:5.2f}, F1 {:5.2f} | Mem {:5.2f} GB" .format( timer.dt(), episode, ii, task, dataloader.n_classes, outer_loss_s if args['print_scaled'] else outer_loss_, acc_s if args['print_scaled'] else acc, f1_s if args['print_scaled'] else f1, psutil.Process(os.getpid()).memory_info().rss / 1024**3)) writer.add_scalars('Loss/Train', {task: outer_loss_}, episode) writer.add_scalars('Accuracy/Train', {task: acc}, episode) writer.add_scalars('F1/Train', {task: f1}, episode) writer.add_scalars('LossScaled/Train', {task: outer_loss_s}, episode) writer.add_scalars('AccuracyScaled/Train', {task: acc_s}, episode) writer.add_scalars('F1Scaled/Train', {task: f1_s}, episode) writer.flush() ############################ # Init Model Backward Pass # ############################ model_init_params = [ param for param in model_init.parameters() if param.requires_grad ] #for param in model_init_params: # param.grad = param.grad #/ args['n_outer'] if args['clip_val'] > 0: torch.nn.utils.clip_grad_norm_(model_init_params, args['clip_val']) meta_optimizer.step() meta_scheduler.step() if args['warmup_steps'] <= episode + 1: meta_optimizer.zero_grad() ##################### # Aggregate Logging # ##################### print( "{:} | MACRO-AGG | Train | Episode {:04} | Loss {:5.2f}, Acc {:5.2f}, F1 {:5.2f}\n" .format( timer.dt(), episode, outer_loss_s_agg if args['print_scaled'] else outer_loss_agg, acc_s_agg if args['print_scaled'] else acc_agg, f1_s_agg if args['print_scaled'] else f1_agg)) writer.add_scalar('Loss/MacroTrain', outer_loss_agg, episode) writer.add_scalar('Accuracy/MacroTrain', acc_agg, episode) writer.add_scalar('F1/MacroTrain', f1_agg, episode) writer.add_scalar('LossScaled/MacroTrain', outer_loss_s_agg, episode) writer.add_scalar('AccuracyScaled/MacroTrain', acc_s_agg, episode) writer.add_scalar('F1Scaled/MacroTrain', f1_s_agg, episode) writer.flush() ############## # Evaluation # ############## if (episode % args['eval_every_n']) == 0 or episode == 1: overall_loss, overall_acc, overall_f1 = [], [], [] overall_loss_s, overall_acc_s, overall_f1_s = [], [], [] ################### # Individual Task # ################### for task in dataset.lens.keys(): datasubset = dataset.datasets[task]['validation'] task_loss, task_acc, task_f1 = [], [], [] task_loss_s, task_acc_s, task_f1_s = [], [], [] for _ in range(args['n_eval_per_task']): dataloader = _get_dataloader( datasubset, tokenizer, device, args, subset_classes=args['subset_classes']) support_labels, support_input, query_labels, query_input = next( dataloader) logits, loss = _adapt_and_fit(support_labels, support_input, query_labels, query_input, loss_fn, model_init, args, mode="eval") mets = logging_metrics(logits.detach().cpu(), query_labels.detach().cpu()) task_loss.append(loss.detach().cpu().item()) task_acc.append(mets['acc']) task_f1.append(mets['f1']) task_loss_s.append(loss.detach().cpu().item() / np.log(dataloader.n_classes)) task_acc_s.append(mets['acc'] / (1 / dataloader.n_classes)) task_f1_s.append(mets['f1'] / (1 / dataloader.n_classes)) overall_loss.append(np.mean(task_loss)) overall_acc.append(np.mean(task_acc)) overall_f1.append(np.mean(task_f1)) overall_loss_s.append(np.mean(task_loss_s)) overall_acc_s.append(np.mean(task_acc_s)) overall_f1_s.append(np.mean(task_f1_s)) print( "{:} | Eval | Episode {:04} | Task {:^20s} | Loss {:5.2f}, Acc {:5.2f}, F1 {:5.2f} | Mem {:5.2f} GB" .format( timer.dt(), episode, task, overall_loss_s[-1] if args['print_scaled'] else overall_loss[-1], overall_acc_s[-1] if args['print_scaled'] else overall_acc[-1], overall_f1_s[-1] if args['print_scaled'] else overall_f1[-1], psutil.Process(os.getpid()).memory_info().rss / 1024**3)) writer.add_scalars('Loss/Eval', {task: overall_loss[-1]}, episode) writer.add_scalars('Accuracy/Eval', {task: overall_acc[-1]}, episode) writer.add_scalars('F1/Eval', {task: overall_f1[-1]}, episode) writer.add_scalars('LossScaled/Eval', {task: overall_loss_s[-1]}, episode) writer.add_scalars('AccuracyScaled/Eval', {task: overall_acc_s[-1]}, episode) writer.add_scalars('F1Scaled/Eval', {task: overall_f1_s[-1]}, episode) writer.flush() ####################### # All Tasks Aggregate # ####################### overall_loss = np.mean(overall_loss) overall_acc = np.mean(overall_acc) overall_f1 = np.mean(overall_f1) overall_loss_s = np.mean(overall_loss_s) overall_acc_s = np.mean(overall_acc_s) overall_f1_s = np.mean(overall_f1_s) print( "{:} | MACRO-AGG | Eval | Episode {:04} | Loss {:5.2f}, Acc {:5.2f}, F1 {:5.2f}\n" .format( timer.dt(), episode, overall_loss_s if args['print_scaled'] else overall_loss, overall_acc_s if args['print_scaled'] else overall_acc, overall_f1_s if args['print_scaled'] else overall_f1)) writer.add_scalar('Loss/MacroEval', overall_loss, episode) writer.add_scalar('Accuracy/MacroEval', overall_acc, episode) writer.add_scalar('F1/MacroEval', overall_f1, episode) writer.add_scalar('LossScaled/MacroEval', overall_loss_s, episode) writer.add_scalar('AccuracyScaled/MacroEval', overall_acc_s, episode) writer.add_scalar('F1Scaled/MacroEval', overall_f1_s, episode) writer.flush() ##################### # Best Model Saving # ##################### if overall_acc_s >= best_overall_acc_s: for file in os.listdir(checkpoint_save_path): if 'best_model' in file: ep = re.match(r".+macroaccs_\[(.+)\]", file) if float(ep.group(1)): os.remove(os.path.join(checkpoint_save_path, file)) save_name = "best_model-episode_[{:}]-macroaccs_[{:.2f}].checkpoint".format( episode, overall_acc_s) with open(os.path.join(checkpoint_save_path, save_name), 'wb') as f: torch.save(model_init.state_dict(), f) print( f"New best scaled accuracy. Saving model as {save_name}\n") best_overall_acc_s = overall_acc_s curr_patience = args['patience'] else: if episode > args['min_episodes']: curr_patience -= 1 #print(f"Model did not improve with macroaccs_={overall_acc_s}. Patience is now {curr_patience}\n") ####################### # Latest Model Saving # ####################### for file in os.listdir(checkpoint_save_path): if 'latest_model' in file: ep = re.match(r".+episode_\[([a-zA-Z0-9\.]+)\].+", file) if ep != None and int(ep.group(1)) <= episode: os.remove(os.path.join(checkpoint_save_path, file)) save_name = "latest_model-episode_[{:}]-macroaccs_[{:.2f}].checkpoint".format( episode, overall_acc_s) with open(os.path.join(checkpoint_save_path, save_name), 'wb') as f: torch.save(model_init.state_dict(), f) with open( os.path.join(checkpoint_save_path, "latest_trainer.pickle"), 'wb') as f: pickle.dump( { 'episode': episode, 'overall_acc_s': overall_acc_s, 'best_overall_acc_s': best_overall_acc_s }, f) if episode >= args['min_episodes']: reduceOnPlateau.step(overall_acc_s) curr_lr = meta_optimizer.param_groups[0]['lr'] if curr_lr < args['min_meta_lr']: print("Patience spent.\nEarly stopping.") raise KeyboardInterrupt writer.add_scalar('Meta-lr', meta_optimizer.param_groups[0]['lr'], episode)
generator = NavieGenerator(input_dim=Config.z_dim) generator.load_weights(weight_file) pseudo_imgs = generator(z_val, training=False) mean, std = get_cifar10_mean_std() # put back mean and std ret = pseudo_imgs * std + mean # pseudo_imgs return ret # ------------------------------------------------------------------------------ if __name__ == '__main__': set_seed(Config.seed) files = [ "zeroshot_cifar10_T-40-2_S-16-1_seed_45/generator_i100.h5", "zeroshot_cifar10_T-40-2_S-16-1_seed_45/generator_i200.h5", "zeroshot_cifar10_T-40-2_S-16-1_seed_45/generator_i300.h5", "zeroshot_cifar10_T-40-2_S-16-1_seed_45/generator_i400.h5", "zeroshot_cifar10_T-40-2_S-16-1_seed_45/generator_i500.h5", "zeroshot_cifar10_T-40-2_S-16-1_seed_45/generator_i700.h5", "zeroshot_cifar10_T-40-2_S-16-1_seed_45/generator_i1000.h5", "zeroshot_cifar10_T-40-2_S-16-1_seed_45/generator_i1500.h5", "zeroshot_cifar10_T-40-2_S-16-1_seed_45/generator_i2000.h5", "zeroshot_cifar10_T-40-2_S-16-1_seed_45/generator_i2500.h5", "zeroshot_cifar10_T-40-2_S-16-1_seed_45/generator_i10000.h5", "zeroshot_cifar10_T-40-2_S-16-1_seed_45/generator_i15000.h5", "zeroshot_cifar10_T-40-2_S-16-1_seed_45/generator_i20000.h5", "zeroshot_cifar10_T-40-2_S-16-1_seed_45/generator_i50000.h5",
def train(depth, width, seed=42, data_per_class=-1, dataset='cifar10', savedir='saved_models', is_continue=False): set_seed(seed) # Load data if dataset == 'cifar10': # TODO: sampling for Fig2 green line (x_train, y_train_lbl), (x_test, y_test_lbl) = get_cifar10_data() # x_train, y_train_lbl = balance_sampling(x_train, y_train_lbl, data_per_class=200) shape = (32, 32, 3) classes = 10 elif dataset == 'fashion_mnist': (x_train, y_train_lbl), (x_test, y_test_lbl) = get_fashion_mnist_data() shape = (32, 32, 1) classes = 10 else: raise NotImplementedError("TODO: SVHN") # ==================================================================== # make sampling if data_per_class > 0: # sample x_train_sample, y_train_lbl_sample = \ balance_sampling(x_train, y_train_lbl, data_per_class=data_per_class) # repeat the sampled data to be as large as the full data set for convienient x_train = np.repeat(x_train_sample, Config.n_data_per_class / data_per_class, axis=0) y_train_lbl = np.repeat(y_train_lbl_sample, Config.n_data_per_class / data_per_class, axis=0) # ==================================================================== # To one-hot y_train = to_categorical(y_train_lbl) y_test = to_categorical(y_test_lbl) # Setup model model_type = 'WRN-%d-%d-seed%d' % (depth, width, seed) wrn_model = WideResidualNetwork(depth, width, classes=classes, input_shape=shape, weight_decay=Config.weight_decay) # Prepare model model saving directory. save_dir = os.path.join(os.getcwd(), savedir) mkdir(save_dir) # Set up model name and path model_name = '%s_%s_model.{epoch:03d}.h5' % (dataset, model_type) model_filepath = os.path.join(save_dir, model_name) # set up log file log_fname = '{}-wrn-{}-{}-seed{}_log.csv'.format(dataset, depth, width, seed) log_filepath = os.path.join(save_dir, log_fname) # ================================================================= if is_continue: for i in range(Config.epochs, 0, -1): fname = model_filepath.format(epoch=i) if os.path.isfile(fname): print("Using ", fname, " as the save point.") break if i <= 1: raise RuntimeError("Cannot continue the training") # ====================================================== initial_epoch = i wrn_model = load_model(fname) is_log_append = True else: initial_epoch = 0 # compile model optim = SGD(learning_rate=lr_schedule(initial_epoch), momentum=Config.momentum, decay=0.0, nesterov=True) wrn_model.compile(loss='categorical_crossentropy', optimizer=optim, metrics=['accuracy']) is_log_append = False logger = CSVLogger(filename=log_filepath, separator=',', append=is_log_append) # Prepare callbacks for model saving and for learning rate adjustment. lr_scheduler = LearningRateScheduler(lr_schedule) checkpointer = ModelCheckpoint(filepath=model_filepath, monitor='val_acc', verbose=1, save_best_only=True) callbacks = [lr_scheduler, checkpointer, logger] datagen = ImageDataGenerator( width_shift_range=4, height_shift_range=4, horizontal_flip=True, vertical_flip=False, rescale=None, fill_mode='reflect', ) datagen.fit(x_train) wrn_model.fit_generator(datagen.flow(x_train, y_train, batch_size=Config.batch_size, shuffle=True), validation_data=(x_test, y_test), epochs=Config.epochs, initial_epoch=initial_epoch, verbose=1, callbacks=callbacks) scores = wrn_model.evaluate(x_test, y_test, verbose=1) print('Test loss:', scores[0]) print('Test accuracy:', scores[1]) # ================================================= # use the final one as teachers wrn_model.save(model_filepath.format(epoch=Config.epochs - 1))
project_name="cpc-nlp") experiment.set_name(run_name) experiment.log_parameters({ **config.training.to_dict(), **config.dataset.to_dict(), **config.cpc_model.to_dict() }) else: experiment = None # define if gpu or cpu use_cuda = not config.training.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") logger.info('===> use_cuda is {}'.format(use_cuda)) # set seed for reproducibility set_seed(config.training.seed, use_cuda) # create a CPC model for NLP model = CPCv1(config=config) # load model if resume mode if config.training.resume_name: logger.info('===> loading a checkpoint') checkpoint = torch.load('{}/{}-{}'.format(config.training.logging_dir, run_name, 'model_best.pth')) model.load_state_dict(checkpoint['state_dict']) # line for multi-gpu if config.training.multigpu and torch.cuda.device_count() > 1: logger.info("===> let's use {} GPUs!".format(torch.cuda.device_count())) model = nn.DataParallel(model) # move to device model.to(device)
import numpy as np from numpy import tile from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score from torch.utils.data import Dataset, DataLoader from torch.utils.data.distributed import DistributedSampler from utils.log import get_logger from utils.compare import compare, count from utils.lr_scheduler import cos_lr_scheduler, exp_lr_scheduler from utils.dataset import roadDataset, roadDatasetInfer from utils.create_dir import create_dir from utils.seed import set_seed from cnn_finetune import make_model from efficientnet_pytorch import EfficientNet from config.default import cfg set_seed(2020) class BASE(): def __init__(self, cfg): self.gpu_id = cfg.SYSTEM.GPU_ID self.num_workers = cfg.SYSTEM.NUM_WORKERS self.train_dir = cfg.DATASET.TRAIN_DIR self.val_dir = cfg.DATASET.VAL_DIR self.test_dir = cfg.DATASET.TEST_DIR self.sub_dir = cfg.OUTPUT_DIR.SUB_DIR self.log_dir = cfg.OUTPUT_DIR.LOG_DIR self.out_dir = cfg.OUTPUT_DIR.OUT_DIR self.model_name = cfg.MODEL.MODEL_NAME self.train_batch_size = cfg.TRAIN_PARAM.TRAIN_BATCH_SIZE
def run(seed=42, lr=3e-5, bs=config.TRAIN_BATCH_SIZE, epoch=config.EPOCHS, threshold=.3, eps=.1): set_seed(seed) main_df = pd.read_csv(config.TRAINING_FILE) folds = main_df['kfold'].unique() scores = [] for fold in sorted(folds): print(f'Fold {fold}') df_train = main_df[main_df['kfold'] != fold].reset_index(drop=True) df_valid = main_df[main_df['kfold'] == fold].reset_index(drop=True) train_dataset = TweetDataset( tweets=df_train['text'].values, selected_texts=df_train['selected_text'].values, sentiments=df_train['sentiment'].values, threshold=threshold) train_data_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=bs, num_workers=6) valid_dataset = TweetDataset( tweets=df_valid['text'].values, selected_texts=df_valid['selected_text'].values, sentiments=df_valid['sentiment'].values, threshold=0) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, shuffle=False, batch_size=config.VALID_BATCH_SIZE, num_workers=6) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print("device: ", device) model = Transformer(nb_layers=2) model.to(device) best_jaccard = 0 param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, ] num_train_steps = int(len(df_train) / bs * epoch) optimizer = AdamW(optimizer_parameters, lr=lr) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) for _ in range(epoch): engine.training(train_data_loader, model, optimizer, device, scheduler, eps) jaccard = engine.evaluating(valid_data_loader, model, device) print(f'Jaccard validation score: {jaccard}') if jaccard > best_jaccard: # torch.save( # model.state_dict(), # os.path.join(config.SAVED_MODEL_PATH, f'model_{fold}.bin')) best_jaccard = jaccard scores.append(best_jaccard) print(f'Cross validation score: {np.mean(scores)} +/-{np.std(scores)}')
def run(self): """ Run an experiment. """ args = self.parser.parse_args() # Set random seed set_seed(args.seed) def run_op(op): # Create task task = self.define_task() # Create agent agent = self.define_agent(task.width, task.height, len(task.get_actions())) # Log experiment info self.log_info(task, agent) # Loading the agent state if args.load: if os.path.isfile(args.load): agent.load(args.load) logger.info("Agent loaded from {}".format(args.load)) else: logger.error( "Agent couldn't be loaded. File {} doesn't exist". format(args.load)) logger.info("") # Run op and return its result return op(lambda: GridWorldEnv(task), agent) def run_train(): def train_op(env, agent): # Train agent on environment result = self.train(env, agent, args.seed) # Saving the agent state if args.save: agent.save(args.save) logger.info("Agent saved to {}".format(args.save)) logger.info("") return result # Run train op and return its result return run_op(train_op) def run_eval(): def eval_op(env, agent): # Evaluate agent on environment return self.eval(env, agent, args.seed) # Run eval op and return its result return run_op(eval_op) if args.train: # Train agent avg_result = AverageRunner(run_train).run(args.runs) log_average_run_result(avg_result) elif args.eval: # Evaluate agent avg_result = AverageRunner(run_eval).run(args.runs) log_average_run_result(avg_result)
def main(): """ YOLOv3 trainer. See README for details. """ args = parse_args() print("Setting Arguments.. : ", args) cuda = torch.cuda.is_available() and args.use_cuda os.makedirs(args.checkpoint_dir, exist_ok=True) # Parse config settings with open(args.cfg, 'r') as f: cfg = yaml.load(f) print("successfully loaded config file: ", cfg) momentum = cfg['TRAIN']['MOMENTUM'] decay = cfg['TRAIN']['DECAY'] burn_in = cfg['TRAIN']['BURN_IN'] iter_size = cfg['TRAIN']['MAXITER'] steps = eval(cfg['TRAIN']['STEPS']) batch_size = cfg['TRAIN']['BATCHSIZE'] subdivision = cfg['TRAIN']['SUBDIVISION'] ignore_thre = cfg['TRAIN']['IGNORETHRE'] random_resize = cfg['AUGMENTATION']['RANDRESIZE'] base_lr = cfg['TRAIN']['LR'] / batch_size / subdivision gradient_clip = cfg['TRAIN']['GRADIENT_CLIP'] print('effective_batch_size = batch_size * iter_size = %d * %d' % (batch_size, subdivision)) # Make trainer behavior deterministic set_seed(seed=0) setup_cudnn(deterministic=True) # Learning rate setup def burnin_schedule(i): if i < burn_in: factor = pow(i / burn_in, 4) elif i < steps[0]: factor = 1.0 elif i < steps[1]: factor = 0.1 else: factor = 0.01 return factor # Initiate model model = YOLOv3(cfg['MODEL'], ignore_thre=ignore_thre) if args.weights_path: print("loading darknet weights....", args.weights_path) parse_yolo_weights(model, args.weights_path) elif args.checkpoint: print("loading pytorch ckpt...", args.checkpoint) state = torch.load(args.checkpoint) if 'model_state_dict' in state.keys(): model.load_state_dict(state['model_state_dict']) else: model.load_state_dict(state) if cuda: print("using cuda") model = model.cuda() if args.tfboard_dir: print("using tfboard") from tensorboardX import SummaryWriter tblogger = SummaryWriter(args.tfboard_dir) model.train() imgsize = cfg['TRAIN']['IMGSIZE'] dataset = COCODataset(model_type=cfg['MODEL']['TYPE'], data_dir='COCO/', img_size=imgsize, augmentation=cfg['AUGMENTATION'], debug=args.debug) dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, shuffle=True, num_workers=args.n_cpu) dataiterator = iter(dataloader) evaluator = COCOAPIEvaluator(model_type=cfg['MODEL']['TYPE'], data_dir='COCO/', img_size=cfg['TEST']['IMGSIZE'], confthre=cfg['TEST']['CONFTHRE'], nmsthre=cfg['TEST']['NMSTHRE']) dtype = torch.cuda.FloatTensor if cuda else torch.FloatTensor # optimizer setup # set weight decay only on conv.weight params_dict = dict(model.named_parameters()) params = [] for key, value in params_dict.items(): if 'conv.weight' in key: params += [{'params':value, 'weight_decay':decay * batch_size * subdivision}] else: params += [{'params':value, 'weight_decay':0.0}] optimizer = optim.SGD(params, lr=base_lr, momentum=momentum, dampening=0, weight_decay=decay * batch_size * subdivision) iter_state = 0 if args.checkpoint: if 'optimizer_state_dict' in state.keys(): optimizer.load_state_dict(state['optimizer_state_dict']) iter_state = state['iter'] + 1 scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule) # start training loop for iter_i in range(iter_state, iter_size + 1): # COCO evaluation if iter_i % args.eval_interval == 0: print('evaluating...') ap = evaluator.evaluate(model) model.train() if args.tfboard_dir: # val/aP tblogger.add_scalar('val/aP50', ap['aP50'], iter_i) tblogger.add_scalar('val/aP75', ap['aP75'], iter_i) tblogger.add_scalar('val/aP5095', ap['aP5095'], iter_i) tblogger.add_scalar('val/aP5095_S', ap['aP5095_S'], iter_i) tblogger.add_scalar('val/aP5095_M', ap['aP5095_M'], iter_i) tblogger.add_scalar('val/aP5095_L', ap['aP5095_L'], iter_i) # subdivision loop optimizer.zero_grad() for inner_iter_i in range(subdivision): try: imgs, targets, _, _ = next(dataiterator) # load a batch except StopIteration: dataiterator = iter(dataloader) imgs, targets, _, _ = next(dataiterator) # load a batch imgs = Variable(imgs.type(dtype)) targets = Variable(targets.type(dtype), requires_grad=False) loss = model(imgs, targets) loss.backward() if gradient_clip >= 0: torch.nn.utils.clip_grad_norm(model.parameters(), gradient_clip) optimizer.step() scheduler.step() if iter_i % 10 == 0: # logging current_lr = scheduler.get_lr()[0] * batch_size * subdivision print('[Iter %d/%d] [lr %f] ' '[Losses: xy %f, wh %f, conf %f, cls %f, total %f, imgsize %d]' % (iter_i, iter_size, current_lr, model.loss_dict['xy'], model.loss_dict['wh'], model.loss_dict['conf'], model.loss_dict['cls'], loss, imgsize), flush=True) if args.tfboard_dir: # lr tblogger.add_scalar('lr', current_lr, iter_i) # train/loss tblogger.add_scalar('train/loss_xy', model.loss_dict['xy'], iter_i) tblogger.add_scalar('train/loss_wh', model.loss_dict['wh'], iter_i) tblogger.add_scalar('train/loss_conf', model.loss_dict['conf'], iter_i) tblogger.add_scalar('train/loss_cls', model.loss_dict['cls'], iter_i) tblogger.add_scalar('train/loss', loss, iter_i) # random resizing if random_resize: imgsize = (random.randint(0, 9) % 10 + 10) * 32 dataset.img_shape = (imgsize, imgsize) dataset.img_size = imgsize dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, shuffle=True, num_workers=args.n_cpu) dataiterator = iter(dataloader) # save checkpoint if args.checkpoint_dir and iter_i > 0 and (iter_i % args.checkpoint_interval == 0): torch.save({'iter': iter_i, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, os.path.join(args.checkpoint_dir, "snapshot"+str(iter_i)+".ckpt")) if args.tfboard_dir: tblogger.close()
def eval(args): def _get_dataloader(datasubset, tokenizer, device, args, subset_classes=True): """ Get specific dataloader. Args: datasubset ([type]): [description] tokenizer ([type]): [description] device ([type]): [description] args ([type]): [description] Returns: dataloader """ if subset_classes: dataloader = StratifiedLoaderwClassesSubset( datasubset, k=args['k'], max_classes=args['max_classes'], max_batch_size=args['max_batch_size'], tokenizer=tokenizer, device=device, shuffle=True, verbose=False) else: dataloader = StratifiedLoader( datasubset, k=args['k'], max_batch_size=args['max_batch_size'], tokenizer=tokenizer, device=device, shuffle=True, verbose=False) return dataloader def _adapt_and_fit(support_labels_list, support_input_list, query_labels_list, query_input_list, loss_fn, model_init, args, mode): """ Adapts the init model to a support set and computes loss on query set. Args: support_labels ([type]): [description] support_text ([type]): [description] query_labels ([type]): [description] query_text ([type]): [description] model_init ([type]): [description] args mode """ ##################### # Create model_task # ##################### model_init.eval() model_task = deepcopy(model_init) model_task_optimizer = optim.SGD(model_task.parameters(), lr=args['inner_lr']) model_task.zero_grad() ####################### # Generate prototypes # ####################### with torch.no_grad(): prototypes = 0.0 for support_labels, support_input in zip(support_labels_list, support_input_list): if mode != "baseline": y = model_init(support_input) else: y = model_init.encode(support_input) labs = torch.sort(torch.unique(support_labels))[0] prototypes += torch.stack( [torch.mean(y[support_labels == c], dim=0) for c in labs]) prototypes = prototypes / len(support_labels_list) W_init = 2 * prototypes b_init = -torch.norm(prototypes, p=2, dim=1)**2 W_task, b_task = W_init.detach(), b_init.detach() W_task.requires_grad, b_task.requires_grad = True, True ################# # Adapt to data # ################# for _ in range(args['n_inner']): for support_labels, support_input in zip(support_labels_list, support_input_list): if mode != "baseline": y = model_task(support_input) else: y = model_task.encode(support_input) logits = F.linear(y, W_task, b_task) inner_loss = loss_fn(logits, support_labels) W_task_grad, b_task_grad = torch.autograd.grad( inner_loss, [W_task, b_task], retain_graph=True) inner_loss.backward() if args['clip_val'] > 0: torch.nn.utils.clip_grad_norm_(model_task.parameters(), args['clip_val']) model_task_optimizer.step() W_task = W_task - args['output_lr'] * W_task_grad b_task = b_task - args['output_lr'] * b_task_grad ######################### # Validate on query set # ######################### logits_list, outer_loss_list = [], [] for query_labels, query_input in zip(query_labels_list, query_input_list): with torch.no_grad(): if mode != "baseline": y = model_task(query_input) else: y = model_task.encode(query_input) logits = F.linear(y, W_task, b_task) outer_loss = loss_fn(logits, query_labels) logits_list.append(logits) outer_loss_list.append(outer_loss) return logits_list, outer_loss_list ####################### # Logging Directories # ####################### log_dir = os.path.join(args['checkpoint_path'], args['version']) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, args['save_version']), exist_ok=True) os.makedirs(os.path.join(log_dir, 'checkpoint'), exist_ok=True) #print(f"Saving models and logs to {log_dir}") checkpoint_save_path = os.path.join(log_dir, 'checkpoint') if args['mode'] != "baseline": with open(os.path.join("./", checkpoint_save_path, "hparams.pickle"), mode='rb+') as f: hparams = pickle.load(f) else: with open(os.path.join("./", args['checkpoint_path'], "hparams.pickle"), mode='rb+') as f: hparams = pickle.load(f) ########################## # Device, Logging, Timer # ########################## set_seed(args['seed']) timer = Timer() device = torch.device('cuda' if ( torch.cuda.is_available() and args['gpu']) else 'cpu') # Build the tensorboard writer writer = SummaryWriter(os.path.join(log_dir, args['save_version'])) ################### # Load in dataset # ################### print("Data Prep") dataset = meta_dataset(include=args['include'], verbose=True) dataset.prep(text_tokenizer=manual_tokenizer) print("") #################### # Init models etc. # #################### if args['mode'] != "baseline": model_init = SeqTransformer(hparams) tokenizer = AutoTokenizer.from_pretrained(hparams['encoder_name']) else: model_init = CustomBERT(num_classes=task_label_dict[args['version']]) tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') tokenizer.add_special_tokens({'additional_special_tokens': specials()}) model_init.encoder.model.resize_token_embeddings(len(tokenizer.vocab)) for file in os.listdir(checkpoint_save_path): if 'best_model' in file: fp = os.path.join(checkpoint_save_path, file) with open(fp, mode='rb+') as f: print(f"Found pre-trained file at {fp}") if args['mode'] != "baseline": model_init.load_state_dict( torch.load(f, map_location=device)) for name, param in model_init.encoder.model.named_parameters( ): transformer_layer = re.search( "(?:encoder\.layer\.)([0-9]+)", name) if transformer_layer and (int( transformer_layer.group(1)) > args['nu']): param.requires_grad = True elif 'pooler' in name: param.requires_grad = False elif args['nu'] < 0: param.requires_grad = True else: param.requires_grad = False else: model_init.load_state_dict( torch.load(f, map_location=device)["bert_state_dict"]) model_init = model_init.to(device) loss_fn = nn.CrossEntropyLoss() ############## # Evaluation # ############## results_dict = defaultdict(dict) for split in args['splits']: overall_loss, overall_acc, overall_f1 = [], [], [] overall_loss_s, overall_acc_s, overall_f1_s = [], [], [] ################### # Individual Task # ################### for task in dataset.lens.keys(): datasubset = dataset.datasets[task][split] task_loss, task_acc, task_f1 = [], [], [] task_loss_s, task_acc_s, task_f1_s = [], [], [] for _ in range(args['n_eval_per_task']): dataloader = _get_dataloader( datasubset, tokenizer, device, args, subset_classes=args['subset_classes']) total_size = args['k'] * dataloader.n_classes n_sub_batches = total_size / args['max_batch_size'] reg_k = int(args['k'] // n_sub_batches) left_over = args['k'] * dataloader.n_classes - \ int(n_sub_batches) * reg_k * dataloader.n_classes last_k = int(left_over / dataloader.n_classes) support_labels_list, support_input_list, query_labels_list, query_input_list = [], [], [], [] dataloader.k = reg_k for _ in range(int(n_sub_batches)): support_labels, support_text, query_labels, query_text = next( dataloader) support_labels_list.append(support_labels) support_input_list.append(support_text) query_labels_list.append(query_labels) query_input_list.append(query_text) if last_k > 0.0: dataloader.k = last_k support_labels, support_text, query_labels, query_text = next( dataloader) support_labels_list.append(support_labels) support_input_list.append(support_text) query_labels_list.append(query_labels) query_input_list.append(query_text) logits_list, loss_list = _adapt_and_fit( support_labels_list, support_input_list, query_labels_list, query_input_list, loss_fn, model_init, hparams, args['mode']) for logits, query_labels, loss in zip(logits_list, query_labels_list, loss_list): mets = logging_metrics(logits.detach().cpu(), query_labels.detach().cpu()) task_loss.append(loss.detach().cpu().item()) task_acc.append(mets['acc']) task_f1.append(mets['f1']) task_loss_s.append(loss.detach().cpu().item() / np.log(dataloader.n_classes)) task_acc_s.append(mets['acc'] / (1 / dataloader.n_classes)) task_f1_s.append(mets['f1'] / (1 / dataloader.n_classes)) overall_loss.append(np.mean(task_loss)) overall_acc.append(np.mean(task_acc)) overall_f1.append(np.mean(task_f1)) overall_loss_s.append(np.mean(task_loss_s)) overall_acc_s.append(np.mean(task_acc_s)) overall_f1_s.append(np.mean(task_f1_s)) print( "{:} | Eval | Split {:^8s} | Task {:^20s} | Loss {:5.2f} ({:4.2f}), Acc {:5.2f} ({:4.2f}), F1 {:5.2f} ({:4.2f}) | Mem {:5.2f} GB" .format( timer.dt(), split, task, overall_loss_s[-1] if args['print_scaled'] else overall_loss[-1], np.std(task_loss_s) if args['print_scaled'] else np.std(task_loss), overall_acc_s[-1] if args['print_scaled'] else overall_acc[-1], np.std(task_acc_s) if args['print_scaled'] else np.std(task_acc), overall_f1_s[-1] if args['print_scaled'] else overall_f1[-1], np.std(task_f1_s) if args['print_scaled'] else np.std(task_f1), psutil.Process(os.getpid()).memory_info().rss / 1024**3)) writer.add_scalars(f'Loss/{split}', {task: overall_loss[-1]}, 0) writer.add_scalars(f'Accuracy/{split}', {task: overall_acc[-1]}, 0) writer.add_scalars(f'F1/{split}', {task: overall_f1[-1]}, 0) writer.add_scalars(f'LossScaled/{split}', {task: overall_loss_s[-1]}, 0) writer.add_scalars(f'AccuracyScaled/{split}', {task: overall_acc_s[-1]}, 0) writer.add_scalars(f'F1Scaled/{split}', {task: overall_f1_s[-1]}, 0) writer.flush() results_dict[task][split] = { "loss": "{:.2f} ({:.2f})".format(overall_loss[-1], np.std(task_loss)), "acc": "{:.2f} ({:.2f})".format(overall_acc[-1], np.std(task_acc)), "f1": "{:.2f} ({:.2f})".format(overall_f1[-1], np.std(task_f1)), "loss_scaled": "{:.2f} ({:.2f})".format(overall_loss_s[-1], np.std(task_loss_s)), "acc_scaled": "{:.2f} ({:.2f})".format(overall_acc_s[-1], np.std(task_acc_s)), "f1_scaled": "{:.2f} ({:.2f})".format(overall_f1_s[-1], np.std(task_f1_s)), } ####################### # All Tasks Aggregate # ####################### overall_loss = np.mean(overall_loss) overall_acc = np.mean(overall_acc) overall_f1 = np.mean(overall_f1) overall_loss_s = np.mean(overall_loss_s) overall_acc_s = np.mean(overall_acc_s) overall_f1_s = np.mean(overall_f1_s) print( "{:} | MACRO-AGG | Eval | Split {:^8s} | Loss {:5.2f}, Acc {:5.2f}, F1 {:5.2f}\n" .format(timer.dt(), split, overall_loss_s if args['print_scaled'] else overall_loss, overall_acc_s if args['print_scaled'] else overall_acc, overall_f1_s if args['print_scaled'] else overall_f1)) writer.add_scalar(f'Loss/Macro{split}', overall_loss, 0) writer.add_scalar(f'Accuracy/Macro{split}', overall_acc, 0) writer.add_scalar(f'F1/Macro{split}', overall_f1, 0) writer.add_scalar(f'LossScaled/Macro{split}', overall_loss_s, 0) writer.add_scalar(f'AccuracyScaled/Macro{split}', overall_acc_s, 0) writer.add_scalar(f'F1Scaled/Macro{split}', overall_f1_s, 0) writer.flush() with open(os.path.join(log_dir, args['save_version'], 'results.pickle'), 'wb+') as file: pickle.dump(results_dict, file)
def zeroshot_train(t_depth, t_width, t_wght_path, s_depth, s_width, seed=42, savedir=None, dataset='cifar10', sample_per_class=0): set_seed(seed) train_name = '%s_T-%d-%d_S-%d-%d_seed_%d' % (dataset, t_depth, t_width, s_depth, s_width, seed) if sample_per_class > 0: train_name += "-m%d" % sample_per_class log_filename = train_name + '_training_log.csv' # save dir if not savedir: savedir = 'zeroshot_' + train_name full_savedir = os.path.join(os.getcwd(), savedir) mkdir(full_savedir) log_filepath = os.path.join(full_savedir, log_filename) logger = CustomizedCSVLogger(log_filepath) # Teacher teacher = WideResidualNetwork(t_depth, t_width, input_shape=Config.input_dim, dropout_rate=0.0, output_activations=True, has_softmax=False) teacher.load_weights(t_wght_path) teacher.trainable = False # Student student = WideResidualNetwork(s_depth, s_width, input_shape=Config.input_dim, dropout_rate=0.0, output_activations=True, has_softmax=False) if sample_per_class > 0: s_decay_steps = Config.n_outer_loop * Config.n_s_in_loop + Config.n_outer_loop else: s_decay_steps = Config.n_outer_loop * Config.n_s_in_loop s_optim = Adam(learning_rate=CosineDecay(Config.student_init_lr, decay_steps=s_decay_steps)) # --------------------------------------------------------------------------- # Generator generator = NavieGenerator(input_dim=Config.z_dim) g_optim = Adam(learning_rate=CosineDecay(Config.generator_init_lr, decay_steps=Config.n_outer_loop * Config.n_g_in_loop)) # --------------------------------------------------------------------------- # Test data if dataset == 'cifar10': (x_train, y_train_lbl), (x_test, y_test) = get_cifar10_data() elif dataset == 'fashion_mnist': (x_train, y_train_lbl), (x_test, y_test) = get_fashion_mnist_data() else: raise ValueError("Only Cifar-10 and Fashion-MNIST supported !!") test_data_loader = tf.data.Dataset.from_tensor_slices( (x_test, y_test)).batch(200) # --------------------------------------------------------------------------- # Train data (if using train data) train_dataflow = None if sample_per_class > 0: # sample first x_train, y_train_lbl = \ balance_sampling(x_train, y_train_lbl, data_per_class=sample_per_class) datagen = ImageDataGenerator(width_shift_range=4, height_shift_range=4, horizontal_flip=True, vertical_flip=False, rescale=None, fill_mode='reflect') datagen.fit(x_train) y_train = to_categorical(y_train_lbl) train_dataflow = datagen.flow(x_train, y_train, batch_size=Config.batch_size, shuffle=True) # Generator loss metrics g_loss_met = tf.keras.metrics.Mean() # Student loss metrics s_loss_met = tf.keras.metrics.Mean() # n_cls_t_pred_metric = tf.keras.metrics.Mean() n_cls_s_pred_metric = tf.keras.metrics.Mean() max_g_grad_norm_metric = tf.keras.metrics.Mean() max_s_grad_norm_metric = tf.keras.metrics.Mean() test_data_loader = tf.data.Dataset.from_tensor_slices( (x_test, y_test)).batch(200) teacher.trainable = False # checkpoint chkpt_dict = { 'teacher': teacher, 'student': student, 'generator': generator, 's_optim': s_optim, 'g_optim': g_optim, } # Saving checkpoint ckpt = tf.train.Checkpoint(**chkpt_dict) ckpt_manager = tf.train.CheckpointManager(ckpt, os.path.join(savedir, 'chpt'), max_to_keep=2) # ========================================================================== # if a checkpoint exists, restore the latest checkpoint. start_iter = 0 if ckpt_manager.latest_checkpoint: ckpt.restore(ckpt_manager.latest_checkpoint) print('Latest checkpoint restored!!') with open(os.path.join(savedir, 'chpt', 'iteration'), 'r') as f: start_iter = int(f.read()) logger = CustomizedCSVLogger(log_filepath, append=True) for iter_ in range(start_iter, Config.n_outer_loop): iter_stime = time.time() max_s_grad_norm = 0 max_g_grad_norm = 0 # sample from latern space to have an image z_val = tf.random.normal([Config.batch_size, Config.z_dim]) # Generator training loss = 0 for ng in range(Config.n_g_in_loop): loss, g_grad_norm = train_gen(generator, g_optim, z_val, teacher, student) max_g_grad_norm = max(max_g_grad_norm, g_grad_norm.numpy()) g_loss_met(loss) # ========================================================================== # Student training loss = 0 pseudo_imgs, t_logits, t_acts = prepare_train_student( generator, z_val, teacher) for ns in range(Config.n_s_in_loop): # pseudo_imgs, t_logits, t_acts = prepare_train_student(generator, z_val, teacher) loss, s_grad_norm, s_logits = train_student( pseudo_imgs, s_optim, t_logits, t_acts, student) max_s_grad_norm = max(max_s_grad_norm, s_grad_norm.numpy()) n_cls_t_pred = len(np.unique(np.argmax(t_logits, axis=-1))) n_cls_s_pred = len(np.unique(np.argmax(s_logits, axis=-1))) # logging s_loss_met(loss) n_cls_t_pred_metric(n_cls_t_pred) n_cls_s_pred_metric(n_cls_s_pred) # ========================================================================== # train if provided n samples if train_dataflow: x_batch_train, y_batch_train = next(train_dataflow) t_logits, t_acts = forward(teacher, x_batch_train, training=False) loss = train_student_with_labels(student, s_optim, x_batch_train, t_logits, t_acts, y_batch_train) # ========================================================================== # -------------------------------------------------------------------- iter_etime = time.time() max_g_grad_norm_metric(max_g_grad_norm) max_s_grad_norm_metric(max_s_grad_norm) # -------------------------------------------------------------------- is_last_epoch = (iter_ == Config.n_outer_loop - 1) if iter_ != 0 and (iter_ % Config.print_freq == 0 or is_last_epoch): n_cls_t_pred_avg = n_cls_t_pred_metric.result().numpy() n_cls_s_pred_avg = n_cls_s_pred_metric.result().numpy() time_per_epoch = iter_etime - iter_stime s_loss = s_loss_met.result().numpy() g_loss = g_loss_met.result().numpy() max_g_grad_norm_avg = max_g_grad_norm_metric.result().numpy() max_s_grad_norm_avg = max_s_grad_norm_metric.result().numpy() # build ordered dict row_dict = OrderedDict() row_dict['time_per_epoch'] = time_per_epoch row_dict['epoch'] = iter_ row_dict['generator_loss'] = g_loss row_dict['student_kd_loss'] = s_loss row_dict['n_cls_t_pred_avg'] = n_cls_t_pred_avg row_dict['n_cls_s_pred_avg'] = n_cls_s_pred_avg row_dict['max_g_grad_norm_avg'] = max_g_grad_norm_avg row_dict['max_s_grad_norm_avg'] = max_s_grad_norm_avg if sample_per_class > 0: s_optim_iter = iter_ * (Config.n_s_in_loop + 1) else: s_optim_iter = iter_ * Config.n_s_in_loop row_dict['s_optim_lr'] = s_optim.learning_rate( s_optim_iter).numpy() row_dict['g_optim_lr'] = g_optim.learning_rate(iter_).numpy() pprint.pprint(row_dict) # ====================================================================== if iter_ != 0 and (iter_ % Config.log_freq == 0 or is_last_epoch): # calculate acc test_accuracy = evaluate(test_data_loader, student).numpy() row_dict['test_acc'] = test_accuracy logger.log_with_order(row_dict) print('Test Accuracy: ', test_accuracy) # for check poing ckpt_save_path = ckpt_manager.save() print('Saving checkpoint for epoch {} at {}'.format( iter_ + 1, ckpt_save_path)) with open(os.path.join(savedir, 'chpt', 'iteration'), 'w') as f: f.write(str(iter_ + 1)) s_loss_met.reset_states() g_loss_met.reset_states() max_g_grad_norm_metric.reset_states() max_s_grad_norm_metric.reset_states() if iter_ != 0 and (iter_ % 5000 == 0 or is_last_epoch): generator.save_weights( join(full_savedir, "generator_i{}.h5".format(iter_))) student.save_weights( join(full_savedir, "student_i{}.h5".format(iter_)))
# print beta Config.beta = args.beta # print out config for attr, v in vars(Config).items(): if attr.startswith('__'): continue print(attr, ": ", v) # calculate iterations iter_per_epoch = math.ceil(Config.total_iteration / Config.epochs) print("Iteration per epoch: ", iter_per_epoch) print("-------------------------------------") # Set seed set_seed(args.seed) # =================================== # Go to have training # load cifar 10, sampling if need; TODO: make a for SVHN (x_train, y_train_lbl), (x_test, y_test_lbl) = get_cifar10_data() if args.sample_per_class < 5000: x_train, y_train_lbl = balance_sampling( x_train, y_train_lbl, data_per_class=args.sample_per_class) # For evaluation test_data_loader = tf.data.Dataset.from_tensor_slices( (x_test, y_test_lbl)).batch(200) # y_test = to_categorical(y_test_lbl) y_train = to_categorical(y_train_lbl)