def test_write_scalar(self): summary_writer = SummaryWriter(self._log_dir) tag_name = "learning_rate" learning_rate = torch.tensor(.01) for i in range(10): summary_writer.add_scalar(tag_name, learning_rate, i) learning_rate -= 0.005
def train(self, load_model=False, model_path=None): if load_model: if model_path is not None: self.load_weights(model_path) ## Training utterances all_input_ids, all_input_len, all_label_ids = convert_examples_to_features( self.train_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) num_train_batches = all_input_ids.size(0) num_train_steps = int(num_train_batches / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) logger.info("***** training *****") logger.info(" Num examples = %d", len(self.train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids, all_input_len, all_label_ids = all_input_ids.to( DEVICE), all_input_len.to(DEVICE), all_label_ids.to(DEVICE) train_data = TensorDataset(all_input_ids, all_input_len, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) all_input_ids_dev, all_input_len_dev, all_label_ids_dev = convert_examples_to_features( self.dev_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) logger.info("***** validation *****") logger.info(" Num examples = %d", len(self.dev_examples)) logger.info(" Batch size = %d", args.dev_batch_size) all_input_ids_dev, all_input_len_dev, all_label_ids_dev = \ all_input_ids_dev.to(DEVICE), all_input_len_dev.to(DEVICE), all_label_ids_dev.to(DEVICE) dev_data = TensorDataset(all_input_ids_dev, all_input_len_dev, all_label_ids_dev) dev_sampler = SequentialSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.dev_batch_size) logger.info("Loaded data!") if args.fp16: self.sumbt_model.half() self.sumbt_model.to(DEVICE) ## Get domain-slot-type embeddings slot_token_ids, slot_len = \ get_label_embedding(self.processor.target_slot, args.max_label_length, self.tokenizer, DEVICE) # for slot_idx, slot_str in zip(slot_token_ids, self.processor.target_slot): # self.idx2slot[slot_idx] = slot_str ## Get slot-value embeddings label_token_ids, label_len = [], [] for slot_idx, labels in zip(slot_token_ids, self.label_list): # self.idx2value[slot_idx] = {} token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, DEVICE) label_token_ids.append(token_ids) label_len.append(lens) # for label, token_id in zip(labels, token_ids): # self.idx2value[slot_idx][token_id] = label logger.info('embeddings prepared') if USE_CUDA and N_GPU > 1: self.sumbt_model.module.initialize_slot_value_lookup( label_token_ids, slot_token_ids) else: self.sumbt_model.initialize_slot_value_lookup( label_token_ids, slot_token_ids) def get_optimizer_grouped_parameters(model): param_optimizer = [(n, p) for n, p in model.named_parameters() if p.requires_grad] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01, 'lr': args.learning_rate }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.learning_rate }, ] return optimizer_grouped_parameters if not USE_CUDA or N_GPU == 1: optimizer_grouped_parameters = get_optimizer_grouped_parameters( self.sumbt_model) else: optimizer_grouped_parameters = get_optimizer_grouped_parameters( self.sumbt_model.module) t_total = num_train_steps scheduler = None if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.fp16_loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer( optimizer, static_loss_scale=args.fp16_loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_proportion * t_total, num_training_steps=t_total) logger.info(optimizer) # Training code ############################################################################### logger.info("Training...") global_step = 0 last_update = None best_loss = None model = self.sumbt_model if not args.do_not_use_tensorboard: summary_writer = None else: summary_writer = SummaryWriter("./tensorboard_summary/logs_1214/") for epoch in trange(int(args.num_train_epochs), desc="Epoch"): # Train model.train() tr_loss = 0 nb_tr_examples = 0 nb_tr_steps = 0 for step, batch in enumerate(tqdm(train_dataloader)): batch = tuple(t.to(DEVICE) for t in batch) input_ids, input_len, label_ids = batch # Forward if N_GPU == 1: loss, loss_slot, acc, acc_slot, _ = model( input_ids, input_len, label_ids, N_GPU) else: loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) # average to multi-gpus loss = loss.mean() acc = acc.mean() acc_slot = acc_slot.mean(0) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps # Backward if args.fp16: optimizer.backward(loss) else: loss.backward() # tensrboard logging if summary_writer is not None: summary_writer.add_scalar("Epoch", epoch, global_step) summary_writer.add_scalar("Train/Loss", loss, global_step) summary_writer.add_scalar("Train/JointAcc", acc, global_step) if N_GPU == 1: for i, slot in enumerate(self.processor.target_slot): summary_writer.add_scalar( "Train/Loss_%s" % slot.replace(' ', '_'), loss_slot[i], global_step) summary_writer.add_scalar( "Train/Acc_%s" % slot.replace(' ', '_'), acc_slot[i], global_step) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify lealrning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) if summary_writer is not None: summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step if scheduler is not None: torch.nn.utils.clip_grad_norm_( optimizer_grouped_parameters, 1.0) optimizer.step() if scheduler is not None: scheduler.step() optimizer.zero_grad() global_step += 1 # Perform evaluation on validation dataset model.eval() dev_loss = 0 dev_acc = 0 dev_loss_slot, dev_acc_slot = None, None nb_dev_examples, nb_dev_steps = 0, 0 for step, batch in enumerate( tqdm(dev_dataloader, desc="Validation")): batch = tuple(t.to(DEVICE) for t in batch) input_ids, input_len, label_ids = batch if input_ids.dim() == 2: input_ids = input_ids.unsqueeze(0) input_len = input_len.unsqueeze(0) label_ids = label_ids.unsuqeeze(0) with torch.no_grad(): if N_GPU == 1: loss, loss_slot, acc, acc_slot, _ = model( input_ids, input_len, label_ids, N_GPU) else: loss, _, acc, acc_slot, _ = model( input_ids, input_len, label_ids, N_GPU) # average to multi-gpus loss = loss.mean() acc = acc.mean() acc_slot = acc_slot.mean(0) num_valid_turn = torch.sum(label_ids[:, :, 0].view(-1) > -1, 0).item() dev_loss += loss.item() * num_valid_turn dev_acc += acc.item() * num_valid_turn if N_GPU == 1: if dev_loss_slot is None: dev_loss_slot = [l * num_valid_turn for l in loss_slot] dev_acc_slot = acc_slot * num_valid_turn else: for i, l in enumerate(loss_slot): dev_loss_slot[ i] = dev_loss_slot[i] + l * num_valid_turn dev_acc_slot += acc_slot * num_valid_turn nb_dev_examples += num_valid_turn dev_loss = dev_loss / nb_dev_examples dev_acc = dev_acc / nb_dev_examples if N_GPU == 1: dev_acc_slot = dev_acc_slot / nb_dev_examples # tensorboard logging if summary_writer is not None: summary_writer.add_scalar("Validate/Loss", dev_loss, global_step) summary_writer.add_scalar("Validate/Acc", dev_acc, global_step) if N_GPU == 1: for i, slot in enumerate(self.processor.target_slot): summary_writer.add_scalar( "Validate/Loss_%s" % slot.replace(' ', '_'), dev_loss_slot[i] / nb_dev_examples, global_step) summary_writer.add_scalar( "Validate/Acc_%s" % slot.replace(' ', '_'), dev_acc_slot[i], global_step) dev_loss = round(dev_loss, 6) output_model_file = os.path.join( os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin") if last_update is None or dev_loss < best_loss: if not USE_CUDA or N_GPU == 1: torch.save(model.state_dict(), output_model_file) else: torch.save(model.module.state_dict(), output_model_file) last_update = epoch best_loss = dev_loss best_acc = dev_acc logger.info( "*** Model Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % (last_update, best_loss, best_acc, global_step)) else: logger.info( "*** Model NOT Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % (epoch, dev_loss, dev_acc, global_step)) if last_update + args.patience <= epoch: break
def train(agents, params, num_processes): """Training Loop for value-based RL methods. Params ====== agent (object) --- the agent to train params (dict) --- the dictionary of parameters """ n_episodes = params['episodes'] maxlen = params['maxlen'] name = params['agent_params']['name'] brain_name = params['brain_name'] env = params['environment'] add_noise = params['agent_params']['add_noise'] pretrain = params['pretrain'] pretrain_length = params['pretrain_length'] num_agents = num_processes scores = np.zeros(num_agents) # list containing scores from each episode scores_window = deque(maxlen=maxlen) # last N scores scores_episode = [] writer = SummaryWriter(log_dir=params['log_dir'] + name) env_info = env.reset(train_mode=True)[brain_name] tic = time.time() timesteps = 0 achievement_length = 0 episode_start = 1 if params['load_agent']: episode_start, timesteps = agents.load_agent() for i_episode in range(episode_start, n_episodes + 1): tic = time.time() states = env_info.vector_observations scores = np.zeros(num_agents) env.reset() while True: states = torch.tensor(states) if pretrain and pretrain_length < len(agents.memory.memory): pretrain = False actions, noise_epsilon = agents.act(states, add_noise, pretrain=pretrain) env_info = env.step(actions)[ brain_name] # send the action to the environment next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished adjusted_rewards = np.array(env_info.rewards) if params['hack_rewards']: if adjusted_rewards[0] != 0: adjusted_rewards[1] = adjusted_rewards[0] * params[ 'alternative_reward_scalar'] elif adjusted_rewards[1] != 0: adjusted_rewards[0] = adjusted_rewards[1] * params[ 'alternative_reward_scalar'] actor_loss, critic_loss = agents.step(states, actions, adjusted_rewards, next_states, dones, pretrain=pretrain) if actor_loss != None and critic_loss != None: if params['agent_params']['schedule_lr']: actor_lr, critic_lr = agents.get_lr() else: actor_lr, critic_lr = params['agent_params'][ 'actor_params']['lr'], params['agent_params'][ 'critic_params']['lr'] writer.add_scalar('noise_epsilon', noise_epsilon, timesteps) writer.add_scalar('actor_loss', actor_loss, timesteps) writer.add_scalar('critic_loss', critic_loss, timesteps) writer.add_scalar('actor_lr', actor_lr, timesteps) writer.add_scalar('critic_lr', critic_lr, timesteps) print('\rTimestep {}\tMax: {:.2f}'.format(timesteps, np.max(scores)), end="") scores += rewards # update the scores states = next_states # roll over the state to next time step if np.any(dones): # exit loop if episode finished break timesteps += 1 # Fills the buffer with experiences resulting from random actions # to encourage exploration if timesteps % params['random_fill_every'] == 0: pretrain = True pretrain = params['pretrain_length'] score = np.mean(scores) scores_episode.append(score) scores_window.append(score) # save most recent score print('\rEpisode {}\tMax: {:.2f} \t Time: {:.2f}'.format( i_episode, np.max(scores), time.time() - tic), end="\n") if i_episode % params['save_every'] == 0: agents.save_agent(np.mean(scores_window), i_episode, timesteps, save_history=True) else: agents.save_agent(np.mean(scores_window), i_episode, timesteps, save_history=False) writer.add_scalars('scores', { 'mean': np.mean(scores), 'min': np.min(scores), 'max': np.max(scores) }, timesteps) update_csv(name, i_episode, np.mean(scores), np.mean(scores)) agents.step_lr(np.mean(scores)) if np.mean(scores) > params['achievement']: achievement_length += 1 if achievement_length > params['achievement_length']: toc = time.time() print( "\n\n Congratulations! The agent has managed to solve the environment in {} episodes with {} training time\n\n" .format(i_episode, toc - tic)) writer.close() return scores else: achievement_length = 0 writer.close() return scores
sh.rm('-rf', 'logs') import logging logging.basicConfig(level=logging.INFO, stream=sys.stdout) from tensorboardX.writer import SummaryWriter swriter = SummaryWriter('logs') add_scalar_old = swriter.add_scalar def add_scalar_and_log(key, value, global_step=0): logging.info('{}:{}: {}'.format(global_step, key, value)) add_scalar_old(key, value, global_step) swriter.add_scalar = add_scalar_and_log def str2bool(x): return x.lower() == 'true' def new_inception_conv2d_forward(self, x): x = self.conv(x) x = self.bn(x) return F.relu(x, inplace=False) tv.models.inception.BasicConv2d.forward = new_inception_conv2d_forward import argparse
class TensorBoard(Callback): """Callback that streams epoch results to tensorboard events folder. Supports all values that can be represented as a string, including 1D iterables such as `np.ndarray`. Example: ```python tensorboard_logger = TensorBoard('runs') model.fit(X_train, Y_train, callbacks=[tensorboard_logger]) ``` """ def __init__(self, logdir: Optional[str] = None, update_freq: Union[str, int] = "epoch", **kwargs) -> None: """ Arguments: logdir: Save directory location. Default is runs/**CURRENT_DATETIME_HOSTNAME**, which changes after each run. Use hierarchical folder structure to compare between runs easily. e.g. pass in 'runs/exp1', 'runs/exp2', etc. for each new experiment to compare across them. update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`, writes the losses and metrics to TensorBoard after each batch. The same applies for `'epoch'`. **kwargs: Options to pass to `SummaryWriter` object """ self.logdir = logdir self.writer = None self.keys = None if update_freq == "batch": self.update_freq = 1 else: self.update_freq = update_freq self._open_args = kwargs if kwargs else {} super(TensorBoard, self).__init__() def on_train_begin(self, logs=None): self.writer = SummaryWriter(self.logdir, **self._open_args) def on_train_batch_end(self, batch: int, logs): if self.update_freq == "epoch": return logs = logs or {} def handle_value(k): is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0 if isinstance(k, six.string_types): return k elif isinstance(k, tp.Iterable) and not is_zero_dim_ndarray: return '"[%s]"' % (", ".join(map(str, k))) else: return k if self.update_freq != "epoch" and batch % self.update_freq == 0: if self.keys is None: self.keys = sorted(logs.keys()) row_dict = collections.OrderedDict({"batch": batch}) row_dict.update( (key + "batch", handle_value(logs[key])) for key in self.keys) for key, value in row_dict.items(): self.writer.add_scalar(key, value, batch) def on_epoch_end(self, epoch, logs=None): logs = logs or {} def handle_value(k): is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0 if isinstance(k, six.string_types): return k elif isinstance(k, tp.Iterable) and not is_zero_dim_ndarray: return '"[%s]"' % (", ".join(map(str, k))) else: return k if self.keys is None: self.keys = sorted(logs.keys()) row_dict = collections.OrderedDict({"epoch": epoch}) row_dict.update((key, handle_value(logs[key])) for key in self.keys) for key, value in row_dict.items(): self.writer.add_scalar(key, value, epoch) def on_train_end(self, logs=None): self.writer.close()
class Trainer: experiment_name = None def __init__( self, net, criterion=None, metric=cal_accuracy, train_dataloader=None, val_dataloader=None, test_dataloader=None, optimizer=None, lr_scheduler=None, tensorboard_dir="./pinkblack_tb/", ckpt="./ckpt/ckpt.pth", experiment_id=None, clip_gradient_norm=False, is_data_dict=False, ): """ :param net: nn.Module Network :param criterion: loss function. __call__(prediction, *batch_y) :param metric: metric function __call__(prediction, *batch_y). *note* : bigger is better. (Early Stopping할 때 metric이 더 큰 값을 선택한다) :param train_dataloader: :param val_dataloader: :param test_dataloader: :param optimizer: torch.optim :param lr_scheduler: :param tensorboard_dir: tensorboard log :param ckpt: weight path :param experiment_id: be shown on tensorboard :param clip_gradient_norm: False or Scalar value (숫자를 입력하면 gradient clipping한다.) :param is_data_dict: whether dataloaders return dict. (dataloader에서 주는 데이터가 dict인지 - 아니라면 (x, y pair tuple로 주는 데이터이다.) """ self.net = net self.criterion = nn.CrossEntropyLoss( ) if criterion is None else criterion self.metric = metric self.dataloader = dict() if train_dataloader is not None: self.dataloader["train"] = train_dataloader if val_dataloader is not None: self.dataloader["val"] = val_dataloader if test_dataloader is not None: self.dataloader["test"] = test_dataloader if train_dataloader is None or val_dataloader is None: logging.warning("Init Trainer :: Two dataloaders are needed!") self.optimizer = (Adam( filter(lambda p: p.requires_grad, self.net.parameters())) if optimizer is None else optimizer) self.lr_scheduler = lr_scheduler self.ckpt = ckpt self.config = defaultdict(float) self.config["max_train_metric"] = -1e8 self.config["max_val_metric"] = -1e8 self.config["max_test_metric"] = -1e8 self.config["tensorboard_dir"] = tensorboard_dir self.config["timestamp"] = datetime.now().strftime("%Y%m%d_%H%M%S") self.config["clip_gradient_norm"] = clip_gradient_norm self.config["is_data_dict"] = is_data_dict if experiment_id is None: self.config["experiment_id"] = self.config["timestamp"] else: self.config["experiment_id"] = experiment_id self.dataframe = pd.DataFrame() self.device = Trainer.get_model_device(self.net) if self.device == torch.device("cpu"): logging.warning( "Init Trainer :: Do you really want to train the network on CPU instead of GPU?" ) if self.config["tensorboard_dir"] is not None: self.tensorboard = SummaryWriter(self.config["tensorboard_dir"]) else: self.tensorboard = None self.callbacks = defaultdict(list) def register_callback(self, func, phase="val"): self.callbacks[phase].append(func) def save(self, f=None): if f is None: f = self.ckpt os.makedirs(os.path.dirname(f), exist_ok=True) if isinstance(self.net, nn.DataParallel): state_dict = self.net.module.state_dict() else: state_dict = self.net.state_dict() torch.save(state_dict, f) torch.save(self.optimizer.state_dict(), f + ".optimizer") if self.lr_scheduler is not None: torch.save(self.lr_scheduler.state_dict(), f + ".scheduler") with open(f + ".config", "w") as fp: json.dump(self.config, fp) self.dataframe.to_csv(f + ".csv", float_format="%.6f", index=False) def load(self, f=None): if f is None: f = self.ckpt if isinstance(self.net, nn.DataParallel): self.net.module.load_state_dict( torch.load(f, map_location=self.device)) else: self.net.load_state_dict(torch.load(f, map_location=self.device)) if os.path.exists(f + ".config"): with open(f + ".config", "r") as fp: dic = json.loads(fp.read()) self.config = defaultdict(float, dic) print("Loaded,", self.config) if os.path.exists(f + ".optimizer"): self.optimizer.load_state_dict(torch.load(f + ".optimizer")) if os.path.exists(f + ".scheduler") and self.lr_scheduler is not None: self.lr_scheduler.load_state_dict(torch.load(f + ".scheduler")) if os.path.exists(f + ".csv"): self.dataframe = pd.read_csv(f + ".csv") if self.config["tensorboard_dir"] is not None: self.tensorboard = SummaryWriter(self.config["tensorboard_dir"]) else: self.tensorboard = None def train(self, epoch=None, phases=None, step=None, validation_interval=1, save_every_validation=False): """ :param epoch: train dataloader를 순회할 횟수 :param phases: ['train', 'val', 'test'] 중 필요하지 않은 phase를 뺄 수 있다. >> trainer.train(1, phases=['val']) :param step: epoch이 아닌 step을 훈련단위로 할 때의 총 step 수. :param validation_interval: validation 간격 :param save_every_validation: True이면, validation마다 checkpoint를 저장한다. :return: None """ if phases is None: phases = list(self.dataloader.keys()) if epoch is None and step is None: raise ValueError( "PinkBlack.trainer :: epoch or step should be specified.") train_unit = "epoch" if step is None else "step" self.config[train_unit] = int(self.config[train_unit]) num_unit = epoch if step is None else step validation_interval = 1 if validation_interval <= 0 else validation_interval kwarg_list = [train_unit] for phase in phases: kwarg_list += [f"{phase}_loss", f"{phase}_metric"] kwarg_list += ["lr", "time"] print_row(kwarg_list=[""] * len(kwarg_list), pad="-") print_row(kwarg_list=kwarg_list, pad=" ") print_row(kwarg_list=[""] * len(kwarg_list), pad="-") start = self.config[train_unit] for i in range(start, start + num_unit, validation_interval): start_time = time() if train_unit == "epoch": for phase in phases: self.config[f"{phase}_loss"], self.config[ f"{phase}_metric"] = self._train( phase, num_steps=len(self.dataloader[phase])) for func in self.callbacks[phase]: func() self.config[train_unit] += 1 elif train_unit == "step": for phase in phases: if phase == "train": # num_unit 이 validation interval로 나눠떨어지지 않는 경우 num_steps = min((start + num_unit - i), validation_interval) self.config[train_unit] += num_steps else: num_steps = len(self.dataloader[phase]) self.config[f"{phase}_loss"], self.config[ f"{phase}_metric"] = self._train(phase, num_steps=num_steps) for func in self.callbacks[phase]: func() else: raise NotImplementedError if self.lr_scheduler is not None: if isinstance(self.lr_scheduler, ReduceLROnPlateau): self.lr_scheduler.step(self.config["val_metric"]) else: self.lr_scheduler.step() i_str = str(self.config[train_unit]) is_best = self.config["max_val_metric"] < self.config["val_metric"] if is_best: for phase in phases: self.config[f"max_{phase}_metric"] = max( self.config[f"max_{phase}_metric"], self.config[f"{phase}_metric"]) i_str = (str(self.config[train_unit])) + "-best" elapsed_time = time() - start_time if self.tensorboard is not None: _loss, _metric = {}, {} for phase in phases: _loss[phase] = self.config[f"{phase}_loss"] _metric[phase] = self.config[f"{phase}_metric"] self.tensorboard.add_scalars( f"{self.config['experiment_id']}/loss", _loss, self.config[train_unit]) self.tensorboard.add_scalars( f"{self.config['experiment_id']}/metric", _metric, self.config[train_unit]) self.tensorboard.add_scalar( f"{self.config['experiment_id']}/time", elapsed_time, self.config[train_unit]) self.tensorboard.add_scalar( f"{self.config['experiment_id']}/lr", self.optimizer.param_groups[0]["lr"], self.config[train_unit], ) print_kwarg = [i_str] for phase in phases: print_kwarg += [ self.config[f"{phase}_loss"], self.config[f"{phase}_metric"] ] print_kwarg += [self.optimizer.param_groups[0]["lr"], elapsed_time] print_row(kwarg_list=print_kwarg, pad=" ") print_row(kwarg_list=[""] * len(kwarg_list), pad="-") self.dataframe = self.dataframe.append(dict( zip(kwarg_list, print_kwarg)), ignore_index=True) if is_best: self.save(self.ckpt) if Trainer.experiment_name is not None: self.update_experiment() if save_every_validation: self.save(self.ckpt + f"-{self.config[train_unit]}") def _step(self, phase, iterator, only_inference=False): if self.config["is_data_dict"]: batch_dict = next(iterator) batch_size = batch_dict[list(batch_dict.keys())[0]].size(0) for k, v in batch_dict.items(): batch_dict[k] = v.to(self.device) else: batch_x, batch_y = next(iterator) if isinstance(batch_x, list): batch_x = [x.to(self.device) for x in batch_x] else: batch_x = [batch_x.to(self.device)] if isinstance(batch_y, list): batch_y = [y.to(self.device) for y in batch_y] else: batch_y = [batch_y.to(self.device)] batch_size = batch_x[0].size(0) self.optimizer.zero_grad() with torch.set_grad_enabled(phase == "train"): if self.config["is_data_dict"]: outputs = self.net(batch_dict) if not only_inference: loss = self.criterion(outputs, batch_dict) else: outputs = self.net(*batch_x) if not only_inference: loss = self.criterion(outputs, *batch_y) if only_inference: return outputs if phase == "train": loss.backward() if self.config["clip_gradient_norm"]: clip_grad_norm_(self.net.parameters(), self.config["clip_gradient_norm"]) self.optimizer.step() with torch.no_grad(): if self.config["is_data_dict"]: metric = self.metric(outputs, batch_dict) else: metric = self.metric(outputs, *batch_y) return { "loss": loss.item(), "batch_size": batch_size, "metric": metric.item() } def _train(self, phase, num_steps=0): running_loss = AverageMeter() running_metric = AverageMeter() if phase == "train": self.net.train() else: self.net.eval() dataloader = self.dataloader[phase] step_iterator = iter(dataloader) tq = tqdm(range(num_steps), leave=False) for st in tq: if (st + 1) % len(dataloader) == 0: step_iterator = iter(dataloader) results = self._step(phase=phase, iterator=step_iterator) tq.set_description( f"Loss:{results['loss']:.4f}, Metric:{results['metric']:.4f}") running_loss.update(results["loss"], results["batch_size"]) running_metric.update(results["metric"], results["batch_size"]) return running_loss.avg, running_metric.avg def eval(self, dataloader=None): self.net.eval() if dataloader is None: dataloader = self.dataloader["val"] phase = "val" output_list = [] step_iterator = iter(dataloader) num_steps = len(dataloader) for st in tqdm(range(num_steps), leave=False): results = self._step(phase="val", iterator=step_iterator, only_inference=True) output_list.append(results) output_cat = torch.cat(output_list) return output_cat def add_external_config(self, args): """ args : a dict-like object which contains key-value configurations. """ new_d = defaultdict(float) for k, v in args.items(): new_d[f"config_{k}"] = v self.config.update(new_d) def update_experiment(self): """ Update experiment statistics by its name (csv file). """ assert Trainer.experiment_name is not None df_config = pd.DataFrame(pd.Series( self.config)).T.set_index("experiment_id") if os.path.exists(Trainer.experiment_name + ".csv"): df_ex = pd.read_csv(Trainer.experiment_name + ".csv", index_col=0) if self.config["experiment_id"] in df_ex.index: df_ex = df_ex.drop(self.config["experiment_id"]) df_ex = df_ex.append(df_config, sort=False) else: df_ex = df_config df_ex.to_csv(Trainer.experiment_name + ".csv") return df_ex def swa_apply(self, bn_update=True): assert hasattr(self.optimizer, "swap_swa_sgd") self.optimizer.swap_swa_sgd() if bn_update: self.swa_bn_update() def swa_bn_update(self): r"""Updates BatchNorm running_mean, running_var buffers in the model. It performs one pass over data in `loader` to estimate the activation statistics for BatchNorm layers in the model. original source is from : torchcontrib """ if not check_bn(self.net): return was_training = self.net.training self.net.train() momenta = {} self.net.apply(reset_bn) self.net.apply(lambda module: get_momenta(module, momenta)) n = 0 for input in self.dataloader['train']: if isinstance(input, (list, tuple)): input = input[0] b = input.size(0) input = input.to(self.device) elif self.config['is_data_dict']: b = input[list(input.keys())[0]].size(0) for k, v in input.items(): input[k] = v.to(self.device) else: b = input.size(0) input = input.to(self.device) momentum = b / float(n + b) for module in momenta.keys(): module.momentum = momentum self.net(input) n += b self.net.apply(lambda module: set_momenta(module, momenta)) self.net.train(was_training) @staticmethod def get_model_device(net): device = torch.device("cpu") for param in net.parameters(): device = param.device break return device @staticmethod def set_experiment_name(name): Trainer.experiment_name = name
class TensorBoard(Callback): """ Callback that streams epoch results to tensorboard events folder. Supports all values that can be represented as a string, including 1D iterables such as `np.ndarray`. ```python tensorboard_logger = TensorBoard('runs') model.fit(X_train, Y_train, callbacks=[tensorboard_logger]) ``` """ def __init__( self, logdir: Optional[str] = None, *, update_freq: Union[str, int] = "epoch", purge_step: Optional[int] = None, comment: str = "", ) -> None: """ Arguments: logdir: Save directory location. Default is runs/**CURRENT_DATETIME_HOSTNAME**/{train, val}, which changes after each run. Use hierarchical folder structure to compare between runs easily. e.g. pass in 'runs/exp1', 'runs/exp2', etc. for each new experiment to compare across them. update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`, writes the losses and metrics to TensorBoard after each batch. The same applies for `'epoch'`. If using an integer, let's say `1000`, the callback will write the metrics and losses to TensorBoard every 1000 batches. Note that writing too frequently to TensorBoard can slow down your training. purge_step (int): When logging crashes at step :math:`T+X` and restarts at step :math:`T`, any events whose global_step larger or equal to :math:`T` will be purged and hidden from TensorBoard. Note that crashed and resumed experiments should have the same ``logdir``. comment (string): Comment logdir suffix appended to the default ``logdir``. If ``logdir`` is assigned, this argument has no effect. """ if not logdir: import socket from datetime import datetime current_time = datetime.now().strftime("%b%d_%H-%M-%S") self.logdir = os.path.join( "runs", current_time + "_" + socket.gethostname() + comment ) else: self.logdir = logdir self.train_writer = None self.val_writer = None self.keys = None self.write_per_batch = True try: self.update_freq = int(update_freq) except ValueError as e: self.update_freq = 1 if update_freq == "batch": self.write_per_batch = True elif update_freq == "epoch": self.write_per_batch = False else: raise e self.purge_step = purge_step super(TensorBoard, self).__init__() def on_train_begin(self, logs=None): self.train_writer = SummaryWriter( os.path.join(self.logdir, "train"), purge_step=self.purge_step ) self.val_writer = SummaryWriter( os.path.join(self.logdir, "val"), purge_step=self.purge_step ) self.steps = self.params["steps"] self.global_step = 0 def on_train_batch_end(self, batch: int, logs=None): if not self.write_per_batch: return logs = logs or {} self.global_step = batch + self.current_epoch * (self.steps) if self.global_step % self.update_freq == 0: if self.keys is None: self.keys = logs.keys() for key in self.keys: self.train_writer.add_scalar(key, logs[key], self.global_step) def on_epoch_begin(self, epoch: int, logs=None): self.current_epoch = epoch def on_epoch_end(self, epoch, logs=None): logs = logs or {} if self.keys is None: self.keys = logs.keys() # logs on on_{train, test}_batch_end do not have val metrics if self.write_per_batch: for key in logs: if "val" in key: self.val_writer.add_scalar( key.replace("val_", ""), logs[key], self.global_step ) return elif epoch % self.update_freq == 0: for key in self.keys: if "val" in key: self.val_writer.add_scalar( key.replace("val_", ""), logs[key], epoch ) else: self.train_writer.add_scalar(key, logs[key], epoch) def on_train_end(self, logs=None): self.train_writer.close() self.val_writer.close()