class TensorBoard(Callback): # TODO: add option to write images; find fix for graph def __init__(self, log_dir, update_frequency = 10): super(Callback, self).__init__() self.log_dir = log_dir self.writer = None self.update_frequency = update_frequency def on_train_begin(self, **_): self.writer = SummaryWriter(os.path.join(self.log_dir, datetime.datetime.now().__str__())) rndm_input = torch.autograd.Variable(torch.rand(1, *self.model.input_shape), requires_grad = True).to(self.logger['device']) # fwd_pass = self.model(rndm_input) self.writer.add_graph(self.model, rndm_input) return self def on_epoch_end(self, **_): if (self.logger['epoch'] % self.update_frequency) == 0: epoch_metrics = self.logger['epoch_metrics'][self.logger['epoch']] for e_metric, e_metric_dct in epoch_metrics.iteritems(): for e_metric_split, e_metric_val in e_metric_dct.iteritems(): self.writer.add_scalar('{}/{}'.format(e_metric_split, e_metric), e_metric_val, self.logger['epoch']) for name, param in self.model.named_parameters(): self.writer.add_histogram(name.replace('.', '/'), param.clone().cpu().data.numpy(), self.logger['epoch']) return self def on_train_end(self, **_): return self.writer.close()
def learn(learning_rate, iterations, x, y, validation=None, stop_early=False, run_comment=''): # Define a neural network using high-level modules. writer = SummaryWriter(comment=run_comment) model = Sequential( Linear(len(x[0]), len(y[0]), bias=True) # n inputs -> 1 output ) loss_fn = BCEWithLogitsLoss(reduction='sum') # reduction=mean converges slower. # TODO: Add an option to twiddle pos_weight, which lets us trade off precision and recall. Maybe also graph using add_pr_curve(), which can show how that tradeoff is going. optimizer = Adam(model.parameters(),lr=learning_rate) if validation: validation_ins, validation_outs = validation previous_validation_loss = None with progressbar(range(iterations)) as bar: for t in bar: y_pred = model(x) # Make predictions. loss = loss_fn(y_pred, y) writer.add_scalar('loss', loss, t) if validation: validation_loss = loss_fn(model(validation_ins), validation_outs) if stop_early: if previous_validation_loss is not None and previous_validation_loss < validation_loss: print('Stopping early at iteration {t} because validation error rose.'.format(t=t)) model.load_state_dict(previous_model) break else: previous_validation_loss = validation_loss previous_model = model.state_dict() writer.add_scalar('validation_loss', validation_loss, t) writer.add_scalar('training_accuracy_per_tag', accuracy_per_tag(model, x, y), t) optimizer.zero_grad() # Zero the gradients. loss.backward() # Compute gradients. optimizer.step() # Horizontal axis is what confidence. Vertical is how many samples were that confidence. writer.add_histogram('confidence', confidences(model, x), t) writer.close() return model
optim_reglar.zero_grad() loss_normal.backward() loss_wdecay.backward() optim_normal.step() optim_reglar.step() if (epoch + 1) % disp_interval == 0: net_prob_0.eval() net_prob_05.eval() # 可视化 for name, layer in net_prob_0.named_parameters(): writer.add_histogram(name + '_grad_normal', layer.grad, epoch) writer.add_histogram(name + '_data_normal', layer, epoch) for name, layer in net_prob_05.named_parameters(): writer.add_histogram(name + '_grad_regularization', layer.grad, epoch) writer.add_histogram(name + '_data_regularization', layer, epoch) test_pred_prob_0, test_pred_prob_05 = net_prob_0(test_x), net_prob_05( test_x) # 绘图 plt.scatter(train_x.data.numpy(), train_y.data.numpy(), c='blue', s=50,
class UNet3DTrainer: """3D UNet trainer. Args: model (Unet3D): UNet 3D model to be trained optimizer (nn.optim.Optimizer): optimizer used for training lr_scheduler (torch.optim.lr_scheduler._LRScheduler): learning rate scheduler WARN: bear in mind that lr_scheduler.step() is invoked after every validation step (i.e. validate_after_iters) not after every epoch. So e.g. if one uses StepLR with step_size=30 the learning rate will be adjusted after every 30 * validate_after_iters iterations. loss_criterion (callable): loss function eval_criterion (callable): used to compute training/validation metric (such as Dice, IoU, AP or Rand score) saving the best checkpoint is based on the result of this function on the validation set device (torch.device): device to train on loaders (dict): 'train' and 'val' loaders checkpoint_dir (string): dir for saving checkpoints and tensorboard logs max_num_epochs (int): maximum number of epochs max_num_iterations (int): maximum number of iterations validate_after_iters (int): validate after that many iterations log_after_iters (int): number of iterations before logging to tensorboard validate_iters (int): number of validation iterations, if None validate on the whole validation set eval_score_higher_is_better (bool): if True higher eval scores are considered better best_eval_score (float): best validation score so far (higher better) num_iterations (int): useful when loading the model from the checkpoint num_epoch (int): useful when loading the model from the checkpoint tensorboard_formatter (callable): converts a given batch of input/output/target image to a series of images that can be displayed in tensorboard skip_train_validation (bool): if True eval_criterion is not evaluated on the training set (used mostly when evaluation is expensive) """ def __init__(self, model, optimizer, lr_scheduler, loss_criterion, eval_criterion, device, loaders, checkpoint_dir, max_num_epochs=100, max_num_iterations=1e5, validate_after_iters=100, log_after_iters=100, validate_iters=None, num_iterations=1, num_epoch=0, eval_score_higher_is_better=True, best_eval_score=None, tensorboard_formatter=None, skip_train_validation=False): self.model = model self.optimizer = optimizer self.scheduler = lr_scheduler self.loss_criterion = loss_criterion self.eval_criterion = eval_criterion self.device = device self.loaders = loaders self.checkpoint_dir = checkpoint_dir self.max_num_epochs = max_num_epochs self.max_num_iterations = max_num_iterations self.validate_after_iters = validate_after_iters self.log_after_iters = log_after_iters self.validate_iters = validate_iters self.eval_score_higher_is_better = eval_score_higher_is_better logger.info(model) logger.info(f'eval_score_higher_is_better: {eval_score_higher_is_better}') if best_eval_score is not None: self.best_eval_score = best_eval_score else: # initialize the best_eval_score if eval_score_higher_is_better: self.best_eval_score = float('-inf') else: self.best_eval_score = float('+inf') self.writer = SummaryWriter(log_dir=os.path.join(checkpoint_dir, 'logs')) assert tensorboard_formatter is not None, 'TensorboardFormatter must be provided' self.tensorboard_formatter = tensorboard_formatter self.num_iterations = num_iterations self.num_epoch = num_epoch self.skip_train_validation = skip_train_validation @classmethod def from_checkpoint(cls, checkpoint_path, model, optimizer, lr_scheduler, loss_criterion, eval_criterion, loaders, tensorboard_formatter=None, skip_train_validation=False): logger.info(f"Loading checkpoint '{checkpoint_path}'...") state = utils.load_checkpoint(checkpoint_path, model, optimizer) logger.info( f"Checkpoint loaded. Epoch: {state['epoch']}. Best val score: {state['best_eval_score']}. Num_iterations: {state['num_iterations']}") checkpoint_dir = os.path.split(checkpoint_path)[0] return cls(model, optimizer, lr_scheduler, loss_criterion, eval_criterion, torch.device(state['device']), loaders, checkpoint_dir, eval_score_higher_is_better=state['eval_score_higher_is_better'], best_eval_score=state['best_eval_score'], num_iterations=state['num_iterations'], num_epoch=state['epoch'], max_num_epochs=state['max_num_epochs'], max_num_iterations=state['max_num_iterations'], validate_after_iters=state['validate_after_iters'], log_after_iters=state['log_after_iters'], validate_iters=state['validate_iters'], tensorboard_formatter=tensorboard_formatter, skip_train_validation=skip_train_validation) @classmethod def from_pretrained(cls, pre_trained, model, optimizer, lr_scheduler, loss_criterion, eval_criterion, device, loaders, max_num_epochs=100, max_num_iterations=1e5, validate_after_iters=100, log_after_iters=100, validate_iters=None, num_iterations=1, num_epoch=0, eval_score_higher_is_better=True, best_eval_score=None, tensorboard_formatter=None, skip_train_validation=False): logger.info(f"Logging pre-trained model from '{pre_trained}'...") utils.load_checkpoint(pre_trained, model, None) checkpoint_dir = os.path.split(pre_trained)[0] return cls(model, optimizer, lr_scheduler, loss_criterion, eval_criterion, device, loaders, checkpoint_dir, eval_score_higher_is_better=eval_score_higher_is_better, best_eval_score=best_eval_score, num_iterations=num_iterations, num_epoch=num_epoch, max_num_epochs=max_num_epochs, max_num_iterations=max_num_iterations, validate_after_iters=validate_after_iters, log_after_iters=log_after_iters, validate_iters=validate_iters, tensorboard_formatter=tensorboard_formatter, skip_train_validation=skip_train_validation) def fit(self): for _ in range(self.num_epoch, self.max_num_epochs): # train for one epoch should_terminate = self.train(self.loaders['train']) if should_terminate: logger.info('Stopping criterion is satisfied. Finishing training') return self.num_epoch += 1 logger.info(f"Reached maximum number of epochs: {self.max_num_epochs}. Finishing training...") def train(self, train_loader): """Trains the model for 1 epoch. Args: train_loader (torch.utils.data.DataLoader): training data loader Returns: True if the training should be terminated immediately, False otherwise """ train_losses = utils.RunningAverage() train_eval_scores = utils.RunningAverage() # sets the model in training mode self.model.train() for i, t in enumerate(train_loader): logger.info( f'Training iteration {self.num_iterations}. Batch {i}. Epoch [{self.num_epoch}/{self.max_num_epochs - 1}]') input, target, weight = self._split_training_batch(t) output, loss = self._forward_pass(input, target, weight) train_losses.update(loss.item(), self._batch_size(input)) # compute gradients and update parameters self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.num_iterations % self.validate_after_iters == 0: # set the model in eval mode self.model.eval() # evaluate on validation set eval_score = self.validate(self.loaders['val']) # set the model back to training mode self.model.train() # adjust learning rate if necessary if isinstance(self.scheduler, ReduceLROnPlateau): self.scheduler.step(eval_score[1]) else: self.scheduler.step() # log current learning rate in tensorboard self._log_lr() # remember best validation metric is_best = self._is_best_eval_score(eval_score[1]) # save checkpoint self._save_checkpoint(is_best) if self.num_iterations % self.log_after_iters == 0: # if model contains final_activation layer for normalizing logits apply it, otherwise both # the evaluation metric as well as images in tensorboard will be incorrectly computed if hasattr(self.model, 'final_activation') and self.model.final_activation is not None: output = self.model.final_activation(output) # compute eval criterion if not self.skip_train_validation: eval_score = self.eval_criterion(output, target) train_eval_scores.update(eval_score.numpy(), self._batch_size(input)) # log stats, params and images logger.info( f'Training stats. Loss: {train_losses.avg}. Evaluation score: {train_eval_scores.avg}') self._log_stats('train', [train_losses.avg], train_eval_scores.avg) self._log_params() self._log_images(input, target, output, 'train_') if self.should_stop(): return True self.num_iterations += 1 return False def should_stop(self): """ Training will terminate if maximum number of iterations is exceeded or the learning rate drops below some predefined threshold (1e-6 in our case) """ if self.max_num_iterations < self.num_iterations: logger.info(f'Maximum number of iterations {self.max_num_iterations} exceeded.') return True min_lr = 1e-6 lr = self.optimizer.param_groups[0]['lr'] if lr < min_lr: logger.info(f'Learning rate below the minimum {min_lr}.') return True return False def validate(self, val_loader): logger.info('Validating...') val_losses = utils.RunningAverage() val_scores = utils.RunningAverage() with torch.no_grad(): for i, t in enumerate(val_loader): logger.info(f'Validation iteration {i}') input, target, weight = self._split_training_batch(t) output, loss = self._forward_pass(input, target, weight) val_losses.update(loss.item(), self._batch_size(input)) # if model contains final_activation layer for normalizing logits apply it, otherwise # the evaluation metric will be incorrectly computed if hasattr(self.model, 'final_activation') and self.model.final_activation is not None: output = self.model.final_activation(output) if i % 100 == 0: self._log_images(input, target, output, 'val_') eval_score = self.eval_criterion(output, target) val_scores.update(eval_score.numpy(), self._batch_size(input)) if self.validate_iters is not None and self.validate_iters <= i: # stop validation break self._log_stats('val', [val_losses.avg], val_scores.avg) logger.info(f'Validation finished. Loss: {val_losses.avg}. Evaluation score: {val_scores.avg}') return val_scores.avg def _split_training_batch(self, t): def _move_to_device(input): if isinstance(input, tuple) or isinstance(input, list): return tuple([_move_to_device(x) for x in input]) else: return input.to(self.device) t = _move_to_device(t) weight = None if len(t) == 2: input, target = t else: input, target, weight = t return input, target, weight def _forward_pass(self, input, target, weight=None): # forward pass output = self.model(input) # compute the loss if weight is None: loss = self.loss_criterion(output, target) else: loss = self.loss_criterion(output, target, weight) return output, loss def _is_best_eval_score(self, eval_score): if self.eval_score_higher_is_better: is_best = eval_score > self.best_eval_score else: is_best = eval_score < self.best_eval_score if is_best: logger.info(f'Saving new best evaluation metric: {eval_score}') self.best_eval_score = eval_score return is_best def _save_checkpoint(self, is_best): # remove `module` prefix from layer names when using `nn.DataParallel` # see: https://discuss.pytorch.org/t/solved-keyerror-unexpected-key-module-encoder-embedding-weight-in-state-dict/1686/20 if isinstance(self.model, nn.DataParallel): state_dict = self.model.module.state_dict() else: state_dict = self.model.state_dict() utils.save_checkpoint({ 'epoch': self.num_epoch + 1, 'num_iterations': self.num_iterations, 'model_state_dict': state_dict, 'best_eval_score': self.best_eval_score, 'eval_score_higher_is_better': self.eval_score_higher_is_better, 'optimizer_state_dict': self.optimizer.state_dict(), 'device': str(self.device), 'max_num_epochs': self.max_num_epochs, 'max_num_iterations': self.max_num_iterations, 'validate_after_iters': self.validate_after_iters, 'log_after_iters': self.log_after_iters, 'validate_iters': self.validate_iters }, is_best, checkpoint_dir=self.checkpoint_dir, logger=logger) def _log_lr(self): lr = self.optimizer.param_groups[0]['lr'] self.writer.add_scalar('learning_rate', lr, self.num_iterations) def _log_stats(self, phase, loss_avg, eval_score_avg): tag_value = { f'{phase}_loss_avg': loss_avg, f'{phase}_eval_score_avg': eval_score_avg } for tag, value in tag_value.items(): if len(value)>1: value_dict = {} for i in range(len(value)): value_dict[tag+'_class'+str(i)] = value[i] self.writer.add_scalars(tag, value_dict, self.num_iterations) else: self.writer.add_scalar(tag, value, self.num_iterations) def _log_params(self): logger.info('Logging model parameters and gradients') for name, value in self.model.named_parameters(): self.writer.add_histogram(name, value.data.cpu().numpy(), self.num_iterations) self.writer.add_histogram(name + '/grad', value.grad.data.cpu().numpy(), self.num_iterations) def _log_images(self, input, target, prediction, prefix=''): inputs_map = { 'inputs': input, 'targets': target, 'predictions': prediction } img_sources = {} for name, batch in inputs_map.items(): if isinstance(batch, list) or isinstance(batch, tuple): for i, b in enumerate(batch): img_sources[f'{name}{i}'] = b.data.cpu().numpy() else: img_sources[name] = batch.data.cpu().numpy() for name, batch in img_sources.items(): for tag, image in self.tensorboard_formatter(name, batch): self.writer.add_image(prefix + tag, image, self.num_iterations, dataformats='CHW') @staticmethod def _batch_size(input): if isinstance(input, list) or isinstance(input, tuple): return input[0].size(0) else: return input.size(0)
class TBCallback(TrainingCallback): def __init__(self, log_dir, input_dim=None): self.log_dir = log_dir self.input_dim = input_dim self.writer = SummaryWriter(log_dir) super().__init__() def before_training(self, model_trainer): if self.input_dim is not None: dummy_input = cuda_move(Variable(torch.zeros(self.input_dim))) model_file = self.log_dir + 'onnx_model.proto' torch.onnx.export(model_trainer.model, dummy_input, model_file, verbose=True) self.writer.add_graph_onnx(model_file) pass def after_epoch(self, model_trainer, train_data, validation_data): n_iter = model_trainer.global_step train_loss, train_metric = model_trainer.train_losses[ -1], model_trainer.train_metrics[-1] val_loss, val_metric = model_trainer.val_losses[ -1], model_trainer.val_metrics[-1] # data grouping by `slash` self.writer.add_scalar('data/train_loss', train_loss, n_iter) self.writer.add_scalar('data/train_metric', train_metric, n_iter) self.writer.add_scalar('data/val_loss', val_loss, n_iter) self.writer.add_scalar('data/val_metric', val_metric, n_iter) if n_iter % model_trainer.validation_steps == 0: # self.writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter) for name, param in model_trainer.model.named_parameters(): self.writer.add_histogram('param/' + name, param.clone().cpu().data.numpy(), n_iter, bins='sturges') self._save_gradient_histograms(model_trainer, train_data) def after_training(self, model_trainer): """ Export scalar data to JSON for external processing and save final weights as images. """ # for name, param in model_trainer.model.named_parameters(): # param = param.data.clone().cpu() # if len(param.size()) == 2: # images should have size (width, height, channel) # param = param.unsqueeze(2) # elif len(param.size()) == 1: # param = param.unsqueeze(1) # param = param.unsqueeze(2) # self.writer.add_image(name, param, model_trainer.global_step) self.writer.export_scalars_to_json("./all_scalars.json") self.writer.close() def _save_gradient_histograms(self, model_trainer, train_data): # Add gradient norm histogram n_iter = model_trainer.global_step random_shuffle = list(train_data.get_one_hot_list()) random.shuffle(random_shuffle) for par in model_trainer.model.parameters(): par.accumulated_grad = [] n_samples = 100 for X_i, y_i in random_shuffle[:n_samples]: X_data, y_data = cuda_move(X_i), cuda_move(y_i) # TODO: backprop through thousand of time steps y_out = model_trainer.model.forward(X_data, logits=True) loss = F.binary_cross_entropy_with_logits(y_out, y_data) model_trainer.model.zero_grad() loss.backward() for par in model_trainer.model.parameters(): par.accumulated_grad.append(par.grad) for name, par in model_trainer.model.named_parameters(): t = torch.stack(par.accumulated_grad, 0) self.writer.add_histogram('grad/' + name, t.clone().cpu().data.numpy(), n_iter, bins='sturges') par.accumulated_grad = None def __str__(self): return "TBCallback(logdir={})".format(self.log_dir)
class DeepQTrainer: """ A Deep Q Network trainer. Supports TD learning, Q learning, and Double Q learning """ def __init__(self, data, batch_size, epoch_limit, criterion, save_loc, name, log_dir, gpu_device, lr, clip, on_policy, double_q, num_workers, reset_rate, validate_rate): """ :param data: A pytorch Dataset :param batch_size: training/validation batch size :param epoch_limit: number of training epochs :param criterion: training loss :param save_loc: save directory :param name: experiment name :param log_dir: save directory for tensorboard log files :param gpu_device: numbered gpu device on which to run :param lr: learning rate for ADAM optimizer :param clip: parameter for gradient clipping :param on_policy: boolean switch to use TD learning instead of Q learning :param double_q: boolean switch to use double Q learning instead of Q learning :param num_workers: number of data-loading threads to use :param reset_rate: rate to cache target network :param validate_rate: rate to check validation performance """ self.writer = SummaryWriter(log_dir='{}/{}'.format(log_dir, name)) # set various attributes attribute_dict = { 'epoch_limit': epoch_limit, 'save_loc': save_loc, 'batch_size': batch_size, 'criterion': criterion, 'name': name, 'gpu_device': gpu_device, 'lr': lr, 'clip': clip, 'data': data, 'on_policy': on_policy, 'double_q': double_q, 'reset_rate': reset_rate, 'validate_rate': validate_rate } for key in attribute_dict: setattr(self, key, attribute_dict[key]) self.total_steps = 0 self.model = None self.old_model = None sampler = torch.utils.data.sampler.SubsetRandomSampler train_sampler = sampler(data.train_ind) valid_sampler = sampler(data.valid_ind) test_sampler = sampler(data.test_ind) self.train_data = DataLoader(data, batch_size=batch_size, sampler=train_sampler, num_workers=num_workers) self.train_data = DataLoader(data, batch_size=batch_size, sampler=train_sampler, num_workers=num_workers) self.valid_data = DataLoader(data, batch_size=batch_size, sampler=valid_sampler, num_workers=num_workers) self.test_data = DataLoader(data, batch_size=batch_size, sampler=test_sampler, num_workers=num_workers) def time_to_reset(self): return self.total_steps % self.reset_rate == 0 def time_to_validate(self): return self.total_steps % self.validate_rate == 0 def train(self, model, gamma, optimizer=None): """ Training loop :param model: pytorch model to be trained :param gamma: discount factor :param optimizer: training optimizer, defaults to ADAM :return: """ print('starting train') torch.save(model.state_dict(), '{}/{}_start.pt'.format(self.save_loc, self.name)) self.model = to_cuda(model, self.gpu_device) self.old_model = copy.deepcopy(model) if optimizer is None: optimizer = torch.optim.Adam(model.parameters(), lr=self.lr) for epoch_num in range(self.epoch_limit): print('epoch {}'.format(epoch_num)) running_train_loss = 0 for state, action, reward, next_state, next_action, feasible_mask in self.train_data: if self.time_to_validate(): print('validating') self.validate(gamma=gamma) if self.time_to_reset(): print('saving model at epoch {}, step {}'.format( epoch_num, self.total_steps)) self.old_model = copy.deepcopy(self.model) torch.save( self.model.state_dict(), '{}/{}_epoch_{}_step_{}.pt'.format( self.save_loc, self.name, epoch_num, self.total_steps)) self.total_steps += 1 train_loss = self.optimize(state=state, action=action, reward=reward, next_state=next_state, next_action=next_action, gamma=gamma, optimizer=optimizer, feasible_mask=feasible_mask) running_train_loss += train_loss self.writer.add_scalar(tag='data/train_epoch_loss', scalar_value=running_train_loss / len(self.train_data), global_step=self.total_steps) torch.save(self.model.state_dict(), '{}/{}_final.pt'.format(self.save_loc, self.name)) def validate(self, gamma): """ Evaluates on validation set. Used to monitor training performance :param gamma: discount factor :return: """ self.model.eval() running_valid_loss = 0 for state, action, reward, next_state, next_action, feasible_mask in self.valid_data: valid_loss = self.optimize(state=state, action=action, reward=reward, next_state=next_state, next_action=next_action, gamma=gamma, optimizer=None, feasible_mask=feasible_mask, valid=True) running_valid_loss += valid_loss self.writer.add_scalar(tag='data/valid_epoch_loss', scalar_value=running_valid_loss / len(self.valid_data), global_step=self.total_steps) print('Validation loss: {:.2f}'.format(running_valid_loss / len(self.valid_data))) self.model.train() def _q_state_value_estimate(self, state, nonterminal_mask, non_final_states, feasible_mask): """ Helper function which calculates the target state value using a Q value estimation procedure :param state: a state tensor :param nonterminal_mask: a boolean mask denoting the terminal states :param non_final_states: the next state for all non-terminal states :param feasible_mask: a boolean mask indicating which actions are allowed in the current state :return: An estimate of state value, using the maximal Q values derived from self.old_model """ next_state_values = to_variable( to_cuda(torch.zeros(state[0].size(0)).float(), self.gpu_device)) nonterminal_feasible_mask = feasible_mask[ nonterminal_mask.nonzero().view(-1)] predictions = self.old_model(to_cuda(non_final_states, self.gpu_device)) # modifying predictions by adjusted to ensure max value is within feasible action set adjuster = 2 * max(abs(predictions.min().data[0]), predictions.max().data[0]) adjusted_predictions = predictions - adjuster adjusted_predictions[nonterminal_feasible_mask] += adjuster next_state_values[nonterminal_mask] = adjusted_predictions.max(1)[0] next_state_values.volatile = False return next_state_values def _double_q_state_value_estimate(self, state, nonterminal_mask, non_final_states, feasible_mask): """ Helper function which calculates the target state value using a Double Q value estimation procedure. Essentially the same as _q_state_value_estimate, except we use the current network to choose the actions which inform next state value. :param state: a state tensor :param nonterminal_mask: a boolean mask denoting the terminal states :param non_final_states: the next state for all non-terminal states :param feasible_mask: a boolean mask indicating which actions are allowed in the current state :return: An estimate of state value, using the maximal Q values derived from self.old_model """ next_state_values = to_variable( to_cuda(torch.zeros(state[0].size(0)).float(), self.gpu_device)) nonterminal_feasible_mask = feasible_mask[ nonterminal_mask.nonzero().view(-1)] predictions_dq = self.model(to_cuda(non_final_states, self.gpu_device)) # modifying predictions by adjusted to ensure max value is within feasible action set adjuster = 2 * max(abs(predictions_dq.min().data[0]), predictions_dq.max().data[0]) adjusted_predictions_dq = predictions_dq - adjuster adjusted_predictions_dq[nonterminal_feasible_mask] += adjuster max_vals_dq, max_inds_dq = adjusted_predictions_dq.max(1) predictions = self.old_model(to_cuda(non_final_states, self.gpu_device)) next_state_values[nonterminal_mask] = predictions.gather( 1, max_inds_dq.view(-1, 1)) next_state_values.volatile = False return next_state_values def _on_policy_state_value_estimate(self, state, next_action, nonterminal_mask, non_final_states): """ Helper function which calculates the target state value using a TD value estimation procedure. Essentially the same as _q_state_value_estimate, except we use the observed next action to choose the actions which inform next state value. Note this corresponds to on-policy TD value estimation. :param state: a state tensor :param next_action: the next action taken at non-terminal states :param nonterminal_mask: a boolean mask denoting the terminal states :param non_final_states: the next state for all non-terminal states :return: An estimate of state value, using the maximal Q values derived from self.old_model """ next_state_values = Variable( to_cuda(torch.zeros(state[0].size(0)).float(), self.gpu_device)) predictions = self.old_model(to_cuda(non_final_states, self.gpu_device)) next_state_values[nonterminal_mask] = predictions.gather( 1, Variable( to_cuda(next_action, self.gpu_device)[nonterminal_mask].view(-1, 1))) next_state_values.volatile = False return next_state_values def log_q_diff(self, pred): """ A convenience function for logging Q value distribution. Can be helpful in diagnosing value collapse. :param pred: Predicted Q values :return: """ pred_sample = pred.data.cpu().numpy() q_diffs = np.diff(np.percentile(pred_sample, [0, 25, 50, 75, 100], axis=1), axis=0) dim_names = ['0_25', '25_50', '50_75', '75_100'] for dim in range(len(q_diffs)): self.writer.add_histogram(tag='predictions/q_diff_{}'.format( dim_names[dim]), values=q_diffs[dim], global_step=self.total_steps, bins='auto') def log_values_and_advantages(self, state): """ Convenience function for logging state value and action-advantage values :param state: vector of states :return: """ advantage = self.model.get_advantage( to_variable(to_cuda(state, self.gpu_device))) self.writer.add_histogram(tag='predictions/advantages', values=advantage.view(-1), global_step=self.total_steps, bins='auto') value = self.model.get_value( to_variable(to_cuda(state, self.gpu_device))) self.writer.add_histogram(tag='predictions/values', values=value.view(-1), global_step=self.total_steps, bins='auto') def optimize(self, state, action, reward, next_state, next_action, gamma, optimizer, feasible_mask, valid=False): """ Runs a step of optimization in the training/validation loops :param state: :param action: :param reward: :param next_state: :param next_action: :param gamma: :param optimizer: :param feasible_mask: :param valid: :return: """ # each state is a tuple of spatial and flat information next_state_court = next_state[0] next_state_flat = next_state[1] # get non final states nonterminal_mask = [] for batch_id in range(next_state_court.shape[0]): nonterminal_mask.append(np.min(next_state_court[batch_id].numpy())) nonterminal_mask = to_cuda( torch.from_numpy(np.array(nonterminal_mask)) > 0, self.gpu_device) non_final_states_court = Variable(torch.cat([ next_state_court[batch_id].view(1, next_state_court[batch_id].size(0), next_state_court[batch_id].size(1), next_state_court[batch_id].size(2)) for batch_id in range(len(next_state_court)) if nonterminal_mask[batch_id] ], dim=0), volatile=True) non_final_states_flat = Variable(torch.cat([ next_state_flat[batch_id].view(1, next_state_flat[batch_id].size(0)) for batch_id in range(len(next_state_flat)) if nonterminal_mask[batch_id] ], dim=0), volatile=True) non_final_states = [non_final_states_court, non_final_states_flat] # get q values for observed actions predictions = self.model(to_variable(to_cuda(state, self.gpu_device))) state_action_values = predictions.gather( 1, Variable(to_cuda(action, self.gpu_device))) # for non-final states, get V(s'), 0 for terminal states feasible_mask = to_cuda(feasible_mask, self.gpu_device) if self.on_policy: next_state_values = self._on_policy_state_value_estimate( state=state, next_action=next_action, nonterminal_mask=nonterminal_mask, non_final_states=non_final_states) elif self.double_q: next_state_values = self._double_q_state_value_estimate( state=state, nonterminal_mask=nonterminal_mask, non_final_states=non_final_states, feasible_mask=feasible_mask) else: next_state_values = self._q_state_value_estimate( state=state, nonterminal_mask=nonterminal_mask, non_final_states=non_final_states, feasible_mask=feasible_mask) # combine discounted next state values with reward to get expected state value expected_state_action_values = (next_state_values * gamma) + Variable( to_cuda(reward.view(-1), self.gpu_device)) # loss is between expected and predicted state-action values loss = self.criterion(state_action_values, expected_state_action_values) if not valid: optimizer.zero_grad() loss.backward() # gradient clipping total_grad = 0 for param in self.model.parameters(): param.grad.data.clamp_(-1 * self.clip, self.clip) total_grad += np.sum(np.abs(to_np(param.grad))) if self.total_steps % 10 == 0: self.writer.add_scalar(tag='data/train_loss', scalar_value=to_np(loss), global_step=self.total_steps) self.writer.add_scalar(tag='data/gradient', scalar_value=total_grad, global_step=self.total_steps) if self.total_steps % 1000 == 0: self.log_q_diff(predictions) self.log_values_and_advantages(state) self.writer.add_histogram(tag='predictions/q_taken', values=state_action_values, global_step=self.total_steps, bins='auto') self.writer.add_histogram(tag='predictions/qs', values=predictions.view(-1), global_step=self.total_steps, bins='auto') optimizer.step() return loss.cpu().data[0]
if args.ckpt: pass else: # save graph and clips_order samples for data in train_dataloader: tuple_clips, tuple_orders = data for i in range(args.tl): writer.add_video('train/tuple_clips', tuple_clips[:, i, :, :, :, :], i, fps=8) writer.add_text('train/tuple_orders', str(tuple_orders[:, i].tolist()), i) tuple_clips = tuple_clips.to(device) writer.add_graph(vcopn, tuple_clips) break # save init params at step 0 for name, param in vcopn.named_parameters(): writer.add_histogram('params/{}'.format(name), param, 0) ### loss funciton, optimizer and scheduler ### criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(vcopn.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.wd) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', min_lr=1e-5, patience=50, factor=0.1) prev_best_val_loss = float('inf') prev_best_model_path = None for epoch in range(args.start_epoch, args.start_epoch+args.epochs): time_start = time.time() train(args, vcopn, criterion, optimizer, device, train_dataloader, writer, epoch) print('Epoch time: {:.2f} s.'.format(time.time() - time_start)) val_loss = validate(args, vcopn, criterion, device, val_dataloader, writer, epoch) # scheduler.step(val_loss) writer.add_scalar('train/lr', optimizer.param_groups[0]['lr'], epoch)
def EnsembleTrain(): torch.autograd.set_detect_anomaly(True) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') input_shape = (192, 192) total_epoch = 10000 batch_size = 24 model_folder = MakeFolder(model_root + '/Resnet') param_config = { RotateTransform.name: { 'theta': ['uniform', -10, 10] }, ShiftTransform.name: { 'horizontal_shift': ['uniform', -0.05, 0.05], 'vertical_shift': ['uniform', -0.05, 0.05] }, ZoomTransform.name: { 'horizontal_zoom': ['uniform', 0.95, 1.05], 'vertical_zoom': ['uniform', 0.95, 1.05] }, FlipTransform.name: { 'horizontal_flip': ['choice', True, False] }, BiasTransform.name: { 'center': ['uniform', -1., 1., 2], 'drop_ratio': ['uniform', 0., 1.] }, NoiseTransform.name: { 'noise_sigma': ['uniform', 0., 0.03] }, ContrastTransform.name: { 'factor': ['uniform', 0.8, 1.2] }, GammaTransform.name: { 'gamma': ['uniform', 0.8, 1.2] }, ElasticTransform.name: ['elastic', 1, 0.1, 256] } sub_train_path = data_root + '/train_name_basemodel.csv' sub_val_path = data_root + '/val_name_basemodel.csv' sub_train = pd.read_csv(sub_train_path).values.tolist()[0] sub_val = pd.read_csv(sub_val_path).values.tolist()[0] train_loader, train_batches = _GetLoader(sub_train, param_config, input_shape, batch_size, True) val_loader, val_batches = _GetLoader(sub_val, param_config, input_shape, batch_size, True) model = Resnet(3, 2).to(device) model.apply(HeWeightInit) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) cr = torch.nn.NLLLoss() scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=10, factor=0.5, verbose=True) early_stopping = EarlyStopping(store_path=os.path.join( model_folder, '{}-{:.6f}.pt'), patience=50, verbose=True) writer = SummaryWriter(log_dir=os.path.join(model_folder, 'log'), comment='Net') for epoch in range(total_epoch): train_loss, val_loss = 0., 0. model.train() pred_list, label_list = [], [] for ind, (inputs, outputs) in enumerate(train_loader): optimizer.zero_grad() inputs = MoveTensorsToDevice(inputs, device) outputs = MoveTensorsToDevice(outputs, device) preds = model(*inputs) loss = cr(preds, outputs.long()) loss.backward() optimizer.step() train_loss += loss.item() pred_list.extend(preds[:, 1].cpu().data.numpy().tolist()) label_list.extend(outputs.cpu().data.numpy().tolist()) train_auc = roc_auc_score(label_list, pred_list) model.eval() pred_list, label_list = [], [] with torch.no_grad(): for ind, (inputs, outputs) in enumerate(val_loader): inputs = MoveTensorsToDevice(inputs, device) outputs = MoveTensorsToDevice(outputs, device) preds = model(*inputs) loss = cr(preds, outputs.long()) val_loss += loss.item() pred_list.extend(preds[:, 1].cpu().data.numpy().tolist()) label_list.extend(outputs.cpu().data.numpy().tolist()) val_auc = roc_auc_score(label_list, pred_list) # Save Tensor Board for index, (name, param) in enumerate(model.named_parameters()): if 'bn' not in name: writer.add_histogram(name + '_data', param.cpu().data.numpy(), epoch + 1) writer.add_scalars( 'Loss', { 'train_loss': train_loss / train_batches, 'val_loss': val_loss / val_batches }, epoch + 1) writer.add_scalars('Auc', { 'train_auc': train_auc, 'val_auc': val_auc }, epoch + 1) print( 'Epoch {}: loss: {:.3f}, val-loss: {:.3f}, auc: {:.3f}, val-auc: {:.3f}' .format(epoch + 1, train_loss / train_batches, val_loss / val_batches, train_auc, val_auc)) scheduler.step(val_loss) early_stopping(val_loss, model, (epoch + 1, val_loss)) if early_stopping.early_stop: print("Early stopping") break writer.flush() writer.close()
X=x_axis.unsqueeze(0).expand(y_axis.size(1), x_axis.size(0)).transpose(0, 1), # Visdom fix Y=y_axis, win=viz_window, update='replace', ) if args.tensorboard and main_proc: values = { 'Avg Train Loss': avg_loss, 'Avg WER': wer, 'Avg CER': cer } tensorboard_writer.add_scalars(args.id, values, epoch + 1) if args.log_params: for tag, value in model.named_parameters(): tag = tag.replace('.', '/') tensorboard_writer.add_histogram(tag, to_np(value), epoch + 1) tensorboard_writer.add_histogram(tag + '/grad', to_np(value.grad), epoch + 1) if args.checkpoint and main_proc: file_path = '%s/deepspeech_%d.pth' % (save_folder, epoch + 1) torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results), file_path) # anneal lr optim_state = optimizer.state_dict() optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0]['lr'] / args.learning_anneal optimizer.load_state_dict(optim_state) print('Learning rate annealed to: {lr:.6f}'.format(lr=optim_state['param_groups'][0]['lr'])) if (best_wer is None or best_wer > wer) and main_proc: print("Found better validated model, saving to %s" % args.model_path) torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
landmarks = landmarks.cuda() out = net(images) # backprop optimizer.zero_grad() loss = criterion(out, landmarks) loss.backward() optimizer.step() if iteration % 100 == 0: image = images.cpu().data.numpy()[0] gt = landmarks.cpu().data.numpy() pr = out.cpu().data.numpy() # 绿色的真实landmark image = draw_landmarks(image, gt[0], (0, 255, 0)) # 红色的预测landmark image = draw_landmarks(image, pr[0], (0, 0, 255)) image = image[::-1, ...] nme = metrics.nme.update(np.reshape(gt, (-1, gt.shape[1]//2, 2)), np.reshape(pr, (-1, gt.shape[1]//2, 2))) metrics.auc.update(nme) metrics.loss.update(loss) writer.add_scalar("watch/NME", metrics.nme.value * 100, iteration) writer.add_scalar("watch/AUC", metrics.auc.value * 100, iteration) writer.add_scalar("watch/loss", metrics.loss.value, iteration) writer.add_scalar("watch/learning_rate", lr, iteration) writer.add_image("result", image, iteration) writer.add_histogram("predictionx", out.cpu().data.numpy()[:, 0:212:2], iteration) state = net.state_dict() saver.save(state, iteration)
def write_tensorboard_histograms(self, model:nn.Module, iter_count:int, tbwriter:SummaryWriter): for name, param in model.named_parameters(): tbwriter.add_histogram('/weights/' + name, param, iter_count)
observation = get_policy_observation(observation) expert_loss_grad = [] expert_loss = [] for expert_trajectory in expert_trajectories: expert_loss_grad.append( sum(2 * (observation - expert_trajectory[0]["state"])) / state_dim) expert_loss.append( sum((observation - expert_trajectory[0]["state"])**2) / state_dim) loss_grad.append(sum(expert_loss_grad) / num_experts) writer.add_histogram( f"Imitation loss {timestep}", np.array(expert_loss), global_step=(max_timestep - timestep) * num_iterations + iteration) writer.add_scalar( f"Imitation loss mean {timestep}", sum(expert_loss) / num_experts, global_step=(max_timestep - timestep) * num_iterations + iteration) observation = torch.Tensor(observation) episode_reward += reward timestep += 1 writer.add_scalar("Episode reward", episode_reward, global_step=max_timestep * num_iterations +
def main(): # Load a config file if args.resume_model is None: config = load_config(args.config) else: # Restart from the last checkpoint config = load_config(os.path.join(args.resume_model, 'config.yml')) # Check differences between args and yaml comfiguraiton for k, v in vars(args).items(): if k not in config.keys(): warnings.warn("key %s is automatically set to %s" % (k, str(v))) # Merge config with args for k, v in config.items(): setattr(args, k, v) # Load dataset train_set = Dataset(csv_path=args.train_set, dict_path=args.dict, label_type=args.label_type, batch_size=args.batch_size * args.ngpus, bptt=args.bptt, eos=args.eos, max_epoch=args.num_epochs, shuffle=True) dev_set = Dataset(csv_path=args.dev_set, dict_path=args.dict, label_type=args.label_type, batch_size=args.batch_size * args.ngpus, bptt=args.bptt, eos=args.eos, shuffle=True) eval_sets = [] for set in args.eval_sets: eval_sets += [Dataset(csv_path=set, dict_path=args.dict, label_type=args.label_type, batch_size=1, bptt=args.bptt, eos=args.eos, is_test=True)] args.num_classes = train_set.num_classes # Model setting model = RNNLM(args) model.name = args.rnn_type model.name += str(args.num_units) + 'H' model.name += str(args.num_projs) + 'P' model.name += str(args.num_layers) + 'L' model.name += '_emb' + str(args.emb_dim) model.name += '_' + args.optimizer model.name += '_lr' + str(args.learning_rate) model.name += '_bs' + str(args.batch_size) if args.tie_weights: model.name += '_tie' if args.residual: model.name += '_residual' if args.backward: model.name += '_bwd' if args.resume_model is None: # Set save path save_path = mkdir_join(args.model, '_'.join(os.path.basename(args.train_set).split('.')[:-1]), model.name) model.set_save_path(save_path) # avoid overwriting # Save the config file as a yaml file save_config(vars(args), model.save_path) # Save the dictionary & wp_model shutil.copy(args.dict, os.path.join(save_path, 'dict.txt')) if args.label_type == 'wordpiece': shutil.copy(args.wp_model, os.path.join(save_path, 'wp.model')) # Setting for logging logger = set_logger(os.path.join(model.save_path, 'train.log'), key='training') for k, v in sorted(vars(args).items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, str(v))) # Count total parameters for name in sorted(list(model.num_params_dict.keys())): num_params = model.num_params_dict[name] logger.info("%s %d" % (name, num_params)) logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000)) # Set optimizer model.set_optimizer(optimizer=args.optimizer, learning_rate_init=float(args.learning_rate), weight_decay=float(args.weight_decay), clip_grad_norm=args.clip_grad_norm, lr_schedule=False, factor=args.decay_rate, patience_epoch=args.decay_patient_epoch) epoch, step = 1, 0 learning_rate = float(args.learning_rate) metric_dev_best = 10000 else: raise NotImplementedError() train_set.epoch = epoch - 1 # GPU setting if args.ngpus >= 1: model = CustomDataParallel(model, device_ids=list(range(0, args.ngpus, 1)), deterministic=True, benchmark=False) model.cuda() logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) # Set process name # setproctitle(args.job_name) # Set learning rate controller lr_controller = Controller(learning_rate_init=learning_rate, decay_type=args.decay_type, decay_start_epoch=args.decay_start_epoch, decay_rate=args.decay_rate, decay_patient_epoch=args.decay_patient_epoch, lower_better=True, best_value=metric_dev_best) # Set reporter reporter = Reporter(model.module.save_path, max_loss=10) # Set the updater updater = Updater(args.clip_grad_norm) # Setting for tensorboard tf_writer = SummaryWriter(model.module.save_path) start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() not_improved_epoch = 0 loss_train_mean, acc_train_mean = 0., 0. pbar_epoch = tqdm(total=len(train_set)) pbar_all = tqdm(total=len(train_set) * args.num_epochs) while True: # Compute loss in the training set (including parameter update) ys_train, is_new_epoch = train_set.next() model, loss_train, acc_train = updater(model, ys_train, args.bptt) loss_train_mean += loss_train acc_train_mean += acc_train pbar_epoch.update(np.sum([len(y) for y in ys_train])) if (step + 1) % args.print_step == 0: # Compute loss in the dev set ys_dev = dev_set.next()[0] model, loss_dev, acc_dev = updater(model, ys_dev, args.bptt, is_eval=True) loss_train_mean /= args.print_step acc_train_mean /= args.print_step reporter.step(step, loss_train_mean, loss_dev, acc_train_mean, acc_dev) # Logging by tensorboard tf_writer.add_scalar('train/loss', loss_train_mean, step + 1) tf_writer.add_scalar('dev/loss', loss_dev, step + 1) for n, p in model.module.named_parameters(): n = n.replace('.', '/') if p.grad is not None: tf_writer.add_histogram(n, p.data.cpu().numpy(), step + 1) tf_writer.add_histogram(n + '/grad', p.grad.data.cpu().numpy(), step + 1) duration_step = time.time() - start_time_step logger.info("...Step:%d(ep:%.2f) loss:%.2f(%.2f)/acc:%.2f(%.2f)/ppl:%.2f(%.2f)/lr:%.5f/bs:%d (%.2f min)" % (step + 1, train_set.epoch_detail, loss_train_mean, loss_dev, acc_train_mean, acc_dev, math.exp(loss_train_mean), math.exp(loss_dev), learning_rate, len(ys_train), duration_step / 60)) start_time_step = time.time() loss_train_mean, acc_train_mean = 0., 0. step += args.ngpus # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch logger.info('===== EPOCH:%d (%.2f min) =====' % (epoch, duration_epoch / 60)) # Save fugures of loss and accuracy reporter.epoch() if epoch < args.eval_start_epoch: # Save the model model.module.save_checkpoint(model.module.save_path, epoch, step, learning_rate, metric_dev_best) else: start_time_eval = time.time() # dev ppl_dev = eval_ppl([model.module], dev_set, args.bptt) logger.info(' PPL (%s): %.3f' % (dev_set.set, ppl_dev)) if ppl_dev < metric_dev_best: metric_dev_best = ppl_dev not_improved_epoch = 0 logger.info('||||| Best Score |||||') # Update learning rate model.module.optimizer, learning_rate = lr_controller.decay_lr( optimizer=model.module.optimizer, learning_rate=learning_rate, epoch=epoch, value=ppl_dev) # Save the model model.module.save_checkpoint(model.module.save_path, epoch, step, learning_rate, metric_dev_best) # test ppl_test_mean = 0. for eval_set in eval_sets: ppl_test = eval_ppl([model.module], eval_set, args.bptt) logger.info(' PPL (%s): %.3f' % (eval_set.set, ppl_test)) ppl_test_mean += ppl_test if len(eval_sets) > 0: logger.info(' PPL (mean): %.3f' % (ppl_test_mean / len(eval_sets))) else: # Update learning rate model.module.optimizer, learning_rate = lr_controller.decay_lr( optimizer=model.module.optimizer, learning_rate=learning_rate, epoch=epoch, value=ppl_dev) not_improved_epoch += 1 duration_eval = time.time() - start_time_eval logger.info('Evaluation time: %.2f min' % (duration_eval / 60)) # Early stopping if not_improved_epoch == args.not_improved_patient_epoch: break if epoch == args.convert_to_sgd_epoch: # Convert to fine-tuning stage model.module.set_optimizer( 'sgd', learning_rate_init=float(args.learning_rate), # TODO: ? weight_decay=float(args.weight_decay), clip_grad_norm=args.clip_grad_norm, lr_schedule=False, factor=args.decay_rate, patience_epoch=args.decay_patient_epoch) logger.info('========== Convert to SGD ==========') pbar_epoch = tqdm(total=len(train_set)) pbar_all.update(len(train_set)) if epoch == args.num_epoch: break start_time_step = time.time() start_time_epoch = time.time() epoch += 1 duration_train = time.time() - start_time_train logger.info('Total time: %.2f hour' % (duration_train / 3600)) tf_writer.close() pbar_epoch.close() pbar_all.close() return model.module.save_path
axis=1)).mean() log = 'epoch: %d, train loss: %.6f, test loss: %.6f, train acc: %.6f, test acc: %.6f' % \ (ep, train_loss.item(), test_loss.item(), train_acc, test_acc) print(log) if use_tensorboard: writer.add_text('training log', log, ep) writer.add_scalars('data/loss', { 'train loss': train_loss.data, 'test loss': test_loss.data }, ep) writer.add_scalars('data/accuracy', { 'train accuracy': train_acc, 'test accuracy': test_acc }, ep) for name, param in dnn.named_parameters(): writer.add_histogram(name, param.clone().data.cpu().numpy(), ep) y_final_test = dnn(data_final_test).detach().numpy().argmax(1) data_save['Accept'] = y_final_test data_save.to_csv('result.csv') y_true = y_test.numpy().argmax(1) y_score = dnn(X_test).detach().numpy()[:, 1].squeeze() print(y_score) y_pred = dnn(X_test).detach().numpy().argmax(1) label = ['not accepted', 'accepted'] print('precision:', metrics.precision_score(y_true, y_pred)) print('recall:', metrics.recall_score(y_true, y_pred)) print('f1 score:', metrics.f1_score(y_true, y_pred)) print('confusion matrix: ', metrics.confusion_matrix(y_true, y_pred)) print(metrics.classification_report(y_true, y_pred, target_names=label)) cm = metrics.confusion_matrix(y_true, y_pred)
def main(args): log_path = "{}_{}".format(args.log, random.randint(1, 100)) train_writer = SummaryWriter(log_dir=log_path + "/train") dev_writer = SummaryWriter(log_dir=log_path + "/dev") train, dev, test, words = read_corpus(args.data) dev_, test_ = dev, test train = create_batches(train, args.batch_size) dev = create_batches(dev, args.batch_size) test = create_batches(test, args.batch_size) model = Model(words, args) if args.load: model.load_state_dict(torch.load(args.load)) model.cuda() print(model) print("vocab size: {}".format(model.n_V)) lr = 1.0 if not args.noam else 1.0 / (args.n_d**0.5) / (args.warmup_steps** 1.5) if args.prune: # in place substituion of linear ops in SRU flop.make_hard_concrete(model.rnn, in_place=True) model.cuda() print("model after inserting hardconcrete:") print(model) hc_modules = flop.get_hardconcrete_modules(model) hc_parameters = [ p for m in hc_modules for p in m.parameters() if p.requires_grad ] optimizer_hc = Adam(hc_parameters, lr=lr * args.prune_lr, weight_decay=0) num_hardconcrete_params = sum(x.numel() for x in hc_parameters) print("num of hardconcrete paramters: {}".format( num_hardconcrete_params)) lambda_1 = nn.Parameter(torch.tensor(0.).cuda()) lambda_2 = nn.Parameter(torch.tensor(0.).cuda()) optimizer_max = Adam([lambda_1, lambda_2], lr=lr, weight_decay=0) optimizer_max.param_groups[0]['lr'] = -lr * args.prune_lr hc_linear_modules = flop.get_hardconcrete_linear_modules(model) num_prunable_params = sum(m.num_prunable_parameters() for m in hc_linear_modules) print("num of prunable paramters: {}".format(num_prunable_params)) else: args.prune_start_epoch = args.max_epoch m_parameters = [ i[1] for i in model.named_parameters() if i[1].requires_grad and 'log_alpha' not in i[0] ] optimizer = Adam(m_parameters, lr=lr * args.lr, weight_decay=args.weight_decay) num_params = sum(x.numel() for x in m_parameters if x.requires_grad) print("num of parameters: {}".format(num_params)) nbatch = 1 niter = 1 best_dev = 1e+8 unroll_size = args.unroll_size batch_size = args.batch_size N = (len(train[0]) - 1) // unroll_size + 1 criterion = nn.CrossEntropyLoss() model.zero_grad() if args.prune: optimizer_max.zero_grad() optimizer_hc.zero_grad() for epoch in range(args.max_epoch): start_time = time.time() model.train() total_loss = 0.0 hidden = model.init_hidden(batch_size) start_prune = epoch >= args.prune_start_epoch for i in range(N): x = train[0][i * unroll_size:(i + 1) * unroll_size] y = train[1][i * unroll_size:(i + 1) * unroll_size].view(-1) hidden.detach_() # language model forward and backward output, hidden = model(x, hidden) loss = criterion(output, y) (loss / args.update_param_freq).backward() loss = loss.item() lagrangian_loss = 0 target_sparsity = 0 expected_sparsity = 0 # add lagrangian loss (regularization) when pruning if start_prune: # compute target sparsity with (optionally) linear warmup target_sparsity = args.prune_sparsity if args.prune_warmup > 0: niter_ = niter - args.prune_start_epoch * N target_sparsity *= min(1.0, niter_ / args.prune_warmup) # compute expected model size and sparsity expected_size = sum( m.num_parameters(train=True) for m in hc_linear_modules) expected_sparsity = 1.0 - expected_size / num_prunable_params # compute lagrangian loss lagrangian_loss = lambda_1 * (expected_sparsity - target_sparsity) + \ lambda_2 * (expected_sparsity - target_sparsity)**2 (lagrangian_loss / args.update_param_freq).backward() expected_sparsity = expected_sparsity.item() lagrangian_loss = lagrangian_loss.item() # log training stats if (niter - 1) % 100 == 0 and nbatch % args.update_param_freq == 0: if args.prune: train_writer.add_scalar('sparsity/expected_sparsity', expected_sparsity, niter) train_writer.add_scalar('sparsity/target_sparsity', target_sparsity, niter) train_writer.add_scalar('loss/lagrangian_loss', lagrangian_loss, niter) train_writer.add_scalar('lambda/1', lambda_1.item(), niter) train_writer.add_scalar('lambda/2', lambda_2.item(), niter) if (niter - 1) % 3000 == 0: for index, layer in enumerate(hc_modules): train_writer.add_histogram( 'log_alpha/{}'.format(index), layer.log_alpha, niter, bins='sqrt', ) sys.stderr.write("\r{:.4f} {:.2f} {:.2f}".format( loss, lagrangian_loss, expected_sparsity, )) train_writer.add_scalar('loss/lm_loss', loss, niter) train_writer.add_scalar('loss/total_loss', loss + lagrangian_loss, niter) train_writer.add_scalar( 'parameter_norm', calc_norm([x.data for x in m_parameters]), niter) train_writer.add_scalar( 'gradient_norm', calc_norm( [x.grad for x in m_parameters if x.grad is not None]), niter) # perform gradient decent every few number of backward() if nbatch % args.update_param_freq == 0: if args.clip_grad > 0: torch.nn.utils.clip_grad_norm(m_parameters, args.clip_grad) optimizer.step() if start_prune: optimizer_max.step() optimizer_hc.step() # clear gradient model.zero_grad() if args.prune: optimizer_max.zero_grad() optimizer_hc.zero_grad() niter += 1 if nbatch % args.log_period == 0 or i == N - 1: elapsed_time = (time.time() - start_time) / 60.0 dev_ppl, dev_loss = eval_model(model, dev) dev_writer.add_scalar('loss/lm_loss', dev_loss, niter) dev_writer.add_scalar('bpc', np.log2(dev_ppl), niter) sparsity = 0 if args.prune: pruned_size = sum( m.num_parameters(train=False) for m in hc_linear_modules) sparsity = 1.0 - pruned_size / num_prunable_params dev_writer.add_scalar('sparsity/hard_sparsity', sparsity, niter) dev_writer.add_scalar('model_size/total_prunable', num_prunable_params, niter) dev_writer.add_scalar('model_size/current_prunable', pruned_size, niter) dev_writer.add_scalar('model_size/total', num_params, niter) dev_writer.add_scalar( 'model_size/current', num_params - num_prunable_params + pruned_size, niter) sys.stdout.write( "\rIter={} lr={:.5f} train_loss={:.4f} dev_loss={:.4f}" " dev_bpc={:.2f} sparsity={:.2f}\teta={:.1f}m\t[{:.1f}m]\n" .format(niter, optimizer.param_groups[0]['lr'], loss, dev_loss, np.log2(dev_ppl), sparsity, elapsed_time * N / (i + 1), elapsed_time)) if dev_ppl < best_dev: if (not args.prune ) or sparsity > args.prune_sparsity - 0.02: best_dev = dev_ppl checkpoint = copy_model(model) sys.stdout.write("\n") sys.stdout.flush() nbatch += 1 if args.noam: lr = min(1.0 / (niter**0.5), niter / (args.warmup_steps**1.5)) optimizer.param_groups[0]['lr'] = lr * args.lr / (args.n_d** 0.5) if args.noam and start_prune: niter_ = niter - args.prune_start_epoch * N lr = min(1.0 / (niter_**0.5), niter_ / (args.warmup_steps**1.5)) optimizer_max.param_groups[0]['lr'] = -lr * args.prune_lr / ( args.n_d**0.5) optimizer_hc.param_groups[0]['lr'] = lr * args.lr / (args.n_d** 0.5) if args.save and (epoch + 1) % 10 == 0: torch.save( copy_model(model), "{}.{}.{:.3f}.pt".format(args.save, epoch + 1, sparsity)) train_writer.close() dev_writer.close() model.load_state_dict(checkpoint) model.cuda() dev = create_batches(dev_, 1) test = create_batches(test_, 1) dev_ppl, dev_loss = eval_model(model, dev) test_ppl, test_loss = eval_model(model, test) sys.stdout.write("dev_bpc={:.3f} test_bpc={:.3f}\n".format( np.log2(dev_ppl), np.log2(test_ppl)))
class Train: __device = [] __writer = [] __model = [] __transformations = [] __dataset_train = [] __train_loader = [] __loss_func = [] __optimizer = [] __exp_lr_scheduler = [] def __init__(self, gpu='0'): # Device configuration self.__device = torch.device('cuda:'+gpu if torch.cuda.is_available() else 'cpu') self.__writer = SummaryWriter('logs') self.__model = CNNDriver() # Set model to train mode self.__model.train() print(self.__model) self.__writer.add_graph(self.__model, torch.rand(10, 3, 66, 200)) # Put model on GPU self.__model = self.__model.to(self.__device) def train(self, num_epochs=100, batch_size=400, lr=0.0001, l2_norm=0.001, save_dir='./save', input='./DataLMDB'): # Create log/save directory if it does not exist if not os.path.exists('./logs'): os.makedirs('./logs') if not os.path.exists(save_dir): os.makedirs(save_dir) self.__transformations = transforms.Compose([AugmentDrivingTransform(), RandomBrightness(), ConvertToGray(), ConvertToSepia(), AddNoise(), DrivingDataToTensor(),]) self.__dataset_train = DriveData_LMDB(input, self.__transformations) self.__train_loader = DataLoader(self.__dataset_train, batch_size=batch_size, shuffle=True, num_workers=4) # Loss and Optimizer self.__loss_func = nn.MSELoss() # self.__loss_func = nn.SmoothL1Loss() self.__optimizer = torch.optim.Adam(self.__model.parameters(), lr=lr, weight_decay=l2_norm) # Decay LR by a factor of 0.1 every 10 epochs self.__exp_lr_scheduler = lr_scheduler.StepLR(self.__optimizer, step_size=15, gamma=0.1) print('Train size:', len(self.__dataset_train), 'Batch size:', batch_size) print('Batches per epoch:', len(self.__dataset_train) // batch_size) # Train the Model iteration_count = 0 for epoch in range(num_epochs): for batch_idx, samples in enumerate(self.__train_loader): # Send inputs/labels to GPU images = samples['image'].to(self.__device) labels = samples['label'].to(self.__device) self.__optimizer.zero_grad() # Forward + Backward + Optimize outputs = self.__model(images) loss = self.__loss_func(outputs, labels.unsqueeze(dim=1)) loss.backward() self.__optimizer.step() self.__exp_lr_scheduler.step(epoch) # Send loss to tensorboard self.__writer.add_scalar('loss/', loss.item(), iteration_count) self.__writer.add_histogram('steering_out', outputs.clone().detach().cpu().numpy(), iteration_count, bins='doane') self.__writer.add_histogram('steering_in', labels.unsqueeze(dim=1).clone().detach().cpu().numpy(), iteration_count, bins='doane') # Get current learning rate (To display on Tensorboard) for param_group in self.__optimizer.param_groups: curr_learning_rate = param_group['lr'] self.__writer.add_scalar('learning_rate/', curr_learning_rate, iteration_count) # Display on each epoch if batch_idx == 0: # Send image to tensorboard self.__writer.add_image('Image', images, epoch) self.__writer.add_text('Steering', 'Steering:' + str(outputs[batch_idx].item()), epoch) # Print Epoch and loss print('Epoch [%d/%d] Loss: %.4f' % (epoch + 1, num_epochs, loss.item())) # Save the Trained Model parameters torch.save(self.__model.state_dict(), save_dir+'/cnn_' + str(epoch) + '.pkl') iteration_count += 1
'imagebox_label', torch.ones(3, 240, 240) * 0.5, torch.Tensor([[10, 10, 100, 100], [101, 101, 200, 200]]), n_iter, labels=['abcde' + str(n_iter), 'fgh' + str(n_iter)]) x = torch.zeros(sample_rate * 2) for i in range(x.size(0)): # sound amplitude should in [-1, 1] x[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate)) writer.add_audio('myAudio', x, n_iter) writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter) writer.add_text('markdown Text', '''a|b\n-|-\nc|d''', n_iter) for name, param in resnet18.named_parameters(): if 'bn' not in name: writer.add_histogram(name, param, n_iter) writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter) # needs tensorboard 0.4RC or later writer.add_pr_curve_raw('prcurve with raw data', true_positive_counts, false_positive_counts, true_negative_counts, false_negative_counts, precision, recall, n_iter) # export scalar data to JSON for external processing writer.export_scalars_to_json("./all_scalars.json") dataset = datasets.MNIST('mnist', train=False, download=True) images = dataset.test_data[:100].float() label = dataset.test_labels[:100] features = images.view(100, 784) writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1))
class SADQ_GQF(object): """Adaptive which uses the SADQ algorithm""" def __init__(self, name, state_length, network_config, reinforce_config, feature_len, combine_decomposed_func, is_sigmoid=False, memory_resotre=True): super(SADQ_GQF, self).__init__() self.name = name #self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.memory = ReplayBuffer_decom(self.reinforce_config.memory_size) self.learning = True self.explanation = False self.state_length = state_length self.features = 0 self.feature_len = feature_len # Global self.steps = 0 self.reward_history = [] self.episode_time_history = [] self.best_reward_mean = -maxsize self.episode = 0 self.feature_len = feature_len self.features = None self.reset() self.memory_resotre = memory_resotre reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name if not self.network_config.restore_network: clear_summary_path(reinforce_summary_path) else: self.restore_state() self.summary = SummaryWriter(log_dir=reinforce_summary_path) self.eval_model = feature_q_model(name, state_length, self.feature_len, self.network_config.output_shape, network_config) self.target_model = feature_q_model(name, state_length, self.feature_len, self.network_config.output_shape, network_config) # self.target_model.eval_mode() self.beta_schedule = LinearSchedule( self.reinforce_config.beta_timesteps, initial_p=self.reinforce_config.beta_initial, final_p=self.reinforce_config.beta_final) self.epsilon_schedule = LinearSchedule( self.reinforce_config.epsilon_timesteps, initial_p=self.reinforce_config.starting_epsilon, final_p=self.reinforce_config.final_epsilon) # def __del__(self): # self.save() # self.summary.close() def should_explore(self): self.epsilon = self.epsilon_schedule.value(self.steps) self.summary.add_scalar(tag='%s/Epsilon' % self.name, scalar_value=self.epsilon, global_step=self.steps) return random.random() < self.epsilon def predict(self, state, isGreedy=False, is_random=False): if self.learning: self.steps += 1 # add to experience if self.previous_state is not None and self.learning and self.current_reward is not None: state_crr = np.unique(state, axis=0) self.memory.add(self.previous_state, None, self.current_reward, state_crr.reshape(-1, self.state_length), 0, self.features) # print("not final : {}".format(self.current_reward) ) # print(0, self.features) if self.learning and self.should_explore() and not isGreedy: q_values = None fv = None choice = random.choice(list(range(len(state)))) action = choice else: with torch.no_grad(): features_vector, q_values = self.eval_model.predict_batch( Tensor(state)) q_values = FloatTensor(q_values).view(-1) _, choice = q_values.max(0) action = choice fv = features_vector[choice] # print("q_value : {}".format(q_values)) # input() if self.learning and self.steps % self.reinforce_config.replace_frequency == 0: logger.debug("Replacing target model for %s" % self.name) if self.reinforce_config.replace_frequency != 1: self.target_model.replace(self.eval_model) else: self.target_model.replace_soft(self.eval_model) # self.target_model.eval_mode() if (self.learning and self.steps > self.reinforce_config.update_start and self.steps % self.reinforce_config.update_steps == 0): self.update_time -= time.time() self.update() self.update_time += time.time() self.current_reward = 0 self.previous_state = state[action] #self.previous_action = action return choice, fv #,q_values def disable_learning(self, is_save=False): logger.info("Disabled Learning for %s agent" % self.name) if is_save: # self.save() self.save(force=True) self.learning = False self.episode = 0 def enable_learning(self): logger.info("enabled Learning for %s agent" % self.name) self.learning = True self.reset() def end_episode(self, state): if not self.learning: return # print("end:") # print(self.current_reward) # input() episode_time = time.time() - self.episode_time self.reward_history.append(self.total_reward) self.episode_time_history.append(episode_time) total_time = sum(self.episode_time_history) avg_time = total_time / len(self.episode_time_history) logger.info("End of Episode %d, " "Total reward %.2f, " "Epsilon %.2f" % (self.episode + 1, self.total_reward, self.epsilon)) logger.debug( "Episode Time: %.2fs (%.2fs), " "Prediction Time: %.2f, " "Update Time %.2f" % (episode_time, avg_time, self.prediction_time, self.update_time)) self.episode += 1 self.summary.add_scalar(tag='%s/Episode Reward' % self.name, scalar_value=self.total_reward, global_step=self.episode) self.memory.add(self.previous_state, None, self.current_reward, state.reshape(-1, self.state_length), 1, self.features) # print("final : {}".format(self.current_reward) ) # input() # print(1, self.features) self.save() self.reset() def reset(self): self.episode_time = time.time() self.current_reward = 0 self.total_reward = 0 self.previous_state = None self.previous_action = None self.prediction_time = 0 self.update_time = 0 self.features = None def restore_state(self): restore_path = self.network_config.network_path + "/adaptive.info" if self.network_config.network_path and os.path.exists( restore_path) and self.memory_resotre: logger.info("Restoring state from %s" % self.network_config.network_path) with open(restore_path, "rb") as file: info = pickle.load(file) self.steps = info["steps"] # self.best_reward_mean = info["best_reward_mean"] self.episode = info["episode"] self.memory.load(self.network_config.network_path) print("lenght of memeory: ", len(self.memory)) def save(self, force=False, appendix=""): info = { "steps": self.steps, "best_reward_mean": self.best_reward_mean, "episode": self.episode } if (len(self.reward_history) >= self.network_config.save_steps and self.episode % self.network_config.save_steps == 0) or force: total_reward = sum( self.reward_history[-self.network_config.save_steps:]) current_reward_mean = total_reward / self.network_config.save_steps if force: #or current_reward_mean >= self.best_reward_mean: print("*************saved*****************", current_reward_mean, self.best_reward_mean) if not force: self.best_reward_mean = current_reward_mean logger.info("Saving network. Found new best reward (%.2f)" % total_reward) self.eval_model.save_network(appendix=appendix) self.target_model.save_network(appendix=appendix) # self.eval_model.save_network() # self.target_model.save_network() with open(self.network_config.network_path + "/adaptive.info", "wb") as file: pickle.dump(info, file, protocol=pickle.HIGHEST_PROTOCOL) self.memory.save(self.network_config.network_path) print("lenght of memeory: ", len(self.memory)) else: logger.info("The best reward is still %.2f. Not saving" % self.best_reward_mean) def reward(self, r): self.total_reward += r self.current_reward += r def passFeatures(self, features): self.features = features.copy() return def summary_test(self, reward, epoch): self.summary.add_scalar(tag='%s/eval reward' % self.name, scalar_value=reward, global_step=epoch * 40) def summary_GVFs_loss(self, loss, epoch): self.summary.add_scalar(tag='%s/GVFs loss' % self.name, scalar_value=loss, global_step=epoch * 40) def update(self): if len(self.memory._storage) <= self.reinforce_config.batch_size: return # self.eval_model.train_mode() beta = self.beta_schedule.value(self.steps) self.summary.add_scalar(tag='%s/Beta' % self.name, scalar_value=beta, global_step=self.steps) if self.reinforce_config.use_prior_memory: batch = self.memory.sample(self.reinforce_config.batch_size, beta) (states, actions, reward, next_states, is_terminal, weights, batch_idxes) = batch self.summary.add_histogram(tag='%s/Batch Indices' % self.name, values=Tensor(batch_idxes), global_step=self.steps) else: batch = self.memory.sample(self.reinforce_config.batch_size) (states, actions, reward, next_states, is_terminal, features_vector) = batch states = FloatTensor(states) # print(states.size()) # next_states = FloatTensor(next_states) terminal = FloatTensor([1 if t else 0 for t in is_terminal]) reward = FloatTensor(reward) features_vector = FloatTensor(features_vector) batch_index = torch.arange(self.reinforce_config.batch_size, dtype=torch.long) # Current Q Values feature_values, q_values = self.eval_model.predict_batch(states) q_values = q_values.flatten() q_max = [] f_max = [] for i, ns in enumerate(next_states): feature_n, q_n = self.target_model.predict_batch( FloatTensor(ns).view(-1, self.state_length)) q_value_max, idx = q_n.max(0) features_max = feature_n[idx] q_max.append(q_value_max) if self.network_config.version in ["v10", "v11"]: # print(features_max) # print(ns[idx, 63:67]) # print(states[i, 63:67]) # print(features_max.size(), FloatTensor(ns).view(-1, self.state_length).size(), states.size()) features_max[:, :3] = (features_max[:, :3] * ns[idx, 65]) / states[i, 65] features_max[:, 3:6] = (features_max[:, 3:6] * ns[idx, 66]) / states[i, 66] features_max[:, 6:9] = (features_max[:, 6:9] * ns[idx, 63]) / states[i, 63] features_max[:, 9:12] = (features_max[:, 9:12] * ns[idx, 64]) / states[i, 64] features_max[features_max == float('inf')] = 0 # print(features_max) # input() f_max.append(features_max.view(-1)) # if torch.sum(terminal == torch.sum(features_vector, dim = 1)) != len(terminal): # print(terminal) # print(features_vector) # input() q_max = torch.stack(q_max, dim=1).view(-1) f_max = torch.stack(f_max) q_max = (1 - terminal) * q_max f_max = (1 - terminal.view(-1, 1)) * f_max q_target = reward + self.reinforce_config.discount_factor * q_max f_target = features_vector + self.reinforce_config.discount_factor * f_max # if torch.sum(reward).item() > 0: # print(reward) # print(feature_values) # print(q_target) # print(q_values) # input() # update model if (torch.sum(feature_values != feature_values).item() + torch.sum(f_target != f_target)).item() > 0: # print("1") # print(features_vector) # print("2") # print(feature_values) # print("3") # print(f_target) # print("4") # print(f_max) # print("5") # print(states.tolist()) # input() f_target[f_target != f_target] = 0 self.eval_model.fit(q_values, q_target, feature_values, f_target) # Update priorities if self.reinforce_config.use_prior_memory: td_errors = q_values - q_target new_priorities = torch.abs( td_errors) + 1e-6 # prioritized_replay_eps self.memory.update_priorities(batch_idxes, new_priorities.data) def load_model(self, model): self.eval_model.replace(model) def load_weight(self, weight_dict): self.eval_model.load_weight(weight_dict) def load_model(self, model): self.eval_model.replace(model) def load_weight(self, new_feature_weights, new_q_weights): self.eval_model.feautre_model.load_state_dict(new_feature_weights) self.eval_model.q_model.load_state_dict(new_q_weights)
val_losses.append(val_epoch_loss) val_accuracy.append(val_epoch_accuracy) writer_train.add_scalars("losses", { 'train_ln': epoch_loss, 'val_ln': val_epoch_loss }, int(epoch)) writer_train.add_scalars("accuracies", { 'train_ln': epoch_accuracy, 'val_ln': val_epoch_accuracy }, int(epoch)) # Learning rate scheduler update scheduler.step(val_epoch_loss) writer_train.add_histogram("error_ln", np.array(train_losses)) elapsed = clock() - start print(elapsed) # ----------------------------------------------------------------------------------- # Model classification metrics classes = [ 'A', 'B', 'C', 'D', 'del', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'nothing', 'O', 'P', 'Q', 'R', 'S', 'space', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' ] correct = 0
loss.backward() optimizer.step() if epoch % 30 == 0: writeIntermediateState(0, model, epoch, nx, ny, log_writer, coordinateSystem) writeIntermediateState(100, model, epoch, nx, ny, log_writer, coordinateSystem) writeIntermediateState(200, model, epoch, nx, ny, log_writer, coordinateSystem) writeIntermediateState(250, model, epoch, nx, ny, log_writer, coordinateSystem) writeIntermediateState(300, model, epoch, nx, ny, log_writer, coordinateSystem) writeIntermediateState(400, model, epoch, nx, ny, log_writer, coordinateSystem) writeIntermediateState(500, model, epoch, nx, ny, log_writer, coordinateSystem) writeValidationLoss(model, log_writer, 250, epoch, coordinateSystem) writeValidationLoss(model, log_writer, 500, epoch, coordinateSystem) sys.stdout.flush() print("PDE Loss at Epoch: ", epoch + 1, loss.item()) if log_writer: log_writer.add_histogram( 'First Layer Grads', model.lin_layers[0].weight.grad.view(-1, 1), epoch) save_checkpoint(model, optimizer, modelPath, epoch)
def main(): base = c3d.C3D(with_classifier=False) model = ssl_net.SSLNET(base, with_classifier=True, num_classes=12) start_epoch = 1 # pretrain_weight = loadcontinur_weights(pretrain_path) # model.load_state_dict(pretrain_weight, strict=False) # train train_dataset = UntrimmedVideoDataset(params['root'], mode="train") if params['data'] == 'UCF-101': val_size = 800 elif params['data'] == 'hmdb': val_size = 400 elif params['data'] == 'Thumos14': val_size = 400 train_dataset, val_dataset = random_split( train_dataset, (len(train_dataset) - val_size, val_size)) print("num_works:{:d}".format(params['num_workers'])) print("batch_size:{:d}".format(params['batch_size'])) train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, num_workers=params['num_workers']) val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=True, num_workers=params['num_workers']) model = nn.DataParallel(model) #multi-gpu model = model.cuda() criterion = nn.CrossEntropyLoss().cuda() optimizer = optim.SGD(model.parameters(), lr=params['learning_rate'], momentum=params['momentum'], weight_decay=params['weight_decay']) #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', min_lr=1e-5, patience=50, factor=0.1) #pretrain_model = pretrain_path.split('/')[-1].split('.')[0] + 'pth' model_save_dir = os.path.join(save_path, '_' + time.strftime('%m-%d-%H-%M')) writer = SummaryWriter(model_save_dir) for data in train_loader: clip, label = data writer.add_video('train/clips', clip, 0, fps=8) writer.add_text('train/idx', str(label.tolist()), 0) clip = clip.cuda() #writer.add_graph(model, (clip, clip)); break for name, param in model.named_parameters(): writer.add_histogram('params/{}'.format(name), param, 0) if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) prev_best_val_loss = float('inf') prev_best_loss_model_path = None prev_best_acc_model_path = None best_acc = 0 best_epoch = 0 for epoch in tqdm(range(start_epoch, start_epoch + params['epoch_num'])): train(train_loader, model, criterion, optimizer, epoch, writer) val_loss, top1_avg = validation(val_loader, model, criterion, optimizer, epoch) if top1_avg >= best_acc: best_acc = top1_avg best_epoch = epoch model_path = os.path.join( model_save_dir, 'best_acc_model_{}.pth.tar'.format(epoch)) torch.save(model.state_dict(), model_path) prev_best_acc_model_path = model_path if val_loss < prev_best_val_loss: model_path = os.path.join( model_save_dir, 'best_loss_model_{}.pth.tar'.format(epoch)) torch.save(model.state_dict(), model_path) prev_best_val_loss = val_loss prev_best_loss_model_path = model_path scheduler.step(val_loss) if epoch % 20 == 0: checkpoints = os.path.join(model_save_dir, str(epoch) + ".pth.tar") torch.save(model.state_dict(), checkpoints) print("save_to:", checkpoints) print("best is :", best_acc, best_epoch)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--stage', default='train', type=str) parser.add_argument('--gpus', default='0,1,2,3', type=str) parser.add_argument('--max_epoch', default=200, type=int) parser.add_argument('--lr_decay_steps', default='160,190,200', type=str) parser.add_argument('--exp', default='', type=str) parser.add_argument('--res_path', default='', type=str) parser.add_argument('--resume_path', default='', type=str) parser.add_argument('--pretrain_path', default='', type=str) parser.add_argument('--dataset', default='imagenet', type=str) parser.add_argument('--lr', default=0.03, type=float) parser.add_argument('--lr_decay_rate', default=0.1, type=float) parser.add_argument('--batch_size', default=128, type=int) parser.add_argument('--weight_decay', default=5e-4, type=float) parser.add_argument('--n_workers', default=32, type=int) parser.add_argument('--n_background', default=4096, type=int) parser.add_argument('--t', default=0.07, type=float) parser.add_argument('--m', default=0.5, type=float) parser.add_argument('--dropout', action='store_true') parser.add_argument('--blur', action='store_true') parser.add_argument('--cos', action='store_true') parser.add_argument('--network', default='resnet18', type=str) parser.add_argument('--mix', action='store_true') parser.add_argument('--not_hardpos', action='store_true') parser.add_argument('--InvP', type=int, default=1) parser.add_argument('--ramp_up', default='binary', type=str) parser.add_argument('--lam_inv', default=0.6, type=float) parser.add_argument('--lam_mix', default=1.0, type=float) parser.add_argument('--diffusion_layer', default=3, type=int) # for cifar 10 the best diffusion_layer is 3 and cifar 100 is 4 # for imagenet I have only tested when diffusion_layer = 3 parser.add_argument('--K_nearst', default=4, type=int) parser.add_argument('--n_pos', default=50, type=int) # for cifar10 the best n_pos is 20, for cifar 100 the best is 10 or 20 parser.add_argument('--exclusive', default=1, type=int) parser.add_argument('--nonlinearhead', default=0, type=int) # exclusive best to be 0 global args args = parser.parse_args() exp_identifier = get_expidentifier([ 'mix', 'network', 'lam_inv', 'lam_mix', 'diffusion_layer', 'K_nearst', 'n_pos', 'exclusive', 'max_epoch', 'ramp_up', 'nonlinearhead', 't', 'weight_decay' ], args) if not args.InvP: exp_identifier = 'hard' args.exp = os.path.join(args.exp, exp_identifier) if not os.path.exists(args.exp): os.makedirs(args.exp) if not os.path.exists(os.path.join(args.exp, 'runs')): os.makedirs(os.path.join(args.exp, 'runs')) if not os.path.exists(os.path.join(args.exp, 'models')): os.makedirs(os.path.join(args.exp, 'models')) if not os.path.exists(os.path.join(args.exp, 'logs')): os.makedirs(os.path.join(args.exp, 'logs')) logger = getLogger(args.exp) device_ids = list(map(lambda x: int(x), args.gpus.split(','))) device = torch.device('cuda: 0') if args.dataset.startswith('cifar'): train_loader, val_loader, train_ordered_labels, train_dataset, val_dataset = cifar.get_dataloader( args) elif args.dataset.startswith('imagenet'): train_loader, val_loader, train_ordered_labels, train_dataset, val_dataset = imagenet.get_instance_dataloader( args) elif args.dataset == 'svhn': train_loader, val_loader, train_ordered_labels, train_dataset, val_dataset = svhn.get_dataloader( args) # create model if args.network == 'alexnet': network = alexnet(128) if args.network == 'alexnet_cifar': network = AlexNet_cifar(128) elif args.network == 'resnet18_cifar': network = ResNet18_cifar(128, dropout=args.dropout, non_linear_head=args.nonlinearhead) elif args.network == 'resnet50_cifar': network = ResNet50_cifar(128, dropout=args.dropout) elif args.network == 'wide_resnet28': network = WideResNetInstance(28, 2) elif args.network == 'resnet18': network = resnet18(non_linear_head=args.nonlinearhead) elif args.network == 'pre-resnet18': network = PreActResNet18(128) elif args.network == 'resnet50': network = resnet50(non_linear_head=args.nonlinearhead) elif args.network == 'pre-resnet50': network = PreActResNet50(128) network = nn.DataParallel(network, device_ids=device_ids) network.to(device) # create optimizer if args.network == 'pre-resnet18' or args.network == 'pre-resnet50': logging.info( colorful( 'Exclude bns from weight decay, copied from LocalAggregation proposed by Zhuang et al [ICCV 2019]' )) parameters = exclude_bn_weight_bias_from_weight_decay( network, weight_decay=args.weight_decay) else: parameters = network.parameters() optimizer = torch.optim.SGD( parameters, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay, ) cudnn.benchmark = True # create memory_bank global writer writer = SummaryWriter(comment='InvariancePropagation', logdir=os.path.join(args.exp, 'runs')) memory_bank = objective.MemoryBank_v1(len(train_dataset), train_ordered_labels, writer, device, m=args.m) # create criterion criterionA = objective.InvariancePropagationLoss( args.t, diffusion_layer=args.diffusion_layer, k=args.K_nearst, n_pos=args.n_pos, exclusive=args.exclusive, InvP=args.InvP, hard_pos=(not args.not_hardpos)) criterionB = objective.MixPointLoss(args.t) if args.ramp_up == 'binary': ramp_up = lambda i_epoch: objective.BinaryRampUp(i_epoch, 30) elif args.ramp_up == 'gaussian': ramp_up = lambda i_epoch: objective.GaussianRampUp(i_epoch, 30, 5) elif args.ramp_up == 'zero': ramp_up = lambda i_epoch: 1 logging.info(beautify(args)) start_epoch = 0 if args.pretrain_path != '' and args.pretrain_path != 'none': logging.info('loading pretrained file from {}'.format( args.pretrain_path)) checkpoint = torch.load(args.pretrain_path) network.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) _memory_bank = checkpoint['memory_banks'] try: _neigh = checkpoint['neigh'] memory_bank.neigh = _neigh except: logging.info( colorful( 'The Pretrained Path has No NEIGH and require a epoch to re-calculate' )) memory_bank.points = _memory_bank start_epoch = checkpoint['epoch'] else: initialize_memorybank(network, train_loader, device, memory_bank) logging.info('start training') best_acc = 0.0 try: for i_epoch in range(start_epoch, args.max_epoch): adjust_learning_rate(args.lr, args.lr_decay_steps, optimizer, i_epoch, lr_decay_rate=args.lr_decay_rate, cos=args.cos, max_epoch=args.max_epoch) train(i_epoch, network, criterionA, criterionB, optimizer, train_loader, device, memory_bank, ramp_up) save_name = 'checkpoint.pth' checkpoint = { 'epoch': i_epoch + 1, 'state_dict': network.state_dict(), 'optimizer': optimizer.state_dict(), 'memory_banks': memory_bank.points, 'neigh': memory_bank.neigh, } torch.save(checkpoint, os.path.join(args.exp, 'models', save_name)) # scheduler.step() # validate(network, memory_bank, val_loader, train_ordered_labels, device) acc = kNN(i_epoch, network, memory_bank, val_loader, train_ordered_labels, K=200, sigma=0.07) if acc >= best_acc: best_acc = acc torch.save(checkpoint, os.path.join(args.exp, 'models', 'best.pth')) if i_epoch in [30, 60, 120, 160, 200, 400, 600]: torch.save( checkpoint, os.path.join(args.exp, 'models', '{}.pth'.format(i_epoch + 1))) args.y_best_acc = best_acc logging.info( colorful('[Epoch: {}] val acc: {:.4f}'.format(i_epoch, acc))) logging.info( colorful('[Epoch: {}] best acc: {:.4f}'.format( i_epoch, best_acc))) writer.add_scalar('acc', acc, i_epoch + 1) with torch.no_grad(): for name, param in network.named_parameters(): if 'bn' not in name: writer.add_histogram(name, param, i_epoch) # cluster except KeyboardInterrupt as e: logging.info('KeyboardInterrupt at {} Epochs'.format(i_epoch)) save_result(args) exit() save_result(args)
class DQNAdaptive(object): """Adaptive which uses the DQN algorithm""" def __init__(self, name, choices, network_config, reinforce_config): super(DQNAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.memory = PrioritizedReplayBuffer(self.reinforce_config.memory_size, 0.6) self.learning = True self.explanation = False # Global self.steps = 0 self.reward_history = [] self.episode_time_history = [] self.best_reward_mean = -maxsize self.episode = 0 self.reset() reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name if not self.network_config.restore_network: clear_summary_path(reinforce_summary_path) else: self.restore_state() self.summary = SummaryWriter(log_dir=reinforce_summary_path) self.target_model = DQNModel(self.name + "_target", self.network_config, use_cuda) self.eval_model = DQNModel(self.name + "_eval", self.network_config, use_cuda) self.beta_schedule = LinearSchedule(self.reinforce_config.beta_timesteps, initial_p=self.reinforce_config.beta_initial, final_p=self.reinforce_config.beta_final) self.epsilon_schedule = LinearSchedule(self.reinforce_config.epsilon_timesteps, initial_p=self.reinforce_config.starting_epsilon, final_p=self.reinforce_config.final_epsilon) def __del__(self): self.save() self.summary.close() def should_explore(self): self.epsilon = self.epsilon_schedule.value(self.steps) self.summary.add_scalar(tag='%s/Epsilon' % self.name, scalar_value=self.epsilon, global_step=self.steps) return random.random() < self.epsilon def predict(self, state): self.steps += 1 # add to experience if self.previous_state is not None: self.memory.add(self.previous_state, self.previous_action, self.current_reward, state, 0) if self.learning and self.should_explore(): q_values = None choice = random.choice(self.choices) action = self.choices.index(choice) else: self.prediction_time -= time.time() _state = Tensor(state).unsqueeze(0) action, q_values = self.eval_model.predict(_state, self.steps, self.learning) choice = self.choices[action] self.prediction_time += time.time() if self.learning and self.steps % self.reinforce_config.replace_frequency == 0: logger.debug("Replacing target model for %s" % self.name) self.target_model.replace(self.eval_model) if (self.learning and self.steps > self.reinforce_config.update_start and self.steps % self.reinforce_config.update_steps == 0): self.update_time -= time.time() self.update() self.update_time += time.time() self.current_reward = 0 self.previous_state = state self.previous_action = action return choice, q_values def disable_learning(self, is_save = True): logger.info("Disabled Learning for %s agent" % self.name) if is_save: self.save() self.learning = False self.episode = 0 def end_episode(self, state): if not self.learning: return episode_time = time.time() - self.episode_time self.reward_history.append(self.total_reward) self.episode_time_history.append(episode_time) total_time = sum(self.episode_time_history) avg_time = total_time / len(self.episode_time_history) logger.info("End of Episode %d, " "Total reward %.2f, " "Epsilon %.2f" % (self.episode + 1, self.total_reward, self.epsilon)) logger.debug("Episode Time: %.2fs (%.2fs), " "Prediction Time: %.2f, " "Update Time %.2f" % (episode_time, avg_time, self.prediction_time, self.update_time)) self.episode += 1 self.summary.add_scalar(tag='%s/Episode Reward' % self.name, scalar_value=self.total_reward, global_step=self.episode) self.memory.add(self.previous_state, self.previous_action, self.current_reward, state, 1) self.save() self.reset() def reset(self): self.episode_time = time.time() self.current_reward = 0 self.total_reward = 0 self.previous_state = None self.previous_action = None self.prediction_time = 0 self.update_time = 0 def restore_state(self): restore_path = self.network_config.network_path + "/adaptive.info" if self.network_config.network_path and os.path.exists(restore_path): logger.info("Restoring state from %s" % self.network_config.network_path) with open(restore_path, "rb") as file: info = pickle.load(file) self.steps = info["steps"] self.best_reward_mean = info["best_reward_mean"] self.episode = info["episode"] def save(self, force=False): info = { "steps": self.steps, "best_reward_mean": self.best_reward_mean, "episode": self.episode } if (len(self.reward_history) >= self.network_config.save_steps and self.episode % self.network_config.save_steps == 0): total_reward = sum(self.reward_history[-self.network_config.save_steps:]) current_reward_mean = total_reward / self.network_config.save_steps if current_reward_mean >= self.best_reward_mean: self.best_reward_mean = current_reward_mean logger.info("Saving network. Found new best reward (%.2f)" % current_reward_mean) self.eval_model.save_network() self.target_model.save_network() with open(self.network_config.network_path + "/adaptive.info", "wb") as file: pickle.dump(info, file, protocol=pickle.HIGHEST_PROTOCOL) else: logger.info("The best reward is still %.2f. Not saving" % self.best_reward_mean) def reward(self, r): self.total_reward += r self.current_reward += r def update(self): if self.steps <= self.reinforce_config.batch_size: return beta = self.beta_schedule.value(self.steps) self.summary.add_scalar(tag='%s/Beta' % self.name, scalar_value=beta, global_step=self.steps) batch = self.memory.sample(self.reinforce_config.batch_size, beta) (states, actions, reward, next_states, is_terminal, weights, batch_idxes) = batch self.summary.add_histogram(tag='%s/Batch Indices' % self.name, values=Tensor(batch_idxes), global_step=self.steps) states = FloatTensor(states) next_states = FloatTensor(next_states) terminal = FloatTensor([1 if t else 0 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.reinforce_config.batch_size, dtype=torch.long) # Current Q Values q_actions, q_values = self.eval_model.predict_batch(states) q_values = q_values[batch_index, actions] # Calculate target actions, q_next = self.target_model.predict_batch(next_states) q_max = q_next.max(1)[0].detach() q_max = (1 - terminal) * q_max q_target = reward + self.reinforce_config.discount_factor * q_max # update model self.eval_model.fit(q_values, q_target, self.steps) # Update priorities td_errors = q_values - q_target new_priorities = torch.abs(td_errors) + 1e-6 # prioritized_replay_eps self.memory.update_priorities(batch_idxes, new_priorities.data)
def train_model(model, datasets, batch_size, epochs, learning_rate, weight_decay=0, metadata=None, weights=None, checkpoint=None): """Train a sequence model on the Emoji Dataset. Args: model (torch.nn.Module): the model to be trained datasets (tuple): contains 3 datasets (TweetsBaseDataset) corresponding to train, dev and test splits batch_size (int): mini-batch size for training epochs (int): number of iterations over the training set learning_rate (float): used in the optimizer weight_decay (float): regularization factor for the optimizer metadata (dict): contains keys and values of any type with a valid string representation, which are saved for visualization in TensorBoard. Use to log model name and hyperparameters weights (dict): maps strings to weights (torch.tensor) to be visualized as histograms in TensorBoard checkpoint (str): path of an existing checkpoint (.pt) file Returns: tuple, containing best validation F1 score and test F1 score """ train_set, dev_set, test_set = datasets train_loader = DataLoader(train_set, batch_size, shuffle=True, num_workers=4, collate_fn=TweetsBaseDataset.collate_fn) model.to(device) criterion = nn.CrossEntropyLoss().to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) if checkpoint is not None: load_training_state(model, optimizer, checkpoint, eval_model=False) # A writer to save TensorBoard events writer = SummaryWriter() logdir = writer.file_writer.get_logdir() # Write hyperparameters to summary if metadata is None: metadata = {} metadata['Batch size'] = batch_size metadata['Learning rate'] = learning_rate text_summary = _build_text_summary(metadata) writer.add_text('metadata', text_summary) best_score = 0 test_f1 = 0 best_ckpt_link = os.path.join(logdir, 'best-ckpt.pt') try: steps = 0 for epoch in range(1, epochs + 1): model.train() print('Epoch {:d}/{:d}'.format(epoch, epochs)) n_batches = 0 for inputs, labels, lengths, indices in train_loader: steps += 1 n_batches += 1 inputs = inputs.to(device) labels = labels.to(device) lengths = lengths.to(device) # Initialize the gradients to zero optimizer.zero_grad() # Run the model outputs = model(inputs, lengths) loss = criterion(outputs, labels) # Optimize loss.backward() optimizer.step() # Log scores on training set if n_batches % 100 == 0: f1 = _get_score(outputs, labels) print("\r{}/{}: loss = {:.4f}, f1_score = {:.4f}".format( n_batches, len(train_loader), loss, f1), end='', flush=True) # Write metrics to TensorBoard writer.add_scalar('training/loss', loss, steps) writer.add_scalar('training/f1_score', f1, steps) # Write histograms if weights is not None: for name, data in weights.items(): writer.add_histogram('weights/' + name, data, steps) # Evaluate on dev set eval_loss, eval_f1 = evaluate(model, criterion, dev_set) print("\nvalidation loss = {:.4f}, validation f1_score = {:.4f}". format(eval_loss, eval_f1)) # Write to Tensorboard writer.add_scalar('validation/loss', eval_loss, steps) writer.add_scalar('validation/f1_score', eval_f1, steps) # Save the checkpoint ckpt_path = os.path.join(logdir, 'ckpt-{:d}.pt'.format(epoch)) save_model(model, optimizer, epoch, ckpt_path) # Create a symbolic link to the best model if eval_f1 > best_score: best_score = eval_f1 if os.path.islink(best_ckpt_link): os.unlink(best_ckpt_link) os.symlink(os.path.basename(ckpt_path), best_ckpt_link) print("Training Completed. Evaluating on test set...") # Evaluate on test set test_loss, test_f1 = evaluate(model, criterion, test_set) _, test_precision = evaluate(model, criterion, test_set, score="precision") _, test_recall = evaluate(model, criterion, test_set, score="recall") print( "\ntest loss = {:.4f}, test f1_score = {:.4f}, test precision = {:.4f}, test recall = {:.4f}" .format(test_loss, test_f1, test_precision, test_recall)) # Write to Tensorboard writer.add_scalar('test/loss', test_loss, 0) writer.add_scalar('test/f1_score', test_f1, 0) except KeyboardInterrupt: print('Interrupted training.') return best_score, test_f1
# For testing just do everything in one giant batch testloader = torch.utils.data.DataLoader( dataset_test, batch_size=len(dataset_test), shuffle=False, num_workers=0, ) model = FeedForward(dim=dataset_train.dim, hidden_size=args.hidden_size, output_size=dataset_train.dim) # Open a tensorboard writer if a logging directory is given if args.logdir != '': current_time = datetime.now().strftime('%b%d_%H-%M-%S') save_dir = osp.join(args.logdir, current_time) writer = SummaryWriter(log_dir=save_dir) if args.weight_histogram: # Log the initial parameters for name, param in model.named_parameters(): writer.add_histogram('parameters/' + name, param.clone().cpu().data.numpy(), 0) mse_criterion = nn.MSELoss() cosine_criterion = nn.CosineEmbeddingLoss() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for e in range(args.epochs): print('Epoch: {0}'.format(e + 1)) avg_mse_loss = 0 avg_cosine_loss = 0 n_batches = 0 for i, data in enumerate(trainloader): noisy, clean = data
single_mt.eval() eval_x, eval_y = dataset.slide_seq2seq_batch( 2, config.max_seq, 'eval') eval_x = torch.from_numpy(eval_x).contiguous().to(config.device, dtype=torch.int) eval_y = torch.from_numpy(eval_y).contiguous().to(config.device, dtype=torch.int) eval_preiction, weights = single_mt.forward(eval_x) eval_metrics = eval_metric_set(eval_preiction, eval_y) torch.save(single_mt.state_dict(), args.model_dir + '/train-{}.pth'.format(e)) if b == 0: train_summary_writer.add_histogram("target_analysis", batch_y, global_step=e) train_summary_writer.add_histogram("source_analysis", batch_x, global_step=e) for i, weight in enumerate(weights): attn_log_name = "attn/layer-{}".format(i) utils.attention_image_summary(attn_log_name, weight, step=idx, writer=eval_summary_writer) eval_summary_writer.add_scalar('loss', eval_metrics['loss'], global_step=idx) eval_summary_writer.add_scalar('accuracy',
class TBXLogger(Logger): """TensorBoardX Logger. Note that hparams will be written only after a trial has terminated. This logger automatically flattens nested dicts to show on TensorBoard: {"a": {"b": 1, "c": 2}} -> {"a/b": 1, "a/c": 2} """ def _init(self): try: from tensorboardX import SummaryWriter except ImportError: logger.error("pip install 'ray[tune]' to see TensorBoard files.") raise self._file_writer = SummaryWriter(self.logdir, flush_secs=30) self.last_result = None def on_result(self, result): step = result.get(TIMESTEPS_TOTAL) or result[TRAINING_ITERATION] tmp = result.copy() for k in [ "config", "pid", "timestamp", TIME_TOTAL_S, TRAINING_ITERATION ]: if k in tmp: del tmp[k] # not useful to log these flat_result = flatten_dict(tmp, delimiter="/") path = ["ray", "tune"] valid_result = {} for attr, value in flat_result.items(): full_attr = "/".join(path + [attr]) if type(value) in VALID_SUMMARY_TYPES: valid_result[full_attr] = value self._file_writer.add_scalar(full_attr, value, global_step=step) elif type(value) is list and len(value) > 0: valid_result[full_attr] = value try: self._file_writer.add_histogram(full_attr, value, global_step=step) # In case TensorboardX still doesn't think it's a valid value # (e.g. `[[]]`), warn and move on. except (ValueError, TypeError): if log_once("invalid_tbx_value"): logger.warning( "You are trying to log an invalid value ({}={}) " "via {}!".format(full_attr, value, type(self).__name__)) self.last_result = valid_result self._file_writer.flush() def flush(self): if self._file_writer is not None: self._file_writer.flush() def close(self): if self._file_writer is not None: if self.trial and self.trial.evaluated_params and self.last_result: self._try_log_hparams(self.last_result) self._file_writer.close() def _try_log_hparams(self, result): # TBX currently errors if the hparams value is None. scrubbed_params = { k: v for k, v in self.trial.evaluated_params.items() if v is not None } from tensorboardX.summary import hparams experiment_tag, session_start_tag, session_end_tag = hparams( hparam_dict=scrubbed_params, metric_dict=result) self._file_writer.file_writer.add_summary(experiment_tag) self._file_writer.file_writer.add_summary(session_start_tag) self._file_writer.file_writer.add_summary(session_end_tag)
class GAN(object): """GAN class.""" def __init__(self, opt, dataset_load=None, exp_dir=None): """Constructor.""" # Save variables self.opt = opt self.dataset_load = dataset_load self.opt.out_dir = exp_dir # Define other variables self.real_label = 1 self.fake_label = 0 # Losses file file_name = os.path.join(self.opt.out_dir, 'losses.txt') self.output_loss_file = open(file_name, "wt") # TODO: Add comment if self.opt.full_sphere_sampling: self.opt.phi = None self.opt.theta = None self.opt.cam_dist = self.opt.cam_dist + 0.2 else: self.opt.angle = None self.opt.axis = None # TensorboardX self.writer = SummaryWriter(self.opt.vis_monitoring) print(self.opt.vis_monitoring) print(self.opt.out_dir) # Create dataset loader self.create_dataset_loader() # Create the networks # Create create_tensors self.create_tensors() # Create criterion # Create create optimizers # Create splats rendering scene self.create_scene() def create_dataset_loader(self, ): """Create dataset leader.""" # Define camera positions if self.opt.same_view: # self.cam_pos = uniform_sample_sphere(radius=self.opt.cam_dist, # num_samples=1) arrays = [ np.asarray([3., 3., 3.]) for _ in range(self.opt.batchSize) ] # TODO: Magic numbers self.cam_pos = np.stack(arrays, axis=0) # Create dataset loader self.dataset_load.initialize_dataset() self.dataset = self.dataset_load.get_dataset() self.dataset_load.initialize_dataset_loader(1) # TODO: Hack self.dataset_loader = self.dataset_load.get_dataset_loader() def create_networks(self, ): """Create networks.""" self.netG, self.netG2, self.netD, self.netD2 = create_networks( self.opt, verbose=True, depth_only=True) # TODO: Remove D2 and G2 # Create the normal estimation network which takes pointclouds in the # camera space and outputs the normals assert self.netG2 is None self.sph_normals = True self.netG2 = NEstNetV1_2(sph=self.sph_normals) print(self.netG2) if not self.opt.no_cuda: self.netD = self.netD.cuda() self.netG = self.netG.cuda() self.netG2 = self.netG2.cuda() def create_scene(self, ): """Create a semi-empty scene with camera parameters.""" self.scene = create_scene(self.opt.splats_img_size, self.opt.splats_img_size, self.opt.fovy, self.opt.focal_length, self.opt.n_splats) def create_tensors(self, ): """Create the tensors.""" # Create tensors self.input = torch.FloatTensor(self.opt.batchSize, self.opt.render_img_nc, self.opt.render_img_size, self.opt.render_img_size) self.input_depth = torch.FloatTensor(self.opt.batchSize, 1, self.opt.render_img_size, self.opt.render_img_size) self.input_normal = torch.FloatTensor(self.opt.batchSize, 1, self.opt.render_img_size, self.opt.render_img_size) self.input_cond = torch.FloatTensor(self.opt.batchSize, 3) self.noise = torch.FloatTensor(self.opt.batchSize, int(self.opt.nz), 1, 1) self.fixed_noise = torch.FloatTensor(self.opt.batchSize, int(self.opt.nz), 1, 1).normal_(0, 1) self.label = torch.FloatTensor(2 * self.opt.batchSize) self.one = torch.FloatTensor([1]) self.mone = self.one * -1 # Move them to the GPU if not self.opt.no_cuda: self.input = self.input.cuda() self.input_depth = self.input_depth.cuda() self.input_normal = self.input_normal.cuda() self.input_cond = self.input_cond.cuda() self.label = self.label.cuda() self.noise = self.noise.cuda() self.fixed_noise = self.fixed_noise.cuda() self.one = self.one.cuda() self.mone = self.mone.cuda() self.fixed_noise = Variable(self.fixed_noise) # TODO: Why? def create_criterion(self, ): """Create criterion.""" self.criterion = nn.BCELoss() if not self.opt.no_cuda: self.criterion = self.criterion.cuda() def create_optimizers(self, ): """Create optimizers.""" if self.opt.optimizer == 'adam': self.optimizerD = optim.Adam(self.netD.parameters(), lr=self.opt.lr, betas=(self.opt.beta1, 0.999)) self.optimizerG = optim.Adam(self.netG.parameters(), lr=self.opt.lr, betas=(self.opt.beta1, 0.999)) self.optimizerG2 = optim.Adam(self.netG2.parameters(), lr=self.opt.lr, betas=(self.opt.beta1, 0.999)) elif self.opt.optimizer == 'rmsprop': self.optimizerD = optim.RMSprop(self.netD.parameters(), lr=self.opt.lr) self.optimizerG = optim.RMSprop(self.netG.parameters(), lr=self.opt.lr) self.optimizerG2 = optim.RMSprop(self.netG2.parameters(), lr=self.opt.lr) else: raise ValueError('Unknown optimizer: ' + self.opt.optimizer) # Create the schedulers if self.opt.lr_sched_type == 'step': LR_fn = optim.lr_scheduler.StepLR elif self.opt.lr_sched_type == 'exp': LR_fn = optim.lr_scheduler.ExponentialLR elif self.opt.lr_sched_type is None: LR_fn = None else: raise ValueError('Unknown scheduler') self.optG_z_lr_scheduler = LR_fn(self.optimizerG, step_size=self.opt.z_lr_sched_step, gamma=self.opt.z_lr_sched_gamma) self.optG2_normal_lr_scheduler = LR_fn( self.optimizerG2, step_size=self.opt.normal_lr_sched_step, gamma=self.opt.normal_lr_sched_gamma) self.LR_SCHED_MAP = [ self.optG_z_lr_scheduler, self.optG2_normal_lr_scheduler ] self.OPT_MAP = [self.optimizerG, self.optimizerG2] def get_samples(self): """Get samples.""" try: samples = self.data_iter.next() except StopIteration: del self.data_iter self.data_iter = iter(self.dataset_loader) samples = self.data_iter.next() except AttributeError: self.data_iter = iter(self.dataset_loader) samples = self.data_iter.next() return samples def get_real_samples(self): """Get a real sample.""" # Define the camera poses if not self.opt.same_view: if self.opt.full_sphere_sampling: self.cam_pos = uniform_sample_sphere( radius=self.opt.cam_dist, num_samples=self.opt.batchSize, axis=self.opt.axis, angle=np.deg2rad(self.opt.angle), theta_range=self.opt.theta, phi_range=self.opt.phi) else: self.cam_pos = uniform_sample_sphere( radius=self.opt.cam_dist, num_samples=self.opt.batchSize, axis=self.opt.axis, angle=self.opt.angle, theta_range=np.deg2rad(self.opt.theta), phi_range=np.deg2rad(self.opt.phi)) if self.opt.full_sphere_sampling_light: self.light_pos1 = uniform_sample_sphere( radius=self.opt.cam_dist, num_samples=self.opt.batchSize, axis=self.opt.axis, angle=np.deg2rad(44), theta_range=self.opt.theta, phi_range=self.opt.phi) # self.light_pos2 = uniform_sample_sphere(radius=self.opt.cam_dist, num_samples=self.opt.batchSize, # axis=self.opt.axis, angle=np.deg2rad(40), # theta_range=self.opt.theta, phi_range=self.opt.phi) else: print("inbox") light_eps = 0.15 self.light_pos1 = np.random.rand(self.opt.batchSize, 3) * self.opt.cam_dist + light_eps self.light_pos2 = np.random.rand(self.opt.batchSize, 3) * self.opt.cam_dist + light_eps # TODO: deg2rad in all the angles???? # Create a splats rendering scene large_scene = create_scene(self.opt.width, self.opt.height, self.opt.fovy, self.opt.focal_length, self.opt.n_splats) lookat = self.opt.at if self.opt.at is not None else [ 0.0, 0.0, 0.0, 1.0 ] large_scene['camera']['at'] = tch_var_f(lookat) # Render scenes data, data_depth, data_normal, data_cond = [], [], [], [] inpath = self.opt.vis_images + '/' inpath2 = self.opt.vis_input + '/' for idx in range(self.opt.batchSize): # Save the splats into the rendering scene if self.opt.use_mesh: if 'sphere' in large_scene['objects']: del large_scene['objects']['sphere'] if 'disk' in large_scene['objects']: del large_scene['objects']['disk'] if 'triangle' not in large_scene['objects']: large_scene['objects'] = { 'triangle': { 'face': None, 'normal': None, 'material_idx': None } } samples = self.get_samples() large_scene['objects']['triangle']['material_idx'] = tch_var_l( np.zeros(samples['mesh']['face'][0].shape[0], dtype=int).tolist()) large_scene['objects']['triangle']['face'] = Variable( samples['mesh']['face'][0].cuda(), requires_grad=False) large_scene['objects']['triangle']['normal'] = Variable( samples['mesh']['normal'][0].cuda(), requires_grad=False) else: if 'sphere' in large_scene['objects']: del large_scene['objects']['sphere'] if 'triangle' in large_scene['objects']: del large_scene['objects']['triangle'] if 'disk' not in large_scene['objects']: large_scene['objects'] = { 'disk': { 'pos': None, 'normal': None, 'material_idx': None } } large_scene['objects']['disk']['radius'] = tch_var_f( np.ones(self.opt.n_splats) * self.opt.splats_radius) large_scene['objects']['disk']['material_idx'] = tch_var_l( np.zeros(self.opt.n_splats, dtype=int).tolist()) large_scene['objects']['disk']['pos'] = Variable( samples['splats']['pos'][idx].cuda(), requires_grad=False) large_scene['objects']['disk']['normal'] = Variable( samples['splats']['normal'][idx].cuda(), requires_grad=False) # Set camera position if not self.opt.same_view: large_scene['camera']['eye'] = tch_var_f(self.cam_pos[idx]) else: large_scene['camera']['eye'] = tch_var_f(self.cam_pos[0]) large_scene['lights']['pos'][0, :3] = tch_var_f( self.light_pos1[idx]) #large_scene['lights']['pos'][1,:3]=tch_var_f(self.light_pos2[idx]) # Render scene res = render(large_scene, norm_depth_image_only=self.opt.norm_depth_image_only, double_sided=True, use_quartic=self.opt.use_quartic) # Get rendered output if self.opt.render_img_nc == 1: depth = res['depth'] im_d = depth.unsqueeze(0) else: depth = res['depth'] im_d = depth.unsqueeze(0) im = res['image'].permute(2, 0, 1) im_ = get_data(res['image']) #im_img_ = get_normalmap_image(im_) target_normal_ = get_data(res['normal']) target_normalmap_img_ = get_normalmap_image(target_normal_) im_n = tch_var_f(target_normalmap_img_).view( im.shape[1], im.shape[2], 3).permute(2, 0, 1) # Add depth image to the output structure file_name = inpath2 + str(self.iterationa_no) + "_" + str( self.critic_iter) + 'input_{:05d}.txt'.format(idx) text_file = open(file_name, "w") text_file.write('%s\n' % (str(large_scene['camera']['eye'].data))) text_file.close() out_file_name = inpath2 + str(self.iterationa_no) + "_" + str( self.critic_iter) + 'input_{:05d}.npy'.format(idx) np.save(out_file_name, self.cam_pos[idx]) out_file_name2 = inpath2 + str(self.iterationa_no) + "_" + str( self.critic_iter) + 'input_light{:05d}.npy'.format(idx) np.save(out_file_name2, self.light_pos1[idx]) out_file_name3 = inpath2 + str(self.iterationa_no) + "_" + str( self.critic_iter) + 'input_im{:05d}.npy'.format(idx) np.save(out_file_name3, get_data(res['image'])) out_file_name4 = inpath2 + str(self.iterationa_no) + "_" + str( self.critic_iter) + 'input_depth{:05d}.npy'.format(idx) np.save(out_file_name4, get_data(res['depth'])) out_file_name5 = inpath2 + str(self.iterationa_no) + "_" + str( self.critic_iter) + 'input_normal{:05d}.npy'.format(idx) np.save(out_file_name5, get_data(res['normal'])) if self.iterationa_no % (self.opt.save_image_interval * 5) == 0: imsave((inpath + str(self.iterationa_no) + 'real_normalmap_{:05d}.png'.format(idx)), target_normalmap_img_) imsave((inpath + str(self.iterationa_no) + 'real_depth_{:05d}.png'.format(idx)), get_data(depth)) # imsave(inpath + str(self.iterationa_no) + 'real_depthmap_{:05d}.png'.format(idx), im_d) # imsave(inpath + str(self.iterationa_no) + 'world_normalmap_{:05d}.png'.format(idx), target_worldnormalmap_img_) data.append(im) data_depth.append(im_d) data_normal.append(im_n) data_cond.append(large_scene['camera']['eye']) # Stack real samples real_samples = torch.stack(data) real_samples_depth = torch.stack(data_depth) real_samples_normal = torch.stack(data_normal) real_samples_cond = torch.stack(data_cond) self.batch_size = real_samples.size(0) if not self.opt.no_cuda: real_samples = real_samples.cuda() real_samples_depth = real_samples_depth.cuda() real_samples_normal = real_samples_normal.cuda() real_samples_cond = real_samples_cond.cuda() # Set input/output variables self.input.resize_as_(real_samples.data).copy_(real_samples.data) self.input_depth.resize_as_(real_samples_depth.data).copy_( real_samples_depth.data) self.input_normal.resize_as_(real_samples_normal.data).copy_( real_samples_normal.data) self.input_cond.resize_as_(real_samples_cond.data).copy_( real_samples_cond.data) self.label.resize_(self.batch_size).fill_(self.real_label) # TODO: Remove Variables self.inputv = Variable(self.input) self.inputv_depth = Variable(self.input_depth) self.inputv_normal = Variable(self.input_normal) self.inputv_cond = Variable(self.input_cond) self.labelv = Variable(self.label) def generate_noise_vector(self, ): """Generate a noise vector.""" self.noise.resize_(self.batch_size, int(self.opt.nz), 1, 1).normal_(0, 1) self.noisev = Variable(self.noise) # TODO: Add volatile=True??? def generate_normals(self, z_batch, cam_pos, camera): """Generate normals from depth.""" W, H = camera['viewport'][2:] normals = [] for z, eye in zip(z_batch, cam_pos): camera['eye'] = eye pcl = z_to_pcl_CC(z.squeeze(), camera) n = self.netG2(pcl.view(H, W, 3).permute(2, 0, 1)[np.newaxis, ...]) n = n.squeeze().permute(1, 2, 0).view(-1, 3).contiguous() normals.append(n) return torch.stack(normals) def tensorboard_pos_hook(self, grad): self.writer.add_image("position_gradient_im", torch.sqrt(torch.sum(grad**2, dim=-1)), self.iterationa_no) self.writer.add_scalar("position_mean_channel1", get_data(torch.mean(torch.abs(grad[:, :, 0]))), self.iterationa_no) self.writer.add_scalar("position_gradient_mean_channel2", get_data(torch.mean(torch.abs(grad[:, :, 1]))), self.iterationa_no) self.writer.add_scalar("position_gradient_mean_channel3", get_data(torch.mean(torch.abs(grad[:, :, 2]))), self.iterationa_no) self.writer.add_scalar("position_gradient_mean", get_data(torch.mean(grad)), self.iterationa_no) self.writer.add_histogram("position_gradient_hist_channel1", grad[:, :, 0].clone().cpu().data.numpy(), self.iterationa_no) self.writer.add_histogram("position_gradient_hist_channel2", grad[:, :, 1].clone().cpu().data.numpy(), self.iterationa_no) self.writer.add_histogram("position_gradient_hist_channel3", grad[:, :, 2].clone().cpu().data.numpy(), self.iterationa_no) self.writer.add_histogram( "position_gradient_hist_norm", torch.sqrt(torch.sum(grad**2, dim=-1)).clone().cpu().data.numpy(), self.iterationa_no) #print('grad', grad) def tensorboard_normal_hook(self, grad): self.writer.add_image("normal_gradient_im", torch.sqrt(torch.sum(grad**2, dim=-1)), self.iterationa_no) self.writer.add_scalar("normal_gradient_mean_channel1", get_data(torch.mean(torch.abs(grad[:, :, 0]))), self.iterationa_no) self.writer.add_scalar("normal_gradient_mean_channel2", get_data(torch.mean(torch.abs(grad[:, :, 1]))), self.iterationa_no) self.writer.add_scalar("normal_gradient_mean_channel3", get_data(torch.mean(torch.abs(grad[:, :, 2]))), self.iterationa_no) self.writer.add_scalar("normal_gradient_mean", get_data(torch.mean(grad)), self.iterationa_no) self.writer.add_histogram("normal_gradient_hist_channel1", grad[:, :, 0].clone().cpu().data.numpy(), self.iterationa_no) self.writer.add_histogram("normal_gradient_hist_channel2", grad[:, :, 1].clone().cpu().data.numpy(), self.iterationa_no) self.writer.add_histogram("normal_gradient_hist_channel3", grad[:, :, 2].clone().cpu().data.numpy(), self.iterationa_no) self.writer.add_histogram( "normal_gradient_hist_norm", torch.sqrt(torch.sum(grad**2, dim=-1)).clone().cpu().data.numpy(), self.iterationa_no) #print('grad', grad) def tensorboard_z_hook(self, grad): self.writer.add_scalar("z_gradient_mean", get_data(torch.mean(torch.abs(grad))), self.iterationa_no) self.writer.add_histogram("z_gradient_hist_channel", grad.clone().cpu().data.numpy(), self.iterationa_no) self.writer.add_image("z_gradient_im", grad, self.iterationa_no) def tensorboard_hook(self, grad): self.writer.add_scalar("z_gradient_mean", get_data(torch.mean(grad[0])), self.iterationa_no) self.writer.add_histogram("z_gradient_hist_channel", grad[0].clone().cpu().data.numpy(), self.iterationa_no) self.writer.add_image( "z_gradient_im", grad[0].view(self.opt.splats_img_size, self.opt.splats_img_size), self.iterationa_no) def train(self, ): """Train network.""" # Load pretrained model if required if self.opt.gen_model_path is not None: print("Reloading networks from") print(' > Generator', self.opt.gen_model_path) self.netG.load_state_dict( torch.load(open(self.opt.gen_model_path, 'rb'))) print(' > Generator2', self.opt.gen_model_path2) self.netG2.load_state_dict( torch.load(open(self.opt.gen_model_path2, 'rb'))) print(' > Discriminator', self.opt.dis_model_path) self.netD.load_state_dict( torch.load(open(self.opt.dis_model_path, 'rb'))) print(' > Discriminator2', self.opt.dis_model_path2) self.netD2.load_state_dict( torch.load(open(self.opt.dis_model_path2, 'rb'))) # Start training file_name = os.path.join(self.opt.out_dir, 'L2.txt') with open(file_name, 'wt') as l2_file: curr_generator_idx = 0 for iteration in range(self.opt.n_iter): self.iterationa_no = iteration self.critic_iter = 0 # Train Discriminator critic_iters times for j in range(self.opt.critic_iters): # Train with real ################# self.in_critic = 1 self.get_real_samples() self.critic_iter += 1 def save_networks(self, epoch): """Save networks to hard disk.""" torch.save(self.netG.state_dict(), '%s/netG_epoch_%d.pth' % (self.opt.out_dir, epoch)) torch.save(self.netG2.state_dict(), '%s/netG2_epoch_%d.pth' % (self.opt.out_dir, epoch)) torch.save(self.netD.state_dict(), '%s/netD_epoch_%d.pth' % (self.opt.out_dir, epoch)) torch.save(self.netD2.state_dict(), '%s/netD2_epoch_%d.pth' % (self.opt.out_dir, epoch)) def save_images(self, epoch, input, output): """Save images.""" if self.opt.render_img_nc == 1: imsave(self.opt.out_dir + '/input2' + str(epoch) + '.png', np.uint8(255. * input.cpu().data.numpy().squeeze())) imsave(self.opt.out_dir + '/fz' + str(epoch) + '.png', np.uint8(255. * output.cpu().data.numpy().squeeze())) else: imsave( self.opt.out_dir + '/input2' + str(epoch) + '.png', np.uint8(255. * input.cpu().data.numpy().transpose((1, 2, 0)))) imsave( self.opt.out_dir + '/output2' + str(epoch) + '.png', np.uint8(255. * output.cpu().data.numpy().transpose( (1, 2, 0))))
writer.add_image('Image', x, n_iter) dummy_audio = torch.zeros(sample_rate * 2) for i in range(x.size(0)): # amplitude of sound should in [-1, 1] dummy_audio[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate)) writer.add_audio('myAudio', dummy_audio, n_iter, sample_rate=sample_rate) writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter) for name, param in resnet18.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter) # needs tensorboard 0.4RC or later writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter) dataset = datasets.MNIST('mnist', train=False, download=True) images = dataset.test_data[:100].float() label = dataset.test_labels[:100] features = images.view(100, 784) writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1)) # export scalar data to JSON for external processing writer.export_scalars_to_json("./all_scalars.json") writer.close()
class UNet3DTrainer: """3D UNet trainer. Args: model (Unet3D): UNet 3D model to be trained optimizer (nn.optim.Optimizer): optimizer used for training lr_scheduler (torch.optim.lr_scheduler._LRScheduler): learning rate scheduler WARN: bear in mind that lr_scheduler.step() is invoked after every validation step (i.e. validate_after_iters) not after every epoch. So e.g. if one uses StepLR with step_size=30 the learning rate will be adjusted after every 30 * validate_after_iters iterations. loss_criterion (callable): loss function eval_criterion (callable): used to compute training/validation metric (such as Dice, IoU, AP or Rand score) saving the best checkpoint is based on the result of this function on the validation set device (torch.device): device to train on loaders (dict): 'train' and 'val' loaders checkpoint_dir (string): dir for saving checkpoints and tensorboard logs max_num_epochs (int): maximum number of epochs max_num_iterations (int): maximum number of iterations validate_after_iters (int): validate after that many iterations log_after_iters (int): number of iterations before logging to tensorboard validate_iters (int): number of validation iterations, if None validate on the whole validation set eval_score_higher_is_better (bool): if True higher eval scores are considered better best_eval_score (float): best validation score so far (higher better) num_iterations (int): useful when loading the model from the checkpoint num_epoch (int): useful when loading the model from the checkpoint """ def __init__(self, model, optimizer, lr_scheduler, loss_criterion, eval_criterion, device, loaders, checkpoint_dir, max_num_epochs=100, max_num_iterations=1e5, validate_after_iters=100, log_after_iters=100, validate_iters=None, num_iterations=1, num_epoch=0, eval_score_higher_is_better=True, best_eval_score=None, logger=None): if logger is None: self.logger = utils.get_logger('UNet3DTrainer', level=logging.DEBUG) else: self.logger = logger self.logger.info(model) self.model = model self.optimizer = optimizer self.scheduler = lr_scheduler self.loss_criterion = loss_criterion self.eval_criterion = eval_criterion self.device = device self.loaders = loaders self.checkpoint_dir = checkpoint_dir self.max_num_epochs = max_num_epochs self.max_num_iterations = max_num_iterations self.validate_after_iters = validate_after_iters self.log_after_iters = log_after_iters self.validate_iters = validate_iters self.eval_score_higher_is_better = eval_score_higher_is_better logger.info( f'eval_score_higher_is_better: {eval_score_higher_is_better}') if best_eval_score is not None: self.best_eval_score = best_eval_score else: # initialize the best_eval_score if eval_score_higher_is_better: self.best_eval_score = float('-inf') else: self.best_eval_score = float('+inf') self.writer = SummaryWriter( log_dir=os.path.join(checkpoint_dir, 'logs')) self.num_iterations = num_iterations self.num_epoch = num_epoch @classmethod def from_checkpoint(cls, checkpoint_path, model, optimizer, lr_scheduler, loss_criterion, eval_criterion, loaders, logger=None): logger.info(f"Loading checkpoint '{checkpoint_path}'...") state = utils.load_checkpoint(checkpoint_path, model, optimizer) logger.info( f"Checkpoint loaded. Epoch: {state['epoch']}. Best val score: {state['best_eval_score']}. Num_iterations: {state['num_iterations']}" ) checkpoint_dir = os.path.split(checkpoint_path)[0] return cls( model, optimizer, lr_scheduler, loss_criterion, eval_criterion, torch.device(state['device']), loaders, checkpoint_dir, eval_score_higher_is_better=state['eval_score_higher_is_better'], best_eval_score=state['best_eval_score'], num_iterations=state['num_iterations'], num_epoch=state['epoch'], max_num_epochs=state['max_num_epochs'], max_num_iterations=state['max_num_iterations'], validate_after_iters=state['validate_after_iters'], log_after_iters=state['log_after_iters'], validate_iters=state['validate_iters'], logger=logger) @classmethod def from_pretrained(cls, pre_trained, model, optimizer, lr_scheduler, loss_criterion, eval_criterion, device, loaders, max_num_epochs=100, max_num_iterations=1e5, validate_after_iters=100, log_after_iters=100, validate_iters=None, num_iterations=1, num_epoch=0, eval_score_higher_is_better=True, best_eval_score=None, logger=None): logger.info(f"Logging pre-trained model from '{pre_trained}'...") utils.load_checkpoint(pre_trained, model, None) checkpoint_dir = os.path.split(pre_trained)[0] return cls(model, optimizer, lr_scheduler, loss_criterion, eval_criterion, device, loaders, checkpoint_dir, eval_score_higher_is_better=eval_score_higher_is_better, best_eval_score=best_eval_score, num_iterations=num_iterations, num_epoch=num_epoch, max_num_epochs=max_num_epochs, max_num_iterations=max_num_iterations, validate_after_iters=validate_after_iters, log_after_iters=log_after_iters, validate_iters=validate_iters, logger=logger) def fit(self): for _ in range(self.num_epoch, self.max_num_epochs): # train for one epoch should_terminate = self.train(self.loaders['train']) if should_terminate: break self.num_epoch += 1 def train(self, train_loader): """Trains the model for 1 epoch. Args: train_loader (torch.utils.data.DataLoader): training data loader Returns: True if the training should be terminated immediately, False otherwise """ train_losses = utils.RunningAverage() train_eval_scores = utils.RunningAverage() # sets the model in training mode self.model.train() for i, t in enumerate(train_loader): self.logger.info( f'Training iteration {self.num_iterations}. Batch {i}. Epoch [{self.num_epoch}/{self.max_num_epochs - 1}]' ) input, target, weight = self._split_training_batch(t) output, loss = self._forward_pass(input, target, weight) train_losses.update(loss.item(), self._batch_size(input)) # compute gradients and update parameters self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.num_iterations % self.validate_after_iters == 0: # evaluate on validation set eval_score = self.validate(self.loaders['val']) # adjust learning rate if necessary if isinstance(self.scheduler, ReduceLROnPlateau): self.scheduler.step(eval_score) else: self.scheduler.step() # log current learning rate in tensorboard self._log_lr() # remember best validation metric is_best = self._is_best_eval_score(eval_score) # save checkpoint self._save_checkpoint(is_best) if self.num_iterations % self.log_after_iters == 0: # if model contains final_activation layer for normalizing logits apply it, otherwise both # the evaluation metric as well as images in tensorboard will be incorrectly computed if hasattr(self.model, 'final_activation'): output = self.model.final_activation(output) # compute eval criterion eval_score = self.eval_criterion(output, target) train_eval_scores.update(eval_score.item(), self._batch_size(input)) # log stats, params and images self.logger.info( f'Training stats. Loss: {train_losses.avg}. Evaluation score: {train_eval_scores.avg}' ) self._log_stats('train', train_losses.avg, train_eval_scores.avg) self._log_params() self._log_images(input, target, output) if self.max_num_iterations < self.num_iterations: self.logger.info( f'Maximum number of iterations {self.max_num_iterations} exceeded. Finishing training...' ) return True self.num_iterations += 1 return False def validate(self, val_loader): self.logger.info('Validating...') val_losses = utils.RunningAverage() val_scores = utils.RunningAverage() try: # set the model in evaluation mode; final_activation doesn't need to be called explicitly self.model.eval() with torch.no_grad(): for i, t in enumerate(val_loader): self.logger.info(f'Validation iteration {i}') input, target, weight = self._split_training_batch(t) output, loss = self._forward_pass(input, target, weight) val_losses.update(loss.item(), self._batch_size(input)) eval_score = self.eval_criterion(output, target) val_scores.update(eval_score.item(), self._batch_size(input)) if self.validate_iters is not None and self.validate_iters <= i: # stop validation break self._log_stats('val', val_losses.avg, val_scores.avg) self.logger.info( f'Validation finished. Loss: {val_losses.avg}. Evaluation score: {val_scores.avg}' ) return val_scores.avg finally: # set back in training mode self.model.train() def _split_training_batch(self, t): def _move_to_device(input): if isinstance(input, tuple) or isinstance(input, list): return tuple([_move_to_device(x) for x in input]) else: return input.to(self.device) t = _move_to_device(t) weight = None if len(t) == 2: input, target = t else: input, target, weight = t return input, target, weight def _forward_pass(self, input, target, weight=None): # forward pass output = self.model(input) # compute the loss if weight is None: loss = self.loss_criterion(output, target) else: loss = self.loss_criterion(output, target, weight) return output, loss def _is_best_eval_score(self, eval_score): if self.eval_score_higher_is_better: is_best = eval_score > self.best_eval_score else: is_best = eval_score < self.best_eval_score if is_best: self.logger.info( f'Saving new best evaluation metric: {eval_score}') self.best_eval_score = eval_score return is_best def _save_checkpoint(self, is_best): utils.save_checkpoint( { 'epoch': self.num_epoch + 1, 'num_iterations': self.num_iterations, 'model_state_dict': self.model.state_dict(), 'best_eval_score': self.best_eval_score, 'eval_score_higher_is_better': self.eval_score_higher_is_better, 'optimizer_state_dict': self.optimizer.state_dict(), 'device': str(self.device), 'max_num_epochs': self.max_num_epochs, 'max_num_iterations': self.max_num_iterations, 'validate_after_iters': self.validate_after_iters, 'log_after_iters': self.log_after_iters, 'validate_iters': self.validate_iters }, is_best, checkpoint_dir=self.checkpoint_dir, logger=self.logger) def _log_lr(self): lr = self.optimizer.param_groups[0]['lr'] self.writer.add_scalar('learning_rate', lr, self.num_iterations) def _log_stats(self, phase, loss_avg, eval_score_avg): tag_value = { f'{phase}_loss_avg': loss_avg, f'{phase}_eval_score_avg': eval_score_avg } for tag, value in tag_value.items(): self.writer.add_scalar(tag, value, self.num_iterations) def _log_params(self): self.logger.info('Logging model parameters and gradients') for name, value in self.model.named_parameters(): self.writer.add_histogram(name, value.data.cpu().numpy(), self.num_iterations) self.writer.add_histogram(name + '/grad', value.grad.data.cpu().numpy(), self.num_iterations) def _log_images(self, input, target, prediction): inputs_map = { 'inputs': input, 'targets': target, 'predictions': prediction } img_sources = {} for name, batch in inputs_map.items(): if isinstance(batch, list) or isinstance(batch, tuple): for i, b in enumerate(batch): img_sources[f'{name}{i}'] = b.data.cpu().numpy() else: img_sources[name] = batch.data.cpu().numpy() for name, batch in img_sources.items(): for tag, image in self._images_from_batch(name, batch): self.writer.add_image(tag, image, self.num_iterations, dataformats='HW') def _images_from_batch(self, name, batch): tag_template = '{}/batch_{}/channel_{}/slice_{}' tagged_images = [] if batch.ndim == 5: # NCDHW slice_idx = batch.shape[2] // 2 # get the middle slice for batch_idx in range(batch.shape[0]): for channel_idx in range(batch.shape[1]): tag = tag_template.format(name, batch_idx, channel_idx, slice_idx) img = batch[batch_idx, channel_idx, slice_idx, ...] tagged_images.append((tag, self._normalize_img(img))) else: # batch has no channel dim: NDHW slice_idx = batch.shape[1] // 2 # get the middle slice for batch_idx in range(batch.shape[0]): tag = tag_template.format(name, batch_idx, 0, slice_idx) img = batch[batch_idx, slice_idx, ...] tagged_images.append((tag, self._normalize_img(img))) return tagged_images @staticmethod def _normalize_img(img): return (img - np.min(img)) / np.ptp(img) @staticmethod def _batch_size(input): if isinstance(input, list) or isinstance(input, tuple): return input[0].size(0) else: return input.size(0)
class TensorboardWriter(FromParams): """ Class that handles Tensorboard (and other) logging. Parameters ---------- get_batch_num_total : Callable[[], int] A thunk that returns the number of batches so far. Most likely this will be a closure around an instance variable in your ``Trainer`` class. serialization_dir : str, optional (default = None) If provided, this is where the Tensorboard logs will be written. summary_interval : int, optional (default = 100) Most statistics will be written out only every this many batches. histogram_interval : int, optional (default = None) If provided, activation histograms will be written out every this many batches. If None, activation histograms will not be written out. should_log_parameter_statistics : bool, optional (default = True) Whether to log parameter statistics. should_log_learning_rate : bool, optional (default = False) Whether to log learning rate. """ def __init__(self, get_batch_num_total: Callable[[], int], serialization_dir: Optional[str] = None, summary_interval: int = 100, histogram_interval: int = None, should_log_parameter_statistics: bool = True, should_log_learning_rate: bool = False) -> None: if serialization_dir is not None: self._train_log = SummaryWriter(os.path.join(serialization_dir, "log", "train")) self._validation_log = SummaryWriter(os.path.join(serialization_dir, "log", "validation")) else: self._train_log = self._validation_log = None self._summary_interval = summary_interval self._histogram_interval = histogram_interval self._should_log_parameter_statistics = should_log_parameter_statistics self._should_log_learning_rate = should_log_learning_rate self._get_batch_num_total = get_batch_num_total @staticmethod def _item(value: Any): if hasattr(value, 'item'): val = value.item() else: val = value return val def should_log_this_batch(self) -> bool: return self._get_batch_num_total() % self._summary_interval == 0 def should_log_histograms_this_batch(self) -> bool: return self._histogram_interval is not None and self._get_batch_num_total() % self._histogram_interval == 0 def add_train_scalar(self, name: str, value: float, timestep: int = None) -> None: timestep = timestep or self._get_batch_num_total() # get the scalar if self._train_log is not None: self._train_log.add_scalar(name, self._item(value), timestep) def add_train_histogram(self, name: str, values: torch.Tensor) -> None: if self._train_log is not None: if isinstance(values, torch.Tensor): values_to_write = values.cpu().data.numpy().flatten() self._train_log.add_histogram(name, values_to_write, self._get_batch_num_total()) def add_graph(self, model, inputs) -> None: if self._train_log is not None: self._train_log.add_graph(model, inputs) def add_validation_scalar(self, name: str, value: float, timestep: int = None) -> None: timestep = timestep or self._get_batch_num_total() if self._validation_log is not None: self._validation_log.add_scalar(name, self._item(value), timestep) def log_parameter_and_gradient_statistics(self, # pylint: disable=invalid-name model: Model, batch_grad_norm: float) -> None: """ Send the mean and std of all parameters and gradients to tensorboard, as well as logging the average gradient norm. """ if self._should_log_parameter_statistics: # Log parameter values to Tensorboard for name, param in model.named_parameters(): self.add_train_scalar("parameter_mean/" + name, param.data.mean()) if param.data.numel() > 1: self.add_train_scalar("parameter_std/" + name, param.data.std()) if param.grad is not None: if param.grad.is_sparse: # pylint: disable=protected-access grad_data = param.grad.data._values() else: grad_data = param.grad.data # skip empty gradients if torch.prod(torch.tensor(grad_data.shape)).item() > 0: # pylint: disable=not-callable self.add_train_scalar("gradient_mean/" + name, grad_data.mean()) if grad_data.numel() > 1: self.add_train_scalar("gradient_std/" + name, grad_data.std()) else: # no gradient for a parameter with sparse gradients logger.info("No gradient for %s, skipping tensorboard logging.", name) # norm of gradients if batch_grad_norm is not None: self.add_train_scalar("gradient_norm", batch_grad_norm) def log_learning_rates(self, model: Model, optimizer: torch.optim.Optimizer): """ Send current parameter specific learning rates to tensorboard """ if self._should_log_learning_rate: # optimizer stores lr info keyed by parameter tensor # we want to log with parameter name names = {param: name for name, param in model.named_parameters()} for group in optimizer.param_groups: if 'lr' not in group: continue rate = group['lr'] for param in group['params']: # check whether params has requires grad or not effective_rate = rate * float(param.requires_grad) self.add_train_scalar("learning_rate/" + names[param], effective_rate) def log_histograms(self, model: Model, histogram_parameters: Set[str]) -> None: """ Send histograms of parameters to tensorboard. """ for name, param in model.named_parameters(): if name in histogram_parameters: self.add_train_histogram("parameter_histogram/" + name, param) def log_metrics(self, train_metrics: dict, val_metrics: dict = None, epoch: int = None, log_to_console: bool = False) -> None: """ Sends all of the train metrics (and validation metrics, if provided) to tensorboard. """ metric_names = set(train_metrics.keys()) if val_metrics is not None: metric_names.update(val_metrics.keys()) val_metrics = val_metrics or {} # For logging to the console if log_to_console: dual_message_template = "%s | %8.3f | %8.3f" no_val_message_template = "%s | %8.3f | %8s" no_train_message_template = "%s | %8s | %8.3f" header_template = "%s | %-10s" name_length = max([len(x) for x in metric_names]) logger.info(header_template, "Training".rjust(name_length + 13), "Validation") for name in metric_names: # Log to tensorboard train_metric = train_metrics.get(name) if train_metric is not None: self.add_train_scalar(name, train_metric, timestep=epoch) val_metric = val_metrics.get(name) if val_metric is not None: self.add_validation_scalar(name, val_metric, timestep=epoch) # And maybe log to console if log_to_console and val_metric is not None and train_metric is not None: logger.info(dual_message_template, name.ljust(name_length), train_metric, val_metric) elif log_to_console and val_metric is not None: logger.info(no_train_message_template, name.ljust(name_length), "N/A", val_metric) elif log_to_console and train_metric is not None: logger.info(no_val_message_template, name.ljust(name_length), train_metric, "N/A") def enable_activation_logging(self, model: Model) -> None: if self._histogram_interval is not None: # To log activation histograms to the forward pass, we register # a hook on forward to capture the output tensors. # This uses a closure to determine whether to log the activations, # since we don't want them on every call. for _, module in model.named_modules(): if not getattr(module, 'should_log_activations', False): # skip it continue def hook(module_, inputs, outputs): # pylint: disable=unused-argument,cell-var-from-loop log_prefix = 'activation_histogram/{0}'.format(module_.__class__) if self.should_log_histograms_this_batch(): self.log_activation_histogram(outputs, log_prefix) module.register_forward_hook(hook) def log_activation_histogram(self, outputs, log_prefix: str) -> None: if isinstance(outputs, torch.Tensor): log_name = log_prefix self.add_train_histogram(log_name, outputs) elif isinstance(outputs, (list, tuple)): for i, output in enumerate(outputs): log_name = "{0}_{1}".format(log_prefix, i) self.add_train_histogram(log_name, output) elif isinstance(outputs, dict): for k, tensor in outputs.items(): log_name = "{0}_{1}".format(log_prefix, k) self.add_train_histogram(log_name, tensor) else: # skip it pass def close(self) -> None: """ Calls the ``close`` method of the ``SummaryWriter`` s which makes sure that pending scalars are flushed to disk and the tensorboard event files are closed properly. """ if self._train_log is not None: self._train_log.close() if self._validation_log is not None: self._validation_log.close()
def train(opts): CUDA = torch.cuda.is_available() and not opts.no_cuda device = 'cuda' if CUDA else 'cpu' # seed initialization random.seed(opts.seed) np.random.seed(opts.seed) torch.manual_seed(opts.seed) if CUDA: torch.cuda.manual_seed_all(opts.seed) # model build model = LPSRNN(dropout=opts.dropout) model.to(device) print(model) writer = SummaryWriter(os.path.join(opts.save_path, 'train')) opt = optim.Adam(model.parameters(), lr=opts.lr) if opts.loss == 'l2': criterion = nn.MSELoss() elif opts.loss == 'l1': criterion = nn.L1Loss() else: raise TypeError('Loss function {} not understood'.format(opts.loss)) dset = WavPairDataset(opts.dataset, transform=wav2stft(logpower=True)) va_dset = WavPairDataset(opts.dataset, split='valid', transform=wav2stft(logpower=True)) collater = SeqLPSCollater(maxlen=opts.maxlen) dloader = DataLoader(dset, batch_size=opts.batch_size, shuffle=True, num_workers=opts.num_workers, collate_fn=collater) va_dloader = DataLoader(va_dset, batch_size=opts.batch_size, shuffle=False, num_workers=opts.num_workers, collate_fn=collater) timings = [] global_step = 0 patience = opts.patience min_va_loss = np.inf for epoch in range(opts.epoch): model.train() beg_t = timeit.default_timer() for bidx, batch in enumerate(dloader, start=1): # split into (X, Y) pairs lps_x, lps_y = batch lps_x, lps_x_pha = torch.chunk(lps_x, 2, dim=3) lps_x = lps_x.squeeze(3) lps_y, lps_y_pha = torch.chunk(lps_y, 2, dim=3) lps_y = lps_y.squeeze(3) lps_x = lps_x.to(device) lps_y = lps_y.to(device) opt.zero_grad() y_, state = model(lps_x) loss = criterion(y_, lps_y) loss.backward() opt.step() end_t = timeit.default_timer() timings.append(end_t - beg_t) beg_t = timeit.default_timer() if bidx % opts.save_freq == 0 or bidx >= len(dloader): print('Batch {}/{} (epoch {}) loss: {:.3f} ' 'btime: {:.3f} s, mbtime: {:.3f}' ''.format(bidx, len(dloader), epoch, loss.item(), timings[-1], np.mean(timings))) writer.add_scalar('training/loss', loss.item(), global_step) writer.add_histogram('training/lps_x', lps_x.cpu().data, global_step, bins='sturges') writer.add_histogram('training/lps_y', lps_y.cpu().data, global_step, bins='sturges') writer.add_histogram('training/pred_y', y_.cpu().data, global_step, bins='sturges') global_step += 1 va_losses = eval_epoch(va_dloader, model, criterion, epoch, writer, opts.save_freq, device) mva_loss = np.mean(va_losses) if min_va_loss > mva_loss: print('Val loss improved {:.3f} --> {:.3f}'.format(min_va_loss, mva_loss)) min_va_loss = mva_loss torch.save(model.state_dict(), os.path.join(opts.save_path, 'model-e{}.ckpt'.format(epoch))) patience = opts.patience else: patience -= 1 print('Val loss did not improve. Curr patience' '{}/{}'.format(patience, opts.patience)) if patience <= 0: print('Finishing training, out of patience') break
class MSGAN: ''' Class that include all the parameters of the optimisation and save the model after each epoch ''' def __init__(self, data_folder, Nepochs=1000, SlopLRelu=0.2, use_cuda=True): ''' SamplesFile: Location of the hdf5 file with all the samples Nepochs: Number of epochs balance: Balance in the final loss for the generator WParam: Parameter that attirbute more weigth to the diagonal because of the increase in difficulty startcounter: Start the counter of the number of iteration at startcounter. Allow to resume the learning of a model ''' self.use_cuda = use_cuda self.latent_dim = 10 self.Nepochs = Nepochs self.StartEpochs = 0 #Different from zeros if the optimisation is resuming self.SlopLRelu = SlopLRelu self.NF = 32 self.Ndepth = 5 self.Ndepth_max = 5 self.scales = [4, 8, 16, 32] self.Nfeatures = [16, 32, 64, 128] # self.len_ohe = 10 self.depths = [i for i in range(self.Ndepth)] self.Nscales = len(self.scales) self.G = MSGenerator(NF=self.NF, scales=self.scales, depths=self.depths, Ndepth_max=self.Ndepth_max, SlopLRelu=self.SlopLRelu, latent_dim=self.latent_dim) # len_ohe=self.len_ohe) self.D = MSDiscriminator(NF=self.NF, scales=self.scales, depths=self.depths, Ndepth_max=self.Ndepth_max, SlopLRelu=self.SlopLRelu, Nfeatures=self.Nfeatures) # len_ohe=self.len_ohe) self.optim_G = optim.Adam(self.G.parameters(), lr=0.0002, betas=(0.5, 0.999)) self.optim_D = optim.Adam(self.D.parameters(), lr=0.0002, betas=(0.5, 0.999)) self.batchsize = 8 self.compute_adv_loss = WGANLoss() #.cuda() if self.use_cuda: self.G = self.G.cuda() self.D = self.D.cuda() self.compute_adv_loss = self.compute_adv_loss.cuda() self.reg_param = 10. #regularization parameter #Init Logger for the tensorboard # self.logger = Logger('./logs' + "/") #Logger for the scalar & histograms self.writer = SummaryWriter('./logs/') #Writter for image saving self.g_loss_record = {key: [] for key in ['train', 'val']} self.g_adv_loss_record = {key: [] for key in ['train', 'val']} self.d_loss_record = {key: [] for key in ['train', 'val']} self.d_adv_loss_fake_record = {key: [] for key in ['train', 'val']} self.d_adv_loss_real_record = {key: [] for key in ['train', 'val']} #Generator/Discriminator loss mean as indicator for the optimal model self.best_g_loss = np.power(10, 20) #np.inf self.best_d_loss = np.power(10, 20) #np.inf self.get_training_images = True self.ImagesDir = './TrainingImages/' if not os.path.exists(self.ImagesDir): os.makedirs(self.ImagesDir) self.ModelDir = './TrainingModels/' if not os.path.exists(self.ModelDir): os.makedirs(self.ModelDir) #Define dataloader train_dataset, test_dataset = load_CIFAR10_datasets( data_folder, self.latent_dim, self.Nscales) self.train_loader = DataLoader(train_dataset, batch_size=self.batchsize, shuffle=True, num_workers=30, pin_memory=True) self.test_loader = DataLoader(test_dataset, batch_size=self.batchsize, shuffle=True, num_workers=30, pin_memory=True) self.train_len = len(train_dataset) self.test_len = len(test_dataset) #Load label names self.label_names = load_CIFAR10_label_names(data_folder) #Fixed latent code to generate always the same image # to follow the evolution of the training self.fixed_LC = [ np.random.randn(1, self.latent_dim).astype(np.float32) for _ in range(self.Nscales) ] print("Initialize the networks weigths...") self.G.apply(self.he_init) self.D.apply(self.he_init) def train(self): for self.epoch in tqdm(range(self.StartEpochs, self.Nepochs)): self.G.train() self.D.train() self.phase = 'train' self.clear_loss_records() total_iter = np.ceil(self.train_len / self.batchsize) for self.counter, (self.X, _, self.latent_codes) in enumerate( tqdm(self.train_loader, total=total_iter, desc='train')): self.X = Variable(self.X, requires_grad=True) #.cuda() self.latent_codes = [ Variable(latent_code) for latent_code in self.latent_codes ] if self.use_cuda: self.X = self.X.cuda() self.latent_codes = [ latent_code.cuda() for latent_code in self.latent_codes ] #Pooling Real image to fit generator Output factors = [ int(self.scales[-1] / scale) for scale in self.scales ] self.Img_real = [ nn.AvgPool2d(factor, stride=factor, padding=0)(self.X) for factor in factors ] # ===update D=== self.optim_D.zero_grad() self.forward_D() self.backward_D() # ===update G=== self.forward_D() self.optim_G.zero_grad() self.forward_G() self.backward_G() # print 'record loss' self.record_loss() self.StartEpochs = self.epoch # if self.counter > 1000: # break # ===validation=== self.validate() # ===tensorboard visualization=== self.tensorboard() # ===save model=== self.save() def forward_D(self): self.Img_fake = self.G(self.latent_codes) _, self.d_real = self.D(self.Img_real) _, self.d_fake = self.D([Img.detach() for Img in self.Img_fake]) #detach means that the netork is using HRfake but block the optimization #to the network that generated it, ie clone the variable as it's a new one def forward_G(self): _, self.d_fake = self.D(self.Img_fake) def backward_G(self): self.g_loss = self.compute_G_loss() self.g_loss.backward() self.optim_G.step() def backward_D(self): self.d_loss = self.compute_D_loss() #retain_graph=False because the loss function # for the gradient and the discriminator are not # the same and therefore the gradients are differents # self.d_loss.backward(retain_graph=False) self.d_loss.backward(retain_graph=True) self.d_real_reg = sum([ self.reg_param * compute_grad2(d_real_i, Img_real_i).mean() for d_real_i, Img_real_i in zip(self.d_real, self.Img_real) ]) self.d_real_reg.backward() # R1_reg(dloss_real,d_real,x_real) self.optim_D.step() def compute_G_loss(self): #Make the discr find True when fake Nlayers = len(self.d_fake) # learning_factors = [numpy2var((Nlayers-(i+1))*((self.epoch+1)/self.Nmax_epoch),use_cuda=True) for i in range((Nlayers))] # self.g_adv_loss = [self.compute_adv_loss(self.d_fake[i], True)*learning_factors[i] for i in range(len(self.d_fake))] self.g_adv_loss = [ self.compute_adv_loss(d_fake_i, True) for d_fake_i in self.d_fake ] #Concatenate all losses self.g_adv_loss = sum(self.g_adv_loss) return self.g_adv_loss def compute_D_loss(self): # Nlayers = len(self.d_real) # self.d_adv_loss = [] # for i in range(len(self.d_real)): # self.d_adv_loss_real = self.compute_adv_loss(self.d_real[i], True) # self.d_adv_loss_fake = self.compute_adv_loss(self.d_fake[i], False) # # learning_factor = numpy2var((Nlayers-(i+1))*((self.epoch+1)/self.Nmax_epoch),use_cuda=True) # # self.d_adv_loss.append((self.d_adv_loss_real + self.d_adv_loss_fake)*learning_factor) # self.d_adv_loss.append((self.d_adv_loss_real + self.d_adv_loss_fake)) # # #Concatenate all losses # self.d_adv_loss = sum(self.d_adv_loss) self.d_adv_loss_real = sum([ self.compute_adv_loss(d_real_i, True) for d_real_i in self.d_real ]) self.d_adv_loss_fake = sum([ self.compute_adv_loss(d_fake_i, False) for d_fake_i in self.d_fake ]) #Regularization on the gradient of real samples # self.d_adv_loss_real = self.d_adv_loss_real + self.d_real_reg self.d_adv_loss = (self.d_adv_loss_real + self.d_adv_loss_fake) / (2. * len(self.d_real)) return self.d_adv_loss def record_loss(self): p = self.phase self.g_loss_record[p].append( var2numpy(self.g_loss.mean(), use_cuda=self.use_cuda)) self.d_loss_record[p].append( var2numpy(self.d_loss.mean(), use_cuda=self.use_cuda)) self.d_adv_loss_fake_record[p].append( var2numpy(self.d_adv_loss_fake.mean(), use_cuda=self.use_cuda)) self.d_adv_loss_real_record[p].append( var2numpy(self.d_adv_loss_real.mean(), use_cuda=self.use_cuda)) def clear_loss_records(self): for p in ['train', 'val']: self.g_loss_record[p] = [] self.d_loss_record[p] = [] self.d_adv_loss_fake_record[p] = [] self.d_adv_loss_real_record[p] = [] def validate(self): self.G.eval() self.D.eval() self.phase = 'val' total_iter = np.ceil(self.test_len / self.batchsize) for self.counter, (self.X, _, self.latent_codes) in enumerate( tqdm(self.test_loader, total=total_iter, desc='validation')): #Generate latent code self.X = Variable(self.X, requires_grad=True) #.cuda() # self.X = Variable(self.X)#.cuda() self.latent_codes = [ Variable(latent_code) for latent_code in self.latent_codes ] if self.use_cuda: self.X = self.X.cuda() self.latent_codes = [ latent_code.cuda() for latent_code in self.latent_codes ] with torch.no_grad(): #Pooling Real image to fit generator Output # factors = [int(self.ImgSizes[-1]/imgsize) for imgsize in self.ImgSizes] factors = [ int(self.scales[-1] / scale) for scale in self.scales ] self.Img_real = [ nn.AvgPool2d(factor, stride=factor, padding=0)(self.X) for factor in factors ] self.forward_D() self.forward_G() self.g_loss = self.compute_G_loss() self.d_loss = self.compute_D_loss() self.record_loss() # if self.counter > 1000: # break def predict(self, img, labels, batchsize=1): dataset = dataset_h5(img, labels, self.latent_dim, self.Nscales) data_loader = DataLoader(dataset, batch_size=batchsize, shuffle=False, num_workers=30, pin_memory=True) self.D.eval() total_iter = np.ceil(img.shape[0] / batchsize) features = [[] for _ in self.scales] for counter, (X, _, _) in enumerate( tqdm(data_loader, total=total_iter, desc='prediction')): X = Variable(X) if self.use_cuda: X = X.cuda() with torch.no_grad(): #Pooling Real image to fit generator Output factors = [ int(self.scales[-1] / scale) for scale in self.scales ] Img_real = [ nn.AvgPool2d(factor, stride=factor, padding=0)(X) for factor in factors ] for i, feat in enumerate(self.D(Img_real)[0]): if self.use_cuda: feat = feat.cpu() features[i].append(feat.data.numpy()[0]) features = [np.vstack(feat) for feat in features] return features def generate(self, Nimages=1, latent_codes=None): self.G.eval() get_latent = True if latent_codes == None else False gen_image = [[] for i in range(Nimages)] for i in range(Nimages): #Generate latent code seed = np.random.seed(i + datetime.now().second + datetime.now().microsecond) if get_latent: latent_codes = [ np.random.randn(1, self.latent_dim).astype(np.float32) for _ in range(self.Nscales) ] latent_codes = [ Variable(torch.from_numpy(latent_code)) for latent_code in latent_codes ] if self.use_cuda: latent_codes = [ latent_code.cuda() for latent_code in latent_codes ] with torch.no_grad(): #Pooling Real image to fit generator Output # factors = [int(self.ImgSizes[-1]/imgsize) for imgsize in self.ImgSizes] for X in self.G(latent_codes): if self.use_cuda: X = X.cpu() gen_image[i].append(X.data.numpy()[0].transpose( (1, 2, 0))) return gen_image def save(self): file_name = os.path.join(self.ModelDir, 'Epoch%d' % (self.epoch)) g_file = file_name + '-G.pth' d_file = file_name + '-D.pth' g_loss_mean = np.array(self.g_loss_record['val']).mean() d_loss_mean = np.array(self.d_loss_record['val']).mean() # if g_loss_mean<self.best_g_loss: if True: state = { 'state_dict': self.G.state_dict(), 'optimizer': self.optim_G.state_dict(), 'epoch': self.epoch, } torch.save(state, g_file) self.best_g_loss = g_loss_mean # if d_loss_mean<self.best_d_loss: if True: state = { 'state_dict': self.D.state_dict(), 'optimizer': self.optim_D.state_dict(), 'epoch': self.epoch, } torch.save(state, d_file) self.best_d_loss = d_loss_mean def load(self, Gpath, Dpath): state_g = torch.load(Gpath) self.G.load_state_dict(state_g['state_dict']) self.optim_G.load_state_dict(state_g['optimizer']) state_d = torch.load(Dpath) self.D.load_state_dict(state_d['state_dict']) self.optim_D.load_state_dict(state_d['optimizer']) #Reset the best loss for the generator and discriminator self.best_g_loss = np.power(10, 20) #np.inf self.best_d_loss = np.power(10, 20) #np.inf def tensorboard(self): # ===Add scalar losses=== for p in ['train', 'val']: prefix = p + '/' info = { prefix + 'G_loss': np.array(self.g_loss_record[p]).mean(), prefix + 'D_loss': np.array(self.d_loss_record[p]).mean(), prefix + 'D_adv_loss_fake': np.array(self.d_adv_loss_fake_record[p]).mean(), prefix + 'D_adv_loss_real': np.array(self.d_adv_loss_real_record[p]).mean() } # self.writer.add_scalars(p, info, self.epoch) for tag, value in info.items(): self.writer.add_scalars(tag, {tag: value}, self.epoch) # self.logger.scalar_summary(tag, value, self.epoch) # ===Add gradien histogram=== for tag, value in self.G.named_parameters(): tag = tag.replace('.', '/') self.writer.add_histogram('G/' + prefix + tag, var2numpy(value), self.epoch) # self.logger.histo_summary('G/' + prefix +tag, var2numpy(value), self.epoch) if value.grad is not None: self.writer.add_histogram('G/' + prefix + tag + '/grad', var2numpy(value.grad), self.epoch) # self.logger.histo_summary('G/' + prefix +tag + '/grad', var2numpy(value.grad), self.epoch) for tag, value in self.D.named_parameters(): tag = tag.replace('.', '/') self.writer.add_histogram('D/' + prefix + tag, var2numpy(value), self.epoch) # self.logger.histo_summary('D/' + prefix + tag, var2numpy(value), self.epoch) if value.grad is not None: self.writer.add_histogram('D/' + prefix + tag + '/grad', var2numpy(value.grad), self.epoch) # self.logger.histo_summary('D/' + prefix + tag + '/grad',var2numpy(value.grad), self.epoch) #===generate sample images=== if self.get_training_images == True: # # K = np.random.randint(self.batchsize) # # f,ax = plt.subplots(1,len(self.Img_fake),figsize=(int(5*len(self.Img_fake)),5)) # for i in range(len(self.Img_fake)): # if self.use_cuda: # img = self.Img_fake[i].cpu() # else: # img = self.Img_fake[i] # # img = (img.data.numpy()[K]).transpose((1,2,0)) # gen_img = self.generate(Nimages=1, latent_codes=self.fixed_LC)[0] f, ax = plt.subplots(1, len(gen_img), figsize=(5 * len(gen_img), 5)) for i, img in enumerate(gen_img): ax[i].imshow(img) plt.savefig(os.path.join(self.ImagesDir, 'Img-Epoch%d.png' % (self.epoch)), format='png') def he_init(self, layer, nonlinearity='conv2d'): classname = layer.__class__.__name__ # Check if the leayer is a convolution. # If True, apply Kaiming normalization if classname.find('Conv') != -1: nonlinearity = nonlinearity.lower() if nonlinearity not in [ 'linear', 'conv1d', 'conv2d', 'conv3d', 'relu', 'leaky_relu', 'sigmoid', 'tanh' ]: if not hasattr(layer, 'gain') or layer.gain is None: gain = 0 # default else: gain = layer.gain elif nonlinearity == 'leaky_relu': # assert param is not None, 'Negative_slope(param) should be given.' gain = calculate_gain(nonlinearity, self.SlopLRelu) else: gain = calculate_gain(nonlinearity) kaiming_normal(layer.weight, a=gain) def copy(self, model): ''' Allow to get paramters from another pretrained model ''' for key in model.__dict__.keys(): self.__dict__[key] = model.__dict__[key]
class SummaryWorker(multiprocessing.Process): def __init__(self, env): super(SummaryWorker, self).__init__() self.env = env self.config = env.config self.queue = multiprocessing.Queue() try: self.timer_scalar = utils.train.Timer(env.config.getfloat('summary', 'scalar')) except configparser.NoOptionError: self.timer_scalar = lambda: False try: self.timer_image = utils.train.Timer(env.config.getfloat('summary', 'image')) except configparser.NoOptionError: self.timer_image = lambda: False try: self.timer_histogram = utils.train.Timer(env.config.getfloat('summary', 'histogram')) except configparser.NoOptionError: self.timer_histogram = lambda: False with open(os.path.expanduser(os.path.expandvars(env.config.get('summary_histogram', 'parameters'))), 'r') as f: self.histogram_parameters = utils.RegexList([line.rstrip() for line in f]) self.draw_bbox = utils.visualize.DrawBBox(env.config, env.category) self.draw_iou = utils.visualize.DrawIou(env.config) def __call__(self, name, **kwargs): if getattr(self, 'timer_' + name)(): kwargs = getattr(self, 'copy_' + name)(**kwargs) self.queue.put((name, kwargs)) def stop(self): self.queue.put((None, {})) def run(self): self.writer = SummaryWriter(os.path.join(self.env.model_dir, self.env.args.run)) while True: name, kwargs = self.queue.get() if name is None: break func = getattr(self, 'summary_' + name) try: func(**kwargs) except: traceback.print_exc() def copy_scalar(self, **kwargs): step, loss_total, loss, loss_hparam = (kwargs[key] for key in 'step, loss_total, loss, loss_hparam'.split(', ')) loss_total = loss_total.data.clone().cpu().numpy() loss = {key: loss[key].data.clone().cpu().numpy() for key in loss} loss_hparam = {key: loss_hparam[key].data.clone().cpu().numpy() for key in loss_hparam} return dict( step=step, loss_total=loss_total, loss=loss, loss_hparam=loss_hparam, ) def summary_scalar(self, **kwargs): step, loss_total, loss, loss_hparam = (kwargs[key] for key in 'step, loss_total, loss, loss_hparam'.split(', ')) for key in loss: self.writer.add_scalar('loss/' + key, loss[key][0], step) if self.config.getboolean('summary_scalar', 'loss_hparam'): self.writer.add_scalars('loss_hparam', {key: loss_hparam[key][0] for key in loss_hparam}, step) self.writer.add_scalar('loss_total', loss_total[0], step) def copy_image(self, **kwargs): step, height, width, rows, cols, data, pred, debug = (kwargs[key] for key in 'step, height, width, rows, cols, data, pred, debug'.split(', ')) data = {key: data[key].clone().cpu().numpy() for key in 'image, yx_min, yx_max, cls'.split(', ')} pred = {key: pred[key].data.clone().cpu().numpy() for key in 'yx_min, yx_max, iou, logits'.split(', ') if key in pred} matching = (debug['positive'].float() - debug['negative'].float() + 1) / 2 matching = matching.data.clone().cpu().numpy() return dict( step=step, height=height, width=width, rows=rows, cols=cols, data=data, pred=pred, matching=matching, ) def summary_image(self, **kwargs): step, height, width, rows, cols, data, pred, matching = (kwargs[key] for key in 'step, height, width, rows, cols, data, pred, matching'.split(', ')) image = data['image'] limit = min(self.config.getint('summary_image', 'limit'), image.shape[0]) image = image[:limit, :, :, :] yx_min, yx_max, iou = (pred[key] for key in 'yx_min, yx_max, iou'.split(', ')) scale = [height / rows, width / cols] yx_min, yx_max = (a * scale for a in (yx_min, yx_max)) if 'logits' in pred: cls = np.argmax(F.softmax(torch.autograd.Variable(torch.from_numpy(pred['logits'])), -1).data.cpu().numpy(), -1) else: cls = np.zeros(iou.shape, np.int) if self.config.getboolean('summary_image', 'bbox'): # data canvas = np.copy(image) canvas = pybenchmark.profile('bbox/data')(self.draw_bbox_data)(canvas, *(data[key] for key in 'yx_min, yx_max, cls'.split(', '))) self.writer.add_image('bbox/data', torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step) # pred canvas = np.copy(image) canvas = pybenchmark.profile('bbox/pred')(self.draw_bbox_pred)(canvas, yx_min, yx_max, cls, iou, nms=True) self.writer.add_image('bbox/pred', torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step) if self.config.getboolean('summary_image', 'iou'): # bbox canvas = np.copy(image) canvas_data = self.draw_bbox_data(canvas, *(data[key] for key in 'yx_min, yx_max, cls'.split(', ')), colors=['g']) # data for i, canvas in enumerate(pybenchmark.profile('iou/data')(self.draw_bbox_iou)(list(map(np.copy, canvas_data)), yx_min, yx_max, cls, matching, rows, cols, colors=['w'])): canvas = np.stack(canvas) canvas = torch.from_numpy(canvas).permute(0, 3, 1, 2) canvas = torchvision.utils.make_grid(canvas.float(), normalize=True, scale_each=True) self.writer.add_image('iou/data%d' % i, canvas, step) # pred for i, canvas in enumerate(pybenchmark.profile('iou/pred')(self.draw_bbox_iou)(list(map(np.copy, canvas_data)), yx_min, yx_max, cls, iou, rows, cols, colors=['w'])): canvas = np.stack(canvas) canvas = torch.from_numpy(canvas).permute(0, 3, 1, 2) canvas = torchvision.utils.make_grid(canvas.float(), normalize=True, scale_each=True) self.writer.add_image('iou/pred%d' % i, canvas, step) def draw_bbox_data(self, canvas, yx_min, yx_max, cls, colors=None): batch_size = len(canvas) if len(cls.shape) == len(yx_min.shape): cls = np.argmax(cls, -1) yx_min, yx_max, cls = ([a[b] for b in range(batch_size)] for a in (yx_min, yx_max, cls)) return [self.draw_bbox(canvas, yx_min.astype(np.int), yx_max.astype(np.int), cls, colors=colors) for canvas, yx_min, yx_max, cls in zip(canvas, yx_min, yx_max, cls)] def draw_bbox_pred(self, canvas, yx_min, yx_max, cls, iou, colors=None, nms=False): batch_size = len(canvas) mask = iou > self.config.getfloat('detect', 'threshold') yx_min, yx_max = (np.reshape(a, [a.shape[0], -1, 2]) for a in (yx_min, yx_max)) cls, iou, mask = (np.reshape(a, [a.shape[0], -1]) for a in (cls, iou, mask)) yx_min, yx_max, cls, iou, mask = ([a[b] for b in range(batch_size)] for a in (yx_min, yx_max, cls, iou, mask)) yx_min, yx_max, cls, iou = ([a[m] for a, m in zip(l, mask)] for l in (yx_min, yx_max, cls, iou)) if nms: overlap = self.config.getfloat('detect', 'overlap') keep = [pybenchmark.profile('nms')(utils.postprocess.nms)(torch.Tensor(iou), torch.Tensor(yx_min), torch.Tensor(yx_max), overlap) if iou.shape[0] > 0 else [] for yx_min, yx_max, iou in zip(yx_min, yx_max, iou)] keep = [np.array(k, np.int) for k in keep] yx_min, yx_max, cls = ([a[k] for a, k in zip(l, keep)] for l in (yx_min, yx_max, cls)) return [self.draw_bbox(canvas, yx_min.astype(np.int), yx_max.astype(np.int), cls, colors=colors) for canvas, yx_min, yx_max, cls in zip(canvas, yx_min, yx_max, cls)] def draw_bbox_iou(self, canvas_share, yx_min, yx_max, cls, iou, rows, cols, colors=None): batch_size = len(canvas_share) yx_min, yx_max = ([np.squeeze(a, -2) for a in np.split(a, a.shape[-2], -2)] for a in (yx_min, yx_max)) cls, iou = ([np.squeeze(a, -1) for a in np.split(a, a.shape[-1], -1)] for a in (cls, iou)) results = [] for i, (yx_min, yx_max, cls, iou) in enumerate(zip(yx_min, yx_max, cls, iou)): mask = iou > self.config.getfloat('detect', 'threshold') yx_min, yx_max = (np.reshape(a, [a.shape[0], -1, 2]) for a in (yx_min, yx_max)) cls, iou, mask = (np.reshape(a, [a.shape[0], -1]) for a in (cls, iou, mask)) yx_min, yx_max, cls, iou, mask = ([a[b] for b in range(batch_size)] for a in (yx_min, yx_max, cls, iou, mask)) yx_min, yx_max, cls = ([a[m] for a, m in zip(l, mask)] for l in (yx_min, yx_max, cls)) canvas = [self.draw_bbox(canvas, yx_min.astype(np.int), yx_max.astype(np.int), cls, colors=colors) for canvas, yx_min, yx_max, cls in zip(np.copy(canvas_share), yx_min, yx_max, cls)] iou = [np.reshape(a, [rows, cols]) for a in iou] canvas = [self.draw_iou(_canvas, iou) for _canvas, iou in zip(canvas, iou)] results.append(canvas) return results def copy_histogram(self, **kwargs): return {key: kwargs[key].data.clone().cpu().numpy() if torch.is_tensor(kwargs[key]) else kwargs[key] for key in 'step, dnn'.split(', ')} def summary_histogram(self, **kwargs): step, dnn = (kwargs[key] for key in 'step, dnn'.split(', ')) for name, param in dnn.named_parameters(): if self.histogram_parameters(name): self.writer.add_histogram(name, param, step)
def train(model_path, epochs): trans = DataUtill.get_ImageNet_transform(random_horizontal_flip=True) train_data = DataUtill.Placesdataset(data_path, transforms=trans) train_data_loader = datas.DataLoader(train_data, batch_size, shuffle=True, num_workers=8) # 可视化数据 # torchvision.utils.save_image(valid_batch["img"], "pic.png", normalize=True) encoder = alexnet(True) num_fea = encoder.classifier[6].in_features features = list(encoder.classifier.children())[:-1] ofc = nn.Linear(num_fea, 200) nn.init.normal(ofc.weight, 0, 0.01) features.append(ofc) encoder.classifier = nn.Sequential(*features) encoder = encoder.cuda() global_step = 0 optimizer = optim.SGD(encoder.parameters(), learning_rate, 0.9, weight_decay=0.0005) optimizer = optim.lr_scheduler.ExponentialLR(optimizer, 0.998) critizen = nn.CrossEntropyLoss() Writer = SummaryWriter(log_dir=model_path) # 先计算一次当前acc max_acc, min_eval_loss = eval(trans, encoder, critizen) print("初始准确率为{}%".format(max_acc)) Writer.add_scalar("/eval/eval_loss", min_eval_loss, global_step) Writer.add_scalar("/eval/accuracy", max_acc, global_step) for epoch in range(epochs): for step, batch in enumerate(train_data_loader): global_step = global_step + 1 input = batch["img"] label = batch["class"] # torchvision.utils.save_image(input, "pic.png", normalize=True) input = autograd.Variable(input) label = autograd.Variable(label) input = input.cuda() label = label.cuda() encoder.zero_grad() output = encoder(input) loss = critizen(output, label.squeeze()) loss.backward(retain_graph=True) optimizer.step() if global_step % 100 == 0: Writer.add_scalar("train_loss", loss, global_step) if global_step % 1000 == 0: Writer.add_histogram("/conv1/grad", encoder.features[0].weight.grad, global_step) Writer.add_histogram("/conv1/weight", encoder.features[0].weight, global_step) Writer.add_histogram("/fc6/grad", encoder.classifier[6].weight.grad, global_step) Writer.add_histogram("/fc6/weight", encoder.classifier[6].weight, global_step) acc, eval_loss = eval(trans, encoder, critizen) Writer.add_scalar("/eval/accuracy", acc, global_step) Writer.add_scalar("/eval/eval_loss", eval_loss, global_step) if acc > max_acc: max_acc = max(acc, max_acc) DataUtill.save_param(encoder, model_path + "alexnet.pkl") print( "save params in {} epoch {} step with accuracy {}% , and the loss is {}" .format(epoch, step, acc, eval_loss))