def _process_batch(self, input, targets, phase): nodes = input.nodes.to(self.device) edge_sources = input.edge_sources.to(self.device) edge_targets = input.edge_targets.to(self.device) edge_distance = input.edge_distance.to(self.device) graph_indices = input.graph_indices.to(self.device) node_counts = input.node_counts.to(self.device) combine_sets = input.combine_sets.to(self.device) plane_wave = input.plane_wave.to(self.device) targets = targets.to(self.device) self.optimizer.zero_grad() with torch.set_grad_enabled(phase == 'train'): outputs = self.model(nodes, edge_sources, edge_targets, edge_distance, graph_indices, node_counts, combine_sets, plane_wave) metric_tensors = [ metric.add_batch_metric(outputs, targets) for metric in self.metrics ] if phase == 'train': loss = metric_tensors[0] loss.backward() if self.clip_value is not None: clip_grad_value_(self.model.parameters(), self.clip_value) self.optimizer.step() return metric_tensors, outputs
def update(self, ob_no, ac_na, reward_n, next_ob_no, terminal_n): # everything else should be numpy arrays up til this point ob_no = np.array(ob_no) ac_na = ptu.from_numpy(ac_na).to(torch.long) next_ob_no = np.array(next_ob_no) reward_n = ptu.from_numpy(reward_n) terminal_n = ptu.from_numpy(terminal_n) ac_na = ac_na.to(self.device) q = torch.gather(self.q_net(ob_no), 1, ac_na.unsqueeze(1)).squeeze() #print('q', q.shape) ac_qmax = torch.argmax(self.q_net(next_ob_no), dim=1).unsqueeze(1) # next_ob_no = next_ob_no.to(self.device) q_target = self.q_net_target(next_ob_no) q_target_plug_in = q_target.gather(1, ac_qmax).squeeze() terminal_n = terminal_n.to(self.device) reward_n = reward_n.to(self.device) target = reward_n + q_target_plug_in * ( torch.logical_not(terminal_n)).detach() loss = self.loss(q, target) self.optimizer.zero_grad() loss.backward() utils.clip_grad_value_(self.q_decoder.parameters(), self.grad_norm_clipping) self.optimizer.step()
def train_model(model, train_iter, val_iter, max_epoch, last_epoch=0): optimizer = tr.optim.Adam( filter(lambda p: p.requires_grad, model.parameters())) # ,lr=0.001 for epoch_now in range(1 + last_epoch, 1 + max_epoch): model.train() start_time = time() learning_rate_dacay(optimizer, epoch_now) step = len(train_iter) // 10 for i, batch in enumerate(train_iter): text = batch.w[0] optimizer.zero_grad() y_true = [batch.cate1_id, batch.cate2_id, batch.cate3_id] y_pred = model(text.to(DEVICE)) nll_loss_list = [ loss_fn(y_pred[i], y_true[i].to(DEVICE)) for i in range(3) ] tot_loss = wei_criterion(nll_loss_list) tot_loss.backward() clip_grad_value_(model.parameters(), GRAD_CLIP) optimizer.step() if i % step == 0: print(i * BATCH_SIZE, end=' ', flush=True) util.log_and_print(nll_loss_list + [tot_loss]) print("\n %.1f min,turns:%d " % ((time() - start_time) / 60, epoch_now)) pred_list = util.get_pred_list(model, val_iter) res = util.creterion_val(pred_list) util.log_and_print(res) tr.save(model.state_dict(), prodirectory + "/{}.pth".format(epoch_now))
def step(self, q, d, match, l2_ratio=0): """ Do a training step. Parameters ---------- q : torch.Tensor A batch of queries. d : torch.Tensor A batch of documents. match : torch.Tensor (dtype=torch.bool) A matrix (2D tensor) with match[i,j] indicating if q[i] match with d[j] Returns ------- loss : torch.Tensor (size 1) The loss (negative mutual information) of the current batch. """ self.zero_grad() qsign = torch.tanh(self.fq(q)) dsign = torch.tanh(self.fd(d)) sh = torch_soft_hamming( qsign[:, None], dsign[None, :]) #shape = (#queries, #documents) bins = self.kernel(sh) pos_cat = bins[match].mean(dim=0) neg_cat = bins[~match].mean(dim=0) loss = -mi_categorical_bernoulli(pos_cat, neg_cat, self.match_prob) loss.backward() clip_grad_value_(self.parameters(), 5) self.optim.step() return loss
def step(self, transitions): r"""Performs a single learning step. :param transitions: a list of :class:`rlcc.Transition` :type transitions: :class:`rlcc.Transition` """ # Compute actor loss loss = self.loss(transitions) # Minimize the loss self.optimizer.zero_grad() loss.backward() # Clip the gradient if self.clip_norm: nn_utils.clip_grad_norm_(self.local.parameters(), self.clip_norm) if self.clip_value: nn_utils.clip_grad_value_(self.local.parameters(), self.clip_value) # Optimize self.optimizer.step() # Notify the observers for obs in self.observers: obs(transitions=transitions, local=self.local, loss=loss, learning_steps=self.learning_steps) # soft updates soft_update(self.target, self.local, self.tau) # update counts self.learning_steps += 1
def train_gen_pg_each(policy, env, epoch, optimizer, num_clicks, recom_number, max_length, batch_size=256, total_size=10000): policy.train() env.eval() print('\nTRAINING : Epoch ' + str(epoch)) all_costs = [] logs = [] decay=0.95 last_time = time.time() if epoch>1: optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * decay print('Learning rate : {0}'.format(optimizer.param_groups[0]['lr'])) for stidx in range(0, total_size, total_size): real_size = total_size batch_replay = ReplayMemory(env, policy, real_size, max_length, num_clicks, recom_number) batch_replay.gen_sample(real_size) click_batch, reward_batch, action_batch, prob_batch = Variable(batch_replay.clicks), Variable(batch_replay.rewards), Variable(batch_replay.actions), Variable(batch_replay.probs, requires_grad = True) value_batch = env.value(reward_batch) loss = -(torch.log(prob_batch) * (value_batch)).sum() all_costs.append(loss.data.cpu().numpy()) optimizer.zero_grad() loss.backward() clip_grad_value_(filter(lambda p: p.requires_grad, policy.parameters()), 1) optimizer.step() if len(all_costs) == 10000: logs.append( '{0} ; loss {1} ; seq/s {2}'.format(stidx, round(np.mean(all_costs),2), int(len(all_costs) * batch_size / (time.time() - last_time)))) print(logs[-1]) last_time = time.time() all_costs = [] return all_costs, reward_batch.float().sum(1).mean().data.cpu().numpy()
def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): """ Update the parameters of the critic. let sum_of_path_lengths be the sum of the lengths of the paths sampled from Agent.sample_trajectories let num_paths be the number of paths sampled from Agent.sample_trajectories arguments: ob_no: shape: (sum_of_path_lengths, ob_dim) next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing the reward for each timestep terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended at that timestep of 0 if the episode did not end returns: nothing """ ob_no = ptu.from_numpy(ob_no) ac_na = ptu.from_numpy(ac_na).to(torch.long) next_ob_no = ptu.from_numpy(next_ob_no) reward_n = ptu.from_numpy(reward_n) terminal_n = ptu.from_numpy(terminal_n) #print(ob_no) qa_t_values = self.q_net(ob_no) q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1) # TODO compute the Q-values from the target network qa_tp1_values = self.q_net_target(next_ob_no) if self.double_q: # You must fill this part for Q2 of the Q-learning portion of the homework. # In double Q-learning, the best action is selected using the Q-network that # is being updated, but the Q-value for this action is obtained from the # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details. q_tp1 = torch.gather(qa_tp1_values, 1, torch.argmax(qa_t_values, dim=1).unsqueeze(1)).squeeze(1) else: q_tp1, _ = qa_tp1_values.max(dim=1) # TODO compute targets for minimizing Bellman error # HINT: as you saw in lecture, this would be: #currentReward + self.gamma * qValuesOfNextTimestep * (not terminal) target = reward_n + self.gamma * (q_tp1 * (1 - terminal_n)) target = target.detach() assert q_t_values.shape == target.shape loss = self.loss(q_t_values, target) self.optimizer.zero_grad() loss.backward() utils.clip_grad_value_(self.q_net.parameters(), self.grad_norm_clipping) self.optimizer.step() return { 'Training Loss': ptu.to_numpy(loss), }
def step(self, q, d, r, stage=1): """ Do a training step. Parameters ---------- q : torch.Tensor A batch of queries. d : torch.Tensor A batch of documents. r : torch.Tensor (dtype=torch.bool) A matrix (2D tensor) with r[i,j] indicating if q[i] match with d[j] stage : float (optional) The stage for computing the tanh(stage*beta*x). (default=1) Returns ------- loss : torch.Tensor (size 1) The loss of HashNet. """ self.zero_grad() zq = self.fq(q) zd = self.fd(d) bq = torch.tanh(stage * self.beta * zq) bd = torch.tanh(stage * self.beta * zd) sh = (bq[:, None] * bd[None, :]).sum(dim=2) ash = self.alpha * sh p = self.match_prob w = (r / p + ~r / (1 - p)) losses = w * (softplus(ash) - r * ash) loss = losses.mean() loss.backward() clip_grad_value_(self.parameters(), 5) self.optim.step() return loss
def _step(self, loader: DataLoader, training: bool = True) -> float: running_loss = 0.0 self.net.train() if training else self.net.eval() for _, (_, data) in enumerate(loader): inputs, labels = data inputs = inputs.to(self.dev) labels = labels.to(self.dev) if training: self.optimizer.zero_grad() outputs = self.net(inputs.float()) loss = self.loss_function(outputs.squeeze(), labels.squeeze().float()) if training: loss.backward() clip_grad_value_(self.net.parameters(), clip_value=0.05) self.optimizer.step() running_loss += loss.item() * self.batch_size if self.use_hvd: running_loss = hvd.allreduce(torch.tensor(running_loss), name="avg_loss").item() inputs.detach() labels.detach() return running_loss
def backward(self, batch): self.lmOpt.zero_grad() backwardAgent(batch, device=self.config.device, n_quant=self.config.N_QUANT_LM) clip_grad_value_(self.lm.parameters(), 10) self.lmOpt.step()
def step(self, q, d, match, l2_ratio=0.01, nbatch=None): """ Do a training step. Parameters ---------- q : torch.Tensor A batch of queries. d : torch.Tensor A batch of documents. match : torch.Tensor (dtype=torch.bool) A matrix (2D tensor) with match[i,j] indicating if q[i] match with d[j] l2_ratio : float (optional) The wanted ratio between the Fbeta Loss and L2 regularization. (default 0.01) nbatch : int or None (optional) Give the number of batch, this is uses for ramping. If None, this uses the final ramping value. Returns ------- loss : torch.Tensor (size 1) The loss of the current batch. """ self.zero_grad() loss = self.loss(q, d, match, l2_ratio=l2_ratio, nbatch=nbatch) loss.backward() clip_grad_value_(self.parameters(), 5) self.optim.step() return loss
def train(self, train_x_dict={}, **kwargs): """Train the model. Parameters ---------- train_x_dict : dict Input data. **kwargs Returns ------- loss : torch.Tensor Train loss value """ self.distributions.train() self.optimizer.zero_grad() loss = self.loss_cls.eval(train_x_dict, **kwargs) # backprop loss.backward() if self.clip_norm: clip_grad_norm_(self.distributions.parameters(), self.clip_norm) if self.clip_value: clip_grad_value_(self.distributions.parameters(), self.clip_value) # update params self.optimizer.step() return loss
def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): ob_no = ptu.from_numpy(ob_no) ac_na = ptu.from_numpy(ac_na).to(torch.long) next_ob_no = ptu.from_numpy(next_ob_no) reward_n = ptu.from_numpy(reward_n) terminal_n = ptu.from_numpy(terminal_n) qa_t_values = self.q_net(ob_no) q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1) qa_tp1_values = self.q_net_target(next_ob_no) if self.double_q: next_actions = self.q_net(next_ob_no).argmax(dim=1) q_tp1 = torch.gather(qa_tp1_values, 1, next_actions.unsqueeze(1)).squeeze(1) else: q_tp1, _ = qa_tp1_values.max(dim=1) target = reward_n + self.gamma * q_tp1 * (1 - terminal_n) target = target.detach() loss = self.loss(q_t_values, target) self.optimizer.zero_grad() loss.backward() utils.clip_grad_value_(self.q_net.parameters(), self.grad_norm_clipping) self.optimizer.step() return {'Training Loss': ptu.to_numpy(loss)}
def train_step( self, inputs, labels, device="cpu", grad_clip_value=1, weight_clip_value=1 ): self.optim.zero_grad() output = self.model(inputs.to(device)) loss = self.criterion(output, labels.to(device)) loss.backward() clip_grad_value_(self.model.parameters(), grad_clip_value) for p in self.model.parameters(): if hasattr(p, "latent_"): p.data.copy_(p.latent_) self.optim.step() for p in self.model.parameters(): if hasattr(p, "latent_"): p.latent_.copy_(p.data.clamp_(-weight_clip_value, weight_clip_value)) pred = output.argmax(dim=1, keepdims=True) correct = ( pred.eq(labels.to(device).view_as(pred)).sum().item() / labels.shape[0] ) * 100 return loss, correct
def optimize(self, loss, clip_norm_args=None, clip_val_args=None): """Short summary. Parameters ---------- loss : torch.Tensor clip_norm_args : list, tuple If provided the norm of the gradients will be clipped. First value represents the max. grad. value, second the norm (optional). clip_val_args : int If provided the gradients will be clipped in range (-clip_val_args, clip_val_args) Note ---------- clip_norm_args and clip_val_args are mutually exclusive. """ if clip_norm_args is not None and clip_val_args is not None: raise ValueError( "'clip_norm_args' and 'clip_val_args' are mutually exclusive.") self.optimizer.zero_grad() loss.backward() if clip_norm_args is not None: clip_grad_norm_(self.parameters, *clip_norm_args) if clip_val_args is not None: clip_grad_value_(self.parameters, clip_val_args) self.optimizer.step()
def train(epoch, loss_fn, train_loader, model, optimizer, gclip=5, schd=None): model.train() loader = progress_bar(train_loader, parent=epoch) total_loss = 0 for data, target in loader: if is_cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output, att, _ = model(data) loss = loss_fn([output, att], target) total_loss += loss.item() loss.backward() if gclip > 0: utils.clip_grad_value_( model.parameters(), gclip ) optimizer.step() if schd and hasattr(schd, 'batch_step'): schd.batch_step() loader.comment = f'Loss: {loss.item():.4f}' return total_loss / len(train_loader)
def get_objf(batch: Dict, model: AcousticModel, P: k2.Fsa, device: torch.device, graph_compiler: MmiTrainingGraphCompiler, is_training: bool, tb_writer: Optional[SummaryWriter] = None, global_batch_idx_train: Optional[int] = None, optimizer: Optional[torch.optim.Optimizer] = None): feature = batch['inputs'] # at entry, feature is [N, T, C] feature = feature.permute(0, 2, 1) # now feature is [N, C, T] assert feature.ndim == 3 feature = feature.to(device) supervisions = batch['supervisions'] supervision_segments, texts = encode_supervisions(supervisions, model.subsampling_factor) loss_fn = LFMMILoss(graph_compiler=graph_compiler, P=P, den_scale=den_scale) grad_context = nullcontext if is_training else torch.no_grad with grad_context(): nnet_output = model(feature) # nnet_output is [N, C, T] nnet_output = nnet_output.permute(0, 2, 1) # now nnet_output is [N, T, C] mmi_loss, tot_frames, all_frames = loss_fn(nnet_output, texts, supervision_segments) if is_training: def maybe_log_gradients(tag: str): if (tb_writer is not None and global_batch_idx_train is not None and global_batch_idx_train % 200 == 0): tb_writer.add_scalars(tag, measure_gradient_norms(model, norm='l1'), global_step=global_batch_idx_train) optimizer.zero_grad() (-mmi_loss).backward() maybe_log_gradients('train/grad_norms') clip_grad_value_(model.parameters(), 5.0) maybe_log_gradients('train/clipped_grad_norms') if tb_writer is not None and global_batch_idx_train % 200 == 0: # Once in a time we will perform a more costly diagnostic # to check the relative parameter change per minibatch. deltas = optim_step_and_measure_param_change(model, optimizer) tb_writer.add_scalars('train/relative_param_change_per_minibatch', deltas, global_step=global_batch_idx_train) else: optimizer.step() ans = -mmi_loss.detach().cpu().item(), tot_frames.cpu().item( ), all_frames.cpu().item() return ans
def clip_grad_func(parameters, method: str, **kwargs): if method == 'norm': clip_grad_norm_(parameters, **kwargs) elif method == 'value': clip_grad_value_(parameters, **kwargs) else: raise ValueError("Wrong gradient clip type!")
def _train_batch(self, batch: CollateBatch) -> Dict[str, Any]: batch: Dict[str, torch.Tensor] = batch.to_device( device=self._cuda_device, non_blocking=True ) output_dict = self._pytorch_model(**batch).pop("loss_info") loss = output_dict.get("batch-loss") loss.backward() # Gradient Clipping if self._grad_norm is not None: clip_grad_norm_(self._model.parameters(), self._grad_norm) if self._grad_clip is not None: clip_grad_value_(self._model.parameters(), self._grad_clip) # Update step self._perform_one_step() metrics = self._model.get_metrics() # Add metrics from output dict metrics.update( {k: v.item() if isinstance(v, torch.Tensor) else v for k, v in output_dict.items()} ) # Add Learning rate if self._encoder_scheduler is not None: metrics["encoder_lr"] = self._encoder_scheduler.get_current_lr()[0] metrics["decoder_lr"] = self._decoder_scheduler.get_current_lr()[0] else: metrics["lr"] = self._scheduler.get_current_lr()[0] return metrics
def run(config): model = get_model(config[MODEL_NAME], config[MODEL_PARAMS]).cuda() criterion = get_loss(config[LOSS_NAME], config[LOSS_PARAMS]) optimizer = get_optimizer(config[OPTIM_NAME], model.parameters(), optimizer_params=config[OPTIM_PARAMS]) last_epoch = -1 scheduler = get_scheduler(config[SCHEDULER_NAME], optimizer, last_epoch, config[SCHEDULER_PARAMS]) datasets = { stage: CustomDataset(DATA_DIR, stage, config[FOLD_ID], config[DATA_PREFIX], config[INPUT_SIZE]) for stage in ['train', 'test'] } dataloaders = { stage: get_dataloader(datasets[stage], config[BATCH_SIZE]) for stage in ['train', 'test'] } writer = SummaryWriter(config[TRAIN_DIR]) clip_grad_value_(model.parameters(), 2.0) train(config, model, dataloaders, criterion, optimizer, scheduler, writer, last_epoch + 1)
def train_epoch(epoch, args, model, dataloader_train, optimizer, scheduler, feature_map): # Set training mode for modules for _, net in model.items(): net.train() batch_count = len(dataloader_train) total_loss = 0.0 for batch_id, data in enumerate(dataloader_train): for _, net in model.items(): net.zero_grad() loss = evaluate_loss(args, model, data, feature_map) loss.backward() total_loss += loss.data.item() # Clipping gradients if args.gradient_clipping: for _, net in model.items(): clip_grad_value_(net.parameters(), 1.0) # Update params of rnn and mlp for _, opt in optimizer.items(): opt.step() for _, sched in scheduler.items(): sched.step() if args.log_tensorboard: log_value('train_batch_loss ' + args.fname, loss, batch_id + batch_count * epoch) return total_loss / batch_count
def iterate_func(engine, batch): optimizer.zero_grad() inputA, inputB, target, personA, personB, ind, _, _ = batch if len(inputA.shape) == len(inputB.shape) == 4: inputA = torch.unsqueeze(inputA, 0) inputB = torch.unsqueeze(inputB, 0) assert inputA.shape[1] == inputB.shape[1] == opt.sampleSeqLength, \ ValueError(f"ind: {ind}, inputA {inputA.shape}, inputB {inputB.shape}, required seq lenth {opt.sampleSeqLength}") if torch.cuda.is_available(): inputA = inputA.float().cuda() inputB = inputB.float().cuda() target = target.float().cuda() personA = personA.long().cuda() personB = personB.long().cuda() distance, outputA, outputB = model(inputA, inputB) contrast_loss = contrast_criterion(distance, target) class_loss_A = class_criterion_A(outputA, personA) class_loss_B = class_criterion_B(outputB, personB) loss = contrast_loss + class_loss_A + class_loss_B loss.backward() clip_grad_value_(model.parameters(), clip_value=opt.gradClip or sys.maxsize) optimizer.step() return loss.item(), contrast_loss.item(), class_loss_A.item( ), class_loss_B.item()
def train_lfmmi_one_iter(model, egs_file, den_fst_path, training_opts, feat_dim, minibatch_size="64", use_gpu=True, lr=0.0001, weight_decay=0.25, frame_shift=0, print_interval=10): pkwrap.kaldi.InstantiateKaldiCuda() if training_opts is None: training_opts = pkwrap.kaldi.chain.CreateChainTrainingOptionsDefault() den_graph = pkwrap.kaldi.chain.LoadDenominatorGraph(den_fst_path, model.output_dim) criterion = pkwrap.chain.KaldiChainObjfFunction.apply if use_gpu: model = model.cuda() optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay) acc_sum = torch.tensor(0., requires_grad=False) for mb_id, merged_egs in enumerate(pkwrap.chain.prepare_minibatch(egs_file, minibatch_size)): features = pkwrap.kaldi.chain.GetFeaturesFromEgs(merged_egs) # 1+frame_shift because we generated a context of 14, not 13 as required by the model features = features[:,1+frame_shift:1+140+25+frame_shift,:] features = features.cuda() output, xent_output = model(features) sup = pkwrap.kaldi.chain.GetSupervisionFromEgs(merged_egs) deriv = criterion(training_opts, den_graph, sup, output, xent_output) acc_sum.add_(deriv[0]) if mb_id>0 and mb_id%print_interval==0: sys.stderr.write("Overall objf={}\n".format(acc_sum/print_interval)) acc_sum.zero_() optimizer.zero_grad() deriv.backward() clip_grad_value_(model.parameters(), 5.0) optimizer.step() sys.stdout.flush() model = model.cpu() return model
def on_backward_end(self, *args, **kwargs): if self.__mode == "value": for md in self.__modules: clip_grad_value_(md.parameters(), **self.__kwargs) else: for md in self.__modules: clip_grad_norm_(md.parameters(), **self.__kwargs)
def train_epoch( epoch, args, model, dataloader_train, optimizer, scheduler, feature_map, summary_writer=None): # Set training mode for modules for _, net in model.items(): net.train() batch_count = len(dataloader_train) total_loss = 0.0 for batch_id, data in enumerate(dataloader_train): for _, net in model.items(): net.zero_grad() loss = evaluate_loss(args, model, data, feature_map) loss.backward() total_loss += loss.data.item() # Clipping gradients if args.gradient_clipping: for _, net in model.items(): clip_grad_value_(net.parameters(), 1.0) # Update params of rnn and mlp for _, opt in optimizer.items(): opt.step() for _, sched in scheduler.items(): sched.step() if args.log_tensorboard: summary_writer.add_scalar('{} {} Loss/train batch'.format( args.note, args.graph_type), loss, batch_id + batch_count * epoch) return total_loss / batch_count
def train(loader, model, crit, optimizer, lr_scheduler, opt, rl_crit=None): model.train() #model = nn.DataParallel(model) for epoch in range(opt["epochs"]): lr_scheduler.step() iteration = 0 # If start self crit training if opt["self_crit_after"] != -1 and epoch >= opt["self_crit_after"]: sc_flag = True init_cider_scorer(opt["cached_tokens"]) else: sc_flag = False for data in loader: #torch.cuda.synchronize() i3d_feats = data['i3d_feats'].squeeze(1) #.cuda() labels = data['labels'] #.cuda() masks = data['masks'] #.cuda() if not sc_flag: seq_probs, _ = model(i3d_feats, labels, 'train') loss = crit(seq_probs, labels[:, 1:], masks[:, 1:]) else: seq_probs, seq_preds = model( i3d_feats, mode='inference', opt=opt) reward = get_self_critical_reward(model, i3d_feats, data, seq_preds) print(reward.shape) loss = rl_crit(seq_probs, seq_preds, torch.from_numpy(reward).float()) #.cuda()) loss.backward() clip_grad_value_(model.parameters(), opt['grad_clip']) optimizer.step() train_loss = loss.item() #torch.cuda.synchronize() iteration += 1 if not sc_flag: print("iter %d (epoch %d), train_loss = %.6f" % (iteration, epoch, train_loss)) else: print("iter %d (epoch %d), avg_reward = %.6f" % (iteration, epoch, np.mean(reward[:, 0]))) if epoch % opt["save_checkpoint_every"] == 0: model_path = os.path.join(opt["checkpoint_path"], 'model_%d.pth' % (epoch)) model_info_path = os.path.join(opt["checkpoint_path"], 'model_score.txt') torch.save(model.state_dict(), model_path) print("model saved to %s" % (model_path)) with open(model_info_path, 'a') as f: f.write("model_%d, loss: %.6f\n" % (epoch, train_loss))
def train_epoch(epoch, args, model, dataloader_train, optimizer, scheduler, feature_map, summary_writer=None): # Set training mode for modules for _, net in model.items(): net.train() batch_count = len(dataloader_train) # number of batches # print('number of batches: ', batch_count) # print('batch_count: ', batch_count) # print('len of train dataset: ', len(dataloader_train.dataset)) y_preds = [] y_trues = [] total_loss = 0.0 for batch_id, data in enumerate(dataloader_train): # print('train.py: batch_id: ', batch_id) # print('train.py: data: ', data) for _, net in model.items(): net.zero_grad() loss, y_pred, y_true = evaluate_loss(args, model, data, feature_map) y_preds.extend(y_pred) y_trues.extend(y_true) loss.backward() total_loss += loss.data.item() # Clipping gradients if args.gradient_clipping: for _, net in model.items(): clip_grad_value_(net.parameters(), 1.0) # Update params of rnn and mlp for _, opt in optimizer.items(): opt.step() for _, sched in scheduler.items(): sched.step() if args.log_tensorboard: summary_writer.add_scalar( '{} {} Loss/train batch'.format(args.note, args.graph_type), loss, batch_id + batch_count * epoch) y_preds = [1. if n >= 0.5 else 0. for n in y_preds] # print('y_trues: ', y_trues) # print('y_preds: ', y_preds) return total_loss / batch_count, accuracy_score(y_trues, y_preds)
def train(self): self.optimizer.schedule() self.loss.step() epoch = self.optimizer.get_last_epoch() + 1 lr = self.optimizer.get_lr() self.ckp.write_log( '[Epoch {}]\tLearning rate: {:.2e}'.format(epoch, Decimal(lr))) self.loss.start_log() self.model.train() timer_data, timer_model = utility.timer(), utility.timer() loss_exits = None for batch, (lr, hr, _, idx_scale) in enumerate(self.loader_train): lr, hr = self.prepare(lr, hr) timer_data.hold() timer_model.tic() self.optimizer.zero_grad() sr = self.model(lr, idx_scale) loss = self.loss(sr, hr) if loss_exits is None and self.args.multi_exit: loss_exits = [0 for _ in range(len(sr))] if self.args.multi_exit: temp_loss = loss[1:] for i in range(len(temp_loss)): loss_exits[i] += temp_loss[i] / self.args.print_every loss = loss[0] loss.backward() if self.args.gclip > 0: utils.clip_grad_value_(self.model.parameters(), self.args.gclip) self.optimizer.step() timer_model.hold() if (batch + 1) % self.args.print_every == 0: if loss_exits: exits_loss_str = "" exits_loss_str += "E0" + ": %.4f" % loss_exits[0].item() for i in range(1, len(loss_exits)): exits_loss_str += " E" + str(i) + ": %.4f" % loss_exits[i].item() self.ckp.write_log('[{}/{}]\t{}\t{}\t{:.1f}+{:.1f}s'.format( (batch + 1) * self.args.batch_size, len(self.loader_train.dataset), self.loss.display_loss(batch), exits_loss_str, timer_model.release(), timer_data.release())) loss_exits = None else: self.ckp.write_log('[{}/{}]\t{}\t{:.1f}+{:.1f}s'.format( (batch + 1) * self.args.batch_size, len(self.loader_train.dataset), self.loss.display_loss(batch), timer_model.release(), timer_data.release())) timer_data.tic() self.loss.end_log(len(self.loader_train)) self.error_last = self.loss.log[-1, -1]
def _clip_grad( params: Union[Tensor, Iterable[Tensor]], clip_norm: Optional[Union[int, float]] = None, clip_value: Optional[Union[int, float]] = None, ) -> None: if clip_norm is not None: clip_grad_norm_(params, clip_norm) if clip_value is not None: clip_grad_value_(params, clip_value)
def train(model, data_loader, data_loader_test, optimizer, epoch=5, thres=0.5, weight=1, agg='mean', save_name=None, multilayer=True, local=False, graphsage=False): for i in range(epoch): min_loss = 10. total_loss = 0 print('epoch: ', i) for index, d in enumerate(data_loader): try: model.train() model.to(Device) hidden_vec, label = model(d[0]) loss = torch.sum(-1. * torch.log(hidden_vec[label == 1])) N = torch.sum(label == 1).item() # hard negative mining for loss back prop neg_loss = torch.sort( -1. * torch.log(1 - hidden_vec[label == 0]).view(-1, ), descending=True)[0][:3 * N] neg_N = len(neg_loss) loss += torch.sum(neg_loss) loss = loss / (N + neg_N) if math.isnan(loss.item()) or math.isinf(loss.item()): # print (loss.item()) continue loss.backward() utils.clip_grad_value_(model.parameters(), 4) optimizer.step() optimizer.zero_grad() total_loss += loss.item() current_loss = total_loss / (index + 1) if index % 50 == 0: print(f'current loss for index:{index} is: {current_loss}') # num_eval += 1 # print ('num_eval:', num_eval) # print (f'training loss for epoch:{i} is: {total_loss/(index +1)}') except: continue path = f'{save_name}/checkpoints.pt' torch.save(model.state_dict(), path) if i >= epoch - 2: pr_rec = eval_model(model, data_loader_test) path = f'{save_name}/pr_rec' with open(path, 'wb') as f: pickle.dump(pr_rec, f)