def train_emb(self, images, captions, lengths, ids=None, instance_ids=None, *args): """One training step given images and captions. """ self.Eiters += 1 self.logger.update('Eit', self.Eiters) self.logger.update('lr', self.optimizer.param_groups[0]['lr']) # compute the embeddings img_emb, cap_emb = self.forward_emb(images, captions, lengths) # measure accuracy and record loss self.optimizer.zero_grad() l_list = [int(i) for i in lengths] mask = Variable( torch.ByteTensor([ i * [1] + (max_length + 3 - i) * [0] for i in l_list ])).cuda() loss = self.forward_loss(img_emb, cap_emb, instance_ids, mask) # compute gradient and do optimization loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) if self.embed_mask is not None: for i, mask in enumerate(self.embed_mask): if mask: self.txt_enc.module.embed.weight.grad.data[i].zero_() self.optimizer.step()
def train_emb(self, videos_1, videos_2, ids=None, *args): """One training step given images and captions. """ self.Eiters += 1 # zero the gradient buffers self.optimizer.zero_grad() # compute the embeddings videos_emb_1 = self.forward_emb(videos_1) videos_emb_2 = self.forward_emb(videos_2) # measure accuracy and record loss # self.optimizer.zero_grad() loss = self.forward_loss(videos_emb_1, videos_emb_2) # loss_value = loss.item() loss_value = loss.data[0] # compute gradient and do SGD step loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) self.optimizer.step() return videos_emb_1.size(0), loss_value
def train_emb(self, oimages, images, captions, lengths, ids=None, *args): """One training step given images and captions. """ self.Eiters += 1 self.logger.update('Eit', self.Eiters) self.logger.update('lr', self.optimizer.param_groups[0]['lr']) # compute the embeddings img_emb, cap_emb, oimg_emb, scores, decode_lengths, captions = self.forward_emb( oimages, images, captions, lengths) # measure accuracy and record loss self.optimizer.zero_grad() loss_vse = self.forward_loss(img_emb, cap_emb) #print loss_vse loss_de = self.forward_decode_loss(scores, decode_lengths, captions) #print loss_de loss = loss_vse + loss_de self.logger.update('La', loss.data[0], captions.size(0)) # compute gradient and do SGD step loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) self.optimizer.step()
def train_emb(self, images, captions, bboxes, depends, lengths, ids=None, *args): """One training step given images and captions. """ self.Eiters += 1 self.logger.update('Eit', self.Eiters) self.logger.update('lr', self.optimizer.param_groups[0]['lr']) # compute the embeddings img_emb, cap_emb, cap_lens = self.forward_emb(images, captions, lengths) scores = self.forward_sim(img_emb, cap_emb, bboxes, depends, cap_lens) # measure accuracy and record loss self.optimizer.zero_grad() loss = self.forward_loss(scores) # compute gradient and do SGD step loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) self.optimizer.step()
def train_emb(self, images, captions, lengths, ids=None, pre=False, *args): """One training step given images and captions. """ self.Eiters += 1 print('Eiters:{}, lr:{}'.format(self.Eiters, self.optimizer.param_groups[0]['lr'])) # compute the embeddings img_emb, cap_emb = self.forward_emb(images, captions, lengths) # measure accuracy and record loss if pre: self.pre_optimizer.zero_grad() self.MI_pre_opt.zero_grad() else: self.optimizer.zero_grad() self.MI_opt.zero_grad() loss = self.forward_loss(img_emb, cap_emb) # compute gradient and do SGD step loss.backward(retain_graph=True) if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) if pre: self.pre_optimizer.step() self.MI_pre_opt.step() else: self.optimizer.step() self.MI_opt.step() return img_emb, cap_emb
def train_emb(self, videos, captions, lengths, *args): """One training step given videos and captions. """ self.Eiters += 1 self.logger.update('Eit', self.Eiters) self.logger.update('lr', self.optimizer.param_groups[0]['lr']) # compute the embeddings vid_emb, cap_emb = self.forward_emb(videos, captions, False) # measure accuracy and record loss self.optimizer.zero_grad() loss = self.forward_loss(cap_emb, vid_emb) if torch.__version__ == '0.3.1': loss_value = loss.data[0] else: loss_value = loss.item() # compute gradient and do SGD step loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) self.optimizer.step() return vid_emb.size(0), loss_value
def _train_on_batch(self, batch: Tuple) -> Tuple: """ Compute loss depending on settings, compute gradients and apply optimization step. Args: batch: batch of training data """ # evaluate loss batch_x, batch_y, input_lengths, target_lengths = batch if self.custom_model_eval: loss, model_output = self.loss(batch, self.model) else: model_output = self.model(batch_x, input_lengths) loss = self.loss(model_output, batch_y) self.optimizer.zero_grad() # reset gradients loss.backward() # backpropagation # gradient clipping if self.clip_grads is not None: grads.clip_grad_norm(self.model.parameters(), self.clip_grads) grad_norm = self._comp_gradients() # compute average gradient norm self.optimizer.step() # apply optimization step return loss, model_output, grad_norm
def step(self): params = [] for group in self.optimizer.param_groups: for p in group['params']: params.append(p) clip_grad_norm(params, self.grad_clip) self.optimizer.step()
def train_emb(self, video_whole, video_part, captions_whole, captions_part, lengths_whole, lengths_part, ids=None, *args): """One training step given images and captions. """ self.Eiters += 1 self.logger.update('Eit', self.Eiters) self.logger.update('lr', self.optimizer.param_groups[0]['lr']) # compute the embeddings img_emb, cap_emb = self.forward_emb(video_whole, video_part, captions_whole, captions_part, lengths_whole, lengths_part) # measure accuracy and record loss self.optimizer.zero_grad() loss = self.forward_loss(img_emb, cap_emb) # compute gradient and do SGD step loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) self.optimizer.step()
def _train_on_batch(self, batch): """ Compute loss, compute gradients and apply optimization step for given batch. """ # run lr scheduler if self.scheduler is not None: self.scheduler.step() # evaluate loss if self._custom_model_eval: # custom evaluation loss, model_output = self.loss(batch, self.model) else: # regular supervised learning batch_x, batch_y = batch model_output = self.model(batch_x) loss = self.loss(model_output, batch_y) self.optimizer.zero_grad() # reset gradients loss.backward() # backpropagation # gradient clipping if self._clip_grads is not None: Grads.clip_grad_norm(self.model.parameters(), self._clip_grads) grad_norm = self._comp_gradient_norm() # compute average gradient norm self.optimizer.step() # apply optimization step return loss, model_output, grad_norm
def train_emb(self, images, captions, concept_labels, concept_input_embs, lengths, ids=None, *args): """One training step given images and captions. """ self.Eiters += 1 self.logger.update('Eit', self.Eiters) self.logger.update('lr', self.optimizer.param_groups[0]['lr']) self.logger.update('GCN_lr', self.optimizer.param_groups[4]['lr']) # compute the embeddings '''! change for adding input w2v dict for GCN attribute predictor''' v_emb, t_emb, predict_score_v, predict_score_t = self.forward_emb( images, captions, concept_labels, concept_input_embs, lengths, self.fuse_weight) # measure accuracy and record loss self.optimizer.zero_grad() loss = self.forward_loss(v_emb, t_emb, predict_score_v, predict_score_t, self.dataset_name) # compute gradient and do SGD step loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) self.optimizer.step()
def train_emb(self, images, captions, lengths, caption_masks, images_lengths, images_masks, query_id, query, num_boxes, boxes, class_labels, *args): """One training step given images and captions. """ self.Eiters += 1 self.logger.update('Eit', self.Eiters) self.logger.update('lr', self.optimizer.param_groups[0]['lr']) # compute the embeddings #img_emb, cap_emb, GCN_img_emd = self.forward_emb(images, captions, lengths, images_masks, caption_masks, boxes) img_emb, cap_emb, cap_lens, im_masks, GCN_img_emd, class_scores = self.forward_emb(images, captions, lengths, images_masks, caption_masks, boxes) # calcualte captioning loss self.optimizer.zero_grad() # measure accuracy and record loss self.optimizer.zero_grad() #loss = self.forward_loss(img_emb, cap_emb, query_id) loss = self.forward_loss(img_emb, cap_emb, cap_lens, im_masks, query_id, class_scores, class_labels) self.logger.update('Le', loss.item(), img_emb.size(0)) # compute gradient and do SGD step loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) self.optimizer.step()
def train_emb(self, images, captions, target_mask, vision_mask, ids=None, *args): """One training step given images and captions. """ self.Eiters += 1 self.logger.update('Eit', self.Eiters) self.logger.update('lr', self.optimizer.param_groups[0]['lr']) # measure accuracy and record loss scores = self.forward_emb(images, captions, target_mask, vision_mask) # measure accuracy and record loss self.optimizer.zero_grad() if scores is not None: loss = scores.sum() self.logger.update('Le', loss, images.size(0)) else: return # compute gradient and do SGD step #loss.backward() with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) self.optimizer.step()
def train_emb(self, images, captions, lengths, ids, caption_labels, caption_masks, *args): """One training step given images and captions. """ self.Eiters += 1 self.logger.update('Eit', self.Eiters) self.logger.update('lr', self.optimizer.param_groups[0]['lr']) # compute the embeddings img_emb, cap_emb, GCN_img_emd = self.forward_emb( images, captions, lengths) # calcualte captioning loss self.optimizer.zero_grad() caption_loss = self.calcualte_caption_loss(GCN_img_emd, caption_labels, caption_masks) # measure accuracy and record loss self.optimizer.zero_grad() retrieval_loss = self.forward_loss(img_emb, cap_emb) loss = retrieval_loss + caption_loss self.logger.update('Le_caption', caption_loss.data[0], img_emb.size(0)) self.logger.update('Le', loss.data[0], img_emb.size(0)) # compute gradient and do SGD step loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) self.optimizer.step()
def after_train_iter(self, runner): runner.optimizer.zero_grad() runner.outputs['loss'].backward() clip_grad_norm( filter(lambda p: p.requires_grad, runner.model.parameters()), max_norm=self.max_norm, norm_type=self.norm_type) runner.optimizer.step()
def train_emb(self, images, captions, lengths, ids=None, *args): # compute the embeddings img_emb, cap_emb = self.forward_emb(images, captions, lengths) # measure accuracy and record loss self.optimizer.zero_grad() loss = self.forward_loss(img_emb, cap_emb) # compute gradient and do SGD step loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) self.optimizer.step()
def forward(model, data, training=True, optimizer=None): use_cuda = 'cuda' in type loss = nn.CrossEntropyLoss() perplexity = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() if training: model.train() else: model.eval() end = time.time() for i, (imgs, (captions, lengths)) in enumerate(data): data_time.update(time.time() - end) if use_cuda: imgs = imgs.cuda() captions = captions.cuda(async=True) imgs = Variable(imgs, volatile=not training) captions = Variable(captions, volatile=not training) input_captions = captions[:-1] target_captions = pack_padded_sequence(captions, lengths)[0] pred, _ = model(imgs, input_captions, lengths) err = loss(pred, target_captions) perplexity.update(math.exp(err.data[0])) if training: optimizer.zero_grad() err.backward() clip_grad_norm(model.rnn.parameters(), grad_clip) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % print_freq == 0: logging.info( '{phase} - Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Perplexity {perp.val:.4f} ({perp.avg:.4f})'.format( epoch, i, len(data), phase='TRAINING' if training else 'EVALUATING', batch_time=batch_time, data_time=data_time, perp=perplexity)) return perplexity.avg
def optimize_loop_wrapper(*args, **kwargs): if not 'optimizers' in kwargs: raise ValueError( "When using @optimize, must pass in list of optimizers") for opt in kwargs['optimizers']: opt.zero_grad() loss = f(*args, **kwargs) loss.backward() for opt in kwargs['optimizers']: all_params = (p for group in opt.param_groups for p in group['params']) clip_grad_norm(all_params, max_norm=5.0, norm_type=2) opt.step() return loss.data[0]
def train_emb(self, images, captions, lengths): """One training step given images and captions. """ self.Eiters += 1 # compute the embeddings img_emb, cap_emb = self.forward_emb(images, captions, lengths) # measure accuracy and record loss self.optimizer.zero_grad() loss = self.forward_loss(img_emb, cap_emb) print('loss', loss.item()) # compute gradient and do SGD step loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) self.optimizer.step()
def train(batch_size, data_size, in_channels, start_epoch, num_epochs, learning_rate, train_loader, test_loader, model, writer, use_gpu, model_save_format): num_batches = len(train_loader) optimizer = torch.optim.Adam(model.parameters(), betas=(0.999, 0.999999), lr=learning_rate) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5) global_step = start_epoch * num_batches model.train() for epoch in range(start_epoch, start_epoch + num_epochs): scheduler.step() print("Learning Rate:", scheduler.get_lr()[0]) writer.add_scalar("train/learning_rate", scheduler.get_lr()[0], global_step) start = time.clock() for batch_index, (x, logo_image, y, logo_index) in enumerate(train_loader): x = x.type(torch.FloatTensor) y = y.type(torch.FloatTensor) logo_image = logo_image.type(torch.FloatTensor) global_step = batch_index + epoch * num_batches x = Variable(x) y = Variable(y) if torch.cuda.is_available() and use_gpu: x = x.cuda() y = y.cuda() optimizer.zero_grad() output = model(x) loss = model.loss(output, y) if (loss == loss).all(): loss.backward() else: print("broken") print("Total Gradient Norm:", clip_grad.clip_grad_norm(model.parameters(), 100)) optimizer.step() writer.add_scalar("train/loss", loss.data[0], global_step) print("Epoch: {}\tBatch: {}/{}\tLoss: {:10.6f}".format( epoch, batch_index, num_batches, loss.data[0])) if cv2.waitKey(1) == 96: raise KeyboardInterrupt("Interrupted") print("Epoch %d Took %f seconds" % (epoch, time.clock() - start)) loss, accuracy = test(test_loader, model, writer, global_step, use_gpu) torch.save(model, model_save_format % (epoch, loss, accuracy))
def train_emb(self, images, captions, lengths, ids=None, *args): #One training step given images and captions. self.Eiters += 1 self.logger.update('iterations', self.Eiters) self.logger.update('current learning rate', self.optimizer.param_groups[0]['lr']) # compute the embeddings img_emb, cap_emb, _ = self.forward_emb(images, captions, lengths) # measure accuracy and record loss self.optimizer.zero_grad() loss = self.forward_loss(img_emb, cap_emb, None, None, None) # compute gradient and do SGD step loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) self.optimizer.step()
def finish_episode(self): if self.verbose: print 'Inside finish episode :' R = 0 saved_actions = self.policy.saved_actions policy_losses = [] value_losses = [] rewards = [] for r in self.policy.rewards[::-1]: R = r + self.gamma * R rewards.insert(0, R) rewards = torch.tensor(rewards).to(self.device) rewards = (rewards - rewards.mean()) / (rewards.std() + self.eps) if self.verbose: print 'rewards :', rewards for (log_prob, value), r in zip(saved_actions, rewards): reward = r - value.item() policy_losses.append(-log_prob * reward) # print value.shape # print torch.tensor([r]).to(device).shape value_losses.append( F.smooth_l1_loss( value, torch.tensor([r]).to(self.device).unsqueeze(0))) self.optimizer.zero_grad() loss = torch.stack(policy_losses).sum() + \ torch.stack(value_losses).sum() loss.backward() clip_grad.clip_grad_norm(self.policy.parameters(), 100) self.optimizer.step() del self.policy.rewards[:] del self.policy.saved_actions[:] return loss
def train_embed_2(self, video_feat1, video_feat2, captions, length, vids=None): self.batch_num += 1 # video_feats_1, video_feats_2, cap_embed_1, cap_embed_2 = self.froward_emb(video_feat1,video_feat2,captions, length) video_feats_1, cap_embed_1 = self.forward_emb(video_feat2, captions, length) self.optimizer.zero_grad() loss = self.forward_loss(video_feats_1, cap_embed_1) loss.backward() clip_grad_norm(self.params, self.grad_clip) self.optimizer.step() # pdb.set_trace() ttemp = loss.cpu().data.tolist() # print(ttemp) self.batch_loss.append(ttemp)
def _train_on_batch(self, batch): """ Compute loss depending on settings, compute gradients and apply optimization step. """ # evaluate loss batch_x, batch_y = batch if self._custom_model_eval: loss, model_output = self.loss(batch, self.model) else: model_output = self.model(batch_x) loss = self.loss(model_output, batch_y) self.optimizer.zero_grad() # reset gradients loss.backward() # backpropagation # gradient clipping if self._clip_grads is not None: Grads.clip_grad_norm(self.model.parameters(), self._clip_grads) grad_norm = self._comp_gradients() # compute average gradient norm self.optimizer.step() # apply optimization step return loss, model_output, grad_norm
def train_emb(self, videos, captions, lengths, cap_ids, *args): """One training step given videos and captions. """ self.Eiters += 1 self.logger.update('Eit', self.Eiters) self.logger.update('lr', self.optimizer.param_groups[0]['lr']) #print(cap_ids) # compute the embeddings vid_emb, cap_emb = self.forward_emb(videos, captions, volatile=False) # Output shape for MSR_VTT: # vid_emb.shape = torch.Size([128, 2048]) # cap_emb.shape = torch.Size([128, 2048]) # measure accuracy and record loss self.optimizer.zero_grad() # print(cap_ids) loss, pos_score, neg_score = self.forward_loss(cap_emb, vid_emb, cap_ids=cap_ids) if torch.__version__ == '0.3.1': loss_value = loss.data[0] pos_value = pos_score.data[0] neg_value = neg_score.data[0] else: loss_value = loss.item() pos_value = pos_score.item() neg_value = neg_score.item() # compute gradient and do SGD step loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) self.optimizer.step() return vid_emb.size(0), loss_value, pos_value, neg_value
def train_emb(self, train_with_audio, images, captions, audios, lengths, ids=None, *args): """One training step given images and captions. """ self.Eiters += 1 self.logger.update('Eit', self.Eiters) self.logger.update('lr', self.optimizer.param_groups[0]['lr']) # compute the embeddings img_emb, cap_emb, aud_emb = self.forward_emb(images, captions, audios, lengths=lengths) # measure accuracy and record loss self.optimizer.zero_grad() if train_with_audio: # img_emb, aud_emb = self.shared_layer(img_emb, aud_emb) img_emb, aud_emb = attention(self.embed_size, True, img_emb, aud_emb) loss = self.forward_loss(img_emb, aud_emb) else: # img_emb, cap_emb = self.shared_layer(img_emb, cap_emb) img_emb, cap_emb = attention(self.embed_size, False, img_emb, cap_emb) loss = self.forward_loss(img_emb, cap_emb) # compute gradient and do SGD step loss.backward() if self.grad_clip > 0: clip_grad_norm(self.params, self.grad_clip) self.optimizer.step()
def _clip_grad_norm(self) -> None: clip_norm_params = list( filter(lambda parm: parm.requires_grad and parm.grad is not None, self.trainer.model.parameters())) if len(clip_norm_params) == 0: return else: if hasattr(self._grad_clip, 'clip_norm_mode'): scale = self._scaler.get_scale() if self._user_scale else 1.0 max_norm = self._grad_clip.max_grad_l2_norm * scale grad_norm = clip_grad.clip_grad_norm(clip_norm_params, max_norm) else: grad_norm = clip_grad.clip_grad_norm_(clip_norm_params, **self._grad_clip) self.trainer.log_buffer.put_scalar('grad_norm', float(grad_norm))
def train_forwad(self, feature, label): self.Eiters += 1 if torch.cuda.is_available(): feature = feature.cuda() label = label.cuda() self.logger.update('Eit', self.Eiters) self.logger.update('lr', self.optimizer.param_groups[0]['lr']) out = self.forward(feature) loss = self.forward_loss(out, label) loss.backward() if self.Eiters % self.iter_size == 0: if self.iter_size != 1: for g in self.optimizer.param_groups: for p in g['params']: p.grad /= self.iter_size if self.grad_clip > 0: total_norm = clip_grad_norm(self.params, self.grad_clip) if total_norm > self.grad_clip: print('clipping gradient: {} with coef {}'.format( total_norm, self.grad_clip / total_norm)) self.optimizer.step() self.optimizer.zero_grad()
def _step(self, closure=None): """Gradient clipping aware step().""" clip_grad_norm(self.params, self.gclip) self.optim.step(closure)
def train(args, model_args, lrate): print("Copying the dataset to the current node's dir...") tmp = '/Tmp/vermavik/' home = '/u/vermavik/' """ tmp='/tmp/vermav1/' home='/u/79/vermav1/unix/' """ dataset = args.dataset data_source_dir = home + 'data/' + dataset + '/' """ if not os.path.exists(data_source_dir): os.makedirs(data_source_dir) data_target_dir = tmp+'data/CelebA/' copy_tree(data_source_dir, data_target_dir) """ ### set up the experiment directories######## exp_name = experiment_name(dataset=args.dataset, act=args.activation, meta_steps=args.meta_steps, sigma=args.sigma, temperature_factor=args.temperature_factor, alpha1=args.alpha1, alpha2=args.alpha2, alpha3=args.alpha3, grad_norm_max=args.grad_max_norm, epochs=args.epochs, job_id=args.job_id, add_name=args.add_name) #temp_model_dir = tmp+'experiments/HVWB/'+dataset+'/model/'+ exp_name #temp_result_dir = tmp+'experiments/HVWB/'+dataset+'/results/'+ exp_name model_dir = home + 'experiments/HVWB/' + dataset + '/model/' + exp_name result_dir = home + 'experiments/HVWB/' + dataset + '/results/' + exp_name if not os.path.exists(model_dir): os.makedirs(model_dir) #if not os.path.exists(temp_result_dir): # os.makedirs(temp_result_dir) """ #copy_script_to_folder(os.path.abspath(__file__), temp_result_dir) result_path = os.path.join(temp_result_dir , 'out.txt') filep = open(result_path, 'w') out_str = str(args) print(out_str) filep.write(out_str + '\n') #torch.backends.cudnn.enabled = False # slower but repeatable torch.backends.cudnn.enabled = True # faster but not repeatable out_str = 'initial seed = ' + str(args.manualSeed) print(out_str) filep.write(out_str + '\n\n') """ #model_id = '/data/lisatmp4/anirudhg/minst_walk_back/walkback_' """ model_id = '/data/lisatmp4/anirudhg/celebA_latent_walkback/walkback_' model_dir = create_log_dir(args, model_id) model_id2 = '../celebA_logs/walkback_' model_dir2 = create_log_dir(args, model_id2) print model_dir print model_dir2 + '/' + 'log.jsonl.gz' logger = mimir.Logger(filename=model_dir2 + '/log.jsonl.gz', formatter=None) """ # TODO batches_per_epoch should not be hard coded lrate = args.lr import sys sys.setrecursionlimit(10000000) args, model_args = parse_args() print args ## load the training data print 'loading mnist' train_loader, test_loader = load_mnist( data_aug=0, batch_size=100, test_batch_size=100, cuda=True, data_target_dir="/u/vermavik/DARC/mnist") #train_loader, unlabelled_loader, test_loader = get_mnist(location="/u/vermavik/DARC/mnist", batch_size=64, labels_per_class=100) n_colors = 1 spatial_width = 28 for batch_idx, (data, target) in enumerate(train_loader): Xbatch = data.numpy() #print Xbatch scl = 1. / np.sqrt(np.mean((Xbatch - np.mean(Xbatch))**2)) shft = -np.mean(Xbatch * scl) break ### TO DO : calculate statistics on whole data print "Width", WIDTH, spatial_width model = Net(args) if args.cuda: model.cuda() loss_fn = nn.BCELoss() if args.optimizer == 'sgd': optimizer_encoder = optim.SGD(model.encoder_params, lr=args.lr, momentum=args.momentum, weight_decay=0) optimizer_transition = optim.SGD(model.transition_params, lr=args.lr, momentum=args.momentum, weight_decay=0) optimizer_decoder = optim.SGD(model.decoder_params, lr=args.lr, momentum=args.momentum, weight_decay=0) elif args.optimizer == 'adam': optimizer_encoder = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) optimizer_transition = optim.Adam(model.transition_params, lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) optimizer_decoder = optim.Adam(model.decoder_params, lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) uidx = 0 estop = False bad_counter = 0 #batch_index = 1 n_samples = 0 print 'Number of steps....' print args.num_steps print "Number of metasteps...." print args.meta_steps print 'Done' count_sample = 1 #### for saving metrics for all steps ### train_loss = [] train_x_loss = [] train_log_p_reverse = [] train_kld = [] #### for saving metrics for each step individually ### train_loss_each_step = [[]] train_x_loss_each_step = [[]] train_log_p_reverse_each_step = [[]] #train_kld_each_step = [[]] for i in range(args.meta_steps - 1): train_loss_each_step.append([]) train_x_loss_each_step.append([]) train_log_p_reverse_each_step.append([]) #train_kld_each_step.append([]) for epoch in range(args.epochs): print('epoch', epoch) for batch_idx, (data, target) in enumerate(train_loader): #batch_idx = 0 #for (data, target), (u, _) in zip(cycle(train_loader), unlabelled_loader): # batch_idx +=1 if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) data = data.view(-1, 1 * 28 * 28) #print data.shape t0 = time.time() #batch_index += 1 n_samples += data.data.shape[0] #print (n_samples) temperature_forward = args.temperature meta_cost = [] x = data z = None encode = True for meta_step in range(0, args.meta_steps): #print ('meta_step', meta_step) #print encode loss, x_loss, log_p_reverse, KLD, z, z_tilde, x_tilde = compute_loss( x, z, model, loss_fn, temperature_forward, meta_step, encode=encode) #meta_cost.append(loss) #print compute_param_norm(model.conv_x_z_1.weight.data) optimizer_encoder.zero_grad() optimizer_transition.zero_grad() optimizer_decoder.zero_grad() loss.backward() total_norm = clip_grad_norm(model.parameters(), args.grad_max_norm) #print ('step', meta_step, total_norm) if encode == True: optimizer_encoder.step() optimizer_transition.step() optimizer_decoder.step() #print ('step', meta_step, clip_grad_norm(model.parameters(), 1000000)) ### store metrics####### train_loss.append(loss.data[0]) train_x_loss.append(x_loss.data[0]) train_log_p_reverse.append(-log_p_reverse.data[0]) if KLD is not None: train_kld.append(KLD.data[0]) #### store metrices for each step separately### train_loss_each_step[meta_step].append(loss.data[0]) train_x_loss_each_step[meta_step].append(x_loss.data[0]) train_log_p_reverse_each_step[meta_step].append( -log_p_reverse.data[0]) #if KLD is not None: # train_kld_each_step[meta_step].append(KLD.data[0]) if args.meta_steps > 1: #data, _, _, _, _, _, _ = forward_diffusion(data, model, loss_fn,temperature_forward,meta_step) #data = data.view(-1,3, 64,64) #data = Variable(data.data, requires_grad=False) x = Variable(x_tilde.data, requires_grad=False) z = Variable(z_tilde.data, requires_grad=False) if args.encode_every_step == 0: encode = False temperature_forward *= args.temperature_factor #print loss.data #print loss.data #cost = sum(meta_cost) / len(meta_cost) #print cost #gradient_updates_ = get_grads(data_use[0],args.temperature) if np.isnan(loss.data.cpu()[0]) or np.isinf(loss.data.cpu()[0]): print loss.data print 'NaN detected' return 1. #batch_idx=0 if batch_idx % 100 == 0: plot_loss(model_dir, train_loss, train_x_loss, train_log_p_reverse, train_kld, train_loss_each_step, train_x_loss_each_step, train_log_p_reverse_each_step, args.meta_steps) count_sample += 1 temperature = args.temperature * (args.temperature_factor**( args.num_steps * args.meta_steps - 1)) temperature_forward = args.temperature #print 'this' data_forward_diffusion = data for num_step in range(args.num_steps * args.meta_steps): #print "Forward temperature", temperature_forward data_forward_diffusion, _, _, _, _, _, _ = forward_diffusion( data_forward_diffusion, model, loss_fn, temperature_forward, num_step) #print data_forward_diffusion.shape #data_forward_diffusion = np.asarray(data).astype('float32').reshape(args.batch_size, INPUT_SIZE) data_ = data_forward_diffusion.view( -1, 1, spatial_width, spatial_width ) #reshape(args.batch_size, n_colors, WIDTH, WIDTH) if num_step % 2 == 1: plot_images( data_.data.cpu().numpy(), model_dir + '/' + "batch_" + str(batch_idx) + '_corrupted_' + 'epoch_' + str(epoch) + '_time_step_' + str(num_step)) temperature_forward = temperature_forward * args.temperature_factor print "PLOTTING ORIGINAL IMAGE" temp = data.view(-1, 1, spatial_width, spatial_width) plot_images( temp.data.cpu().numpy(), model_dir + '/' + 'orig_' + 'epoch_' + str(epoch) + '_batch_index_' + str(batch_idx)) print "DONE PLOTTING ORIGINAL IMAGE" ''' temperature = args.temperature * (args.temperature_factor ** (args.num_steps*args.meta_steps - 1 )) for i in range(args.num_steps*args.meta_steps + args.extra_steps): x_data, sampled, sampled_activation, sampled_preactivation = f_sample(x_data, temperature) print 'On backward step number, using temperature', i, temperature reverse_time(scl, shft, x_data, model_dir + '/'+ "batch_" + str(batch_index) + '_samples_backward_' + 'epoch_' + str(count_sample) + '_time_step_' + str(i)) x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor ''' #print 'this' if args.noise == "gaussian": z_sampled = np.random.normal( 0.0, 1.0, size=(args.batch_size, args.nl)) #.clip(0.0, 1.0) else: z_sampled = np.random.binomial(1, 0.5, size=(args.batch_size, args.nl)) temperature = args.temperature * (args.temperature_factor**( args.num_steps * args.meta_steps - 1)) z = torch.from_numpy(np.asarray(z_sampled).astype('float32')) if args.cuda: z = z.cuda() z = Variable(z) for i in range(args.num_steps * args.meta_steps): # + args.extra_steps): z_new_to_x, z_to_x, z_new = model.sample( z, temperature, args.num_steps * args.meta_steps - i - 1) #print 'On step number, using temperature', i, temperature if i % 2 == 1: reverse_time( scl, shft, z_new_to_x.data.cpu().numpy(), model_dir + '/batch_index_' + str(batch_idx) + '_inference_' + 'epoch_' + str(epoch) + '_step_' + str(i)) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor z = z_new
def main(): args = parser.parse_args() print(args) # for now, batch_size should match number of gpus assert(args.batch_size==torch.cuda.device_count()) # create model model = detector(arch=args.cnn_arch, base_cnn_pkl_file=args.cnn_pkl, mapping_file=args.cnn_mapping, output_prob=False, return_rois=False, return_img_features=False) model = model.cuda() # freeze part of the net stop_grad=['conv1','bn1','relu','maxpool','layer1'] model_no_grad=torch.nn.Sequential(*[getattr(model.model,l) for l in stop_grad]) for param in model_no_grad.parameters(): param.requires_grad = False # define optimizer optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.base_lr, momentum=args.momentum, weight_decay=args.wd) # create dataset train_dataset = CocoDataset(ann_file=args.dset_ann, img_dir=args.dset_path, proposal_file=args.dset_rois, mode='train', sample_transform=preprocess_sample(target_sizes=[800], sample_proposals_for_training=True)) train_loader = DataLoader(train_dataset, batch_size=args.batch_size,shuffle=False, num_workers=args.workers, collate_fn=collate_custom) training_stats = TrainingStats(losses=['loss_cls','loss_bbox'], metrics=['accuracy_cls'], solver_max_iters=args.max_iter) iter = args.start_iter print('starting training') while iter<args.max_iter: for i, batch in enumerate(train_loader): if args.batch_size==1: batch = to_cuda_variable(batch,volatile=False) else: # when using multiple GPUs convert to cuda later in data_parallel and list_to_tensor batch = to_variable(batch,volatile=False) # update lr lr = get_lr_at_iter(iter) adjust_learning_rate(optimizer, lr) # start measuring time training_stats.IterTic() # forward pass if args.batch_size==1: cls_score,bbox_pred=model(batch['image'],batch['rois']) list_to_tensor = lambda x: x else: cls_score,bbox_pred=data_parallel(model,(batch['image'],batch['rois'])) # run model distributed over gpus and concatenate outputs for all batch # convert gt data from lists to concatenated tensors list_to_tensor = lambda x: torch.cat(tuple([i.cuda() for i in x]),0) cls_labels = list_to_tensor(batch['labels_int32']).long() bbox_targets = list_to_tensor(batch['bbox_targets']) bbox_inside_weights = list_to_tensor(batch['bbox_inside_weights']) bbox_outside_weights = list_to_tensor(batch['bbox_outside_weights']) # compute loss loss_cls=cross_entropy(cls_score,cls_labels) loss_bbox=smooth_L1(bbox_pred,bbox_targets,bbox_inside_weights,bbox_outside_weights) # compute classification accuracy (for stats reporting) acc = accuracy(cls_score,cls_labels) # get final loss loss = loss_cls + loss_bbox # update optimizer.zero_grad() loss.backward() # Without gradient clipping I get inf's and NaNs. # it seems that in Caffe the SGD solver performs grad clipping by default. # https://github.com/BVLC/caffe/blob/master/src/caffe/solvers/sgd_solver.cpp # it also seems that Matterport's Mask R-CNN required grad clipping as well # (see README in https://github.com/matterport/Mask_RCNN) # the value max_norm=35 was taken from here https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto clip_grad_norm(filter(lambda p: p.requires_grad, model.parameters()), max_norm=35, norm_type=2) optimizer.step() # stats training_stats.IterToc() training_stats.UpdateIterStats(losses_dict={'loss_cls': loss_cls.data.cpu().numpy().item(), 'loss_bbox': loss_bbox.data.cpu().numpy().item()}, metrics_dict={'accuracy_cls':acc.data.cpu().numpy().item()}) training_stats.LogIterStats(iter, lr) # save checkpoint if (iter+1)%args.checkpoint_period == 0: save_checkpoint({ 'iter': iter, 'args': args, 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict(), }, args.checkpoint_fn) if iter == args.start_iter + 20: # training_stats.LOG_PERIOD=20 # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() # allow finishing in the middle of an epoch if iter>args.max_iter: break # advance iteration iter+=1
def step(self, closure=None): """Gradient clipping aware step().""" if self.gclip > 0: clip_grad_norm(self.params, self.gclip) self.optim.step(closure)