batch = config.get('main', 'BATCH_SIZE_TRAIN') lr_str = config.get('main', 'LEARNING_RATE') attention_mask_is = config.get('CNNs', 'ATTENTION_MASK_IS') TARGET_only_LARGE_NEST_flag = strtobool(config.get('main', 'NEST')) writer_log_dir = '../../data/TensorboardGraph/span_NER_RE/correcteval_answerchanged2-4_TARGET_only_LARGE_NEST_flag_is_{5}/batch_size_{0}/learning_rate_{1}/network_{2}_{3}/0_logit_weight_{4}'.format( batch, lr_str, network_structure, attention_mask_is, weight, str(TARGET_only_LARGE_NEST_flag)) brat_log_dir = '../../brat/brat-v1.3_Crunchy_Frog/data/model_preds/span_NER_RE/correcteval_answerchanged2-4_TARGET_only_LARGE_NEST_flag_is_{5}/batch_size_{0}/learning_rate_{1}/network_{2}_{3}/0_logit_weight_{4}'.format( batch, lr_str, network_structure, attention_mask_is, weight, str(TARGET_only_LARGE_NEST_flag)) hoge_dir = '../../data/TensorboardGraph/span_NER_RE/LARGE_NEST_{}_debug'.format( str(TARGET_only_LARGE_NEST_flag)) writer = tb.SummaryWriter(logdir=writer_log_dir) np.random.seed(1) torch.manual_seed(1) # pdb.set_trace() print('\nCreate Environment...\n') device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print('\nCreate data...') database = shelve.open(config.get('path', 'SHELVE_PATH')) # pdb.set_trace() vocab, REL_DIC, corpus, filename_lst = database[dataname] database.close() # (doc[0], indx_tokens, output_film_size1, output_film_size2, output_film_size3, output_film_size4, attention_mask, spmed, (n,doc,Entdic, Reldic))
shape=tuple(test_shape)) train_davis_dataset = SimaseDavis(train_davis_json, train_davis_memmap) train_davis_dataloader = torch.utils.data.DataLoader( train_davis_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) test_davis_dataset = SimaseDavis(test_davis_json, test_davis_memmap) test_davis_dataloader = torch.utils.data.DataLoader(test_davis_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) tensorboard_writer = tensorboardX.SummaryWriter() model_save_path = './encoder_dense_1.pth' ''' #showing pairs itf = next(iter(davis_dataloader)) img1, img2, target, class_1, class_2 = itf img1 = img1.squeeze(dim=1) img2 = img2.squeeze(dim=1) for k in range(4): i = img1[k].numpy().transpose(2,1,0) cv2_imshow(i) j = img2[k].numpy().transpose(2,1,0) cv2_imshow(j) '''
def train(opt): # Deal with feature things before anything opt.use_fc, opt.use_att = utils.if_use_feat(opt.caption_model) if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5 loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path) #opt.ss_prob=0.0 infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl'), 'rb') as f: infos = utils.pickle_load(f) saved_model_opt = infos['opt'] need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars(opt)[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl')): with open(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl'), 'rb') as f: histories = utils.pickle_load(f) else: infos['iter'] = 0 infos['epoch'] = 0 infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['vocab'] = loader.get_vocab() infos['pix_perss']=loader.get_personality() infos['opt'] = opt iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) print("current epoch: ",epoch) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) opt.vocab = loader.get_vocab() opt.xpersonality=loader.get_personality() if opt.use_joint==0: #torch.cuda.set_device(0) model = models.setup(opt).cuda() elif opt.use_joint==1: model = models.JointModel(opt) model.cuda() #model=models.setup(opt) del opt.vocab if opt.start_from is not None: opt.model=os.path.join(opt.start_from, 'model'+'.pth') model.load_state_dict(torch.load(opt.model)) dp_model = torch.nn.DataParallel(model) lw_model = LossWrapper(model, opt) dp_lw_model = torch.nn.DataParallel(lw_model) #dp_lw_model=LossWrapper(model, opt) # this is for no cuda epoch_done = True # Assure in training mode #dp_lw_model=lw_model dp_lw_model.train() if opt.noamopt: assert opt.caption_model == 'transformer', 'noamopt can only work with transformer' optimizer = utils.get_std_opt(model, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup) optimizer._step = iteration elif opt.reduce_on_plateau: optimizer = utils.build_optimizer([p for p in model.parameters() if p.requires_grad], opt) optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3) else: optimizer = utils.build_optimizer([p for p in model.parameters() if p.requires_grad], opt) # Load the optimizer if vars(opt).get('start_from', None) is not None and os.path.isfile(os.path.join(opt.start_from,"optimizer.pth")): optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) else: print('Optimizer param group number not matched? There must be new parameters. Reinit the optimizer.') def save_checkpoint(model, infos, optimizer, histories=None, append=''): if len(append) > 0: append = '-' + append # if checkpoint_path doesn't exist if not os.path.isdir(opt.checkpoint_path): os.makedirs(opt.checkpoint_path) checkpoint_path = os.path.join(opt.checkpoint_path, 'model%s.pth' %(append)) torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer%s.pth' %(append)) torch.save(optimizer.state_dict(), optimizer_path) with open(os.path.join(opt.checkpoint_path, 'infos_'+opt.id+'%s.pkl' %(append)), 'wb') as f: utils.pickle_dump(infos, f) if histories: with open(os.path.join(opt.checkpoint_path, 'histories_'+opt.id+'%s.pkl' %(append)), 'wb') as f: utils.pickle_dump(histories, f) try: while True: if epoch_done: if not opt.noamopt and not opt.reduce_on_plateau: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate ** frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate utils.set_lr(optimizer, opt.current_lr) # set the decayed rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False # Assign retrieval loss weight if epoch > opt.retrieval_reward_weight_decay_start and opt.retrieval_reward_weight_decay_start >= 0: frac = (epoch - opt.retrieval_reward_weight_decay_start) // opt.retrieval_reward_weight_decay_every model.retrieval_reward_weight = opt.retrieval_reward_weight * (opt.retrieval_reward_weight_decay_rate ** frac) epoch_done = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() with torch.autograd.set_detect_anomaly(True): tmp = [data['fc_feats'], data['att_feats'],data['densecap'], data['labels'], data['masks'], data['att_masks'], data['personality']] tmp = [_ if _ is None else _.cuda() for _ in tmp] fc_feats, att_feats,densecap, labels, masks, att_masks,personality = tmp optimizer.zero_grad() model_out = dp_lw_model(fc_feats, att_feats,densecap, labels, masks, att_masks,personality, data['gts'], torch.arange(0, len(data['gts'])), sc_flag) loss = model_out['loss'].mean() loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.item() torch.cuda.synchronize() end = time.time() if not sc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) else: print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f},train_loss = {:.3f}" \ .format(iteration, epoch, model_out['reward'].mean(), end - start,train_loss)) if opt.use_joint==1: for k, v in model.loss().items(): prt_str += "{} = {:.3f} ".format(k, v) print(prt_str) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 epoch_done = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): add_summary_value(tb_summary_writer, 'train_loss', train_loss, iteration) if opt.noamopt: opt.current_lr = optimizer.rate() elif opt.reduce_on_plateau: opt.current_lr = optimizer.current_lr add_summary_value(tb_summary_writer, 'learning_rate', opt.current_lr, iteration) add_summary_value(tb_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: add_summary_value(tb_summary_writer, 'avg_reward', model_out['reward'].mean(), iteration) loss_history[iteration] = train_loss if not sc_flag else model_out['reward'].mean() lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # update infos infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( dp_model, lw_model.crit, loader, eval_kwargs) if opt.reduce_on_plateau: if 'CIDEr' in lang_stats: optimizer.scheduler_step(-lang_stats['CIDEr']) else: optimizer.scheduler_step(val_loss) # Write validation result into summary add_summary_value(tb_summary_writer, 'validation loss', val_loss, iteration) if lang_stats is not None: for k,v in lang_stats.items(): add_summary_value(tb_summary_writer, k, v, iteration) val_result_history[iteration] = {'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions} # Save model if is improving on validation result if opt.language_eval == 1: if opt.use_joint==1: current_score = lang_stats['SPICE']*100 elif opt.use_joint==0: current_score = lang_stats['CIDEr'] # could use SPICE else: if opt.use_joint==0: current_score = - val_loss elif opt.use_joint==1: current_score= - val_loss['loss_cap'] if opt.use_joint==1: current_score_vse = val_loss.get(opt.vse_eval_criterion, 0)*100 best_flag = False best_flag_vse= False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True if opt.use_joint==1: if best_val_score_vse is None or current_score_vse > best_val_score_vse: best_val_score_vse = current_score_vse best_flag_vse = True infos['best_val_score_vse'] = best_val_score_vse # Dump miscalleous informations infos['best_val_score'] = best_val_score histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history save_checkpoint(model, infos, optimizer, histories) if opt.save_history_ckpt: save_checkpoint(model, infos, optimizer, append=str(iteration)) if best_flag: save_checkpoint(model, infos, optimizer, append='best') if best_flag_vse: save_checkpoint(model, infos, optimizer, append='vse-best') # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break except (RuntimeError, KeyboardInterrupt): print('Save ckpt on exception ...') save_checkpoint(model, infos, optimizer) print('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace)
dataset_test, word_dict = tokenize(os.path.join(args.data_dir, 'valid.txt'), \ train=False, word_dict=word_dict, char_level=args.character_level) # fetch one minibatch of data train_batch = next(minibatch_generator(dataset_train, args, shuffle=False)) test_batch = next(minibatch_generator(dataset_test, args, shuffle=False)) # load model that will be evaluated gen, loaded_epoch = load_model_from_file(args.model_path, epoch=args.model_epoch) gen.args.alpha_test = args.alpha_test gen.eval() print('switching the temperature to {}'.format(gen.args.alpha_test)) # args.model_path = 'our_GAN_stats_for_real_this_time' # Logging writer = tensorboardX.SummaryWriter(log_dir=os.path.join(args.model_path, \ 'A_TB_alpha{}'.format(gen.args.alpha_test))) writes = 0 if args.cuda: gen = gen.cuda() def save_samples_for_bleu(gen, input, word_dict, epoch, sample_size=10000): maybe_create_dir(os.path.join(args.model_path, 'samples')) file_name = os.path.join(args.model_path, 'samples/gen_for_bleu_{}_{:.4f}.txt'.format(epoch, gen.args.alpha_test)) print('saving in {}'.format(file_name)) with torch.no_grad(): with open(file_name, 'w') as f: tot_sent=0 while tot_sent < sample_size: _, fake_sentences = gen(input[:, [0]]) sentences = id_to_words(fake_sentences.cpu().data.numpy(), word_dict)
sys.exit("Only support MUNIT|UNIT") trainer.cuda() train_loader_a, train_loader_b, test_loader_a, test_loader_b = get_all_data_loaders( config) train_display_images_a = torch.stack( [train_loader_a.dataset[i]['data'] for i in range(display_size)]).cuda() train_display_images_b = torch.stack( [train_loader_b.dataset[i]['data'] for i in range(display_size)]).cuda() test_display_images_a = torch.stack( [test_loader_a.dataset[i]['data'] for i in range(display_size)]).cuda() test_display_images_b = torch.stack( [test_loader_b.dataset[i]['data'] for i in range(display_size)]).cuda() model_name = os.path.splitext(os.path.basename(opts.config))[0] train_writer = tensorboardX.SummaryWriter( os.path.join(opts.output_path + "/logs", model_name)) output_directory = os.path.join(opts.output_path + "/outputs", model_name) checkpoint_directory, image_directory = prepare_sub_folder(output_directory) shutil.copy(opts.config, os.path.join(output_directory, 'config.yaml')) # training iterations = trainer.resume(checkpoint_directory, hyperparameters=config) if opts.resume else 0 while True: for it, (images_a, images_b) in enumerate(zip(train_loader_a, train_loader_b)): trainer.update_learning_rate() images_a, images_b = images_a['data'].cuda().detach( ), images_b['data'].cuda().detach() with Timer("Elapsed time in update: %f"):
def train(self): transform = transforms.Compose([ transforms.Resize(self.config.data.image_size), transforms.ToTensor() ]) if self.config.data.dataset == 'CIFAR10': dataset = CIFAR10(os.path.join(self.args.run, 'datasets', 'cifar10'), train=True, download=True, transform=transform) test_dataset = CIFAR10(os.path.join(self.args.run, 'datasets', 'cifar10'), train=False, download=True, transform=transform) num_items = len(dataset) indices = list(range(num_items)) random_state = np.random.get_state() np.random.seed(2020) np.random.shuffle(indices) np.random.set_state(random_state) train_indices, val_indices = indices[:int( num_items * 0.9)], indices[int(num_items * 0.9):] val_dataset = Subset(dataset, val_indices) dataset = Subset(dataset, train_indices) elif self.config.data.dataset == 'MNIST': dataset = MNIST(os.path.join(self.args.run, 'datasets', 'mnist'), train=True, download=True, transform=transform) num_items = len(dataset) indices = list(range(num_items)) random_state = np.random.get_state() np.random.seed(2020) np.random.shuffle(indices) np.random.set_state(random_state) train_indices, val_indices = indices[:int( num_items * 0.9)], indices[int(num_items * 0.9):] val_dataset = Subset(dataset, val_indices) dataset = Subset(dataset, train_indices) test_dataset = MNIST(os.path.join(self.args.run, 'datasets', 'mnist'), train=False, download=True, transform=transform) dataloader = DataLoader(dataset, batch_size=self.config.training.batch_size, shuffle=True, num_workers=2) val_loader = DataLoader(val_dataset, batch_size=self.config.training.batch_size, shuffle=True, num_workers=2) test_loader = DataLoader(test_dataset, batch_size=self.config.training.batch_size, shuffle=True, num_workers=2) val_iter = iter(val_loader) self.config.input_dim = self.config.data.image_size**2 * self.config.data.channels tb_path = os.path.join(self.args.run, 'tensorboard', self.args.doc) if os.path.exists(tb_path): shutil.rmtree(tb_path) model_path = os.path.join(self.args.run, 'results', self.args.doc) if os.path.exists(model_path): shutil.rmtree(model_path) os.makedirs(model_path) ## save txt files txtfiles = os.path.join('txtresults', self.args.doc) if not os.path.exists(txtfiles): os.makedirs(txtfiles) tb_logger = tensorboardX.SummaryWriter(log_dir=tb_path) flow = NICE(self.config.input_dim, self.config.model.hidden_size, self.config.model.num_layers).to(self.config.device) optimizer = self.get_optimizer(flow.parameters()) # Set up test data noise_sigma = self.config.data.noise_sigma step = 1 def energy_net(inputs): energy, _ = flow(inputs, inv=False) return -energy def grad_net_kingma(inputs): energy, _ = flow(inputs, inv=False) grad1, grad2 = flow.grads_backward(inv=False) return -grad1, -grad2 def grad_net_UT(inputs): energy, _ = flow(inputs, inv=False) grad1, T, U = flow.grads_backward_TU(inv=False) grad2 = T * U / 2. return -grad1, -grad2 def grad_net_S(inputs): energy, _ = flow(inputs, inv=False) grad1, S_r, S_i = flow.grads_backward_S(inv=False) grad2 = (S_r**2 - S_i**2) return -grad1, -grad2 def sample_net(z): samples, _ = flow(z, inv=True) samples, _ = Logit()(samples, mode='inverse') return samples # Use this to select the sigma for DSM losses # if self.config.training.algo == 'dsm': # sigma = self.args.dsm_sigma # if noise_sigma is None: # sigma = select_sigma(iter(dataloader), iter(val_loader)) # else: # sigma = select_sigma(iter(dataloader), iter(val_loader), noise_sigma=noise_sigma) if self.args.load_path != "": flow.load_state_dict(torch.load(self.args.load_path)) best_model = {"val": None, "ll": None, "esm": None} best_val_loss = {"val": 1e+10, "ll": -1e+10, "esm": 1e+10} best_val_iter = {"val": 0, "ll": 0, "esm": 0} time_record = [] time_culm_record = [] val_logp_record = [] val_sm_record = [] for _ in range(self.config.training.n_epochs): for _, (X, y) in enumerate(dataloader): noises = torch.zeros_like(X) X = X + (noises.uniform_(0, 1) - 0.5) / 256. flattened_X = X.type(torch.float32).to( self.config.device).view(X.shape[0], -1) flattened_X.clamp_(1e-3, 1 - 1e-3) flattened_X, _ = Logit()(flattened_X, mode='direct') if noise_sigma is not None: flattened_X += torch.randn_like(flattened_X) * noise_sigma flattened_X.requires_grad_(True) logp = -energy_net(flattened_X) logp = logp.mean() if self.config.training.algo == 'kingma': t = time.time() loss = approx_backprop_score_matching( grad_net_kingma, flattened_X) if self.config.training.algo == 'UT': t = time.time() loss = approx_backprop_score_matching( grad_net_UT, flattened_X) if self.config.training.algo == 'S': t = time.time() loss = approx_backprop_score_matching( grad_net_S, flattened_X) elif self.config.training.algo == 'mle': t = time.time() loss = energy_net(flattened_X) loss = loss.mean() elif self.config.training.algo == 'ssm': t = time.time() loss, *_ = single_sliced_score_matching( energy_net, flattened_X, noise_type=self.config.training.noise_type) elif self.config.training.algo == 'ssm_vr': t = time.time() loss, *_ = sliced_VR_score_matching( energy_net, flattened_X, noise_type=self.config.training.noise_type) elif self.config.training.algo == 'dsm': t = time.time() loss = dsm(energy_net, flattened_X, sigma=self.args.dsm_sigma) elif self.config.training.algo == 'dsm_tracetrick': t = time.time() loss = dsm_tracetrick(energy_net, flattened_X, sigma=self.args.dsm_sigma) elif self.config.training.algo == 'dsm_tracetrick_FD': t = time.time() loss = dsm_tracetrick_FD(energy_net, flattened_X, sigma=self.args.dsm_sigma) elif self.config.training.algo == "exact": t = time.time() loss = exact_score_matching(energy_net, flattened_X, train=True).mean() elif self.config.training.algo == 'efficient_sm': t = time.time() loss = single_efficient_score_matching( energy_net, flattened_X, eps=self.args.ESM_eps, noise_type=self.config.training.noise_type) elif self.config.training.algo == 'efficient_sm_conjugate': t = time.time() loss = efficient_score_matching_conjugate( energy_net, flattened_X, eps=self.args.ESM_eps, noise_type=self.config.training.noise_type) elif self.config.training.algo == 'MLE_efficient_sm_conjugate': t = time.time() loss = MLE_efficient_score_matching_conjugate( energy_net, flattened_X, eps=self.args.ESM_eps, mle_ratio=self.args.MLE_ratio, noise_type=self.config.training.noise_type) optimizer.zero_grad() loss.backward() optimizer.step() t = time.time() - t time_record.append(t) if step % 100 == 0: try: val_X, _ = next(val_iter) except: val_iter = iter(val_loader) val_X, _ = next(val_iter) noises = torch.zeros_like(val_X) val_X = val_X + (noises.uniform_(0, 1) - 0.5) / 256. val_X = val_X.type(torch.float32).to(self.config.device) val_X.clamp_(1e-3, 1 - 1e-3) val_X, _ = Logit()(val_X, mode='direct') val_X = val_X.view(val_X.shape[0], -1) if noise_sigma is not None: val_X += torch.randn_like(val_X) * noise_sigma val_logp = -energy_net(val_X) val_logp = val_logp.mean() if self.config.training.algo == 'kingma': val_loss = approx_backprop_score_matching( grad_net_kingma, val_X) if self.config.training.algo == 'UT': val_loss = approx_backprop_score_matching( grad_net_UT, val_X) if self.config.training.algo == 'S': val_loss = approx_backprop_score_matching( grad_net_S, val_X) elif self.config.training.algo == 'ssm': val_loss, *_ = single_sliced_score_matching( energy_net, val_X, noise_type=self.config.training.noise_type) elif self.config.training.algo == 'ssm_vr': val_loss, *_ = sliced_VR_score_matching( energy_net, val_X, noise_type=self.config.training.noise_type) elif self.config.training.algo == 'dsm': val_loss = dsm(energy_net, val_X, sigma=self.args.dsm_sigma) elif self.config.training.algo == 'dsm_tracetrick': val_loss = dsm_tracetrick(energy_net, val_X, sigma=self.args.dsm_sigma) elif self.config.training.algo == 'dsm_tracetrick_FD': val_loss = dsm_tracetrick_FD(energy_net, val_X, sigma=self.args.dsm_sigma) elif self.config.training.algo == 'mle': val_loss = -val_logp elif self.config.training.algo == "exact": val_loss = exact_score_matching(energy_net, val_X, train=False).mean() elif self.config.training.algo == 'efficient_sm': val_loss = single_efficient_score_matching( energy_net, val_X, eps=self.args.ESM_eps) elif self.config.training.algo == 'efficient_sm_conjugate': val_loss = efficient_score_matching_conjugate( energy_net, val_X, eps=self.args.ESM_eps) elif self.config.training.algo == 'MLE_efficient_sm_conjugate': val_loss = MLE_efficient_score_matching_conjugate( energy_net, val_X, eps=self.args.ESM_eps, mle_ratio=self.args.MLE_ratio) logging.info( "logp: {:.3f}, val_logp: {:.3f}, loss: {:.3f}, val_loss: {:.3f}, time per step: {:.3f} +- {:.3f} ms" .format(logp.item(), val_logp.item(), loss.item(), val_loss.item(), np.mean(time_record) * 1e3, np.std(time_record) * 1e3)) tb_logger.add_scalar('logp', logp, global_step=step) tb_logger.add_scalar('loss', loss, global_step=step) tb_logger.add_scalar('val_logp', val_logp, global_step=step) tb_logger.add_scalar('val_loss', val_loss, global_step=step) # save records in txt val_logp_record.append(val_logp.item()) time_culm = sum(time_record) time_culm_record.append(time_culm) np.savetxt(txtfiles + '/val_logp_record.txt', np.array(val_logp_record)) np.savetxt(txtfiles + '/time_culm_record.txt', np.array(time_culm_record)) if val_loss < best_val_loss['val']: best_val_loss['val'] = val_loss best_val_iter['val'] = step best_model['val'] = copy.deepcopy(flow.state_dict()) if val_logp > best_val_loss['ll']: best_val_loss['ll'] = val_logp best_val_iter['ll'] = step best_model['ll'] = copy.deepcopy(flow.state_dict()) if step % 100 == 0: with torch.no_grad(): z = torch.normal( torch.zeros(100, flattened_X.shape[1], device=self.config.device)) samples = sample_net(z) samples = samples.view(100, self.config.data.channels, self.config.data.image_size, self.config.data.image_size) samples = torch.clamp(samples, 0.0, 1.0) image_grid = make_grid(samples, 10) tb_logger.add_image('samples', image_grid, global_step=step) data = X data_grid = make_grid(data[:100], 10) tb_logger.add_image('data', data_grid, global_step=step) logging.info("Computing exact score matching....") try: val_X, _ = next(val_iter) except: val_iter = iter(val_loader) val_X, _ = next(val_iter) noises = torch.zeros_like(val_X) val_X = val_X + (noises.uniform_(0, 1) - 0.5) / 256. val_X = val_X.type(torch.float32).to(self.config.device) val_X.clamp_(1e-3, 1 - 1e-3) val_X, _ = Logit()(val_X, mode='direct') val_X = val_X.view(val_X.shape[0], -1) if noise_sigma is not None: val_X += torch.randn_like(val_X) * noise_sigma sm_loss = exact_score_matching(energy_net, val_X, train=False).mean() if sm_loss < best_val_loss['esm']: best_val_loss['esm'] = sm_loss best_val_iter['esm'] = step best_model['esm'] = copy.deepcopy(flow.state_dict()) logging.info( 'step: {}, exact score matching loss: {}'.format( step, sm_loss.item())) tb_logger.add_scalar('exact_score_matching_loss', sm_loss, global_step=step) # save records in txt val_sm_record.append(sm_loss.item()) np.savetxt(txtfiles + '/val_smloss_record.txt', np.array(val_sm_record)) if step % 500 == 0: torch.save(flow.state_dict(), os.path.join(model_path, 'nice.pth')) step += 1 self.results = {} self.evaluate_model(flow.state_dict(), "final", val_loader, test_loader, model_path) self.evaluate_model(best_model['val'], "best_on_val", val_loader, test_loader, model_path) self.evaluate_model(best_model['ll'], "best_on_ll", val_loader, test_loader, model_path) self.evaluate_model(best_model['esm'], "best_on_esm", val_loader, test_loader, model_path) self.results['final']['num_iters'] = step self.results['best_on_val']['num_iters'] = best_val_iter['val'] self.results['best_on_ll']['num_iters'] = best_val_iter['ll'] self.results['best_on_esm']['num_iters'] = best_val_iter['esm'] pickle_out = open(model_path + "/results.pkl", "wb") pickle.dump(self.results, pickle_out) pickle_out.close()
def _main(): print_gpu_details() device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu") train_root = args.train_path image_size = 256 cropped_image_size = 256 print("set image folder") train_set = dset.ImageFolder(root=train_root, transform=transforms.Compose([ transforms.Resize(image_size), transforms.CenterCrop(cropped_image_size), transforms.ToTensor() ])) normalizer_clf = transforms.Compose([ transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) print('set data loader') train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True) classifier = torch.load(args.classifier_path) classifier.eval() classifier.to(device) features1_to_image_gen = Features1ToImage() features1_to_image_gen.load_state_dict(torch.load(os.path.join(args.features_gens_dir_path, 'features1_to_image'))) features1_to_image_gen.eval() features1_to_image_gen.to(device) features_generators = [LevelUpFeaturesGenerator(input_level_features=i) for i in range(2, 5)] for i, features_gen in enumerate(features_generators): input_level_features = i + 2 features_gen.to(device) # weights init if input_level_features < args.train_block_input: features_gen.load_state_dict(torch.load(os.path.join(args.features_gens_dir_path, 'features{}_to_features{}'.format(input_level_features, input_level_features - 1)))) features_gen.eval() else: features_gen.init_weights() discriminator = FeaturesDiscriminator(args.discriminator_norm, dis_type=args.gen_type, dis_level=args.train_block_input - 1) discriminator.to(device) discriminator.init_weights() # losses + optimizers criterion_discriminator, criterion_generator = get_wgan_losses_fn() next_level_features_criterion = nn.L1Loss() criterion_features = nn.L1Loss() gen_optimizer = optim.Adam(features_generators[args.train_block_input - 2].parameters(), lr=args.lr, betas=(0.5, 0.999)) discriminator_optimizer = optim.Adam(discriminator.parameters(), lr=args.lr, betas=(0.5, 0.999)) num_of_epochs = args.epochs starting_time = time.time() iterations = 0 outputs_dir = os.path.join('features_creation_models', args.model_name) if not os.path.isdir(outputs_dir): os.makedirs(outputs_dir, exist_ok=True) temp_results_dir = os.path.join(outputs_dir, 'temp_results') if not os.path.isdir(temp_results_dir): os.mkdir(temp_results_dir) models_dir = os.path.join(outputs_dir, 'models_checkpoint') if not os.path.isdir(models_dir): os.mkdir(models_dir) writer = tensorboardX.SummaryWriter(os.path.join(outputs_dir, 'summaries')) fixed_features = 0 first_iter = True print("Starting Training Loop...") features_to_train = args.train_block_input for epoch in range(num_of_epochs): for data in train_loader: iterations += 1 if iterations % 30 == 1: print('epoch:', epoch, ', iter', iterations, 'start, time =', time.time() - starting_time, 'seconds') starting_time = time.time() images, _ = data images = images.to(device) # change to gpu tensor images_clf = normalizer_clf(images) _, features = classifier(images_clf) features = list(features) if first_iter: first_iter = False fixed_features = [torch.clone(features[x]) for x in range(len(features))] grid = vutils.make_grid(images, padding=2, normalize=False, nrow=8) vutils.save_image(grid, os.path.join(temp_results_dir, 'original_images.jpg')) if iterations % (args.discriminator_steps + 1) != 1: discriminator_loss_dict = train_discriminator(features_generators[features_to_train - 2], discriminator, criterion_discriminator, discriminator_optimizer, features, features_to_train) for k, v in discriminator_loss_dict.items(): writer.add_scalar('D/%s' % k, v.data.cpu().numpy(), global_step=iterations) if iterations % 30 == 1: print('{}: {:.6f}'.format(k, v)) else: generator_loss_dict = train_generator(features_generators[features_to_train - 2], discriminator, classifier, gen_optimizer, features, features_to_train, criterion_generator, criterion_features, next_level_features_criterion) for k, v in generator_loss_dict.items(): writer.add_scalar('G/f' + str(features_to_train) + '_%s' % k, v.data.cpu().numpy(), global_step=iterations//1 + 1) if iterations % 30 == 1: print('{}: {:.6f}'.format(k, v)) if iterations < 10000 and iterations % 2000 == 1 or iterations % 4000 == 1: for i, features_gen in enumerate(features_generators): features_level = i + 2 if features_level == features_to_train: # print only the given layer output (have option to generate from all layer by modifying the if) torch.save(features_gen.state_dict(), models_dir + '/' + args.model_name + '_f{}_to_f{}'.format(features_level, features_level - 1)) # regular sampling (#batch_size different images) fake_images = sample(features1_to_image_gen, features_generators, fixed_features, features_level) grid = vutils.make_grid(fake_images, padding=2, normalize=True, nrow=8) vutils.save_image(grid, os.path.join(temp_results_dir, 'res_iter_{}_origin_f{}.jpg'.format(iterations // 2000, features_level))) if iterations % 20000 == 1: for i, features_gen in enumerate(features_generators): features_level = i + 2 torch.save(features_gen.state_dict(), models_dir + '/' + args.model_name + '_f{}_to_f{}_'.format(features_level, features_level - 1) + str(iterations // 20000))
# model_name = args.model or default_model_name # model_dir = utils.get_model_dir(model_name) results_dir = os.path.join(args.results_dir, args.env) model_dir = os.path.join(results_dir, 'models', 'seed' + str(args.seed)) if not os.path.exists(model_dir): try: os.makedirs(model_dir) except: pass output_performance_filename = os.path.join(results_dir, 'seed' + str(args.seed) + '.npz') # Load loggers and Tensorboard writer txt_logger = utils.get_txt_logger(model_dir) csv_file, csv_logger = utils.get_csv_logger(model_dir) tb_writer = tensorboardX.SummaryWriter(model_dir) # Log command and all script arguments txt_logger.info("{}\n".format(" ".join(sys.argv))) txt_logger.info("{}\n".format(args)) # Set seed for all randomness sources utils.seed(args.seed) # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") txt_logger.info(f"Device: {device}\n")
def do_train(cfg, model, data_loader, optimizer, scheduler, checkpointer, device, arguments, args): logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() model.train() save_to_disk = dist_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: import tensorboardX summary_writer = tensorboardX.SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) else: summary_writer = None max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = targets.to(device) loss_dict = model(images, targets=targets) loss = sum(loss for loss in loss_dict.values()) #print('loss dict = {}'.format(loss_dict)) assert torch.all(torch.isfinite(images.flatten())) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() loss.backward() optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if iteration % args.save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: if dist_util.get_rank() == 0 and summary_writer: eval_results = do_evaluation(cfg, model, distributed=args.distributed, iteration=iteration) print('Logging evaluation results...') for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) # For debugging if 0: print( 'writing backup accuracy to logger, eval_result = {}' .format(eval_result)) summary_writer.add_scalar( 'accuracyBackup', eval_result['metrics']['mAP'], global_step=iteration) # important!! summary_writer.flush() model.train() # *IMPORTANT*: change to train mode after eval. checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
shuffle=True, batch_size=64, transform=train_transform) testloader = CIFAR10(train=False, shuffle=False, batch_size=100, transform=test_transform) model = get_model(args.model) optimizer = nn.SGD(parameters=model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=5e-4) summary_writer = tensorboardX.SummaryWriter(logdir=args.logdir) decay_lr_at = [int(args.epoch_num * i) for i in [0.25, 0.5, 0.75]] max_acc = 0. for epoch in range(args.epoch_num): if epoch in decay_lr_at: optimizer.lr *= 0.1 train_loss, train_acc = train(epoch, model, trainloader, optimizer) test_loss, test_acc = test(epoch, model, testloader) summary_writer.add_scalar('Train Loss', train_loss, epoch) summary_writer.add_scalar('Train Acc', train_acc, epoch) summary_writer.add_scalar('Test Loss', test_loss, epoch) summary_writer.add_scalar('Test Acc', test_acc, epoch)
def set_out_file(self, out_file_name: str, exp_name: str) -> None: self.out_file_name = out_file_name self.exp_name = exp_name self.writer = tensorboardX.SummaryWriter(log_dir=f"{out_file_name}")
def main(): configs = prepare() if configs.evaluate is not None: configs.evaluate.fn(configs) return import numpy as np import tensorboardX import torch import torch.backends.cudnn as cudnn from torch.utils.data import DataLoader from tqdm import tqdm ################################ # Train / Eval Kernel Function # ################################ def adjust_learning_rate(optimizer, epoch, args_lr): """Sets the learning rate to the initial LR decayed by half by every 5 or 10 epochs""" if epoch > 0: if epoch <= 30: lr = args_lr * (0.5**(epoch // 5)) else: lr = args_lr * (0.5**(epoch // 10)) for param_group in optimizer.param_groups: param_group['lr'] = lr writer.add_scalar('lr_dis', lr, epoch) # train kernel def train(model, source_loader, target_loader, criterion, optimizer_g, optimizer_cls, scheduler_g, scheduler_cls, current_step, writer, cons): model.train() loss_total = 0 loss_adv_total = 0 data_total = 0 batch_iterator = zip(loop_iterable(source_loader), loop_iterable(target_loader)) for _ in trange(len(source_loader)): (inputs, targets), (inputs_t, _) = next(batch_iterator) if isinstance(inputs, dict): for k, v in inputs.items(): batch_size = v.size(0) inputs[k] = v.to(configs.device, non_blocking=True) else: batch_size = inputs.size(0) inputs = inputs.to(configs.device, non_blocking=True) if isinstance(inputs_t, dict): for k, v in inputs_t.items(): batch_size = v.size(0) inputs_t[k] = v.to(configs.device, non_blocking=True) else: batch_size = inputs_t.size(0) inputs_t = inputs_t.to(configs.device, non_blocking=True) if isinstance(targets, dict): for k, v in targets.items(): targets[k] = v.to(configs.device, non_blocking=True) else: targets = targets.to(configs.device, non_blocking=True) outputs = model(inputs) pred_t1, pred_t2 = model.module.inst_seg_net( { 'features': inputs_t['features'], 'one_hot_vectors': inputs_t['one_hot_vectors'] }, constant=cons, adaptation=True) loss_s = criterion(outputs, targets) # Adversarial loss loss_adv = -1 * discrepancy_loss(pred_t1, pred_t2) loss = loss_s + loss_adv loss.backward() optimizer_g.step() optimizer_cls.step() optimizer_g.zero_grad() optimizer_cls.zero_grad() loss_adv_total += loss_adv.item() * batch_size # Gen Training for _ in range(configs.train.gen_num_train): pred_t1, pred_t2 = model.module.inst_seg_net( { 'features': inputs_t['features'], 'one_hot_vectors': inputs_t['one_hot_vectors'] }, constant=cons, adaptation=True) loss_adv = -1 * discrepancy_loss(pred_t1, pred_t2) loss_adv.backward() loss_adv_total += loss_adv.item() * batch_size optimizer_g.step() optimizer_g.zero_grad() loss_total += loss_s.item() * batch_size data_total += batch_size writer.add_scalar('loss_s/train', loss_total / data_total, current_step) writer.add_scalar('loss_adv/train', loss_adv_total / data_total, current_step) current_step += batch_size if scheduler_g is not None: scheduler_g.step() if scheduler_cls is not None: scheduler_cls.step() # evaluate kernel def evaluate(model, loader, split='test'): meters = {} for k, meter in configs.train.meters.items(): meters[k.format(split)] = meter() model.eval() with torch.no_grad(): for inputs, targets in tqdm(loader, desc=split, ncols=0): if isinstance(inputs, dict): for k, v in inputs.items(): inputs[k] = v.to(configs.device, non_blocking=True) else: inputs = inputs.to(configs.device, non_blocking=True) if isinstance(targets, dict): for k, v in targets.items(): targets[k] = v.to(configs.device, non_blocking=True) else: targets = targets.to(configs.device, non_blocking=True) outputs = model(inputs) for meter in meters.values(): meter.update(outputs, targets) for k, meter in meters.items(): meters[k] = meter.compute() return meters ########### # Prepare # ########### if configs.device == 'cuda': cudnn.benchmark = True if configs.get('deterministic', False): cudnn.deterministic = True cudnn.benchmark = False if ('seed' not in configs) or (configs.seed is None): configs.seed = torch.initial_seed() % (2**32 - 1) seed = configs.seed random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) print(configs) ##################################################################### # Initialize DataLoaders, Model, Criterion, LRScheduler & Optimizer # ##################################################################### print(f'\n==> loading source dataset "{configs.source_dataset}"') source_dataset = configs.source_dataset() source_loaders = { "train": DataLoader( source_dataset["train"], shuffle=True, batch_size=configs.train.batch_size, drop_last=True, num_workers=configs.data.num_workers, pin_memory=True, worker_init_fn=lambda worker_id: np.random.seed(seed + worker_id)) } print(f'\n==> loading target dataset "{configs.target_dataset}"') target_dataset = configs.target_dataset() target_loaders = {} for split in target_dataset: target_loaders[split] = DataLoader( target_dataset[split], shuffle=(split == 'train'), batch_size=configs.train.batch_size, drop_last=True, num_workers=configs.data.num_workers, pin_memory=True, worker_init_fn=lambda worker_id: np.random.seed(seed + worker_id)) print(f'\n==> creating model "{configs.model}"') model = configs.model() if configs.device == 'cuda': model = torch.nn.DataParallel(model) model = model.to(configs.device) criterion = configs.train.criterion().to(configs.device) #params gen_params = [{ 'params': v } for k, v in model.module.inst_seg_net.g.named_parameters() if 'pred_offset' not in k] cls_params = [{ 'params': model.module.inst_seg_net.c1.parameters() }, { 'params': model.module.inst_seg_net.c2.parameters() }, { 'params': model.module.center_reg_net.parameters() }, { 'params': model.module.box_est_net.parameters() }] optimizer_g = configs.train.optimizer_g(gen_params) optimizer_cls = configs.train.optimizer_cls(cls_params) # optimizer_dis = configs.train.optimizer_dis(dis_params) last_epoch, best_metrics = -1, {m: None for m in configs.train.metrics} if os.path.exists(configs.train.checkpoint_path): print(f'==> loading checkpoint "{configs.train.checkpoint_path}"') checkpoint = torch.load(configs.train.checkpoint_path) print(' => loading model') model.load_state_dict(checkpoint.pop('model')) if 'optimizer_g' in checkpoint and checkpoint[ 'optimizer_g'] is not None: print(' => loading optimizer_g') optimizer_g.load_state_dict(checkpoint.pop('optimizer_g')) if 'optimizer_cls' in checkpoint and checkpoint[ 'optimizer_cls'] is not None: print(' => loading optimizer_cls') optimizer_cls.load_state_dict(checkpoint.pop('optimizer_cls')) last_epoch = checkpoint.get('epoch', last_epoch) meters = checkpoint.get('meters', {}) for m in configs.train.metrics: best_metrics[m] = meters.get(m + '_best', best_metrics[m]) del checkpoint if 'scheduler_g' in configs.train and configs.train.scheduler_g is not None: configs.train.scheduler_g.last_epoch = last_epoch print(f'==> creating scheduler "{configs.train.scheduler_g}"') scheduler_g = configs.train.scheduler_g(optimizer_g) else: scheduler_g = None if 'scheduler_c' in configs.train and configs.train.scheduler_c is not None: configs.train.scheduler_c.last_epoch = last_epoch print(f'==> creating scheduler "{configs.train.scheduler_c}"') scheduler_c = configs.train.scheduler_c(optimizer_cls) else: scheduler_c = None ############ # Training # ############ if last_epoch >= configs.train.num_epochs: meters = dict() for split, loader in target_loaders.items(): if split != 'train': meters.update(evaluate(model, loader=loader, split=split)) for k, meter in meters.items(): print(f'[{k}] = {meter:2f}') return with tensorboardX.SummaryWriter(configs.train.save_path) as writer: step_size = min(len(source_dataset['train']), len(target_dataset['train'])) for current_epoch in range(last_epoch + 1, configs.train.num_epochs): current_step = current_epoch * step_size cons = math.sin( (current_epoch + 1) / configs.train.num_epochs * math.pi / 2) writer.add_scalar('lr_g', scheduler_g.get_lr()[0], current_epoch) writer.add_scalar('lr_c', scheduler_c.get_lr()[0], current_epoch) # train print( f'\n==> training epoch {current_epoch}/{configs.train.num_epochs}' ) train(model, source_loader=source_loaders['train'], target_loader=target_loaders['train'], criterion=criterion, optimizer_g=optimizer_g, optimizer_cls=optimizer_cls, scheduler_g=scheduler_g, scheduler_cls=scheduler_c, current_step=current_step, writer=writer, cons=cons) current_step += step_size # evaluate meters = dict() for split, loader in source_loaders.items(): if split != 'train': meters.update(evaluate(model, loader=loader, split=split)) for k, meter in meters.items(): print(f'Source [{k}] = {meter:2f}') meters = dict() for split, loader in target_loaders.items(): if split != 'train': meters.update(evaluate(model, loader=loader, split=split)) # check whether it is the best best = {m: False for m in configs.train.metrics} for m in configs.train.metrics: if best_metrics[m] is None or best_metrics[m] < meters[m]: best_metrics[m], best[m] = meters[m], True meters[m + '_best'] = best_metrics[m] # log in tensorboard for k, meter in meters.items(): print(f'Target [{k}] = {meter:2f}') writer.add_scalar(k, meter, current_step) # save checkpoint torch.save( { 'epoch': current_epoch, 'model': model.state_dict(), 'optimizer_g': optimizer_g.state_dict(), 'optimizer_cls': optimizer_cls.state_dict(), 'meters': meters, 'configs': configs, }, configs.train.checkpoint_path) shutil.copyfile( configs.train.checkpoint_path, configs.train.checkpoints_path.format(current_epoch)) for m in configs.train.metrics: if best[m]: shutil.copyfile(configs.train.checkpoint_path, configs.train.best_checkpoint_paths[m]) if best.get(configs.train.metric, False): shutil.copyfile(configs.train.checkpoint_path, configs.train.best_checkpoint_path) print(f'[save_path] = {configs.train.save_path}')
def main(): opt = TrainOptions() args = opt.initialize() _t = {'iter time': Timer()} model_name = args.source + '_to_' + args.target if not os.path.exists(args.snapshot_dir): os.makedirs(args.snapshot_dir) os.makedirs(os.path.join(args.snapshot_dir, 'logs')) opt.print_options(args) sourceloader, targetloader = CreateSrcDataLoader( args), CreateTrgDataLoader(args) targetloader_iter, sourceloader_iter = iter(targetloader), iter( sourceloader) model, optimizer = CreateModel(args) model_D, optimizer_D = CreateDiscriminator(args) start_iter = 0 if args.restore_from is not None: start_iter = int(args.restore_from.rsplit('/', 1)[1].rsplit('_')[1]) train_writer = tensorboardX.SummaryWriter( os.path.join(args.snapshot_dir, "logs", model_name)) bce_loss = torch.nn.BCEWithLogitsLoss() cudnn.enabled = True cudnn.benchmark = True model.train() model.cuda() model_D.train() model_D.cuda() loss = [ 'loss_seg_src', 'loss_seg_trg', 'loss_D_trg_fake', 'loss_D_src_real', 'loss_D_trg_real' ] _t['iter time'].tic() for i in range(start_iter, args.num_steps): model.adjust_learning_rate(args, optimizer, i) model_D.adjust_learning_rate(args, optimizer_D, i) optimizer.zero_grad() optimizer_D.zero_grad() for param in model_D.parameters(): param.requires_grad = False src_img, src_lbl, _, _ = sourceloader_iter.next() src_img, src_lbl = Variable(src_img).cuda(), Variable( src_lbl.long()).cuda() src_seg_score = model(src_img, lbl=src_lbl) loss_seg_src = model.loss loss_seg_src.backward() if args.data_label_folder_target is not None: trg_img, trg_lbl, _, _ = targetloader_iter.next() trg_img, trg_lbl = Variable(trg_img).cuda(), Variable( trg_lbl.long()).cuda() trg_seg_score = model(trg_img, lbl=trg_lbl) loss_seg_trg = model.loss else: trg_img, _, name = targetloader_iter.next() trg_img = Variable(trg_img).cuda() trg_seg_score = model(trg_img) loss_seg_trg = 0 outD_trg = model_D(F.softmax(trg_seg_score), 0) loss_D_trg_fake = model_D.loss loss_trg = args.lambda_adv_target * loss_D_trg_fake + loss_seg_trg loss_trg.backward() for param in model_D.parameters(): param.requires_grad = True src_seg_score, trg_seg_score = src_seg_score.detach( ), trg_seg_score.detach() outD_src = model_D(F.softmax(src_seg_score), 0) loss_D_src_real = model_D.loss / 2 loss_D_src_real.backward() outD_trg = model_D(F.softmax(trg_seg_score), 1) loss_D_trg_real = model_D.loss / 2 loss_D_trg_real.backward() optimizer.step() optimizer_D.step() for m in loss: train_writer.add_scalar(m, eval(m), i + 1) if (i + 1) % args.save_pred_every == 0: print('taking snapshot ...') torch.save( model.state_dict(), os.path.join(args.snapshot_dir, '%s_' % (args.source) + str(i + 1) + '.pth')) if (i + 1) % args.print_freq == 0: _t['iter time'].toc(average=False) print('[it %d][src seg loss %.4f][lr %.4f][%.2fs]' % \ (i + 1, loss_seg_src.data, optimizer.param_groups[0]['lr']*10000, _t['iter time'].diff)) if i + 1 > args.num_steps_stop: print('finish training') break _t['iter time'].tic()
torch.nn.Linear(128, 64), torch.nn.CELU(0.1), torch.nn.Linear(64, 1)) return model nn = torchani.ANIModel([atomic() for _ in range(4)]) print(nn) if os.path.isfile(model_checkpoint): nn.load_state_dict(torch.load(model_checkpoint)) else: torch.save(nn.state_dict(), model_checkpoint) model = torch.nn.Sequential(aev_computer, nn).to(device) writer = tensorboardX.SummaryWriter(log_dir=log) training = torchani.data.BatchedANIDataset( training_path, consts.species_to_tensor, batch_size, device=device, transform=[energy_shifter.subtract_from_dataset]) print(training) validation = torchani.data.BatchedANIDataset( validation_path, consts.species_to_tensor, batch_size, device=device,
def main(_): local_job_device = '/job:{}/task:{}'.format(FLAGS.job_name, FLAGS.task) shared_job_device = '/job:learner/task:0' is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task is_learner = FLAGS.job_name == 'learner' cluster = tf.train.ClusterSpec({ 'actor': ['localhost:{}'.format(8001 + i) for i in range(FLAGS.num_actors)], 'learner': ['localhost:8000'] }) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task) filters = [shared_job_device, local_job_device] input_shape = [84, 84, 4] output_size = 4 env_name = 'BreakoutDeterministic-v4' with tf.device(shared_job_device): queue = buffer_queue.FIFOQueue(FLAGS.trajectory, input_shape, output_size, FLAGS.queue_size, FLAGS.batch_size, FLAGS.num_actors) learner = model.IMPALA(trajectory=FLAGS.trajectory, input_shape=input_shape, num_action=output_size, discount_factor=FLAGS.discount_factor, start_learning_rate=FLAGS.start_learning_rate, end_learning_rate=FLAGS.end_learning_rate, learning_frame=FLAGS.learning_frame, baseline_loss_coef=FLAGS.baseline_loss_coef, entropy_coef=FLAGS.entropy_coef, gradient_clip_norm=FLAGS.gradient_clip_norm) sess = tf.Session(server.target) queue.set_session(sess) learner.set_session(sess) if is_learner: writer = tensorboardX.SummaryWriter('runs/learner') train_step = 0 while True: size = queue.get_size() if size > 3 * FLAGS.batch_size: train_step += 1 batch = queue.sample_batch() s = time.time() pi_loss, baseline_loss, entropy, learning_rate = learner.train( state=np.stack(batch.state), reward=np.stack(batch.reward), action=np.stack(batch.action), done=np.stack(batch.done), behavior_policy=np.stack(batch.behavior_policy)) writer.add_scalar('data/pi_loss', pi_loss, train_step) writer.add_scalar('data/baseline_loss', baseline_loss, train_step) writer.add_scalar('data/entropy', entropy, train_step) writer.add_scalar('data/learning_rate', learning_rate, train_step) writer.add_scalar('data/time', time.time() - s, train_step) else: trajectory_data = collections.namedtuple('trajectory_data', [ 'state', 'next_state', 'reward', 'done', 'action', 'behavior_policy' ]) env = wrappers.make_uint8_env(env_name) if FLAGS.task == 0: env = gym.wrappers.Monitor( env, 'save-mov', video_callable=lambda episode_id: episode_id % 10 == 0) state = env.reset() episode = 0 score = 0 episode_step = 0 total_max_prob = 0 lives = 5 writer = tensorboardX.SummaryWriter('runs/actor_{}'.format(FLAGS.task)) while True: unroll_data = trajectory_data([], [], [], [], [], []) for _ in range(FLAGS.trajectory): action, behavior_policy, max_prob = learner.get_policy_and_action( state) episode_step += 1 total_max_prob += max_prob next_state, reward, done, info = env.step(action) score += reward if lives != info['ale.lives']: r = -1 d = True else: r = reward d = False unroll_data.state.append(state) unroll_data.next_state.append(next_state) unroll_data.reward.append(r) unroll_data.done.append(d) unroll_data.action.append(action) unroll_data.behavior_policy.append(behavior_policy) state = next_state lives = info['ale.lives'] if done: print(episode, score) writer.add_scalar('data/prob', total_max_prob / episode_step, episode) writer.add_scalar('data/score', score, episode) writer.add_scalar('data/episode_step', episode_step, episode) episode += 1 score = 0 episode_step = 0 total_max_prob = 0 lives = 5 state = env.reset() queue.append_to_queue( task=FLAGS.task, unrolled_state=unroll_data.state, unrolled_next_state=unroll_data.next_state, unrolled_reward=unroll_data.reward, unrolled_done=unroll_data.done, unrolled_action=unroll_data.action, unrolled_behavior_policy=unroll_data.behavior_policy)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("--multi-node", action="store_true", help="multi node") parser.add_argument("--out", help="output directory") parser.add_argument("--debug", action="store_true", help="debug mode") parser.add_argument("--gpu", type=int, default=0, help="gpu id") parser.add_argument("--seed", type=int, default=0, help="random seed") parser.add_argument( "--lr", type=float, default=0.0001, help="learning rate", ) parser.add_argument( "--max-epoch", type=int, default=30, help="max epoch", ) parser.add_argument( "--call-evaluation-before-training", action="store_true", help="call evaluation before training", ) def argparse_type_class_ids(string): if string == "all": n_class = len(morefusion.datasets.ycb_video.class_names) class_ids = np.arange(n_class)[1:].tolist() elif string == "asymmetric": class_ids = ( morefusion.datasets.ycb_video.class_ids_asymmetric.tolist()) elif string == "symmetric": class_ids = ( morefusion.datasets.ycb_video.class_ids_symmetric.tolist()) else: class_ids = [int(x) for x in string.split(",")] return class_ids parser.add_argument( "--class-ids", type=argparse_type_class_ids, default="all", help="class id (e.g., 'all', 'asymmetric', 'symmetric', '1,6,9')", ) parser.add_argument( "--pretrained-model", help="pretrained model", ) parser.add_argument( "--note", help="note", ) parser.add_argument( "--pretrained-resnet18", action="store_true", help="pretrained resnet18", ) parser.add_argument( "--centerize-pcd", action="store_true", help="centerize pcd", ) parser.add_argument( "--resume", help="resume", ) parser.add_argument( "--loss", choices=["add/add_s", "add->add/add_s|1"], default="add->add/add_s|1", help="loss", ) args = parser.parse_args() chainer.global_config.debug = args.debug # ------------------------------------------------------------------------- # device initialization if args.multi_node: import chainermn comm = chainermn.create_communicator("pure_nccl") device = comm.intra_rank n_gpu = comm.size else: device = args.gpu n_gpu = 1 if not args.multi_node or comm.rank == 0: now = datetime.datetime.now(datetime.timezone.utc) args.timestamp = now.isoformat() args.hostname = socket.gethostname() args.githash = morefusion.utils.githash(__file__) termcolor.cprint("==> Started training", attrs={"bold": True}) if args.out is None: if not args.multi_node or comm.rank == 0: args.out = osp.join(here, "logs", now.strftime("%Y%m%d_%H%M%S.%f")) else: args.out = None if args.multi_node: args.out = comm.bcast_obj(args.out) if device >= 0: chainer.cuda.get_device_from_id(device).use() # seed initialization random.seed(args.seed) np.random.seed(args.seed) if device >= 0: chainer.cuda.cupy.random.seed(args.seed) # dataset initialization data_train = None data_valid = None if not args.multi_node or comm.rank == 0: termcolor.cprint("==> Dataset size", attrs={"bold": True}) data_ycb_trainreal = morefusion.datasets.YCBVideoRGBDPoseEstimationDatasetReIndexed( # NOQA "trainreal", class_ids=args.class_ids, augmentation=True) data_ycb_syn = morefusion.datasets.YCBVideoRGBDPoseEstimationDatasetReIndexed( # NOQA "syn", class_ids=args.class_ids, augmentation=True) data_ycb_syn = morefusion.datasets.RandomSamplingDataset( data_ycb_syn, len(data_ycb_trainreal)) data_my_train = morefusion.datasets.MySyntheticYCB20190916RGBDPoseEstimationDatasetReIndexed( # NOQA "train", class_ids=args.class_ids, augmentation=True) data_train = chainer.datasets.ConcatenatedDataset( data_ycb_trainreal, data_ycb_syn, data_my_train) print(f"ycb_trainreal={len(data_ycb_trainreal)}, " f"ycb_syn={len(data_ycb_syn)}, my_train={len(data_my_train)}") del data_ycb_trainreal, data_ycb_syn, data_my_train data_ycb_val = morefusion.datasets.YCBVideoRGBDPoseEstimationDatasetReIndexed( # NOQA "val", class_ids=args.class_ids) data_my_val = morefusion.datasets.MySyntheticYCB20190916RGBDPoseEstimationDatasetReIndexed( # NOQA "val", class_ids=args.class_ids) data_valid = chainer.datasets.ConcatenatedDataset( data_ycb_val, data_my_val, ) print(f"ycb_val={len(data_ycb_val)}, my_val={len(data_my_val)}") del data_ycb_val, data_my_val data_train = chainer.datasets.TransformDataset(data_train, transform) data_valid = chainer.datasets.TransformDataset(data_valid, transform) if args.multi_node: data_train = chainermn.scatter_dataset(data_train, comm, shuffle=True, seed=args.seed) data_valid = chainermn.scatter_dataset(data_valid, comm, shuffle=False, seed=args.seed) args.class_names = morefusion.datasets.ycb_video.class_names.tolist() loss = args.loss if loss == "add->add/add_s|1": loss = "add" # model initialization model = contrib.models.Model( n_fg_class=len(args.class_names) - 1, centerize_pcd=args.centerize_pcd, pretrained_resnet18=args.pretrained_resnet18, loss=loss, ) if args.pretrained_model is not None: chainer.serializers.load_npz(args.pretrained_model, model) if device >= 0: model.to_gpu() # optimizer initialization optimizer = chainer.optimizers.Adam(alpha=args.lr) if args.multi_node: optimizer = chainermn.create_multi_node_optimizer(optimizer, comm) optimizer.setup(model) if args.pretrained_resnet18: model.resnet_extractor.init_block.disable_update() model.resnet_extractor.res2.disable_update() for link in model.links(): if isinstance(link, chainer.links.BatchNormalization): link.disable_update() if not args.multi_node or comm.rank == 0: termcolor.cprint("==> Link update rules", attrs={"bold": True}) for name, link in model.namedlinks(): print(name, link.update_enabled) # iterator initialization iter_train = chainer.iterators.MultiprocessIterator( data_train, batch_size=16 // n_gpu, repeat=True, shuffle=True, ) iter_valid = chainer.iterators.MultiprocessIterator( data_valid, batch_size=16, repeat=False, shuffle=False, ) updater = chainer.training.StandardUpdater( iterator=iter_train, optimizer=optimizer, device=device, ) if not args.multi_node or comm.rank == 0: writer = tensorboardX.SummaryWriter(log_dir=args.out) writer_with_updater = morefusion.training.SummaryWriterWithUpdater( writer) writer_with_updater.setup(updater) # ------------------------------------------------------------------------- trainer = chainer.training.Trainer(updater, (args.max_epoch, "epoch"), out=args.out) trainer.extend(E.FailOnNonNumber()) @chainer.training.make_extension(trigger=(1, "iteration")) def update_loss(trainer): updater = trainer.updater optimizer = updater.get_optimizer("main") target = optimizer.target assert trainer.stop_trigger.unit == "epoch" if args.loss == "add->add/add_s|1": if updater.epoch_detail < 1: assert target._loss == "add" else: target._loss = "add/add_s" else: assert args.loss in ["add/add_s"] return trainer.extend(update_loss) log_interval = 10, "iteration" eval_interval = 0.25, "epoch" # evaluate evaluator = morefusion.training.extensions.PoseEstimationEvaluator( iterator=iter_valid, target=model, device=device, progress_bar=True, ) if args.multi_node: evaluator.comm = comm trainer.extend( evaluator, trigger=eval_interval, call_before_training=args.call_evaluation_before_training, ) if not args.multi_node or comm.rank == 0: # print arguments msg = pprint.pformat(args.__dict__) msg = textwrap.indent(msg, prefix=" " * 2) termcolor.cprint("==> Arguments", attrs={"bold": True}) print(f"\n{msg}\n") trainer.extend( morefusion.training.extensions.ArgsReport(args), call_before_training=True, ) # snapshot trigger_best_add = chainer.training.triggers.MinValueTrigger( key="validation/main/add_or_add_s", trigger=eval_interval, ) trigger_best_auc = chainer.training.triggers.MaxValueTrigger( key="validation/main/auc/add_or_add_s", trigger=eval_interval, ) trainer.extend( E.snapshot(filename="snapshot_trainer_latest.npz"), trigger=eval_interval, ) trainer.extend( E.snapshot_object(model, filename="snapshot_model_latest.npz"), trigger=eval_interval, ) trainer.extend( E.snapshot_object(model, filename="snapshot_model_best_add.npz"), trigger=trigger_best_add, ) trainer.extend( E.snapshot_object(model, filename="snapshot_model_best_auc.npz"), trigger=trigger_best_auc, ) # log trainer.extend( morefusion.training.extensions.LogTensorboardReport( writer=writer, trigger=log_interval, ), call_before_training=True, ) trainer.extend( E.PrintReport( [ "epoch", "iteration", "elapsed_time", "main/loss", "main/add_or_add_s", "validation/main/auc/add_or_add_s", ], log_report="LogTensorboardReport", ), trigger=log_interval, call_before_training=True, ) trainer.extend(E.ProgressBar(update_interval=1)) # ------------------------------------------------------------------------- if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def train(self): if 'CIFAR' in self.config.data.dataset: if self.config.data.augmentation: transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) else: transform_train = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if self.config.data.dataset == 'CIFAR10': dataset = CIFAR10(os.path.join(self.args.run, 'datasets', 'cifar10'), train=True, download=True, transform=transform_train) test_dataset = CIFAR10(os.path.join(self.args.run, 'datasets', 'cifar10'), train=False, download=True, transform=transform_test) elif self.config.data.dataset == 'CIFAR100': dataset = CIFAR100(os.path.join(self.args.run, 'datasets', 'cifar100'), train=True, download=True, transform=transform_train) test_dataset = CIFAR100(os.path.join(self.args.run, 'datasets', 'cifar100'), train=False, download=True, transform=transform_test) elif self.config.data.dataset == 'MNIST': if self.config.data.augmentation: transform = transforms.Compose([ transforms.RandomCrop(28, padding=2), transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, )) ]) else: transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, )) ]) dataset = MNIST(os.path.join(self.args.run, 'datasets', 'mnist'), train=True, download=True, transform=transform) test_dataset = MNIST(os.path.join(self.args.run, 'datasets', 'mnist_test'), train=False, download=True, transform=transform) elif self.config.data.dataset == 'CELEBA': dataset = ImageFolder( root=os.path.join(self.args.run, 'datasets', 'celeba'), transform=transforms.Compose([ transforms.CenterCrop(140), transforms.Resize(self.config.data.image_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ])) num_items = len(dataset) indices = list(range(num_items)) random_state = np.random.get_state() np.random.seed(2019) np.random.shuffle(indices) np.random.set_state(random_state) train_indices, test_indices = indices[:int( num_items * 0.7)], indices[int(num_items * 0.7):int(num_items * 0.8)] test_dataset = Subset(dataset, test_indices) dataset = Subset(dataset, train_indices) dataloader = DataLoader(dataset, batch_size=self.config.training.batch_size, shuffle=True, num_workers=4, drop_last=True) test_loader = DataLoader(test_dataset, batch_size=self.config.training.batch_size, shuffle=False, num_workers=4, drop_last=True) test_iter = iter(test_loader) net = Net(self.config).to(self.config.device) #net = ResNet(self.config).to(self.config.device) net = torch.nn.DataParallel(net) optimizer = self.get_optimizer(net.parameters()) tb_path = os.path.join(self.args.run, 'tensorboard', self.args.doc) if os.path.exists(tb_path): shutil.rmtree(tb_path) tb_logger = tensorboardX.SummaryWriter(log_dir=tb_path) if self.args.resume_training: states = torch.load(os.path.join(self.args.run, 'logs', self.args.doc, 'checkpoint.pth'), map_location=self.config.device) net.load_state_dict(states[0]) optimizer.load_state_dict(states[1]) begin_epoch = states[2] step = states[3] else: step = 0 begin_epoch = 0 # Train the model # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[150], gamma=0.3) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, self.config.training.n_epochs, eta_min=0.) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, min_lr=1e-08) for epoch in range(begin_epoch, self.config.training.n_epochs): scheduler.step() # manually adjust learning rate # self.adjust_learning_rate(optimizer, epoch) # total_loss = 0 #for plateau scheduler only for batch_idx, (data, target) in enumerate(dataloader): net.train() data = data.to(device=self.config.device) target = target.to(device=self.config.device) output = net(data) loss = F.nll_loss(output, target) pred = torch.argmax(output, dim=1, keepdim=True) train_accuracy = float( pred.eq(target.data.view_as(pred)).sum()) / float( target.shape[0]) # total_loss += loss.data #for plateau scheduler # Backward and optimize optimizer.zero_grad() loss.backward() optimizer.step() # validation net.eval() with torch.no_grad(): try: test_data, test_target = next(test_iter) except: test_iter = iter(test_loader) test_data, test_target = next(test_iter) test_data = test_data.to(device=self.config.device) test_target = test_target.to(device=self.config.device) test_output = net(test_data) test_loss = F.nll_loss(test_output, test_target) test_pred = torch.argmax(test_output, dim=1, keepdim=True) test_accuracy = float( test_pred.eq(test_target.data.view_as( test_pred)).sum()) / test_data.shape[0] tb_logger.add_scalar('training_loss', loss, global_step=step) tb_logger.add_scalar('training_accuracy', train_accuracy, global_step=step) tb_logger.add_scalar('test_loss', test_loss, global_step=step) tb_logger.add_scalar('test_accuracy', test_accuracy, global_step=step) if step % self.config.training.log_interval == 0: logging.info( "epoch: {}, batch: {}, training_loss: {}, train_accuracy: {}, test_loss: {}, test_accuracy: {}" .format(epoch, batch_idx, loss.item(), train_accuracy, test_loss.item(), test_accuracy)) step += 1 # scheduler.step(total_loss) #for palteau scheduler only if (epoch + 1) % self.config.training.snapshot_interval == 0: print(self.config.training.snapshot_interval) states = [ net.state_dict(), optimizer.state_dict(), epoch + 1, step ] torch.save( states, os.path.join(self.args.run, 'logs', self.args.doc, 'checkpoint_epoch_{}.pth'.format(epoch + 1))) torch.save( states, os.path.join(self.args.run, 'logs', self.args.doc, 'checkpoint.pth'))
actor_delay=1, save_interval=100_000, name="awac_run", render=False, save_to_disk=True, log_to_disk=True, verbosity=0, infinite_bootstrap=True, **kwargs, ): if save_to_disk or log_to_disk: save_dir = utils.make_process_dirs(name) if log_to_disk: # create tb writer, save hparams writer = tensorboardX.SummaryWriter(save_dir) writer.add_hparams(locals(), {}) ########### ## SETUP ## ########### agent.to(device) agent.train() # initialize target networks target_agent = copy.deepcopy(agent) target_agent.to(device) utils.hard_update(target_agent.critic1, agent.critic1) utils.hard_update(target_agent.critic2, agent.critic2) target_agent.train() # set up optimizers critic_optimizer = torch.optim.Adam(
def run_train(model, cfg): train_loader = DataLoader(cfg['train'], batch_size=cfg['batch'], shuffle=True, num_workers=cfg['nworker'], collate_fn=cfg['collate']) model_pth = os.path.join(cfg['model_dir'], "model.pth") writer = tensorboardX.SummaryWriter(cfg['model_dir']) cfg['writer'] = writer criterion = cfg['criterion'] optimizer = torch.optim.Adam( model.parameters(), lr=cfg['lr'], weight_decay=cfg['decay']) if cfg['scheduler']: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'max', factor=cfg['factor'], patience=cfg['patience']) step = 0 for e in range(cfg['epochs']): print("----run train---", cfg['model'], e) model.train() st = time.time() cfg['step'] = e for i_batch, sample_batched in enumerate(train_loader): sgene, img, label = sample_batched inputs = torch.from_numpy(img).type(torch.cuda.FloatTensor) gt = torch.from_numpy(label).type(torch.cuda.FloatTensor) model.zero_grad() predict = model(inputs) loss = criterion(predict, gt) loss.backward() optimizer.step() writer.add_scalar("loss", loss, step) step += 1 et = time.time() writer.add_scalar("train time", et - st, e) val_loss, lab_f1_macro = run_val(model, cfg) print("val loss:", val_loss, "\tf1:", lab_f1_macro) if cfg['scheduler']: scheduler.step(lab_f1_macro) for g in optimizer.param_groups: writer.add_scalar("lr", g['lr'], e) if e == 0: start_loss = val_loss min_loss = start_loss max_f1 = 0.0 # if val_loss > 2 * min_loss: # print("early stopping at %d" % e) # break # run_test(model, cfg) if min_loss > val_loss or lab_f1_macro > max_f1: if min_loss > val_loss: min_loss = val_loss print("----save best epoch:%d, loss:%f---" % (e, val_loss)) if lab_f1_macro > max_f1: max_f1 = lab_f1_macro print("----save best epoch:%d, f1:%f---" % (e, max_f1)) torch.save(model.state_dict(), model_pth) run_test(model, cfg)
def __init__(self, path): self.global_step = 0 self.logger = tensorboardX.SummaryWriter(os.path.join(path, "log"))
print('Resuming from epoch %d at iteration %d' % (start_epoch, epoch_iter)) else: start_epoch, epoch_iter = 1, 0 if opt.debug: opt.display_freq = 1 opt.print_freq = 1 opt.niter = 1 opt.niter_decay = 0 opt.max_dataset_size = 10 data_loader = CreateFaceConDataLoader(opt) dataset = data_loader.load_data() dataset_size = len(data_loader) print('#training images = %d' % dataset_size) train_writer = tensorboardX.SummaryWriter(os.path.join('./logs', opt.name)) model = create_model(opt) visualizer = Visualizer(opt) total_steps = (start_epoch - 1) * dataset_size + epoch_iter display_delta = total_steps % opt.display_freq print_delta = total_steps % opt.print_freq save_delta = total_steps % opt.save_latest_freq for epoch in range(start_epoch, opt.niter + opt.niter_decay + 1): epoch_start_time = time.time() if epoch != start_epoch: epoch_iter = epoch_iter % dataset_size for i, data in enumerate(dataset, start=epoch_iter):
def _initialize_tensorboard(self): self.tbx = tensorboardX.SummaryWriter( self.paths['experiment_tensorboard'], flush_secs=9999)
else: device = torch.device('cpu') print('Using PyTorch version:', torch.__version__, ' Device:', device) assert (LV(torch.__version__) >= LV("1.0.0")) # TensorBoard is a tool for visualizing progress during training. Although TensorBoard was created for TensorFlow, it can also be used with PyTorch. It is easiest to use it with the tensorboardX module. try: import tensorboardX logdir = os.path.join( os.getcwd(), "logs", "gtsrb-" + datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) print('TensorBoard log directory:', logdir) os.makedirs(logdir) log = tensorboardX.SummaryWriter(logdir) except ImportError as e: log = None # ## Data # # The training dataset consists of 5535 images of traffic signs of varying size. There are 43 different types of traffic signs. # # The validation and test sets consist of 999 and 12630 images, respectively. # # ### Downloading the data datapath = os.getenv('DATADIR', '/scratch/project_2005299/data') datapath = os.path.join(datapath, 'gtsrb/train-5535') (nimages_train, nimages_validation, nimages_test) = (5535, 999, 12630)
exp = utils.ExperienceDataset() if loaded_from is not None: utils.load_checkpoint(loaded_from, dyn, pol, exp) # initialize dynamics optimizer opt1 = torch.optim.Adam(dyn.parameters(), args.dyn_lr) # initialize policy optimizer opt2 = torch.optim.Adam(pol.parameters(), args.pol_lr) if args.use_cuda and torch.cuda.is_available(): dyn = dyn.cuda() pol = pol.cuda() writer = tensorboardX.SummaryWriter( logdir=os.path.join(results_folder, "logs")) # callbacks def on_close(): writer.close() atexit.register(on_close) # initial experience data collection env.seed(args.seed) rnd = lambda x, t: env.action_space.sample() # noqa: E731 while exp.n_samples() < initial_experience: ret = utils.apply_controller( env, rnd, min(args.control_H, initial_experience - exp.n_samples() + 1),
def run(): args = parse_args() # Vis window if args.vis: cv2.namedWindow('Display', cv2.WINDOW_NORMAL) # Set-up output directories dt = datetime.datetime.now().strftime('%y%m%d_%H%M') net_desc = '{}_{}'.format(dt, '_'.join(args.description.split())) save_folder = os.path.join(args.outdir, net_desc) if not os.path.exists(save_folder): os.makedirs(save_folder) tb = tensorboardX.SummaryWriter(os.path.join(args.logdir, net_desc)) # Load Dataset logging.info('Loading {} Dataset...'.format(args.dataset.title())) Dataset = get_dataset(args.dataset) train_dataset = Dataset(args.dataset_path, start=0.0, end=args.split, ds_rotate=args.ds_rotate, random_rotate=True, random_zoom=True, include_depth=args.use_depth, include_rgb=args.use_rgb) train_data = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers ) val_dataset = Dataset(args.dataset_path, start=args.split, end=1.0, ds_rotate=args.ds_rotate, random_rotate=True, random_zoom=True, include_depth=args.use_depth, include_rgb=args.use_rgb) val_data = torch.utils.data.DataLoader( val_dataset, batch_size=1, shuffle=False, num_workers=args.num_workers ) logging.info('Done') # Load the network logging.info('Loading Network...') input_channels = 1*args.use_depth + 3*args.use_rgb ggcnn = get_network(args.network) net = ggcnn(input_channels=input_channels) device = torch.device("cuda:0") net = net.to(device) optimizer = optim.Adam(net.parameters()) logging.info('Done') # Print model architecture. summary(net, (input_channels, 300, 300)) f = open(os.path.join(save_folder, 'arch.txt'), 'w') sys.stdout = f summary(net, (input_channels, 300, 300)) sys.stdout = sys.__stdout__ f.close() best_iou = 0.0 for epoch in range(args.epochs): logging.info('Beginning Epoch {:02d}'.format(epoch)) train_results = train(epoch, net, device, train_data, optimizer, args.batches_per_epoch, vis=args.vis) # Log training losses to tensorboard tb.add_scalar('loss/train_loss', train_results['loss'], epoch) for n, l in train_results['losses'].items(): tb.add_scalar('train_loss/' + n, l, epoch) # Run Validation logging.info('Validating...') test_results = validate(net, device, val_data, args.val_batches) logging.info('%d/%d = %f' % (test_results['correct'], test_results['correct'] + test_results['failed'], test_results['correct']/(test_results['correct']+test_results['failed']))) # Log validation results to tensorbaord tb.add_scalar('loss/IOU', test_results['correct'] / (test_results['correct'] + test_results['failed']), epoch) tb.add_scalar('loss/val_loss', test_results['loss'], epoch) for n, l in test_results['losses'].items(): tb.add_scalar('val_loss/' + n, l, epoch) # Save best performing network iou = test_results['correct'] / (test_results['correct'] + test_results['failed']) if iou > best_iou or epoch == 0 or (epoch % 10) == 0: torch.save(net, os.path.join(save_folder, 'epoch_%02d_iou_%0.2f' % (epoch, iou))) torch.save(net.state_dict(), os.path.join(save_folder, 'epoch_%02d_iou_%0.2f_statedict.pt' % (epoch, iou))) best_iou = iou
def train(opt): # Deal with feature things before anything opt.use_att = utils.if_use_att(opt.caption_model) if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5 loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length opt.pos_size = loader.pos_size tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path) infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')): with open( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')) as f: histories = cPickle.load(f) iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) model = models.setup(opt).cuda() dp_model = torch.nn.DataParallel(model) update_lr_flag = True # Assure in training mode dp_model.train() crit = utils.CRFModelCriterion() rl_crit = utils.RewardCriterion() optimizer = utils.build_optimizer(model.parameters(), opt) # Load the optimizer if vars(opt).get('start_from', None) is not None and os.path.isfile( os.path.join(opt.start_from, "optimizer.pth")): optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) while True: if update_lr_flag: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate utils.set_lr(optimizer, opt.current_lr) # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False update_lr_flag = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['pos'], data['masks'], data['att_masks'] ] tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp] fc_feats, att_feats, labels, pos, masks, att_masks = tmp optimizer.zero_grad() if not sc_flag: outputs, crfloss = dp_model(fc_feats, att_feats, labels, pos[:, 1:], masks[:, 1:], att_masks) loss = crit(crfloss, outputs, labels[:, 1:], masks[:, 1:]) else: gen_result, sample_logprobs = dp_model(fc_feats, att_feats, att_masks, opt={'sample_max': 0}, mode='sample') reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt) loss = rl_crit(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda()) loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.item() torch.cuda.synchronize() end = time.time() if not sc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) else: print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, np.mean(reward[:,0]), end - start)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): add_summary_value(tb_summary_writer, 'train_loss', train_loss, iteration) add_summary_value(tb_summary_writer, 'learning_rate', opt.current_lr, iteration) add_summary_value(tb_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: add_summary_value(tb_summary_writer, 'avg_reward', np.mean(reward[:, 0]), iteration) loss_history[iteration] = train_loss if not sc_flag else np.mean( reward[:, 0]) lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( dp_model, crit, loader, eval_kwargs) # Write validation result into summary add_summary_value(tb_summary_writer, 'validation loss', val_loss, iteration) if lang_stats is not None: for k, v in lang_stats.items(): add_summary_value(tb_summary_writer, k, v, iteration) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if True: # if true if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['best_val_score'] = best_val_score infos['opt'] = opt infos['vocab'] = loader.get_vocab() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(histories, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(infos, f) # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break
def main(): # Device configuration device = torch.device('cuda' if ( torch.cuda.is_available() and args.gpu_enable) else 'cpu') # seed torch.manual_seed(args.seed) if device == torch.device('cuda'): torch.cuda.manual_seed(args.seed) # define model model = GQN(gpu_enable=args.gpu_enable).to(device) model.load_state_dict(torch.load(args.snapshot_path)['state_dict']) model.eval() # define screen screen_size = model.image_size camera = gqn.three.PerspectiveCamera(eye=(3, 1, 0), center=(0, 0, 0), up=(0, 1, 0), fov_rad=math.pi / 2.0, aspect_ratio=screen_size[0] / screen_size[1], z_near=0.1, z_far=10) # prepare images raw_observed_images = np.zeros(screen_size + (3, ), dtype="uint32") observed_image = torch.from_numpy( np.zeros((1, 3) + screen_size, dtype="float32")).to(device) observed_viewpoint = torch.from_numpy(np.zeros((1, 7), dtype="float32")).to(device) renderer = gqn.three.Renderer(screen_size[0], screen_size[1]) features = [] label_imgs = [] label_meta = [] with torch.no_grad(): for scenenum in range(10): scene, _ = gqn.environment.shepard_metzler.build_scene( num_blocks=random.choice([x for x in range(7, 8)])) renderer.set_scene(scene) for viewnum in range(5): # prepare renderer rad = random.uniform(0, math.pi * 2) rad2 = random.uniform(0, math.pi * 2) eye = (3.0 * math.cos(rad), 3.0 * math.sin(rad2), 3.0 * math.sin(rad)) center = (0, 0, 0) yaw = gqn.math.yaw(eye, center) pitch = gqn.math.pitch(eye, center) camera.look_at( eye=eye, center=center, up=(0.0, 1.0, 0.0), ) renderer.render(camera, raw_observed_images) # [0, 255] -> [-1, 1] observed_image[0] = torch.from_numpy( (raw_observed_images.transpose( (2, 0, 1)) / 255 - 0.5) * 2.0).to(device) observed_viewpoint[0] = torch.from_numpy( np.array((eye[0], eye[1], eye[2], math.cos(yaw), math.sin(yaw), math.cos(pitch), math.sin(pitch)), dtype="float32")).to(device) # representation network tmp_r = model.compute_observation_representation( torch.unsqueeze(observed_image, 0), torch.unsqueeze(observed_viewpoint, 0)) features.append(tmp_r.view(-1)) label_imgs.append(observed_image[0].clone()) label_meta.append(str(scenenum)) features = torch.stack(features) label_imgs = (torch.stack(label_imgs) + 1.0) / 2.0 # tensorboard writer = tbx.SummaryWriter() writer.add_embedding(features, metadata=label_meta, label_img=label_imgs) writer.close()
for try_epoch in range(args.epochs, 0, -1): if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): resume_from_epoch = try_epoch break # Horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = hvd.broadcast(torch.tensor(resume_from_epoch), root_rank=0, name='resume_from_epoch').item() # Horovod: print logs on the first worker. verbose = 1 if hvd.rank() == 0 else 0 # Horovod: write TensorBoard logs on first worker. log_writer = tensorboardX.SummaryWriter( args.log_dir) if hvd.rank() == 0 else None kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.ImageFolder(args.train_dir, transform=transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) # Horovod: use DistributedSampler to partition data among workers. Manually specify # `num_replicas=hvd.size()` and `rank=hvd.rank()`. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
def __init__(self,config:BasicConfig): self.writer = tb.SummaryWriter(logdir=config.log_root+"_tbx",comment=config.log_file)
import keyword import torch import tensorboardX as tbx import numpy as np import torchvision import PIL writer = tbx.SummaryWriter('runs/exp-1') #writer.add_embedding(torch.randn(100, 5), metadata=meta, label_img=label_img) writer.add_scalar('loss', torch.tensor([0.3]).item(), 1) writer.add_scalar('loss', torch.tensor([0.9]).item(), 2) #writer.add_embedding(torch.randn(100, 5), metadata=meta) writer.close()