def __init__(self): ## General parameters self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') autograd.set_detect_anomaly(True) torch.set_printoptions(precision=8)
def __init__(self): ## General parameters self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') autograd.set_detect_anomaly(True) torch.set_printoptions(precision=8) ## Set default parameters ## -> Can be changed during experiments self.scale_mode = 'rcwSM' self.scale_combo = 'comp_mult' self.rotation_type = 'qrotate' self.grad_bias_binding = 1.5 self.grad_bias_rotation = 1.5 self.grad_bias_translation = 1.5 self.nxm = False self.additional_features = None self.nxm_enhance = 'square', self.nxm_last_line_scale = 0.1 self.dummie_init = 0.1 self.gestalten = False self.decay = False self.initial_angle = 0
def __init__(self): ## General parameters self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') autograd.set_detect_anomaly(True) torch.set_printoptions(precision=8) ## Set default parameters ## -> Can be changed during experiments self.grad_bias = 1.5
def train_network(config): """ Initialise dataset and network. Parameters: config: map containing relevant parameters """ dataset = KTHDataset(config['metadata_file'], config['dataset_dir'], use_confidence_scores=False) dataloader = DataLoader(dataset, batch_size=config['batch_size'], sampler=config['sampler'], collate_fn=util.loopy_pad_collate_fn) model = stgcn.STGCN(config['C_in'], config['gamma'], config['nr_classes'], edge_importance=config['edge_importance_weighting']) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.01) autograd.set_detect_anomaly(True) # enable anomaly detection # train for epoch in range(config['n_epochs']): for batch_idx, (data, label) in enumerate(dataloader): if batch_idx == len(dataset) // 2: break data, label = data.to(config['device']), label.to( config['device']) # Move to GPU optimizer.zero_grad( ) # pytorch accumulates gradients on every call to loss.backward() so need to 0 gradients to get correct parameter update output = model.forward(data.double()) loss = criterion(output, label) if batch_idx == 0: loss.backward(retain_graph=True) else: loss.backward() # Backward pass optimizer.step() # Update the weights if epoch % 10 == 0: print('Epoch: ', epoch + 1, '\t loss: ', loss) if batch_idx == config['batch_size'] - 1: break
def main(argv=None): ''' Main entry point ''' args = parse_args(argv) print(f'Running torch {torch.version.__version__}') profile_cuda_memory = args.config.cuda.profile_cuda_memory pin_memory = 'cuda' in args.device.type and not profile_cuda_memory dataloader = get_dataloader(args.config.data, args.seed_fn, pin_memory, args.num_devices, shuffle=args.shuffle) print(dataloader.dataset.stats) model = args.model(args.config.model, dataloader.dataset) action = args.action(args.action_config, model, dataloader, args.device) if args.action_type == 'train' and args.action_config.early_stopping: args.config.data.split = 'valid' args.config.data.max_examples = 0 action.validation_dataloader = get_dataloader(args.config.data, args.seed_fn, pin_memory, args.num_devices, shuffle=args.shuffle) if args.config.cuda.profile_cuda_memory: print('Profiling CUDA memory') memory_profiler = profile.CUDAMemoryProfiler( action.modules.values(), filename=profile_cuda_memory) sys.settrace(memory_profiler) threading.settrace(memory_profiler) step = 0 epoch = 0 if args.restore: restore_modules = { module_name: module for module_name, module in action.modules.items() if module_name not in args.reset_parameters } epoch, step = restore(args.restore, restore_modules, num_checkpoints=args.average_checkpoints, map_location=args.device.type, strict=not args.reset_parameters) model.reset_named_parameters(args.reset_parameters) if 'step' in args.reset_parameters: step = 0 epoch = 0 args.experiment.set_step(step) with ExitStack() as stack: stack.enter_context(profiler.emit_nvtx(args.config.cuda.profile_cuda)) stack.enter_context(set_detect_anomaly(args.detect_anomalies)) action(epoch, args.experiment, args.verbose)
def instantiate_trainer(config): verbosity = config.get("verbosity", logging.INFO) t_logging.set_verbosity(verbosity) logger.setLevel(verbosity) # debug (see torch.autograd.detect_anomaly) set_detect_anomaly(bool(config.get("debug", False))) # model model_args = dict(name="ViT-B/32", jit=False, training=True, Class="CLIPDecoder") model_args.update(config.get("model", {})) model_args["Class"] = getattr(clip.model, model_args["Class"]) logger.info(f"loading model from pre-trained CLIP {model_args}...") model, image_preprocess = load(**model_args) # data train_dataset, eval_dataset = get_datasets( image_preprocess=image_preprocess, **config.get("dataset", {})) # training criterion_args = config.get("criterion", {}) # get criterion class (e.g. nn.NLLLoss) by name CriterionClass = getattr(nn, criterion_args.pop("Class", "NLLLoss")) criterion = CriterionClass(**criterion_args) learner_args = config.get("learner", {}) LearnerClass = getattr(sys.modules[__name__], learner_args.pop("Class", "LanguageModel")) learner = LearnerClass(model, criterion) training_args = Seq2SeqTrainingArguments(**config.get("training", {})) trainer = CLIPTrainer(model=learner, args=training_args, data_collator=collate_batch, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics) # training callbacks for callback in config.get("callbacks", []): CallbackClass = getattr(trainer_callback, callback.pop("Class")) trainer.add_callback(CallbackClass(**callback)) return trainer, training_args, config
import matplotlib.pyplot as plt # class imports from BinAndPerspTaking.binding import Binder from BinAndPerspTaking.binding_exmat import BinderExMat from BinAndPerspTaking.perspective_taking import Perspective_Taker from CoreLSTM.core_lstm import CORE_NET from Data_Compiler.data_preparation import Preprocessor from BAPTAT_evaluation import BAPTAT_evaluator ############################################################################ ########## PARAMETERS #################################################### ## General parameters device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') autograd.set_detect_anomaly(True) ## Define data parameters num_frames = 500 num_input_features = 15 num_input_dimensions = 3 preprocessor = Preprocessor(num_input_features, num_input_dimensions) evaluator = BAPTAT_evaluator(num_frames, preprocessor) data_at_unlike_train = False ## Note: sample needs to be changed in the future # data paths data_asf_path = 'Data_Compiler/S35T07.asf' data_amc_path = 'Data_Compiler/S35T07.amc' ## Define model parameters model_path = 'CoreLSTM/models/LSTM_23.pt'
""" Simulation optimization for finding bargaining policies (rather than bargaining updates). """ import numpy as np import matplotlib.pyplot as plt import torch as T from torch.autograd import grad, set_detect_anomaly import optim from abc import ABCMeta, abstractmethod from scipy.special import expit from scipy.stats import norm import pdb set_detect_anomaly(True) def defection_test(player_1_reward_history_, player_1_cooperative_reward_mean, cutoff): """ :param player_1_reward_history_: Player 1's observed rewards :param player_1_cooperative_reward_mean: Expected value under cooperation for player 1 :param cutoff: :return: """ player_2_test_statistic = (np.mean(player_1_reward_history_) - player_1_cooperative_reward_mean) / \ np.std(player_1_reward_history_ / len(player_1_reward_history_)) player_2_null_prob = norm.cdf(player_2_test_statistic) return player_2_null_prob < cutoff class IteratedGameLearner(metaclass=ABCMeta):
def test_model(N,Dc,Dd,Db,L,K,X_c=None,X_d=None,X_b=None): batch_size = int(N/4) epsilon = 1e0 # ----------- Model ------------ gaussian = Gaussian(Dc, L, K) categorical = Categorical(Dd, L, K) bernoulli = Bernoulli(Db, L, K) likelihoods = [gaussian,bernoulli,categorical] model = Mixture_Model(N, L, likelihoods) optim = torch.optim.Adagrad(model.parameters(), lr=0.01) autograd.set_detect_anomaly(True) # optim = torch.optim.SGD(model.parameters(),lr=0.001, momentum= 0.9) data_set = torch.utils.data.TensorDataset(torch.Tensor(X_c), torch.Tensor(X_d),torch.Tensor(X_b)) #data_set = torch.utils.data.TensorDataset(torch.Tensor(X_c),torch.Tensor(X_b)) data_loader = torch.utils.data.DataLoader(data_set, batch_size=batch_size, shuffle=False) # shuffle a true? #data_loader= torch.utils.data.DataLoader(X_c, batch_size = batch_size, shuffle=False) #shuffle a true? num_epochs = 100 ll_list = [] loss_list = [] KL_z_list = [] KL_s_list = [] rik_epochs = [] term_1_list = [] term_2_list = [] term_3_list = [] past_loss = 0 for epoch in range(num_epochs): loss_epoch = 0 ll_epoch = 0 KL_z_epoch = 0 KL_s_epoch = 0 term_1_epoch = 0 term_2_epoch = 0 term_3_epoch = 0 # for x_batch_real, x_batch_discrete in data_loader: for index, x_batch in enumerate(data_loader): x_batch_real = x_batch[0] x_batch_disc = x_batch[1] x_batch_bin = x_batch[2] # ----- Variational E ----- fix θ optim.zero_grad() util.fix_model_params(likelihoods, set=False) util.fix_variational_params(model, set=True) loss, LL, KL_z, KL_s, rik, term_1, term_2,term_3 = model(index, X_c=x_batch_real.numpy(), X_d=x_batch_disc.numpy(), X_b=x_batch_bin.numpy()) loss.backward() optim.step() # ----- Variational M ----- fix φ optim.zero_grad() util.fix_model_params(likelihoods, set=True) util.fix_variational_params(model, set=False) loss, LL, KL_z, KL_s, rik, term_1, term_2,term_3 = model(index, X_c=x_batch_real.numpy(), X_d=x_batch_disc.numpy(), X_b=x_batch_bin.numpy()) loss.backward() optim.step() ll_epoch += LL KL_s_epoch += KL_s KL_z_epoch += KL_z loss_epoch += loss term_1_epoch += term_1 term_2_epoch += term_2 term_3_epoch += term_3 #print(f"Epoch = {epoch}, Loglik ={ll_epoch}, -ELBO ={loss_epoch}") rik_epochs.append(rik) KL_z_list.append(KL_z_epoch) KL_s_list.append(KL_s_epoch) loss_list.append(loss_epoch) term_1_list.append(term_1_epoch) term_2_list.append(term_2_epoch) term_3_list.append(term_3_epoch) ll_list.append(ll_epoch) z_mean = model.q_z_mean W_c = model.gaussian.W_c var_c =model.gaussian.var_c W_b = model.bernoulli.W_d W_d = model.categorical.W_d #W_d = None mu_d = model.categorical.mu_d #mu_d = None mu_b = model.bernoulli.mu_d param = torch.nn.functional.softmax(model.q_s_param, dim=1).detach().numpy() #print(param) profiles = np.argmax(param, axis=1) + 1 ''' plt.figure() plt.plot(np.arange(num_epochs), KL_z_list) plt.title(f'Convergence of KL_z for K={K}') plt.xlabel('Epochs') plt.ylabel('Kullback-Leibler divergence') plt.savefig('KL_z_'+str(K)+'.png') plt.figure() plt.plot(np.arange(num_epochs), KL_s_list) plt.title(f'Convergence of KL_s for K={K}') plt.xlabel('Epochs') plt.ylabel('Kullback-Leibler divergence') plt.savefig('KL_s_'+str(K)+'.png') ''' plt.figure() plt.plot(np.arange(num_epochs), term_1_list) plt.title(f'Convergence of ELBO terms for K={K}') plt.legend([ 'Gaussian Term ']) plt.xlabel('Epochs') plt.ylabel('Likelihood') plt.savefig('GaussianTerm_'+str(K)+'.png') plt.figure() plt.plot(np.arange(num_epochs), term_2_list) plt.title(f'Convergence of ELBO terms for K={K}') plt.legend(['Bernoulli term']) plt.xlabel('Epochs') plt.ylabel('Likelihood') plt.savefig('BernoulliTerm_'+str(K)+'.png') plt.figure() plt.plot(np.arange(num_epochs), term_3_list) plt.title(f'Convergence of ELBO terms for K={K}') plt.legend(['Categorical term']) plt.xlabel('Epochs') plt.ylabel('Likelihood') plt.savefig('CategoricalTerm_'+str(K)+'.png') plt.figure() plt.plot(np.arange(num_epochs), ll_list) plt.plot(np.arange(num_epochs), loss_list) plt.title(f'Performance in epochs for K={K}') plt.legend(['Likelihood evolution', 'Loss evolution']) plt.xlabel('Epochs') plt.ylabel('Likelihood') plt.savefig('Convergence_'+str(K)+'.png') #plt.show() return ll_list[-1],z_mean,W_c,W_b,mu_b,mu_d,W_d,var_c,profiles
def run(self, epoch): args = self.args self.model.train() nbatches = self.nbatches interval = timer() percent_done = 0 memory_gb = 0.0 avg_losses = {} if self.args.nan_detection: autograd.set_detect_anomaly(True) for batch_idx, batch in enumerate(self.train_loader): self.batch_idx = batch_idx progress = epoch + batch_idx / nbatches logging_epoch = (batch_idx % args.log_interval == 0 or batch_idx == (nbatches - 1)) self.start_of_batch_hook(progress, logging_epoch) if batch_idx == 0: logging.info("Starting batch 0") sys.stdout.flush() def batch_closure(subbatch): nonlocal memory_gb result = self.training_loss(subbatch) if isinstance(result, tuple): result, prediction, target = result else: prediction = None # For backwards compatibility target = None if isinstance(result, torch.Tensor): # By default self.training_loss() returns a single tensor loss_dict = {'train_loss': result} else: # Perceptual loss will return a dict of losses where the main # loss is 'train_loss'. This is for easily logging the parts # composing the loss (eg, perceptual loss + l1) loss_dict = result loss_dict, _, _, _ = self.additional_training_loss_terms( loss_dict, subbatch, prediction, target) loss = loss_dict['train_loss'] # Memory usage is at its maximum right before backprop if logging_epoch and self.args.cuda: memory_gb = torch.cuda.memory_allocated() / 1000000000 self.midbatch_hook(progress, logging_epoch) self.optimizer.zero_grad() self.backwards(loss) return loss, loss_dict if hasattr(self.optimizer, 'batch_step'): loss, loss_dict = self.optimizer.batch_step( batch, batch_closure=batch_closure) else: closure = lambda: batch_closure(batch) loss, loss_dict = self.optimizer.step(closure=closure) if args.debug: self.check_for_nan(loss) # Running average of all losses returned for name in loss_dict: loss_gpu = loss_dict[name] loss_cpu = loss_gpu.cpu().item() loss_dict[name] = loss_cpu if batch_idx == 0: avg_losses[name] = loss_cpu elif batch_idx < 50: avg_losses[name] = (batch_idx * avg_losses[name] + loss_cpu) / (batch_idx + 1) else: avg_losses[ name] = 0.99 * avg_losses[name] + 0.01 * loss_cpu losses = {} for name in loss_dict: losses['instantaneous_' + name] = loss_dict[name] losses['average_' + name] = avg_losses[name] self.runinfo['train_fnames'].append(batch['fname']) self.training_loss_hook(progress, losses, logging_epoch) del losses if logging_epoch: mid = timer() new_percent_done = 100. * batch_idx / nbatches percent_change = new_percent_done - percent_done percent_done = new_percent_done if percent_done > 0: inst_estimate = math.ceil( (mid - interval) / (percent_change / 100)) inst_estimate = str( datetime.timedelta(seconds=inst_estimate)) else: inst_estimate = "unknown" logging.info( 'Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, inst: {} Mem: {:2.1f}gb' .format(epoch, batch_idx, nbatches, 100. * batch_idx / nbatches, loss.item(), inst_estimate, memory_gb)) interval = mid if self.args.break_early is not None and percent_done >= self.args.break_early: break if self.args.debug_epoch: break
def run(dataset, save_folder, load_ckpt_sign=False): test_topics = ['阔腿裤', '宽松', '线条'] dataset.build_vocab() if torch.cuda.is_available(): device_num = 0 deviceName = f"cuda: {device_num}" torch.cuda.set_device(device_num) print(f'Current device: {torch.cuda.current_device()}') else: deviceName = "cpu" device = torch.device(deviceName) vocab = torch.load(f"{save_folder}/vocab.pkl") word_vec = torch.load(f"{save_folder}/word_vec.pkl") model = MTALSTM(hidden_dim=config.hidden_dim, embed_dim=config.embedding_dim, num_keywords=config.num_keywords, num_layers=config.num_layers, num_labels=len(vocab), weight=word_vec, vocab_size=len(vocab), bidirectional=config.bidirectional) optimizer = optim.Adam(model.parameters(), lr=config.lr) loss_function = nn.NLLLoss() if load_ckpt_sign: loss_values, epoch_values, bleu_values = load_ckpt_train( 50, save_folder, model, device, optimizer) else: loss_values = [] epoch_values = [] bleu_values = [] since = time.time() autograd.set_detect_anomaly(False) autograd.set_detect_anomaly(False) prev_epoch = 0 if not epoch_values else epoch_values[-1] best_bleu = 0 if not bleu_values else max(bleu_values) prev_epoch = 0 if not epoch_values else epoch_values[-1] best_bleu = 0 if not bleu_values else max(bleu_values) # optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum) lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.4, patience=2, min_lr=1e-7, verbose=True) if config.use_gpu: model = model.to('cuda') print("Dump to cuda") model.apply(params_init_uniform) autograd.set_detect_anomaly(False) prev_epoch = 0 if not epoch_values else epoch_values[-1] best_bleu = 0 if not bleu_values else max(bleu_values) corpus_indice, topics_indice, corpus_test, topics_test, w2i, i2w = dataset.extract_sents( vocab) length = list(map(lambda x: len(x), corpus_indice)) for epoch in range(config.num_epoch - prev_epoch): epoch += prev_epoch start = time.time() num, total_loss = 0, 0 topics_indice, corpus_indice = dataset.shuffleData( topics_indice, corpus_indice) # shuffle data at every epoch data = dataset.data_iterator(corpus_indice, topics_indice, config.batch_size, max(length) + 1) hidden = model.init_hidden(batch_size=config.batch_size) weight = torch.ones(len(vocab)) weight[0] = 0 num_iter = len(corpus_indice) // config.batch_size for X, Y, mask, topics in tqdm(data, total=num_iter): num += 1 if config.use_gpu: X = X.to(device) Y = Y.to(device) mask = mask.to(device) topics = topics.to(device) loss_function = loss_function.to(device) weight = weight.to(device) optimizer.zero_grad() coverage_vector = model.init_coverage_vector( config.batch_size, config.num_keywords) init_output = torch.zeros(config.batch_size, config.hidden_dim).to(device) output, _, hidden, _, _ = model(inputs=X, topics=topics, output=init_output, hidden=hidden, mask=mask, target=Y, coverage_vector=coverage_vector) hidden[0].detach_() hidden[1].detach_() loss = (-output.output).reshape((-1, config.batch_size)).t() * mask loss = loss.sum(dim=1) / mask.sum(dim=1) loss = loss.mean() loss.backward() norm = 0.0 nn.utils.clip_grad_value_(model.parameters(), 1) optimizer.step() total_loss += float(loss.item()) if np.isnan(total_loss): for name, p in model.named_parameters(): if p.grad is None: continue print(name, p) assert False, "Gradient explode" one_iter_loss = np.mean(total_loss) lr_scheduler.step(one_iter_loss) # validation bleu_score = 0 num_test = 500 bleu_score = evaluate_bleu(model, topics_test, corpus_test, num_test=num_test, method='predict_rnn', i2w=i2w, w2i=w2i) bleu_values.append(bleu_score) loss_values.append(total_loss / num) epoch_values.append(epoch + 1) # save checkpoint if ((epoch + 1) % config.check_point == 0) or (epoch == (config.num_epoch - 1)) or epoch + 1 > 90 or bleu_score > 4: model_check_point = '%s/model_trainable_%d.pk' % (save_folder, epoch + 1) optim_check_point = '%s/optim_trainable_%d.pkl' % (save_folder, epoch + 1) loss_check_point = '%s/loss_trainable_%d.pkl' % (save_folder, epoch + 1) epoch_check_point = '%s/epoch_trainable_%d.pkl' % (save_folder, epoch + 1) bleu_check_point = '%s/bleu_trainable_%d.pkl' % (save_folder, epoch + 1) torch.save(model.state_dict(), model_check_point) torch.save(optimizer.state_dict(), optim_check_point) torch.save(loss_values, loss_check_point) torch.save(epoch_values, epoch_check_point) torch.save(bleu_values, bleu_check_point) # save currunt best result if bleu_score > best_bleu: best_bleu = bleu_score print('current best bleu: %.4f' % best_bleu) model_check_point = '%s/model_best_%d.pk' % (save_folder, epoch + 1) optim_check_point = '%s/optim_best_%d.pkl' % (save_folder, epoch + 1) loss_check_point = '%s/loss_best_%d.pkl' % (save_folder, epoch + 1) epoch_check_point = '%s/epoch_best_%d.pkl' % (save_folder, epoch + 1) bleu_check_point = '%s/bleu_best_%d.pkl' % (save_folder, epoch + 1) torch.save(model.state_dict(), model_check_point) torch.save(optimizer.state_dict(), optim_check_point) torch.save(loss_values, loss_check_point) torch.save(epoch_values, epoch_check_point) torch.save(bleu_values, bleu_check_point) end = time.time() s = end - since h = math.floor(s / 3600) m = s - h * 3600 m = math.floor(m / 60) s -= (m * 60 + h * 3600) if ((epoch + 1) % config.verbose == 0) or (epoch == (config.num_epoch - 1)): print( 'epoch %d/%d, loss %.4f, norm %.4f, predict bleu: %.4f, time %.3fs, since %dh %dm %ds' % (epoch + 1, config.num_epoch, total_loss / num, norm, bleu_score, end - start, h, m, s)) evaluateAndShowAttention(test_topics, model, i2w, w2i, epoch, dataset.dataname, method='beam_search')
def init_torch(): cudnn.benchmark = True set_detect_anomaly(False) profile(False) emit_nvtx(False)
def train_meta_model(meta_model, train_dataloader, args, reloaded=False, epoch=0, tuning_dataloader=None, train_embedding_after=-1, tqdm=tqdm): all_train_perfs, all_dev_perfs = [], [] optimizer = torch.optim.Adam( meta_model.parameters, lr=args.learning_rate, weight_decay=args.weight_decay, ) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.learning_rate_step, args.learning_rate_decay) early_stop_count = 0 prev_err = 10e9 epoch_rng = range(epoch + 1 if reloaded else 0, args.epochs) if tqdm is not None: epoch_rng = tqdm(epoch_rng, desc='Epoch: N/A', leave=False) for epoch in epoch_rng: scheduler.step( ) # This goes before any of the train/validation stuff b/c torch version is 1.0.1 if train_embedding_after >= epoch: # to ensure it is unfrozen after reloading meta_model.unfreeze_representation() meta_model.train() dataloader_rng = train_dataloader if tqdm is not None: dataloader_rng = tqdm(dataloader_rng, desc='Batch: N/A', total=len(train_dataloader), leave=False) optimizer.zero_grad() for i, batch in enumerate(dataloader_rng): if args.do_detect_anomaly: set_detect_anomaly(True) hidden_states, pooled_output, all_outputs, total_loss = meta_model.forward( batch) total_loss.backward() if i % args.batches_per_gradient == 0: optimizer.step() optimizer.zero_grad() if args.do_detect_anomaly: set_detect_anomaly(False) if tqdm is not None: dataloader_rng.set_description('Batch: %.2e' % total_loss) if tqdm is None: print("Epoch %d: %.2f" % (epoch, total_loss.item())) elif (tqdm is not None) and (tuning_dataloader is not None): pass else: epoch_rng.set_description("Epoch %d: %.2f" % (epoch, total_loss.item())) if epoch % args.train_save_every == 0: if tuning_dataloader is None: meta_model.save(epoch) else: # do eval to see if this is the best score dataloader_rng = tuning_dataloader if tqdm is not None: dataloader_rng = tqdm(dataloader_rng, desc='Batch: N/A', total=len(tuning_dataloader), leave=False) meta_model.eval() tuning_losses = [] for i, batch in enumerate(dataloader_rng): hidden_states, pooled_output, all_outputs, total_loss = meta_model.forward( batch) tuning_losses.append(total_loss.cpu().data.numpy().ravel()) meta_model.train() total_err = np.mean(np.concatenate(tuning_losses)) if total_err < prev_err: # this is the best model meta_model.save(epoch) # best_meta_model=meta_model.copy().cpu() prev_err = total_err early_stop_count = 0 if tqdm is None: print("Epoch %d: %.2f" % (epoch, total_loss.item())) else: epoch_rng.set_description("Epoch %d: %.2f" % (epoch, total_loss.item())) else: early_stop_count += 1 if early_stop_count == 10: print( f"Early stopping at epoch {epoch}. Best model at epoch {epoch} with a loss of {prev_err}" ) break if tuning_dataloader is None: meta_model.save(epoch) return meta_model else: return best_meta_model
# if p.requires_grad: # print(name, p) # p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value)) def decay_lr(optimizer, epoch, factor=0.1, lr_decay_epoch=60): if epoch % lr_decay_epoch == 0: for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * factor print('lr decayed to %.4f' % optimizer.param_group[0]['lr']) return optimizer since = time.time() autograd.set_detect_anomaly(False) prev_epoch = 0 if not epoch_values else epoch_values[-1] best_bleu = 0 if not bleu_values else max(bleu_values) for epoch in range(num_epoch - prev_epoch): epoch += prev_epoch start = time.time() num, total_loss = 0, 0 # optimizer = decay_lr(optimizer=optimizer, epoch=epoch+1) topics_indice, corpus_indice = shuffleData( topics_indice, corpus_indice) # shuffle data at every epoch # max_length = max(length) + 1 max_length = 140 # todo data = data_iterator(corpus_indice, topics_indice, batch_size, max_length) hidden = model.init_hidden(batch_size=batch_size) weight = torch.ones(len(vocab))