def main(): # representations = get_embeddings(device).to(device) # f=0 # section: settings global best_bleu4, epochs_since_improvement, start_epoch, data_name, word_map # section: fine tune if args.fine_tune_encoder and args.fine_tune_epochs == -1: raise Exception( 'if "fine_tune_encoder" == true you must also specify "fine_tune_epochs" != -1' ) # section: word map if not args.run_local: data_f = '/yoav_stg/gshalev/image_captioning/output_folder' else: data_f = data_folder word_map_file = os.path.join(data_f, 'WORDMAP_' + data_name + '.json') print('word_map_file: {}'.format(word_map_file)) print('loading word map from path: {}'.format(word_map_file)) with open(word_map_file, 'r') as j: word_map = json.load(j) print('load word map COMPLETED') rev_word_map = {v: k for k, v in word_map.items()} # section: representation representations = get_embeddings(device).to(device) # section: not fixed if not args.fixed: representations.requires_grad = True # section: Initialization print('run a new model (No args.checkpoint)') decoder = DecoderWithoutAttention(attention_dim=300, embed_dim=300, decoder_dim=300, vocab_size=len(word_map), device=device, dropout=dropout, encoder_dim=300) decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, decoder.parameters()), lr=decoder_lr) # section: not fixed if not args.fixed: decoder_optimizer.add_param_group({'params': representations}) encoder = Encoder(embeded_dim=300) #notice: fine to encoder encoder.fine_tune(True if args.fine_tune_encoder and args.fine_tune_epochs == 0 else False) encoder_optimizer = torch.optim.Adam( params=filter(lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr ) if args.fine_tune_encoder and args.fine_tune_epochs == 0 else None # section: Move to GPU, if available decoder = decoder.to(device) encoder = encoder.to(device) # section: wandb if not args.run_local: wandb.watch(decoder) # section: Loss function criterion = nn.CrossEntropyLoss().to(device) # section: dataloaders train_loader = torch.utils.data.DataLoader(CaptionDataset( data_f, data_name, 'TRAIN', transform=transforms.Compose([data_normalization])), batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(CaptionDataset( data_f, data_name, 'VAL', transform=transforms.Compose([data_normalization])), batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) val_loader_for_val = torch.utils.data.DataLoader(CaptionDataset( data_f, data_name, 'VAL', transform=transforms.Compose([data_normalization])), batch_size=1, shuffle=True, num_workers=workers, pin_memory=True) # section: Epochs print('starting epochs') for epoch in range(start_epoch, epochs): # section: terminate training after 20 epochs without improvment if epochs_since_improvement == 20: print('break after : epochs_since_improvement == 20') break # section: fine tune encoder if epoch == args.fine_tune_epochs: print('fine tuning after epoch({}) == args.fine_tune_epochs({})'. format(epoch, args.fine_tune_epochs)) encoder.fine_tune(args.fine_tune_encoder) encoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr) # section: adjust LR after 8 epochs without improvment if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0: print('!!! ADJUST LR AFTER : epochs_since_improvement: {}'.format( epochs_since_improvement)) adjust_learning_rate(decoder_optimizer, 0.8) # section: train print( '--------------111111111-----------Start train----------epoch-{}'. format(epoch)) train(train_loader=train_loader, encoder=encoder, decoder=decoder, criterion=criterion, encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer, epoch=epoch, representations=representations) # section: eval print( '--------------2222222222-----------Start validation----------epoch-{}' .format(epoch)) recent_bleu4 = validate(val_loader=val_loader, encoder=encoder, decoder=decoder, criterion=criterion, rev_word_map=rev_word_map, representations=representations) print('9999999999999- recent blue {}'.format(recent_bleu4)) print( '--------------3333333333-----------Start val without teacher forcing----------epoch-{}' .format(epoch)) with torch.no_grad(): caption_image_beam_search(encoder, decoder, val_loader_for_val, word_map, rev_word_map, representations) print( '!@#!@!#!#@!#@!#@ DONE WITH TRAIN VAL AND VAL WITHOUT TEACHER FORCING FOR EPOCH :{}' .format(epoch)) # section: save model if there was an improvement is_best = recent_bleu4 > best_bleu4 best_bleu4 = max(recent_bleu4, best_bleu4) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 save_checkpoint(data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer, decoder_optimizer, recent_bleu4, is_best, representations=representations, runname=args.runname)
def train(config): wandb.init(project="lunar-lander", name="target") # EPISODE = 1_000 STEP = 3_000_000 EXPERIENCE_REPLAY = 1_000_000 BATCH_SIZE = 32 ENTROPY_TERM_COEFFICIENT = 0.2 # ENTROPY_TERM_COEFFICIENT = 0.002 GAMMA = 0.99 POLYAK = 0.995 LR = 0.001 START_STEP = 10000 env = gym.make(config.env) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] if not isinstance(env.action_space, gym.spaces.Box): raise RuntimeError("action space is not continuous") q = Q(state_dim, action_dim).cuda() q_opt = torch.optim.Adam(q.parameters(), LR) policy = Policy(state_dim, action_dim, env.action_space.low, env.action_space.high).cuda() policy_opt = torch.optim.Adam(policy.parameters(), LR) wandb.watch([q1, q2, policy], log="all", log_freq=10000) replay_buffer = ReplayBuffer(EXPERIENCE_REPLAY, state_dim, action_dim) episode = 0 step = 0 while True: episode += 1 state = env.reset() episode_reward = 0 is_done = False while (not is_done) and step < STEP: step += 1 # get action from policy net state_tensor = (torch.from_numpy(state).type( torch.FloatTensor).unsqueeze(0).cuda()) with torch.no_grad(): action = policy(state_tensor)[0].cpu().numpy() # take action next_state, reward, is_done, _info = env.step(action) bonus = 0 reward += bonus # record replay_buffer.add( state=state, action=action, reward=reward, next_state=next_state, is_done=is_done, ) if config.render: env.render() episode_reward += reward # clean up state = next_state del action, state_tensor, next_state, _info # train from replay if step < START_STEP: continue batch = replay_buffer.sample(BATCH_SIZE) # update Q # Q(s,a) = Q(next_state,a') with torch.no_grad(): next_action, logp = policy(batch.next_states, with_logprob=True) target = batch.rewards + GAMMA * (1 - batch.is_dones) * (torch.min( q1_target(batch.next_states, next_action), q2_target(batch.next_states, next_action), ) - ENTROPY_TERM_COEFFICIENT * logp) del next_action, logp # Ex_a'[Q(s',a')] = Sum_a'(Q(s',a')*Pr[a'] - alpha * log(Pr[a'|s'])) # Since a' is continuous, # We sample a single value of a' from policy and use it q_loss = F.mse_loss(q1(batch.states, batch.actions), target) + F.mse_loss( q2(batch.states, batch.actions), target) q_opt.zero_grad() q_loss.backward() q_opt.step() del target # update Policy, ascend on Q+H action, logp = policy( batch.states, with_logprob=True) # at this time, differentiable action policy_profit = ( torch.min(q1(batch.states, action), q2(batch.states, action)) - ENTROPY_TERM_COEFFICIENT * logp) policy_loss = -policy_profit.mean() policy_opt.zero_grad() policy_loss.backward() policy_opt.step() # Update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(q1.parameters(), q1_target.parameters()): p_targ.data.mul_(POLYAK) p_targ.data.add_((1 - POLYAK) * p.data) for p, p_targ in zip(q2.parameters(), q2_target.parameters()): p_targ.data.mul_(POLYAK) p_targ.data.add_((1 - POLYAK) * p.data) wandb.log(dict(reward=reward, q_loss=q_loss.item()), step=step) if step % 10000 == 0: torch.save(dict(policy=policy.state_dict()), "target.pt") print(episode, step, episode_reward) wandb.log(dict(episode_reward=episode_reward), step=step)
def train( self, train_dataset, output_dir, show_running_loss=True, eval_data=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ model = self.model args = self.args device = self.device tb_writer = SummaryWriter(logdir=args.tensorboard_dir) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=self.args.dataloader_num_workers, ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [] custom_parameter_names = set() for group in self.args.custom_parameter_groups: params = group.pop("params") custom_parameter_names.update(params) param_group = {**group} param_group["params"] = [ p for n, p in model.named_parameters() if n in params ] optimizer_grouped_parameters.append(param_group) for group in self.args.custom_layer_parameters: layer_number = group.pop("layer") layer = f"layer.{layer_number}." group_d = {**group} group_nd = {**group} group_nd["weight_decay"] = 0.0 params_d = [] params_nd = [] for n, p in model.named_parameters(): if n not in custom_parameter_names and layer in n: if any(nd in n for nd in no_decay): params_nd.append(p) else: params_d.append(p) custom_parameter_names.add(n) group_d["params"] = params_d group_nd["params"] = params_nd optimizer_grouped_parameters.append(group_d) optimizer_grouped_parameters.append(group_nd) if not self.args.train_custom_parameters_only: optimizer_grouped_parameters.extend([ { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and not any( nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and any( nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ]) warmup_steps = math.ceil(t_total * args.warmup_ratio) args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if (args.model_name and os.path.isfile( os.path.join(args.model_name, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name, "scheduler.pt"))) if args.n_gpu > 1: model = torch.nn.DataParallel(model) logger.info(" Training started") global_step = 0 training_progress_scores = None tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.silent, mininterval=0) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 steps_trained_in_current_epoch = 0 epochs_trained = 0 if args.model_name and os.path.exists(args.model_name): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name.split("/")[-1].split("-") if len(checkpoint_suffix) > 2: checkpoint_suffix = checkpoint_suffix[1] else: checkpoint_suffix = checkpoint_suffix[-1] global_step = int(checkpoint_suffix) epochs_trained = global_step // ( len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info( " Will skip the first %d steps in the current epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") if args.evaluate_during_training: training_progress_scores = self._create_training_progress_scores( **kwargs) if args.wandb_project: wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs) wandb.watch(self.model) if args.fp16: from torch.cuda import amp scaler = amp.GradScaler() for current_epoch in train_iterator: model.train() if epochs_trained > 0: epochs_trained -= 1 continue train_iterator.set_description( f"Epoch {epoch_number + 1} of {args.num_train_epochs}") batch_iterator = tqdm( train_dataloader, desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}", disable=args.silent, mininterval=0, ) for step, batch in enumerate(batch_iterator): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) if args.fp16: with amp.autocast(): outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] else: outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: batch_iterator.set_description( f"Epochs {epoch_number}/{args.num_train_epochs}. Running Loss: {current_loss:9.4f}" ) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: scaler.scale(loss).backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if args.fp16: scaler.step(optimizer) scaler.update() else: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.wandb_project: wandb.log({ "Training loss": current_loss, "lr": scheduler.get_last_lr()[0], "global_step": global_step, }) if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) self.save_model(output_dir_current, optimizer, scheduler, model=model) if args.evaluate_during_training and ( args.evaluate_during_training_steps > 0 and global_step % args.evaluate_during_training_steps == 0): # Only evaluate when single GPU otherwise metrics may not average well results = self.eval_model( eval_data, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if args.save_eval_checkpoints: self.save_model(output_dir_current, optimizer, scheduler, model=model, results=results) training_progress_scores["global_step"].append( global_step) training_progress_scores["train_loss"].append( current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args.output_dir, "training_progress_scores.csv"), index=False, ) if args.wandb_project: wandb.log( self._get_last_metrics( training_progress_scores)) if not best_eval_metric: best_eval_metric = results[ args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[ args. early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[ args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self. args.evaluate_during_training else training_progress_scores, ) else: if results[ args. early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[ args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self. args.evaluate_during_training else training_progress_scores, ) epoch_number += 1 output_dir_current = os.path.join( output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args.save_model_every_epoch or args.evaluate_during_training: os.makedirs(output_dir_current, exist_ok=True) if args.save_model_every_epoch: self.save_model(output_dir_current, optimizer, scheduler, model=model) if args.evaluate_during_training and args.evaluate_each_epoch: results = self.eval_model( eval_data, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) self.save_model(output_dir_current, optimizer, scheduler, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args.output_dir, "training_progress_scores.csv"), index=False) if args.wandb_project: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[ args. early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping and args.early_stopping_consider_epochs: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) else: if results[ args. early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping and args.early_stopping_consider_epochs: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, )
config = wandb.config config.dropout = 1.0 # writer = SummaryWriter() env = SingleObservation() n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = TD3("CnnPolicy", env, action_noise=action_noise, verbose=1, buffer_size=10000, tensorboard_log="./td3_learning_tensorboard") model.learn(total_timesteps=10) wandb.watch(model) obs = env.reset() for _ in range(10): print("running") action, _states = model.predict(obs) # action = env.action_space.sample() for t in range(10000): obs, rewards, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) obs = env.reset() break env.render() # writer.flush()
if name.split('.')[1].isdigit(): if int(name.split('.')[1]) > wbconfig.training_depth: param.requires_grad = True models[cv]._conv_head.weight.requires_grad = True models[cv]._bn1.weight.requires_grad = True models[cv]._bn1.bias.requires_grad = True models[cv]._fc.weight.requires_grad = True models[cv]._fc.bias.requires_grad = True # collect parameters to be trained params_to_update = [] for name,param in models[cv].named_parameters(): if param.requires_grad: params_to_update.append(param) '''Step 3: Build required stuff for training helper function''' wandb.watch(models[cv],log='all') models[cv].to(device) dataloaders = dataloaders_dict criterion = nn.CrossEntropyLoss(weight = class_weights).to(device) optimizer = optim.Adam(params_to_update, lr = wbconfig.learning_rate, betas=(wbconfig.betas1,wbconfig.betas2), eps=wbconfig.eps, amsgrad=wbconfig.amsgrad) num_epochs = wbconfig.num_epochs log_path = './MURA_Anim_Finetune/log/CV'+str(cv)+'_log.txt' model_save_path = './MURA_Anim_Finetune/CV'+str(cv) '''Step 4: Train model''' trained_model,val_acc_history,train_acc_history = train_model(models[cv], dataloaders_dict,
def main(): best_accu = 0 # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch_size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=0.8, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--wandb', action='store_true', default=False, help='For wandb logging') parser.add_argument('--train', action='store_false', default=True, help='Start training') parser.add_argument('--val', action='store_false', default=True, help='Start validation') parser.add_argument('--test', action='store_false', default=True, help='Start testing on MNIST test set') parser.add_argument('--per_class', action='store_false', default=True, help='Calulate accuracy per class') parser.add_argument('--saved_ckpt', type=str, default="./checkpoints", metavar='saved_ckpt', help='Path for saving the checkpoint') parser.add_argument('--load_ckpt', type=str, default="./checkpoints", metavar='load_ckpt', help='For loading checkpoint') parser.add_argument( '--path', type=str, default="./data", metavar='path', help='For Training the model on midas task 1 split set') args = parser.parse_args() if args.wandb: # wandb initalization print("==> wandb initalization of project") wandb.init(project="midas-tasks-solutions", reinit=True) wandb.config.update(args) device = 'cuda' if torch.cuda.is_available() else 'cpu' os.makedirs(f'{args.saved_ckpt}', exist_ok=True) fdir = f'{args.saved_ckpt}/run_with_epochs_{args.epochs}_LR_{args.lr}' args.load_ckpt = fdir os.makedirs(fdir, exist_ok=True) print("==> Loading dataset") train_kwargs = {'batch_size': args.batch_size} val_kwargs = {'batch_size': args.batch_size} test_kwargs = {'batch_size': args.batch_size} if torch.cuda.is_available(): cuda_kwargs = {'num_workers': 1, 'pin_memory': True, 'shuffle': True} train_kwargs.update(cuda_kwargs) print("==> Loading Midas dataset") midas_train, midas_val = midas_task1_split(args.path) print("==> Loading MNIST dataset") mnist_test = mnist_testloader() midas_train_loader = torch.utils.data.DataLoader(midas_train, **train_kwargs) midas_val_loader = torch.utils.data.DataLoader(midas_val, **val_kwargs) test_loader = torch.utils.data.DataLoader(mnist_test, **test_kwargs) print("==> Building model...") midas_model = Net().to(device) print(midas_model) mnist_model = Net().to(device) optimizer = optim.Adadelta(midas_model.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss() scheduler = CosineAnnealingWarmupRestarts(optimizer, first_cycle_steps=args.epochs, cycle_mult=1.0, max_lr=1.0, min_lr=args.lr, warmup_steps=5, gamma=1.0) if args.wandb: wandb.watch(midas_model) wandb.watch(mnist_model) print(f"==> Starting Learning Rate {args.lr}") for epoch in range(1, args.epochs + 1): print(f"==> Epoch {epoch}/{args.epochs + 1}") if args.train: print("==> Model training started") train(args, midas_model, device, midas_train_loader, optimizer, epoch, criterion) if args.val: print("==> Evaluating midas model on midas val") midas_accu = val_midas(args, midas_model, device, midas_val_loader, epoch, criterion) print(f"==> Saving model checkpoint at {fdir}") is_best = midas_accu > best_accu best_accu = max(midas_accu, best_accu) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': midas_model.state_dict(), 'best_accu': best_accu, 'optimizer': optimizer.state_dict(), }, is_best, fdir) if args.test: print("==> Loading model checkpoint") load_ckpt(mnist_model, args.load_ckpt) mnist_model.fc2.out_features = 10 print(mnist_model) print("==> Testing model on mnist") mnist_accu = test(args, mnist_model, device, test_loader, epoch, criterion) if args.per_class: print("==> Accuracy per class on midas val") accuracy_per_class(args, midas_model, device, midas_val_loader, epoch, midas_val.classes) scheduler.step() print("Lr after scheduler = ", optimizer.param_groups[0]['lr']) print(f"Best accuracy on testing set = {best_accu}")
drop_last=False, collate_fn=data_maker.generate_batch, ) # 加载模型 encoder = BertModel.from_pretrained(config["bert_path"]) hidden_size = encoder.config.hidden_size ent_extractor = TPLinkerPlusBert(encoder, tag_size, hyper_parameters["shaking_type"], hyper_parameters["inner_enc_type"], hyper_parameters["tok_pair_sample_rate"]) ent_extractor = ent_extractor.to(device) if config["logger"] == "wandb": wandb.watch(ent_extractor) # 加载损失函数 metrics = MetricsCalculator(handshaking_tagger) loss_func = lambda y_pred, y_true: metrics.loss_func( y_pred, y_true, ghm=hyper_parameters["ghm"]) # train step def train_step(batch_train_data, optimizer): sample_list, batch_input_ids, \ batch_attention_mask, batch_token_type_ids, \ tok2char_span_list, batch_shaking_tag = batch_train_data batch_input_ids, \ batch_attention_mask, \
def train(self, output_dir, train_batch_size, gradient_accumulation_steps, seed, epochs, data_path, pretrained_path, valid_path=None, no_cuda=False, dropout=0.3, weight_decay=0.01, warmup_proportion=0.1, learning_rate=5e-5, adam_epsilon=1e-8, max_seq_length=128, squeeze=True, max_grad_norm=1.0, eval_batch_size=32, epoch_save_model=False, model_name='XLMR', embedding_path=None, split_train_data=False, data_divider=0.6, wandb=None, save=True, logger=None, json_dataset=False, label_file=None, xlm_dataset=False, div=None, div_2=None, motherfile=False, multi_source_labels=False, device=0): epoch_times = [] if wandb: import wandb print(wandb) wandb.init(project='ABOM-PolEmo', config={ "epochs": epochs, "language_model": pretrained_path, "batch_size": train_batch_size, "max_seq_length": max_seq_length, "warmup_proportion": warmup_proportion, "learning_rate": learning_rate, "gradient_accumulation_steps": gradient_accumulation_steps, "squeeze": squeeze, "dropout": dropout, "output_dit": output_dir }) if save and os.path.exists(output_dir) and os.listdir(output_dir): raise ValueError( "Output directory (%s) already exists and is not empty." % output_dir) if save and not os.path.exists(output_dir): os.makedirs(output_dir) if not logger: logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO, filename=os.path.join(output_dir, "log.txt")) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logger = logging.getLogger(__name__) if gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" % gradient_accumulation_steps) train_batch_size = train_batch_size // gradient_accumulation_steps random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if split_train_data: if json_dataset: examples, label_list = get_examples_from_json(data_path) elif motherfile: examples, label_list = get_examples_from_motherfile(data_path) elif xlm_dataset: examples, label_list = get_examples_from_xml(data_path) else: examples, label_list = get_examples(data_path, 'train') random.shuffle(examples) train_examples = examples[0:int(len(examples) * data_divider)] val_examples = examples[int(len(examples) * data_divider):] eval_examples = examples[( int(len(examples) * data_divider) + int(len(examples) * ((1 - data_divider) / 2))):] else: train_examples = None if json_dataset: examples, label_list = get_examples_from_json(data_path) elif motherfile: train_examples, train_label_list = get_examples_from_motherfile( data_path, 'train') val_examples, val_label_list = get_examples_from_motherfile( data_path, 'test') train_label_list.extend(val_label_list) label_list = list(set(train_label_list)) elif xlm_dataset: examples, label_list = get_examples_from_xml(data_path) else: train_examples, label_list = get_examples(data_path, 'train') logger.info("\nDATA SIZE\n") logger.info("\Train = %d\n" % len(train_examples)) logger.info("\Val = %d\n" % len(val_examples)) num_train_optimization_steps = 0 num_labels = len(label_list) + 1 num_train_optimization_steps = int( len(train_examples) / train_batch_size / gradient_accumulation_steps) * epochs hidden_size = 300 if pretrained_path == None else 768 if 'base' in pretrained_path else 1024 device = 'cuda:0' if (torch.cuda.is_available() and not no_cuda) else 'cpu' logger.info(device) if model_name == 'HERBERT': model = AutoTokenizerForTokenClassification( pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout, device=device) elif model_name == 'BERT_MULTILINGUAL': model = BertBaseMultilingualCased(pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout, device=device) elif model_name == 'REFORMER': model = Reformer(n_labels=num_labels, hidden_size=512, dropout=dropout, device=device, max_seq_length=max_seq_length, batch_size=train_batch_size) elif model_name == 'POLISH_ROBERTA': model = PolishRoberta(pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout, device=device) else: model = XLMRForTokenClassification(pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout=dropout, device=device) model.to(device) if wandb: wandb.watch(model) no_decay = ['bias', 'final_layer_norm.weight'] params = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) train_features = convert_examples_to_features(train_examples, label_list, max_seq_length, model.encode_word) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_data = create_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size) if not split_train_data and not val_examples and not motherfile: val_examples, _ = get_examples(valid_path, 'valid') val_features = convert_examples_to_features(val_examples, label_list, max_seq_length, model.encode_word) val_data = create_dataset(val_features) best_val_f1 = 0.0 best_precision = 0.0 best_recall = 0.0 for epoch_no in range(1, epochs + 1): start = timer() epoch_stats = {"epoch": epoch_no} logger.info("Epoch %d" % epoch_no) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 model.train() steps = len(train_dataloader) for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, label_ids, l_mask, valid_ids, = batch loss = model(input_ids, label_ids, l_mask, valid_ids) if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) tr_loss += loss.item() epoch_stats["loss"] = loss if wandb: wandb.log({"loss": loss}) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if step % 5 == 0: logger.info('Step = %d/%d; Loss = %.4f' % (step + 1, steps, tr_loss / (step + 1))) if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() del batch logger.info("\nTesting on validation set...") f1, report, entity_scores, precision, recall = evaluate_model( model, val_data, label_list, eval_batch_size, device) epoch_stats["validation_F1"] = f1 print(report) if f1 > best_val_f1: best_val_f1 = f1 best_precision = precision best_recall = recall logger.info( "\nFound better f1=%.4f on validation set. Saving model\n" % f1) logger.info("%s\n" % report) if save: torch.save( model.state_dict(), open(os.path.join(output_dir, 'model.pt'), 'wb')) save_params(output_dir, dropout, num_labels, label_list) if save and epoch_save_model: epoch_output_dir = os.path.join(output_dir, "e%03d" % epoch_no) os.makedirs(epoch_output_dir) if save: torch.save( model.state_dict(), open(os.path.join(epoch_output_dir, 'model.pt'), 'wb')) save_params(epoch_output_dir, dropout, num_labels, label_list) if wandb: wandb.log(epoch_stats) epoch_times.append(timer() - start) model.cpu() del model, logger torch.cuda.empty_cache() print("Avg. epoch time") print(np.mean(epoch_times, axis=0)) print(max_seq_length) return best_val_f1, entity_scores, best_precision, epoch_times, best_recall
def train( self, train_dataset, output_dir, multi_label=False, show_running_loss=True, eval_df=None, test_df=None, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) t_total = (len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"]) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args["weight_decay"], }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = (warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"]) optimizer = AdamW( optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"], ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"]) epoch_number = 0 if args["evaluate_during_training"]: extra_metrics = {key: [] for key in kwargs} if multi_label: training_progress_scores = { "global_step": [], "LRAP": [], "train_loss": [], "eval_loss": [], **extra_metrics, } else: if self.model.num_labels == 2: training_progress_scores = { "global_step": [], "tp": [], "tn": [], "fp": [], "fn": [], "mcc": [], "train_loss": [], "eval_loss": [], **extra_metrics, } elif self.model.num_labels == 1: training_progress_scores = { "global_step": [], "train_loss": [], "eval_loss": [], **extra_metrics, } else: training_progress_scores = { "global_step": [], "mcc": [], "train_loss": [], "eval_loss": [], **extra_metrics, } if args["wandb_project"]: wandb.init(project=args["wandb_project"], config={**args}, **args["wandb_kwargs"]) wandb.watch(self.model) if args["faq_evaluate_during_training"]: write_progress_to_csv(output_dir, 'train_log.csv', write_header=True) model.train() for _ in train_iterator: train_start = time.time() # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate( tqdm(train_dataloader, desc="Current iteration", disable=args["silent"])): batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args["n_gpu"] > 1: loss = ( loss.mean() ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args["max_grad_norm"]) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if (args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0): # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar( "loss", (tr_loss - logging_loss) / args["logging_steps"], global_step, ) logging_loss = tr_loss if args["wandb_project"]: wandb.log({ "Training loss": current_loss, "lr": scheduler.get_lr()[0], "global_step": global_step, }) if args["save_steps"] > 0 and global_step % args[ "save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) # Take care of distributed/parallel training model_to_save = (model.module if hasattr( model, "module") else model) model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) if args["evaluate_during_training"] and ( args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model(eval_df, verbose=True, **kwargs) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) if args["save_eval_checkpoints"]: model_to_save = (model.module if hasattr( model, "module") else model) model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) output_eval_file = os.path.join( output_dir_current, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format( key, str(results[key]))) training_progress_scores["global_step"].append( global_step) training_progress_scores["train_loss"].append( current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( args["output_dir"] + "training_progress_scores.csv", index=False, ) if args["wandb_project"]: wandb.log( self._get_last_metrics( training_progress_scores)) epoch_number += 1 train_time = datetime.timedelta(seconds=int(time.time() - train_start)) save_start = time.time() output_dir_current = os.path.join( output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if (args["save_model_every_epoch"] or args["evaluate_during_training"] ) and not os.path.exists(output_dir_current): os.makedirs(output_dir_current) if args["save_model_every_epoch"]: model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) save_time = datetime.timedelta(seconds=int(time.time() - save_start)) eval_start = time.time() if args["evaluate_during_training"]: results, _, _ = self.eval_model(eval_df, verbose=True, **kwargs) output_eval_file = os.path.join(output_dir_current, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format( key, str(results[key]))) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(args["output_dir"] + "training_progress_scores.csv", index=False) if args["faq_evaluate_during_training"]: records = { 'epoch': epoch_number, 'ckpt': "checkpoint-{}-epoch-{}".format(global_step, epoch_number) } if eval_df is not None: eval_metrics, _, _ = faq_evaluate(self, eval_df) print_metrics(eval_metrics) records.update({('dev-' + k): v for k, v in eval_metrics.items()}) if test_df is not None: test_metrics, _, _ = faq_evaluate(self, test_df) print_metrics(test_metrics) records.update({('test-' + k): v for k, v in test_metrics.items()}) write_progress_to_csv(output_dir, 'train_log.csv', metrics=records) eval_time = datetime.timedelta(seconds=int(time.time() - eval_start)) print( f'Finished epoch {epoch_number} [train {train_time}, save {save_time}, eval {eval_time}]' ) return global_step, tr_loss / global_step
def watch(self, model): wandb.watch(model)
def train_and_eval(args, recon_args, recon_model): """ Wrapper for training and evaluation of policy model. :param args: Argument object, containing hyperparameters for training and evaluation. :param recon_args: reconstruction model arguments. :param recon_model: reconstruction model. """ if args.resume: # Check that this works resumed = True new_run_dir = args.policy_model_checkpoint.parent data_path = args.data_path # In case models have been moved to a different machine, make sure the path to the recon model is the # path provided. recon_model_checkpoint = args.recon_model_checkpoint model, args, start_epoch, optimiser = load_policy_model(pathlib.Path( args.policy_model_checkpoint), optim=True) args.old_run_dir = args.run_dir args.old_recon_model_checkpoint = args.recon_model_checkpoint args.old_data_path = args.data_path args.recon_model_checkpoint = recon_model_checkpoint args.run_dir = new_run_dir args.data_path = data_path args.resume = True else: resumed = False # Improvement model to train model = build_policy_model(args) # Add mask parameters for training args = add_mask_params(args) if args.data_parallel: model = torch.nn.DataParallel(model) optimiser = build_optim(args, model.parameters()) start_epoch = 0 # Create directory to store results in savestr = '{}_res{}_al{}_accel{}_k{}_{}_{}'.format( args.dataset, args.resolution, args.acquisition_steps, args.accelerations, args.num_trajectories, datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), ''.join(choice(ascii_uppercase) for _ in range(5))) args.run_dir = args.exp_dir / savestr args.run_dir.mkdir(parents=True, exist_ok=False) args.resumed = resumed if args.wandb: allow_val_change = args.resumed # only allow changes if resumed: otherwise something is wrong. wandb.config.update(args, allow_val_change=allow_val_change) wandb.watch(model, log='all') # Logging logging.info(recon_model) logging.info(model) # Save arguments for bookkeeping args_dict = { key: str(value) for key, value in args.__dict__.items() if not key.startswith('__') and not callable(key) } save_json(args.run_dir / 'args.json', args_dict) # Initialise summary writer writer = SummaryWriter(log_dir=args.run_dir / 'summary') # Parameter counting logging.info( 'Reconstruction model parameters: total {}, of which {} trainable and {} untrainable' .format(count_parameters(recon_model), count_trainable_parameters(recon_model), count_untrainable_parameters(recon_model))) logging.info( 'Policy model parameters: total {}, of which {} trainable and {} untrainable' .format(count_parameters(model), count_trainable_parameters(model), count_untrainable_parameters(model))) if args.scheduler_type == 'step': scheduler = torch.optim.lr_scheduler.StepLR(optimiser, args.lr_step_size, args.lr_gamma) elif args.scheduler_type == 'multistep': if not isinstance(args.lr_multi_step_size, list): args.lr_multi_step_size = [args.lr_multi_step_size] scheduler = torch.optim.lr_scheduler.MultiStepLR( optimiser, args.lr_multi_step_size, args.lr_gamma) else: raise ValueError( "{} is not a valid scheduler choice ('step', 'multistep')".format( args.scheduler_type)) # Create data loaders train_loader = create_data_loader(args, 'train', shuffle=True) dev_loader = create_data_loader(args, 'val', shuffle=False) train_data_range_dict = create_data_range_dict(args, train_loader) dev_data_range_dict = create_data_range_dict(args, dev_loader) if not args.resume: if args.do_train_ssim: do_and_log_evaluation(args, -1, recon_model, model, train_loader, writer, 'Train', train_data_range_dict) do_and_log_evaluation(args, -1, recon_model, model, dev_loader, writer, 'Val', dev_data_range_dict) for epoch in range(start_epoch, args.num_epochs): train_loss, train_time = train_epoch(args, epoch, recon_model, model, train_loader, optimiser, writer, train_data_range_dict) logging.info( f'Epoch = [{epoch+1:3d}/{args.num_epochs:3d}] TrainLoss = {train_loss:.3g} TrainTime = {train_time:.2f}s ' ) if args.do_train_ssim: do_and_log_evaluation(args, epoch, recon_model, model, train_loader, writer, 'Train', train_data_range_dict) do_and_log_evaluation(args, epoch, recon_model, model, dev_loader, writer, 'Val', dev_data_range_dict) scheduler.step() save_policy_model(args, args.run_dir, epoch, model, optimiser) writer.close()
def main( num_epochs=50, batch_size=64, D=18, N=50000, w_lr=1e-4, w_momentum=0.9, w_weight_decay=0, a_lr=3e-4, a_momentum=0.9, a_weight_decay=0, T=10, grad_clip=1, logging_freq=200, w_checkpoint_freq=1, max_order_y=7, noise_var=0.25, featurize_type="fourier", initial_degree=100, hvp="finite_diff", arch_train_data="val", normalize_a_lr=True, w_warm_start=0, extra_weight_decay=0.5, grad_inner_loop_order=-1, grad_outer_loop_order=-1, ): config = locals() wandb_auth() wandb.init(project="NAS", group=f"Linear_SOTL", config=config) ### MODEL INIT # x, y = data_generator(N, max_order_generated=D, max_order_y=[(5,7), (9,13)], noise_var=0.25, featurize_type='fourier') # x, y = get_datasets("songs") dset_train, dset_val = get_datasets(name="MNIST", data_size=N, max_order_generated=D, max_order_y=max_order_y, noise_var=noise_var, featurize_type=featurize_type) model = SoTLNet(num_features=int(len(dset_train[0][0])), layer_type="MNIST", degree=-1, weight_decay=extra_weight_decay) criterion = get_criterion(model_type) w_optimizer = SGD(model.weight_params(), lr=w_lr, momentum=w_momentum, weight_decay=w_weight_decay) a_optimizer = SGD(model.arch_params(), lr=a_lr, momentum=a_momentum, weight_decay=a_weight_decay) wandb.watch(model, log="all") train_bptt(num_epochs=num_epochs, model=model, criterion=criterion, w_optimizer=w_optimizer, a_optimizer=a_optimizer, dset_train=dset_train, dset_val=dset_val, logging_freq=logging_freq, batch_size=batch_size, T=T, grad_clip=grad_clip, w_lr=w_lr, w_checkpoint_freq=w_checkpoint_freq, grad_inner_loop_order=grad_inner_loop_order, grad_outer_loop_order=grad_outer_loop_order, hvp=hvp, arch_train_data=arch_train_data, normalize_a_lr=normalize_a_lr, log_grad_norm=True, log_alphas=True, w_warm_start=w_warm_start, extra_weight_decay=extra_weight_decay) # train_normal(num_epochs=num_epochs, model=model, dset_train=dset_train, # logging_freq=logging_freq, batch_size=batch_size, grad_clip=grad_clip, optim="sgd") lapack_solution, res, eff_rank, sing_values = scipy.linalg.lstsq(x, y) print(f"Cond number:{abs(sing_values.max()/sing_values.min())}") val_meter = valid_func(model=model, dset_val=dset_val, criterion=criterion) model.fc1.weight = torch.nn.Parameter(torch.tensor(lapack_solution)) val_meter2 = valid_func(model=model, dset_val=dset_val, criterion=criterion) print( f"Trained val loss: {val_meter.avg}, SciPy solver val loss: {val_meter2.avg}, difference: {val_meter.avg - val_meter2.avg} (ie. {(val_meter.avg/val_meter2.avg-1)*100}% more)" ) true_degree = max_order_y / 2 trained_degree = model.fc1.alphas.item() print( f"True degree: {true_degree}, trained degree: {trained_degree}, difference: {abs(true_degree - trained_degree)}" ) wandb.run.summary["degree_mismatch"] = abs(true_degree - trained_degree)
def train_detector(args): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu trainset = KittiDataset(args.data_dir, args.seq, args.npoints) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, \ shuffle=True, num_workers=args.num_workers, drop_last=True) model = Detector(args) model = model.cuda() if args.use_wandb: wandb.watch(model) chamfer_criterion = ChamferLoss() point_criterion = Point2PointLoss() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) best_epoch_loss = float("inf") for epoch in range(args.epoch): torch.cuda.empty_cache() model.train() epoch_loss = 0 epoch_chamfer_loss = 0 epoch_point_loss = 0 count = 0 pbar = tqdm(enumerate(trainloader)) for i, data in pbar: src_pc, src_sn, dst_pc, dst_sn, T = data src = torch.cat((src_pc, src_sn), dim=-1) dst = torch.cat((dst_pc, dst_sn), dim=-1) src = src.cuda() dst = dst.cuda() src_pc = src_pc.cuda() dst_pc = dst_pc.cuda() T = T.cuda() R = T[:, :3, :3].contiguous() t = T[:, :3, 3].unsqueeze(1).contiguous() src_kp, src_sigma, _, _ = model(src) dst_kp, dst_sigma, _, _ = model(dst) src_kp_trans = (torch.matmul(R, src_kp).permute(0, 2, 1) + t).permute(0, 2, 1).contiguous() chamfer_loss = chamfer_criterion(src_kp_trans, dst_kp, src_sigma, dst_sigma) point_loss = point_criterion( src_kp, src_pc.permute(0, 2, 1).contiguous()) + point_criterion( dst_kp, dst_pc.permute(0, 2, 1).contiguous()) loss = chamfer_loss + args.alpha * point_loss epoch_loss = epoch_loss + float(loss) epoch_chamfer_loss = epoch_chamfer_loss + float(chamfer_loss) epoch_point_loss = epoch_point_loss + float(point_loss) count += 1 optimizer.zero_grad() loss.backward() optimizer.step() if i % 100 == 0: pbar.set_description( 'Train Epoch:{}[{}/{}({:.0f}%)]\tLoss: {:.6f}'.format( epoch + 1, i, len(trainloader), 100. * i / len(trainloader), loss.item())) epoch_loss = epoch_loss / count epoch_chamfer_loss = epoch_chamfer_loss / count epoch_point_loss = epoch_point_loss / count print('Epoch {} finished. Loss: {:.3f} Chamfer loss: {:.3f} Point loss: {:.3f}'.\ format(epoch+1, epoch_loss, epoch_chamfer_loss, epoch_point_loss)) if args.use_wandb: wandb.log({ "loss": epoch_loss, "chamfer loss": epoch_chamfer_loss, "point loss": epoch_point_loss }) if not os.path.exists(args.ckpt_dir): os.makedirs(args.ckpt_dir) if epoch_loss < best_epoch_loss: torch.save(model.state_dict(), os.path.join(args.ckpt_dir, 'best_detector.pth'))
def run_train(self, train_data, dev_data): self.print_model_parameters() import wandb wandb.init(project='smore-{}-group-{}-final'.format( self.args.dataset_name, get_no_join_tag(self.args, separator_in_front=True)), group=get_wandb_group(self.args), name=get_wandb_tag(self.args)) os.environ["WANDB_RUN_GROUP"] = get_wandb_group(self.args) wandb.watch(self) if self.args.augment_with_wikisql: train_data_, train_data_augment = [], [] for example in train_data: if example.dataset_id == WIKISQL: train_data_augment.append(example) else: train_data_.append(example) train_data = train_data_ train_batch_size = round(self.train_batch_size * 0.7) train_augment_batch_size = self.train_batch_size - train_batch_size dev_data_, dev_data_augment = [], [] for example in dev_data: if example.dataset_id == WIKISQL: dev_data_augment.append(example) else: dev_data_.append(example) dev_data = dev_data_ print('**************************') print('{} training examples'.format(len(train_data))) print('{} augmented training examples'.format( len(train_data_augment))) print('train batch size = {}'.format(train_batch_size)) print('train augment batch size = {}'.format( train_augment_batch_size)) print('{} dev examples'.format(len(dev_data))) print('{} augmented dev examples'.format(len(dev_data_augment))) print('**************************') else: train_batch_size = self.train_batch_size train_augment_batch_size = 0 # Track training losses dev metrics changes ############################ epoch_losses = [] best_dev_metrics = 0 dev_metrics_history = [] ############################ all_train_data = copy.deepcopy(train_data) # Curriculum learning (start from easy category) if self.args.curriculum_interval > 0: # assert(self.args.curriculum_interval % self.args.num_peek_steps == 0) train_data = [ exp for exp in all_train_data if exp.hardness in ['easy', 'medium'] ] print('Curriculumn: [easy, medium] ({}) ------'.format( len(train_data))) num_steps = self.num_steps * self.num_accumulation_steps num_peek_steps = self.num_peek_steps * self.num_accumulation_steps curriculum_interval = self.args.curriculum_interval * self.num_accumulation_steps random.shuffle(train_data) if self.args.augment_with_wikisql: random.shuffle(train_data_augment) augment_example_id = 0 step_id, example_id = 0, 0 self.optim.zero_grad() self.train() for interval_step_id in range(self.start_step, num_steps, num_peek_steps): # Update model parameters self.train() for s_id in tqdm(range(num_peek_steps)): step_id = interval_step_id + s_id if self.log_in_wandb(step_id / self.num_accumulation_steps): wandb.log({ 'learning_rate/{}'.format(self.dataset): self.optim.param_groups[0]['lr'] }) wandb.log({ 'fine_tuning_rate/{}'.format(self.dataset): self.optim.param_groups[1]['lr'] }) batch_end = example_id + train_batch_size if curriculum_interval > 0 and step_id % curriculum_interval == 0 and \ 0 < step_id / curriculum_interval <= 2: if float(step_id) / curriculum_interval == 1: train_data = [ exp for exp in all_train_data if exp.hardness in ['easy', 'medium', 'hard'] ] print('Curriculumn: [easy, medium, hard] ({}) ------'. format(len(train_data))) elif float(step_id) / curriculum_interval == 2: train_data = all_train_data print( 'Curriculumn: [easy, medium, hard, extra] ({}) ------' .format(len(train_data))) random.shuffle(train_data) example_id, batch_end = 0, train_batch_size if batch_end > len(train_data): random.shuffle(train_data) example_id, batch_end = 0, train_batch_size mini_batch = train_data[example_id:batch_end] example_id = batch_end if self.args.augment_with_wikisql: augment_batch_end = augment_example_id + train_augment_batch_size if augment_batch_end > len(train_data_augment): random.shuffle(train_data_augment) augment_example_id, augment_batch_end = 0, train_augment_batch_size mini_batch += train_data_augment[ augment_example_id:augment_batch_end] augment_example_id = augment_batch_end formatted_batch = self.format_batch(mini_batch) loss = self.loss(formatted_batch) loss.backward() epoch_losses.append(float(loss) * self.num_accumulation_steps) if (step_id + 1) % self.num_accumulation_steps == 0: # Gradient clipping if self.grad_norm > 0: nn.utils.clip_grad_norm_(self.parameters(), self.grad_norm) # Update learning rate scheduler self.lr_scheduler.step() # Update parameters self.optim.step() self.optim.zero_grad() # Check training statistics if step_id > 0 and (step_id + 1) % num_peek_steps == 0: stdout_msg = 'Step {}: average training loss = {}'.format( step_id / self.num_accumulation_steps, np.mean(epoch_losses)) print(stdout_msg) wandb.log({ 'cross_entropy_loss/{}'.format(self.dataset): np.mean(epoch_losses) }) epoch_losses = [] # Check model performance if step_id > 0 and (step_id + 1) % num_peek_steps == 0: self.eval() if self.args.process_sql_in_execution_order: pred_restored_cache = self.load_pred_restored_cache() pred_restored_cache_size = sum( len(v) for v in pred_restored_cache.values()) else: pred_restored_cache = None engine_path = os.path.join( self.args.data_dir, 'dev.db') if self.args.dataset_name == 'wikisql' else None engine = DBEngine(engine_path) if engine_path else None output_dict = self.inference( dev_data, restore_clause_order=self.args. process_sql_in_execution_order, pred_restored_cache=pred_restored_cache, check_schema_consistency_=self.args.sql_consistency_check, engine=engine, inline_eval=True, verbose=False) metrics = eval_tools.get_exact_match_metrics( dev_data, output_dict['pred_decoded'], engine=engine) dev_metrics_history.append(metrics) # eval_metrics = metrics['top_1_ex'] if self.args.dataset_name == 'wikisql' else metrics['top_1_em'] eval_metrics_em = metrics['top_1_em'] eval_metrics_exe = metrics['top_1_ex'] wandb.log({ 'dev_exact_match/{}'.format(self.dataset): eval_metrics_em }) wandb.log({ 'dev_execution/{}'.format(self.dataset): eval_metrics_exe }) print('Dev set performance:') print('Top-1 exact match: {}'.format(metrics['top_1_em'])) print('Top-3 exact match: {}'.format(metrics['top_3_em'])) if self.args.dataset_name == 'wikisql': print('Top-1 exe acc: {}'.format(metrics['top_1_ex'])) print('Top-3 exe acc: {}'.format(metrics['top_3_ex'])) if eval_metrics_exe >= best_dev_metrics: best_dev_metrics = eval_metrics_exe self.save_checkpoint(step_id, step_id / num_peek_steps, output_dict['pred_decoded'], is_best=True) if self.args.augment_with_wikisql and (step_id + 1) % ( num_peek_steps * 3) == 0: wikisql_output_dict = self.inference(dev_data_augment, inline_eval=True, verbose=False) wikisql_metrics = eval_tools.get_exact_match_metrics( dev_data_augment, wikisql_output_dict['pred_decoded']) wandb.log({ 'wikisql_dev_exact_match/{}'.format(self.dataset): wikisql_metrics['top_1_em'] }) print('WikiSQL dev set performance:') print('Top-1 exact match: {}'.format( wikisql_metrics['top_1_em'])) print('Top-3 exact match: {}'.format( wikisql_metrics['top_3_em'])) if self.args.process_sql_in_execution_order: new_pred_restored_cache_size = sum( len(v) for v in output_dict['pred_restored_cache'].values()) newly_cached_size = new_pred_restored_cache_size - pred_restored_cache_size if newly_cached_size > 0: self.save_pred_restored_cache( output_dict['pred_restored_cache'], newly_cached_size)
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) wandb.init(project="qpic-project", entity="sangbaeklee", group="experiment_qpic") wandb.config = { "learning_rate": args.lr, "epochs": args.epochs, "batch_size": args.batch_size, } if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) wandb.watch(model) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if not args.hoi: if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 elif args.pretrained: checkpoint = torch.load(args.pretrained, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if args.eval: if args.hoi: test_stats = evaluate_hoi(args.dataset_file, model, postprocessors, data_loader_val, args.subject_category_id, device) return else: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if args.hoi: test_stats = evaluate_hoi(args.dataset_file, model, postprocessors, data_loader_val, args.subject_category_id, device) coco_evaluator = None else: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } #import pdb; pdb.set_trace() if args.dataset_file == 'hico' or args.dataset_file == 'hico_second': wandb.log({ "loss": train_stats['loss'], "mAP": test_stats['mAP'], "mAP rare": test_stats['mAP rare'], "mAP non-rare": test_stats['mAP non-rare'], "mean max recall": test_stats['mean max recall'] }) elif args.dataset_file == 'vcoco': wandb.log({ "mAP_all": test_stats['mAP_all'], "mAP_thesis": test_stats['mAP_thesis'], "AP_hold_obj": test_stats['AP_hold_obj'], "AP_stand": test_stats['AP_stand'], "AP_sit_instr": test_stats['AP_sit_instr'], "AP_ride_instr": test_stats['AP_ride_instr'], "AP_walk": test_stats['AP_walk'], "AP_look_obj": test_stats['AP_look_obj'], "AP_hit_instr": test_stats['AP_hit_instr'], "AP_hit_obj": test_stats['AP_hit_obj'], "AP_eat_obj": test_stats['AP_eat_obj'], "AP_eat_instr": test_stats['AP_eat_instr'], "AP_jump_instr": test_stats['AP_jump_instr'], "AP_lay_instr": test_stats['AP_lay_instr'], "AP_talk_on_phone_instr": test_stats['AP_talk_on_phone_instr'], "AP_carry_obj": test_stats['AP_carry_obj'], "AP_throw_obj": test_stats['AP_throw_obj'], "AP_catch_obj": test_stats['AP_catch_obj'], "AP_cut_instr": test_stats['AP_cut_instr'], "AP_cut_obj": test_stats['AP_cut_obj'], "AP_run": test_stats['AP_run'], "AP_work_on_computer_instr": test_stats['AP_work_on_computer_instr'], "AP_ski_instr": test_stats['AP_ski_instr'], "AP_surf_instr": test_stats['AP_surf_instr'], "AP_skateboard_instr": test_stats['AP_skateboard_instr'], "AP_smile": test_stats['AP_smile'], "AP_drink_instr": test_stats['AP_drink_instr'], "AP_kick_obj": test_stats['AP_kick_obj'], "AP_point_instr": test_stats['AP_point_instr'], "AP_read_obj": test_stats['AP_read_obj'], "AP_snowboard_instr": test_stats['AP_snowboard_instr'],\ "loss" : train_stats['loss'] }) else: continue if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=None, verbose=True): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) t_total = len(train_dataloader) // args[ "gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args["weight_decay"], }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args[ "warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW( optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"], ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"]) epoch_number = 0 best_eval_loss = None early_stopping_counter = 0 if args["evaluate_during_training"]: training_progress_scores = self._create_training_progress_scores() if args["wandb_project"]: wandb.init(project=args["wandb_project"], config={**args}, **args["wandb_kwargs"]) wandb.watch(self.model) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate( tqdm(train_dataloader, desc="Current iteration", disable=args["silent"])): batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args["n_gpu"] > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # torch.nn.utils.clip_grad_norm_( # amp.master_params(optimizer), args["max_grad_norm"] # ) else: loss.backward() # torch.nn.utils.clip_grad_norm_( # model.parameters(), args["max_grad_norm"] # ) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: if args["fp16"]: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args["max_grad_norm"]) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args[ "logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar( "loss", (tr_loss - logging_loss) / args["logging_steps"], global_step, ) logging_loss = tr_loss if args["wandb_project"]: wandb.log({ "Training loss": current_loss, "lr": scheduler.get_lr()[0], "global_step": global_step, }) if args["save_steps"] > 0 and global_step % args[ "save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) self._save_model(output_dir_current, model=model) if args["evaluate_during_training"] and ( args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): # Only evaluate when single GPU otherwise metrics may not average well results, _ = self.eval_model(eval_data, verbose=True) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if args["save_eval_checkpoints"]: self._save_model(output_dir_current, model=model, results=results) training_progress_scores["global_step"].append( global_step) training_progress_scores["train_loss"].append( current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( args["output_dir"] + "training_progress_scores.csv", index=False, ) if args["wandb_project"]: wandb.log( self._get_last_metrics( training_progress_scores)) if not best_eval_loss: best_eval_loss = results["eval_loss"] self._save_model(args["best_model_dir"], model=model, results=results) elif results["eval_loss"] - best_eval_loss < args[ "early_stopping_delta"]: best_eval_loss = results["eval_loss"] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: print() print( f"No improvement in eval_loss for {early_stopping_counter} steps." ) print( f"Training will stop at {args['early_stopping_patience']} steps." ) print() else: if verbose: print() print( f"Patience of {args['early_stopping_patience']} steps reached." ) print("Training terminated.") print() return global_step, tr_loss / global_step epoch_number += 1 output_dir_current = os.path.join( output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args["save_model_every_epoch"] or args[ "evaluate_during_training"]: os.makedirs(output_dir_current, exist_ok=True) if args["save_model_every_epoch"]: self._save_model(output_dir_current, model=model) if args["evaluate_during_training"]: results, _ = self.eval_model(eval_data, verbose=True) self._save_model(output_dir_current, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(args["output_dir"] + "training_progress_scores.csv", index=False) if not best_eval_loss: best_eval_loss = results["eval_loss"] self._save_model(args["best_model_dir"], model=model, results=results) elif results["eval_loss"] - best_eval_loss < args[ "early_stopping_delta"]: best_eval_loss = results["eval_loss"] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: print() print( f"No improvement in eval_loss for {early_stopping_counter} steps." ) print( f"Training will stop at {args['early_stopping_patience']} steps." ) print() else: if verbose: print() print( f"Patience of {args['early_stopping_patience']} steps reached." ) print("Training terminated.") print() return global_step, tr_loss / global_step return global_step, tr_loss / global_step
def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ tokenizer = self.tokenizer device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_folder"]) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) t_total = len(train_dataloader) // args[ "gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args["weight_decay"] }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args[ "warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args['silent']) epoch_number = 0 if args['evaluate_during_training']: training_progress_scores = { 'global_step': [], 'precision': [], 'recall': [], 'f1_score': [], 'train_loss': [], 'eval_loss': [], } if args['wandb_project']: argwandb.init(project=args['wandb_project'], config={**args}, **args['wandb_kwargs']) wandb.watch(self.model) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate( tqdm(train_dataloader, desc="Current iteration", disable=args['silent'])): batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } # XLM and RoBERTa don"t use segment_ids if args['model_type'] in ["bert", "xlnet"]: inputs["token_type_ids"] = batch[2] outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args['n_gpu'] > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: logger.info("\rRunning loss: %f" % loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args["max_grad_norm"]) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args[ "logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args["logging_steps"], global_step) logging_loss = tr_loss if args['wandb_project']: wandb.log({ 'Training loss': current_loss, 'lr': scheduler.get_lr()[0], 'global_step': global_step }) if args["save_steps"] > 0 and global_step % args[ "save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) if args['evaluate_during_training'] and ( args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model(eval_df, verbose=True) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) if args['save_eval_checkpoints']: model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) output_eval_file = os.path.join( output_dir_current, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format( key, str(results[key]))) training_progress_scores['global_step'].append( global_step) training_progress_scores['train_loss'].append( current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(args['output_dir'] + 'training_progress_scores.csv', index=False) if args['wandb_project']: wandb.log( self._get_last_metrics( training_progress_scores)) epoch_number += 1 output_dir_current = os.path.join(output_dir, "epoch-{}".format(epoch_number)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) if args['evaluate_during_training']: results, _, _ = self.eval_model(eval_df, verbose=True) output_eval_file = os.path.join(output_dir_current, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format( key, str(results[key]))) return global_step, tr_loss / global_step
def train_transformer_style( model: PyTorchForecast, training_params: Dict, takes_target=False, forward_params: Dict = {}, model_filepath: str = "model_save") -> None: """ Function to train any PyTorchForecast model :model The initialized PyTorchForecastModel :training_params_dict A dictionary of the parameters needed to train model :takes_target boolean: Determines whether to pass target during training :forward_params: A dictionary for additional forward parameters (for instance target) """ use_wandb = model.wandb es = None if "early_stopping" in model.params: es = EarlyStopper(model.params["early_stopping"]['patience']) opt = pytorch_opt_dict[training_params["optimizer"]]( model.model.parameters(), **training_params["optim_params"]) criterion_init_params = {} if "criterion_params" in training_params: criterion_init_params = training_params["criterion_params"] criterion = pytorch_criterion_dict[training_params["criterion"]](**criterion_init_params) max_epochs = training_params["epochs"] data_loader = DataLoader( model.training, batch_size=training_params["batch_size"], shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None) validation_data_loader = DataLoader( model.validation, batch_size=training_params["batch_size"], shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None) test_data_loader = DataLoader(model.test_data, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None) meta_model = None meta_representation = None if model.params.get("meta_data") is None: model.params["meta_data"] = False if model.params["meta_data"]: with open(model.params["meta_data"]["path"]) as f: json_data = json.load(f) dataset_params2 = json_data["dataset_params"] training_path = dataset_params2["training_path"] valid_path = dataset_params2["validation_path"] meta_name = json_data["model_name"] meta_model = PyTorchForecast(meta_name, training_path, valid_path, dataset_params2["test_path"], json_data) meta_representation = get_meta_representation(model.params["meta_data"]["column_id"], model.params["meta_data"]["uuid"], meta_model) if use_wandb: wandb.watch(model.model) session_params = [] for epoch in range(max_epochs): total_loss = torch_single_train( model, opt, criterion, data_loader, takes_target, meta_model, meta_representation, forward_params) print("The loss for epoch " + str(epoch)) print(total_loss) use_decoder = False if "use_decoder" in model.params: use_decoder = True valid = compute_validation( validation_data_loader, model.model, epoch, model.params["dataset_params"]["forecast_length"], criterion, model.device, meta_model=meta_model, decoder_structure=use_decoder, use_wandb=use_wandb) if valid < 0.01: raise("Error validation loss is zero there is a problem with the validator.") if use_wandb: wandb.log({'epoch': epoch, 'loss': total_loss}) epoch_params = { "epoch": epoch, "train_loss": str(total_loss), "validation_loss": str(valid)} session_params.append(epoch_params) if es: if not es.check_loss(model.model, valid): print("Stopping model now") model.model.load_state_dict(torch.load("checkpoint.pth")) break decoder_structure = True if model.params["dataset_params"]["class"] != "default": decoder_structure = False test = compute_validation( test_data_loader, model.model, epoch, model.params["dataset_params"]["forecast_length"], criterion, model.device, meta_model=meta_model, decoder_structure=decoder_structure, use_wandb=use_wandb, val_or_test="test_loss") print("test loss:", test) model.params["run"] = session_params model.save_model(model_filepath, max_epochs)
def train_model(self, epochs, dataset, save_folder, batch_size=1, cache=False, epochs_per_checkpoint=5, dis_train_amount=3, iters=None,wdb=True, tb=True, ray=False,local_dir="../"): self.local_dir = local_dir # Make a writer for Tensorboard if tb: writer = SummaryWriter() # Use wandb for watching the model if wdb: wandb.init(project="retrogan") wandb.run.name = self.name wandb.watch(self, criterion="simlex") wandb.run.save() res = [] self.set_fp16() self.to_device(self.device) class RetroPairsDataset(Dataset): """Dataset of pairs of embeddings consisting of the distributional and its retrofitted counterpart.""" def __init__(self, original_dataset, retrofitted_dataset, save_folder, cache): # Load the data. X_train, Y_train = helpertools.load_all_words_dataset_final(original_dataset, retrofitted_dataset, save_folder=save_folder, cache=cache) print("Shapes of training data:", X_train.shape, Y_train.shape) print(X_train) print(Y_train) print("*" * 100) self.x = X_train self.y = Y_train def __len__(self): return self.x.shape[0] def __getitem__(self, idx): # We normalize the embeddings that we utilize imgs_A = np.array(self.x.iloc[idx], dtype=np.float) imgs_B = np.array(self.y.iloc[idx], dtype=np.float) imgs_A /= np.linalg.norm(imgs_A) imgs_B /= np.linalg.norm(imgs_B) return torch.from_numpy(imgs_A), torch.from_numpy(imgs_B) # Initialize the dataset ds = RetroPairsDataset(dataset["original"], dataset["retrofitted"], save_folder=save_folder, cache=cache) # Create our data loader dataloader = DataLoader(ds, batch_size=batch_size, shuffle=True, num_workers=0) # Initialize our models optimizers self.compile_all() def train_step(self, batch_i, imgs_A, imgs_B, epoch, count, training_epochs): if imgs_A.shape[0] == 1: print("Batch is equal to 1 in training.") return a = datetime.datetime.now() imgs_A = imgs_A.to(self.device) imgs_B = imgs_B.to(self.device) imgs_A = imgs_A.half() if self.fp16 else imgs_A.float() imgs_B = imgs_B.half() if self.fp16 else imgs_B.float() with torch.cuda.amp.autocast(): fake_B = self.g_AB(imgs_A) fake_A = self.g_BA(imgs_B) # Train the discriminators (original images = real / translated = Fake) dA_loss = None dB_loss = None valid = torch.ones((imgs_A.shape[0], 1)).to(self.device) # *noisy_entries_num,) ) fake = torch.zeros((imgs_A.shape[0], 1)).to(self.device) # *noisy_entries_num,) ) # accs = [] b = datetime.datetime.now() # print("Data prep time",b-a) # TRAIN THE DISCRIMINATORS a = datetime.datetime.now() if False: for _ in range(int(dis_train_amount)): if _ % 2 == 0: # print("Adding noise") i_A = imgs_A + torch.tensor( np.random.uniform(low=-1, size=(imgs_A.shape[0], self.word_vector_dimensions)), device=imgs_A.device).half() i_B = imgs_B + torch.tensor( np.random.uniform(low=-1, size=(imgs_A.shape[0], self.word_vector_dimensions)), device=imgs_B.device).half() f_A = fake_A + torch.tensor( np.random.uniform(low=-1, size=(imgs_A.shape[0], self.word_vector_dimensions)), device=fake_A.device).half() f_B = fake_B + torch.tensor( np.random.uniform(low=-1, size=(imgs_A.shape[0], self.word_vector_dimensions)), device=fake_B.device).half() else: i_A = imgs_A i_B = imgs_B f_B = fake_B f_A = fake_A # with torch.no_grad(): # TRAIN ON BATCH VALID self.dA_optimizer.zero_grad() dA = self.d_A(i_A) dA_loss_real = nn.BCEWithLogitsLoss()(dA, valid) if self.fp16: self.dA_optimizerscaler.scale(dA_loss_real).backward() self.dA_optimizerscaler.step(self.dA_optimizer) self.dA_optimizerscaler.update() else: dA_loss_real.backward(retain_graph=True) self.dA_optimizer.step() # TRAIN ON BATCH FAKE self.dA_optimizer.zero_grad() dA_f = self.d_A(f_A) dA_loss_fake = nn.BCEWithLogitsLoss()(dA_f, fake) if self.fp16: self.dA_optimizerscaler.scale(dA_loss_fake).backward(retain_graph=True) self.dA_optimizerscaler.step(self.dA_optimizer) self.dA_optimizerscaler.update() else: dA_loss_fake.backward(retain_graph=True) self.dA_optimizer.step() if dA_loss is None: dA_loss = 0.5 * (float(dA_loss_real) + float(dA_loss_fake)) else: dA_loss += 0.5 * (float(dA_loss_real) + float(dA_loss_fake)) # TRAIN ON BATCH VALID self.dB_optimizer.zero_grad() dB = self.d_B(i_B) dB_loss_real = nn.BCEWithLogitsLoss()(dB, valid) if self.fp16: self.dB_optimizerscaler.scale(dB_loss_real).backward() self.dB_optimizerscaler.step(self.dB_optimizer) self.dB_optimizerscaler.update() else: dB_loss_real.backward(retain_graph=True) self.dB_optimizer.step() # TRAIN ON BATCH FAKE self.dB_optimizer.zero_grad() dB_f = self.d_B(f_B) dB_loss_fake = nn.BCEWithLogitsLoss()(dB_f, fake) if self.fp16: self.dB_optimizerscaler.scale(dB_loss_fake).backward(retain_graph=True) self.dB_optimizerscaler.step(self.dB_optimizer) self.dB_optimizerscaler.update() else: dB_loss_fake.backward(retain_graph=True) self.dB_optimizer.step() # dB_loss_real = self.d_B.train_on_batch(retrofitted_embeddings, valid) # dB_loss_fake = self.d_B.train_on_batch(fake_B, fake) if dB_loss is None: dB_loss = 0.5 * (dB_loss_real.item() + dB_loss_fake.item()) else: dB_loss += 0.5 * (dB_loss_real.item() + dB_loss_fake.item()) else: dA_loss = 0 dB_loss = 0 # ABBA b = datetime.datetime.now() d_loss = (1.0 / dis_train_amount) * 0.5 * np.add(dA_loss, dB_loss) # print("Dis train time", b - a) # TRAIN THE CYCLE DISCRIMINATORS if self.cycle_dis: a = datetime.datetime.now() with torch.cuda.amp.autocast(): fake_ABBA = self.g_BA(fake_B) fake_BAAB = self.g_AB(fake_A) self.dABBA_optimizer.zero_grad() with torch.cuda.amp.autocast(): dA = self.d_ABBA(torch.cat([fake_B, imgs_A], 1)) dA_r = self.d_ABBA(torch.cat([fake_B, fake_ABBA], 1)) dABBA_loss_real = CycleCond_Loss()(dA, dA_r) # dABBA_loss_real = nn.BCEWithLogitsLoss()(dA, valid) if self.fp16: self.dABBA_optimizerscaler.scale(dABBA_loss_real).backward() self.dABBA_optimizerscaler.step(self.dABBA_optimizer) self.dABBA_optimizerscaler.update() else: dABBA_loss_real.backward() self.dABBA_optimizer.step() self.dBAAB_optimizer.zero_grad() with torch.cuda.amp.autocast(): dB = self.d_BAAB(torch.cat([fake_A, imgs_B], 1)) dB_r = self.d_BAAB(torch.cat([fake_A, fake_BAAB], 1)) dBAAB_loss_real = CycleCond_Loss()(dB, dB_r) # dABBA_loss_real = nn.BCEWithLogitsLoss()(dA, valid) if self.fp16: self.dBAAB_optimizerscaler.scale(dBAAB_loss_real).backward() self.dBAAB_optimizerscaler.step(self.dBAAB_optimizer) self.dBAAB_optimizerscaler.update() else: dBAAB_loss_real.backward() self.dBAAB_optimizer.step() d_cycle_loss = 0.5 * (dBAAB_loss_real.item() + dABBA_loss_real.item()) b = datetime.datetime.now() # print("Cycle discriminator train time", b - a) else: d_cycle_loss = 0 # Calculate the max margin loss for A->B, B->A ## Max margin AB and BA if self.one_way_mm: self.g_AB_optimizer.zero_grad() a = datetime.datetime.now() with torch.cuda.amp.autocast(): mm_a = self.g_AB(imgs_A) mm_a_loss = MaxMargin_Loss(batch_size=imgs_A.shape[0])(mm_a, imgs_B) # Calling the step function on an Optimizer makes an update to its # parameters if self.fp16: self.g_AB_optimizerscaler.scale(mm_a_loss).backward() self.g_AB_optimizerscaler.step(self.g_AB_optimizer) self.g_AB_optimizerscaler.update() else: mm_a_loss.backward(retain_graph=True) self.g_AB_optimizer.step() mm_a_loss = mm_a_loss.item() self.g_BA_optimizer.zero_grad() with torch.cuda.amp.autocast(): mm_b = self.g_BA(imgs_B) mm_b_loss = MaxMargin_Loss(batch_size=imgs_A.shape[0])(mm_b, imgs_A) if self.fp16: self.g_BA_optimizerscaler.scale(mm_b_loss).backward() self.g_BA_optimizerscaler.step(self.g_BA_optimizer) self.g_BA_optimizerscaler.update() else: mm_b_loss.backward() self.g_BA_optimizer.step() mm_b_loss = mm_b_loss.item() b = datetime.datetime.now() # print("MM one way discriminator train time", b - a) else: mm_a_loss = mm_b_loss = 0 # Calculate the cycle A->B->A, B->A->B with max margin, and mae a = datetime.datetime.now() self.combined_optimizer.zero_grad() with torch.cuda.amp.autocast(): fake_B = self.g_AB(imgs_A) fake_A = self.g_BA(imgs_B) # with torch.no_grad(): valid_A = self.d_A(fake_A) valid_B = self.d_B(fake_B) valid_A_loss = nn.BCEWithLogitsLoss()(valid_A, valid) valid_B_loss = nn.BCEWithLogitsLoss()(valid_B, valid) id_a = fake_B id_b = fake_A if self.id_loss: gamma = 1.0 mae_id_abba = gamma * torch.nn.L1Loss()(id_a, imgs_A) mae_id_baab = gamma * torch.nn.L1Loss()(id_b, imgs_B) else: mae_id_abba = mae_id_baab = 0 with torch.cuda.amp.autocast(): fake_ABBA = self.g_BA(fake_B) fake_BAAB = self.g_AB(fake_A) if self.cycle_mm: mm_abba = MaxMargin_Loss(batch_size=imgs_A.shape[0])(fake_ABBA, imgs_A) mm_baab = MaxMargin_Loss(batch_size=imgs_A.shape[0])(fake_BAAB, imgs_B) else: mm_abba = mm_baab = 0 if self.cycle_loss: mae_abba = torch.nn.L1Loss()(fake_ABBA, imgs_A) mae_baab = torch.nn.L1Loss()(fake_BAAB, imgs_B) else: mae_abba = 0 mae_baab = 0 if self.cycle_dis: with torch.cuda.amp.autocast(): dA = self.d_ABBA(torch.cat([fake_B, imgs_A], 1)) dA_r = self.d_ABBA(torch.cat([fake_B, fake_ABBA], 1)) dABBA_loss_real = CycleCond_Loss()(dA, dA_r) dB = self.d_BAAB(torch.cat([fake_A, imgs_B], 1)) dB_r = self.d_BAAB(torch.cat([fake_A, fake_BAAB], 1)) dBAAB_loss_real = CycleCond_Loss()(dB, dB_r) else: dABBA_loss_real = 0 dBAAB_loss_real = 0 g_loss = valid_A_loss + valid_B_loss + \ self.cycle_mm_weight * mm_abba + self.cycle_mm_weight * mm_baab + \ mae_abba + mae_baab + \ self.id_loss_weight * mae_id_abba + self.id_loss_weight * mae_id_baab + \ dBAAB_loss_real + dABBA_loss_real if self.fp16: self.combined_optimizerscaler.scale(g_loss).backward() self.combined_optimizerscaler.step(self.combined_optimizer) self.combined_optimizerscaler.update() else: g_loss.backward() self.combined_optimizer.step() b = datetime.datetime.now() # print("Combined gen train time", b - a) if batch_i % 50 == 0 and batch_i != 0: print( "Epoch", epoch, "/", training_epochs, "Batch:", batch_i, len(dataloader), "Global Step", count, "Discriminator loss:", d_loss, # "Discriminator acc:", "{:.2f}".format(100 * np.mean(accs)), "Combined loss:", "{:.2f}".format(g_loss.item()), "MM_ABBA_CYCLE:", "{:.2f}".format(mm_abba.item() if self.cycle_mm else 0), "MM_BAAB_CYCLE:", "{:.2f}".format(mm_baab.item() if self.cycle_mm else 0), "abba acc:", "{:.2f}".format(mae_abba.item() if self.cycle_loss else 0), "baab acc:", "{:.2f}".format(mae_baab.item() if self.cycle_loss else 0), "idloss ab:", "{:.2f}".format(mae_id_abba.item() if self.id_loss else 0), "idloss ba:", "{:.2f}".format(mae_id_baab.item() if self.id_loss else 0), "mm ab loss:", "{:.2f}".format(mm_a_loss if self.one_way_mm else 0), "mm ba loss:", "{:.2f}".format(mm_b_loss if self.one_way_mm else 0), "discriminator cycle loss:", "{:.2f}".format(d_cycle_loss), ) scalars = { "epoch": epoch, # "batch": batch_i, "global_step": count, "discriminator_loss": d_loss, # "discriminator_acc": np.mean(accs), "combined_loss": g_loss.item(), "loss": g_loss.item() + d_loss, "MM_ABBA_CYCLE": mm_abba.item() if self.cycle_mm else 0, "MM_BAAB_CYCLE": mm_baab.item() if self.cycle_mm else 0, "abba_mae": mae_abba.item() if self.cycle_loss else 0, "baab_mae": mae_baab.item() if self.cycle_loss else 0, "cycle_da": valid_A_loss.item(), "cycle_db": valid_B_loss.item(), "idloss_ab": mae_id_abba.item() if self.id_loss else 0, "idloss_ba": mae_id_baab.item() if self.id_loss else 0, "mm_ab_loss": mm_a_loss if self.one_way_mm else 0, "mm_ba_loss": mm_b_loss if self.one_way_mm else 0, "discriminator_cycle_loss": d_cycle_loss } if wdb: wandb.log(scalars, step=count) if tb: writer.add_scalars("run", tag_scalar_dict=scalars, global_step=count) writer.flush() def train_loop(training_epochs, iters=None): count = 0 # We gave a specific amount of epochs if iters is None: for epoch in range(training_epochs): for batch_i, (distributional_embeddings, retrofitted_embeddings) in enumerate(dataloader): train_step(self, batch_i, distributional_embeddings, retrofitted_embeddings, epoch, count, training_epochs) count += 1 print("\n") sl, sv, c = self.test(dataset) print(sl, sv, c) print("Saving our results.") # Save to tensorboard if tb: writer.add_scalar("simlex", sl, global_step=count) writer.add_scalar("simverb", sv, global_step=count) writer.add_scalar("card", c, global_step=count) writer.flush() # Save them also to wandb if wdb: wandb.log({"simlex": sl, "card": c, "simverb": sv, "epoch": epoch}, step=count) if ray: tune.report(**{"simlex": sl, "card": c, "simverb": sv, "epoch": epoch}) # Save a checkpoint if epochs_per_checkpoint is not None: if epoch % epochs_per_checkpoint == 0 and epoch != 0: self.save_model(name="checkpoint") print("\n") res.append((sl, sv, c)) print(res) print("\n") else: epoch = 0 running = True while running: for batch_i, (distributional_embeddings, retrofitted_embeddings) in enumerate(dataloader): if count >= iters: running = False break train_step(self, batch_i, distributional_embeddings, retrofitted_embeddings, epoch, count, iters / len(dataloader)) count += 1 epoch += 1 print("\n") sl, sv, c = self.test(dataset) print(sl, sv, c) # Save to tensorboard if tb: writer.add_scalar("simlex", sl, global_step=count) writer.add_scalar("simverb", sv, global_step=count) writer.add_scalar("card", c, global_step=count) writer.flush() # Save to wandb if wdb: wandb.log({"simlex": sl, "simverb": sv, "card": c}, step=count) # Save the checkpoint if epochs_per_checkpoint is not None: if epoch % epochs_per_checkpoint == 0 and epoch != 0: self.save_model(name="checkpoint") print('\n') res.append((sl, sv, c)) print(res) print("\n") # Start the training loop train_loop(epochs, iters=iters) print("Final performance") sl, sv, c = self.test(dataset) print(sl, sv, c) res.append((sl, sv, c)) print('\n') return res
) args = args.parse_args() # set seed and ensure everything is properly split set_seed(args.seed) folder_path = f"./transVAE_{args.res}_{args.n_embd}_{args.batch_size}" print(f":: Will Save data in {folder_path}") os.makedirs(folder_path, exist_ok=True) # define the model model = TransformerVAE(n_embd=args.n_embd, n_head=args.n_head, res=args.res) print(":: Number of params:", sum(p.numel() for p in model.parameters())) if WANDB: wandb.init(project="vq-vae") wandb.watch(model) # watch the model metrics # define the dataset and goooo train = DSWrapper(train=True) test = DSWrapper(train=False) trainer = DiscreteVAETrainer(model, train, test) trainer.train( bs = args.batch_size, lr = args.lr, folder_path=folder_path, test_every=args.test_every, save_every=args.save_every, n_epochs=args.n_epochs, skip_steps=None, gradient_accumulation_steps=args.gradient_accumulation_steps )
shuffle=False, pin_memory=True) test_dataloader = DataLoader(test_dataset, batch_size=6, num_workers=8, pin_memory=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = WaveNet() optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001) criterion = nn.CrossEntropyLoss() featurizer = MelSpectrogram(MelSpectrogramConfig()).to(device) model.to(device) wandb.watch(model, log="all") N_EPOCHS = 14 for epoch in tqdm(range(N_EPOCHS)): train_loss, train_acc = train_model(model, train_dataloader, optimizer, criterion) test_loss, test_acc = evaluate(model, test_dataloader, criterion) wandb.log({ "learning_rate": 0.0001, "model": 'wavenet', "optimizer": 'Adam', "train_loss": train_loss, "train_accuracy": train_acc, "test_loss": test_loss,
def train_transformer_style(model: PyTorchForecast, training_params: Dict, takes_target=False, forward_params: Dict = {}, model_filepath: str = "model_save") -> None: """Function to train any PyTorchForecast model :param model: A properly wrapped PyTorchForecast model :type model: PyTorchForecast :param training_params: A dictionary of the necessary parameters for training. :type training_params: Dict :param takes_target: A parameter to determine whether a model requires the target, defaults to False :type takes_target: bool, optional :param forward_params: [description], defaults to {} :type forward_params: Dict, optional :param model_filepath: The file path to load modeel weights from, defaults to "model_save" :type model_filepath: str, optional :raises ValueError: [description] """ use_wandb = model.wandb es = None worker_num = 1 pin_memory = False dataset_params = model.params["dataset_params"] num_targets = 1 if "n_targets" in model.params: num_targets = model.params["n_targets"] if "num_workers" in dataset_params: worker_num = dataset_params["num_workers"] print("using " + str(worker_num)) if "pin_memory" in dataset_params: pin_memory = dataset_params["pin_memory"] print("Pin memory set to true") if "early_stopping" in model.params: es = EarlyStopper(model.params["early_stopping"]['patience']) opt = pytorch_opt_dict[training_params["optimizer"]]( model.model.parameters(), **training_params["optim_params"]) criterion_init_params = {} if "criterion_params" in training_params: criterion_init_params = training_params["criterion_params"] criterion = pytorch_criterion_dict[training_params["criterion"]]( **criterion_init_params) if "probabilistic" in model.params[ "model_params"] or "probabilistic" in model.params: probabilistic = True else: probabilistic = False max_epochs = training_params["epochs"] data_loader = DataLoader(model.training, batch_size=training_params["batch_size"], shuffle=False, sampler=None, batch_sampler=None, num_workers=worker_num, collate_fn=None, pin_memory=pin_memory, drop_last=False, timeout=0, worker_init_fn=None) validation_data_loader = DataLoader( model.validation, batch_size=training_params["batch_size"], shuffle=False, sampler=None, batch_sampler=None, num_workers=worker_num, collate_fn=None, pin_memory=pin_memory, drop_last=False, timeout=0, worker_init_fn=None) # TODO support batch_size > 1 test_data_loader = DataLoader(model.test_data, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, num_workers=worker_num, collate_fn=None, pin_memory=pin_memory, drop_last=False, timeout=0, worker_init_fn=None) meta_model = None meta_representation = None meta_loss = None if model.params.get("meta_data") is None: model.params["meta_data"] = False if model.params["meta_data"]: meta_model, meta_representation, meta_loss = handle_meta_data(model) if use_wandb: wandb.watch(model.model) session_params = [] for epoch in range(max_epochs): total_loss = torch_single_train(model, opt, criterion, data_loader, takes_target, meta_model, meta_representation, meta_loss, multi_targets=num_targets, forward_params=forward_params.copy()) print("The loss for epoch " + str(epoch)) print(total_loss) use_decoder = False if "use_decoder" in model.params: use_decoder = True valid = compute_validation( validation_data_loader, model.model, epoch, model.params["dataset_params"]["forecast_length"], model.crit, model.device, multi_targets=num_targets, meta_model=meta_model, decoder_structure=use_decoder, use_wandb=use_wandb, probabilistic=probabilistic) if valid == 0.0: raise ValueError( "Error validation loss is zero there is a problem with the validator." ) if use_wandb: wandb.log({'epoch': epoch, 'loss': total_loss}) epoch_params = { "epoch": epoch, "train_loss": str(total_loss), "validation_loss": str(valid) } session_params.append(epoch_params) if es: if not es.check_loss(model.model, valid): print("Stopping model now") model.model.load_state_dict(torch.load("checkpoint.pth")) break decoder_structure = True if model.params["dataset_params"]["class"] != "default": decoder_structure = False test = compute_validation( test_data_loader, model.model, epoch, model.params["dataset_params"]["forecast_length"], model.crit, model.device, meta_model=meta_model, multi_targets=num_targets, decoder_structure=decoder_structure, use_wandb=use_wandb, val_or_test="test_loss", probabilistic=probabilistic) print("test loss:", test) model.params["run"] = session_params model.save_model(model_filepath, max_epochs)
def main(): ON_SERVER = False parser = argparse.ArgumentParser(description='SfSNet - Residual') parser.add_argument('--local_rank', type=int, default=0, help='input batch size for training (default: 8)') parser.add_argument('--batch_size', type=int, default=8, metavar='N', help='input batch size for training (default: 8)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='learning rate (default: 0.001)') parser.add_argument('--wt_decay', type=float, default=0.0005, metavar='W', help='SGD momentum (default: 0.0005)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--read_first', type=int, default=-1, help='read first n rows (default: -1)') parser.add_argument('--details', type=str, default=None, help='Explaination of the run') if ON_SERVER: parser.add_argument('--syn_data', type=str, default='/nfs/bigdisk/bsonawane/sfsnet_data/', help='Synthetic Dataset path') parser.add_argument( '--celeba_data', type=str, default= '/nfs/bigdisk/bsonawane/CelebA-dataset/CelebA_crop_resize_128/', help='CelebA Dataset path') parser.add_argument('--log_dir', type=str, default='./results/', help='Log Path') else: parser.add_argument('--syn_data', type=str, default='../data/full_syn/', help='Synthetic Dataset path') parser.add_argument('--celeba_data', type=str, default='../data/ffhq_pipeline_test/', help='FFHQ Dataset path') parser.add_argument('--log_dir', type=str, default='../results/', help='Log Path') parser.add_argument('--load_model', type=str, default=None, help='load model from') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) # initialization syn_data = args.syn_data celeba_data = args.celeba_data batch_size = args.batch_size lr = args.lr wt_decay = args.wt_decay log_dir = args.log_dir epochs = args.epochs model_dir = args.load_model read_first = args.read_first if read_first == -1: read_first = None # Debugging and check working # syn_train_csv = syn_data + '/train.csv' # train_dataset, _ = get_sfsnet_dataset(syn_dir=syn_data+'train/', read_from_csv=syn_train_csv, read_celeba_csv=None, read_first=read_first, validation_split=5) # train_dl = DataLoader(train_dataset, batch_size=10, shuffle=False) # validate_shading_method(train_dl) # return # Init WandB for logging wandb.init(project='SfSNet-CelebA-Baseline-V3-SkipNetBased') wandb.log({'lr': lr, 'weight decay': wt_decay}) # Initialize models skipnet_model = SkipNet() if use_cuda: skipnet_model = skipnet_model.cuda() # .to(args.local_rank) if model_dir is not None: skipnet_model.load_state_dict( torch.load(model_dir + 'skipnet_model.pkl')) else: print('Initializing weights') skipnet_model.apply(weights_init) os.system('mkdir -p {}'.format(args.log_dir)) with open(args.log_dir + '/details.txt', 'w') as f: f.write(args.details) wandb.watch(skipnet_model) # 1. Train on Synthetic data train_synthetic(skipnet_model, syn_data, celeba_data = celeba_data, read_first=read_first, \ batch_size=batch_size, num_epochs=epochs, log_path=log_dir+'Synthetic_Train/', use_cuda=use_cuda, wandb=wandb, \ lr=lr, wt_decay=wt_decay, training_syn=True) # 2. Generate Pseudo-Training information for CelebA dataset # Load CelebA dataset celeba_train_csv = celeba_data + '/train.csv' celeba_test_csv = celeba_data + '/test.csv' train_dataset, _ = get_celeba_dataset(read_from_csv=celeba_train_csv, read_first=read_first, validation_split=0) test_dataset, _ = get_celeba_dataset(read_from_csv=celeba_test_csv, read_first=read_first, validation_split=0) celeba_train_dl = DataLoader(train_dataset, batch_size=1, shuffle=True) celeba_test_dl = DataLoader(test_dataset, batch_size=1, shuffle=True) out_celeba_images_dir = celeba_data + 'synthesized_data_skip_net/' out_train_celeba_images_dir = out_celeba_images_dir + 'train/' out_test_celeba_images_dir = out_celeba_images_dir + 'test/' os.system('mkdir -p {}'.format(out_train_celeba_images_dir)) os.system('mkdir -p {}'.format(out_test_celeba_images_dir)) # Dump normal, albedo, shading, face and sh for celeba dataset generate_celeba_synthesize(skipnet_model, celeba_train_dl, train_epoch_num=epochs, use_cuda=use_cuda, out_folder=out_train_celeba_images_dir, wandb=wandb) generate_celeba_synthesize(skipnet_model, celeba_test_dl, train_epoch_num=epochs, use_cuda=use_cuda, out_folder=out_test_celeba_images_dir, wandb=wandb) # generate CSV for images generated above generate_celeba_synthesize_data_csv(out_train_celeba_images_dir, out_celeba_images_dir + '/train.csv') generate_celeba_synthesize_data_csv(out_test_celeba_images_dir, out_celeba_images_dir + '/test.csv')
def train( run_name: str, # Data train_filepath: str = CSNJS_TRAIN_FILEPATH, eval_filepath: str = CSNJS_VALID_FILEPATH, spm_filepath: str = SPM_UNIGRAM_FILEPATH, program_mode="identity", eval_program_mode="identity", label_mode="identifier", num_workers=1, limit_dataset_size=-1, # Model model_type="transformer", n_decoder_layers=4, d_model: int = 512, resume_path: str = "", resume_encoder_name: str = "encoder_q", # encoder_q, encoder_k, encoder resume_project: bool = False, # Optimization train_decoder_only: bool = False, num_epochs: int = 50, save_every: int = 2, batch_size: int = 256, lr: float = 8e-4, adam_beta1: float = 0.9, adam_beta2: float = 0.98, use_lr_warmup: bool = True, loss_type="nll_token", # nll_token or nll_sequence # Loss subword_regularization_alpha: float = 0, # Computational use_cuda: bool = True, auto_test: bool = True, seed: int = 0, ): """Train model""" torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) run_dir = RUN_DIR / run_name run_dir.mkdir(exist_ok=True, parents=True) logger.add(str((run_dir / "train.log").resolve())) logger.info(f"Saving logs, model checkpoints to {run_dir}") config = locals() logger.info(f"Config: {config}") wandb.init(name=run_name, config=config, job_type="training", project="identifier-prediction", entity="ml4code") if use_cuda: assert torch.cuda.is_available( ), "CUDA not available. Check env configuration, or pass --use_cuda False" train_augmentations = [ { "fn": "sample_lines", "line_length_pct": 0.5, }, # WARN: this is a no-op because the arguments for sample_lines are prob and prob_keep_line # Also need to have options under an "options" key { "fn": "insert_var_declaration", "prob": 0.5 }, { "fn": "rename_variable", "prob": 0.5 }, ] sp = spm.SentencePieceProcessor() sp.Load(spm_filepath) pad_id = sp.PieceToId("[PAD]") # Create training dataset and dataloader logger.info(f"Training data path {train_filepath}") train_dataset = get_csnjs_dataset(train_filepath, label_mode=label_mode, limit_size=limit_dataset_size) logger.info(f"Training dataset size: {len(train_dataset)}") train_loader = javascript_dataloader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, augmentations=train_augmentations, sp=sp, program_mode=program_mode, subword_regularization_alpha=subword_regularization_alpha, ) # Create eval dataset and dataloader logger.info(f"Eval data path {eval_filepath}") eval_dataset = get_csnjs_dataset(eval_filepath, label_mode=label_mode, limit_size=limit_dataset_size) logger.info(f"Eval dataset size: {len(eval_dataset)}") eval_loader = javascript_dataloader( eval_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, augmentations=[], sp=sp, program_mode=eval_program_mode, subword_regularization_alpha=subword_regularization_alpha, ) # Create model pad_id = sp.PieceToId("[PAD]") if model_type == "transformer": model = TransformerModel(n_tokens=sp.GetPieceSize(), pad_id=pad_id, n_decoder_layers=n_decoder_layers, d_model=d_model) logger.info( f"Created TransformerModel with {count_parameters(model)} params") elif model_type == "lstm": model = Seq2SeqLSTM(n_tokens=sp.GetPieceSize(), pad_id=pad_id, d_model=d_model) logger.info( f"Created Seq2SeqLSTM with {count_parameters(model)} params") # Set up optimizer model = nn.DataParallel(model) model = model.cuda() if use_cuda else model wandb.watch(model, log="all") params = model.module.decoder.parameters( ) if train_decoder_only else model.parameters() optimizer = torch.optim.Adam(params, lr=lr, betas=(adam_beta1, adam_beta2), eps=1e-9) if use_lr_warmup: scheduler = get_linear_schedule_with_warmup( optimizer, 5000, len(train_loader) * num_epochs) else: scheduler = LambdaLR(optimizer, lr_lambda=lambda x: 1.0) # Load checkpoint start_epoch = 1 global_step = 0 min_eval_loss = float("inf") if resume_path: logger.info( f"Resuming training from checkpoint {resume_path}, resume_encoder_name={resume_encoder_name}" ) checkpoint = torch.load(resume_path) assert resume_encoder_name in [ "encoder_k", "encoder_q", "encoder", "supervised" ] if resume_encoder_name == "supervised": # This checkpoint is the result of training with this script, not pretraining model.module.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) min_eval_loss = checkpoint.get("min_eval_loss", checkpoint["eval_loss"]) start_epoch = checkpoint["epoch"] + 1 global_step = checkpoint["global_step"] for _ in range(global_step): scheduler.step() else: pretrained_state_dict = checkpoint["model_state_dict"] encoder_state_dict = {} for key, value in pretrained_state_dict.items(): if key.startswith(resume_encoder_name + ".") and "project_layer" not in key: remapped_key = key[len(resume_encoder_name + "."):] logger.debug( f"Remapping checkpoint key {key} to {remapped_key}. Value mean: {value.mean().item()}" ) encoder_state_dict[remapped_key] = value if key.startswith( resume_encoder_name + ".") and "project_layer.0." in key and resume_project: remapped_key = key[len(resume_encoder_name + "."):] logger.debug( f"Remapping checkpoint project key {key} to {remapped_key}. Value mean: {value.mean().item()}" ) encoder_state_dict[remapped_key] = value model.encoder.load_state_dict(encoder_state_dict, strict=False) logger.info(f"Loaded keys: {encoder_state_dict.keys()}") logger.info(f"Loaded state dict from {resume_path}") for epoch in tqdm.trange(start_epoch, num_epochs + 1, desc="training", unit="epoch", leave=False): logger.info(f"Starting epoch {epoch}\n") if train_decoder_only: model.module.encoder.eval() model.module.decoder.train() else: model.train() pbar = tqdm.tqdm(train_loader, desc=f"epoch {epoch}") for X, Y, X_lengths, Y_lengths in pbar: if use_cuda: X = X.cuda() Y = Y.cuda() X_lengths, Y_lengths = X_lengths.cuda(), Y_lengths.cuda() optimizer.zero_grad() # NOTE: X and Y are [B, max_seq_len] tensors (batch first) logits = model(X, Y[:, :-1], X_lengths, Y_lengths) if loss_type == "nll_sequence": loss = F.cross_entropy(logits.transpose(1, 2), Y[:, 1:], ignore_index=pad_id, reduction="sum") loss = loss / X.size( 0 ) # Average over num sequences, not target sequence lengths # Thus, minimize bits per sequence. elif loss_type == "nll_token": loss = F.cross_entropy( logits.transpose(1, 2), Y[:, 1:], ignore_index=pad_id, ) loss.backward() optimizer.step() scheduler.step() # Log loss global_step += 1 wandb.log( { "epoch": epoch, f"label-{label_mode}/train_loss": loss.item(), "lr": scheduler.get_last_lr()[0] }, step=global_step, ) pbar.set_description(f"epoch {epoch} loss {loss.item():.4f}") # Evaluate logger.info( f"Evaluating model after epoch {epoch} ({global_step} steps)...") max_decode_len = 20 if label_mode == "identifier" else 200 eval_loss = _evaluate(model, eval_loader, sp, use_cuda=use_cuda, max_decode_len=max_decode_len, loss_type=loss_type) logger.info( f"Evaluation loss after epoch {epoch} ({global_step} steps): {eval_loss:.4f}" ) wandb.log({ "epoch": epoch, f"label-{label_mode}/eval_loss": eval_loss }, step=global_step) # Save checkpoint if save_every and epoch % save_every == 0 or eval_loss < min_eval_loss: if eval_loss < min_eval_loss: logger.info( f"New best evaluation loss: prev {min_eval_loss:.4f} > new {eval_loss:.4f}" ) min_eval_loss = eval_loss model_file = run_dir / "ckpt_best.pth" else: model_file = run_dir / f"ckpt_ep{epoch:04d}.pth" checkpoint = { "model_state_dict": model.module.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "epoch": epoch, "global_step": global_step, "config": config, "eval_loss": eval_loss, "min_eval_loss": min_eval_loss, } logger.info(f"Saving checkpoint to {model_file}...") torch.save(checkpoint, str(model_file.resolve())) wandb.save(str(model_file.resolve())) logger.info("Done.") if auto_test: best_ckpt = run_dir / "ckpt_best.pth" test( str(best_ckpt.resolve()), CSNJS_TEST_FILEPATH, spm_filepath, program_mode, label_mode, num_workers, -1, n_decoder_layers=n_decoder_layers, )
def main(): global best_bleu4, epochs_since_improvement, start_epoch, data_name, word_map if args.fine_tune_encoder and args.fine_tune_epochs == -1: raise Exception( 'if "fine_tune_encoder" == true you must also specify "fine_tune_epochs" != -1' ) # Read word map if not args.run_local: data_f = '/yoav_stg/gshalev/image_captioning/output_folder' else: data_f = data_folder word_map_file = os.path.join(data_f, 'WORDMAP_' + data_name + '.json') print('word_map_file: {}'.format(word_map_file)) print('loading word map from path: {}'.format(word_map_file)) with open(word_map_file, 'r') as j: word_map = json.load(j) print('load word map COMPLETED') # rev word map rev_word_map = {v: k for k, v in word_map.items()} # Initialize checkpoint if args.checkpoint is None: print('run a new model (No args.checkpoint)') decoder = DecoderWithAttention(attention_dim=attention_dim, embed_dim=emb_dim, decoder_dim=decoder_dim, vocab_size=len(word_map), device=device, dropout=dropout) decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, decoder.parameters()), lr=decoder_lr) encoder = Encoder() encoder.fine_tune(True if args.fine_tune_encoder and args.fine_tune_epochs == 0 else False) encoder_optimizer = torch.optim.Adam( params=filter(lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr ) if args.fine_tune_encoder and args.fine_tune_epochs == 0 else None # load checkpoint else: print('run a model loaded from args.checkpoint') checkpoint = torch.load(args.checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = 0 # epochs_since_improvement = checkpoint['epochs_since_improvement'] best_bleu4 = checkpoint['bleu-4'] decoder = checkpoint['decoder'] decoder_optimizer = checkpoint['decoder_optimizer'] encoder = checkpoint['encoder'] encoder_optimizer = checkpoint['encoder_optimizer'] if args.fine_tune_encoder and encoder_optimizer is None: print('----------loading model without encoder optimizer') encoder.fine_tune(args.fine_tune_encoder) encoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr) elif args.fine_tune_encoder and encoder_optimizer is not None: raise Exception('you are loading a model with encoder optimizer') # Move to GPU, if available decoder = decoder.to(device) encoder = encoder.to(device) # wandb if not args.run_local: wandb.watch(decoder) # Loss function criterion = nn.CrossEntropyLoss().to(device) # Custom dataloaders train_loader = torch.utils.data.DataLoader(CaptionDataset( data_f, data_name, 'TRAIN', transform=transforms.Compose([data_normalization])), batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(CaptionDataset( data_f, data_name, 'VAL', transform=transforms.Compose([data_normalization])), batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) val_loader_for_val = torch.utils.data.DataLoader(CaptionDataset( data_f, data_name, 'VAL', transform=transforms.Compose([data_normalization])), batch_size=1, shuffle=True, num_workers=workers, pin_memory=True) # Epochs print('starting epochs') for epoch in range(start_epoch, epochs): # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20 if epochs_since_improvement == 20: print('break after : epochs_since_improvement == 20') break if epoch == args.fine_tune_epochs: print('fine tuning after epoch({}) == args.fine_tune_epochs({})'. format(epoch, args.fine_tune_epochs)) encoder.fine_tune(args.fine_tune_encoder) encoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr) # Chane batch saize to 32 train_loader = torch.utils.data.DataLoader(CaptionDataset( data_f, data_name, 'TRAIN', transform=transforms.Compose([data_normalization])), batch_size=32, shuffle=True, num_workers=workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(CaptionDataset( data_f, data_name, 'VAL', transform=transforms.Compose([data_normalization])), batch_size=32, shuffle=True, num_workers=workers, pin_memory=True) if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0: print( 'adjust lr afetr : epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0' ) adjust_learning_rate(decoder_optimizer, 0.8) if args.checkpoint is not None: adjust_learning_rate(encoder_optimizer, 0.8) elif args.fine_tune_encoder and epoch > args.fine_tune_epochs: print( '------------------------------------epoch: {} fine tune lr encoder' .format(epoch)) adjust_learning_rate(encoder_optimizer, 0.8) print( '--------------111111111-----------Start train----------epoch-{}'. format(epoch)) # One epoch's training train(train_loader=train_loader, encoder=encoder, decoder=decoder, criterion=criterion, encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer, epoch=epoch) print( '--------------2222222222-----------Start validation----------epoch-{}' .format(epoch)) # One epoch's validation recent_bleu4 = validate(val_loader=val_loader, encoder=encoder, decoder=decoder, criterion=criterion, rev_word_map=rev_word_map) print('9999999999999- recent blue {}'.format(recent_bleu4)) print( '--------------3333333333-----------Start val without teacher forcing----------epoch-{}' .format(epoch)) caption_image_beam_search(encoder, decoder, val_loader_for_val, word_map, rev_word_map) print( '!@#!@!#!#@!#@!#@ DONE WITH TRAIN VAL AND VAL WITHOUT TEACHER FORCING FOR EPOCH :{}' .format(epoch)) # Check if there was an improvement is_best = recent_bleu4 > best_bleu4 best_bleu4 = max(recent_bleu4, best_bleu4) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer, decoder_optimizer, recent_bleu4, is_best, args.runname)
def wandb_init(self, model): if not self._init: self._init = True wandb.init(project="videowalk", group="release", config=self.args) wandb.watch(model)
def training_ms(model, config, train_loader, val_loader): wandb.watch(model, log="all") train_iter = 0 best_accuracy = 0 for epoch in range(config.epochs): epoch_loss_rgb = 0 epoch_loss_ms = 0 epoch_loss_ordering = 0 num_corrects_rgb = 0 num_corrects_ms = 0 trainSamples = 0 map_pixel_samples = 0 iterPerEpoch = 0 model.lstm_cell.train(True) model.classifier.train(True) model.resNet.layer4[0].conv1.train(True) model.resNet.layer4[0].conv2.train(True) model.resNet.layer4[1].conv1.train(True) model.resNet.layer4[1].conv2.train(True) model.resNet.layer4[2].conv1.train(True) model.resNet.layer4[2].conv2.train(True) model.resNet.fc.train(True) model.ms_conv.train(True) model.ms_classifier.train(True) # display_ms = True for inputs_rgb, map_labels, labels in train_loader: num_samples = inputs_rgb.size(0) trainSamples += num_samples train_iter += 1 iterPerEpoch += 1 optimizer_fn.zero_grad() inputs_rgb = inputs_rgb.permute(1, 0, 2, 3, 4).to(config.device) # but why? labels = labels.to(config.device) # map_labels = map_labels.to(config.device) map_labels = map_labels.to(config.device).permute( 0, 2, 1, 3, 4).squeeze() # BSxseq_lenx7x7 output_label, _, output_map, order_labels, order_feats = model( inputs_rgb) # output_map is BSx2 # if display_ms: # display_map_prediction(map_labels[0].clone(), output_map[0].clone(), loss_fn_ms, config.ms_task) # display_ms = False map_pixel_samples += num_samples * config.seq_len * 7 * 7 loss_rgb = loss_fn_rgb(output_label, labels) loss_ms = loss_fn_ms(output_map.squeeze(), map_labels) loss_ordering = loss_fn_ordering(order_feats, order_labels.to(config.device)) # loss_ratio = loss_rgb.item()/loss_ms.item() loss = loss_rgb + loss_ms + loss_ordering loss.backward() optimizer_fn.step() _, predicted_rgb = torch.max(output_label.data, 1) epoch_loss_rgb += loss_rgb.item() predicted_rgb = predicted_rgb.to(config.device) num_corrects_rgb += torch.sum(predicted_rgb == labels).data.item() if config.ms_task == "classifier": _, predicted_ms = torch.max(output_map.data, 1) predicted_ms = predicted_ms.to(config.device) num_corrects_ms += torch.sum( predicted_ms == map_labels).data.item() epoch_loss_ms += loss_ms.item() epoch_loss_ordering += loss_ordering.item() optim_scheduler.step() avg_loss_rgb = epoch_loss_rgb / iterPerEpoch train_accuracy_rgb = (num_corrects_rgb / trainSamples) avg_loss_ms = epoch_loss_ms / iterPerEpoch avg_loss_ordering = epoch_loss_ordering / iterPerEpoch if config.ms_task == "classifier": train_accuracy_ms = (num_corrects_ms / map_pixel_samples) print('Train: Epoch = {}/{} | Loss = {} | Accuracy = {}'.format( epoch + 1, config.epochs, avg_loss_rgb, train_accuracy_rgb)) max_loss = 6 avg_loss_normalized_rgb = avg_loss_rgb if avg_loss_rgb < max_loss else max_loss avg_loss_normalized_ms = avg_loss_ms if avg_loss_ms < max_loss else max_loss if config.ms_task == "classifier": wandb.log({ "train_loss_rgb": avg_loss_normalized_rgb, "train_loss_ms": avg_loss_normalized_ms, "train_accuracy_rgb": train_accuracy_rgb, "train_accuracy_ms": train_accuracy_ms, "avg_loss_ordering": avg_loss_ordering, "eopch": (epoch + 1) }) else: wandb.log({ "train_loss_rgb": avg_loss_normalized_rgb, "train_loss_ms": avg_loss_normalized_ms, "train_accuracy_rgb": train_accuracy_rgb, "avg_loss_ordering": avg_loss_ordering, "eopch": (epoch + 1) }) if (epoch + 1) % config.val_frequency == 0: with torch.no_grad(): model.eval() val_loss_epoch_rgb = 0 val_loss_epoch_ms = 0 val_iter = 0 val_samples = 0 num_corrects_rgb = 0 num_corrects_ms = 0 map_pixel_samples = 0 val_loss_epoch_ordering = 0 for inputs_rgb, map_labels, labels in val_loader: val_iter += 1 num_samples = inputs_rgb.size(0) val_samples += num_samples inputs_rgb = inputs_rgb.permute(1, 0, 2, 3, 4).to(config.device) labels = labels.to(config.device) map_labels = map_labels.to(config.device).view( num_samples, config.seq_len, 7, 7) output_label, _, output_map, order_labels, order_feats = model( inputs_rgb) map_pixel_samples += num_samples * config.seq_len * 7 * 7 val_loss_rgb = loss_fn_rgb(output_label, labels) val_loss_ms = loss_fn_ms(output_map.squeeze(), map_labels) loss_ordering = loss_fn_ordering( order_feats, order_labels.to(config.device)) val_loss_epoch_rgb += val_loss_rgb.item() val_loss_epoch_ms += val_loss_ms.item() val_loss_epoch_ordering += loss_ordering.item() _, predicted_rgb = torch.max(output_label.data, 1) num_corrects_rgb += torch.sum( predicted_rgb == labels).data.item() if config.ms_task == "classifier": _, predicted_ms = torch.max(output_map.data, 1) num_corrects_ms += torch.sum( predicted_ms == map_labels).data.item() val_accuracy_rgb = (num_corrects_rgb / val_samples) avg_val_loss_rgb = val_loss_epoch_rgb / val_iter avg_val_loss_ms = val_loss_epoch_ms / val_iter avg_val_loss_ordering = val_loss_epoch_ordering / iterPerEpoch if config.ms_task == "classifier": val_accuracy_ms = (num_corrects_ms / map_pixel_samples) print('***** Val: Epoch = {} | Loss {} | Accuracy = {} *****'. format(epoch + 1, avg_val_loss_rgb, val_accuracy_rgb)) avg_val_loss_normalized_rgb = avg_val_loss_rgb if avg_val_loss_rgb < max_loss else max_loss avg_val_loss_normalized_ms = avg_val_loss_ms if avg_val_loss_ms < max_loss else max_loss if config.ms_task == "classifier": wandb.log({ "valid_loss_rgb": avg_val_loss_normalized_rgb, "valid_loss_ms": avg_val_loss_normalized_ms, "valid_accuracy_rgb": val_accuracy_rgb, "valid_accuracy_ms": val_accuracy_ms, "avg_val_loss_ordering": avg_val_loss_ordering, "eopch": (epoch + 1) }) else: wandb.log({ "valid_loss_rgb": avg_val_loss_normalized_rgb, "valid_loss_ms": avg_val_loss_normalized_ms, "valid_accuracy_rgb": val_accuracy_rgb, "avg_val_loss_ordering": avg_val_loss_ordering, "eopch": (epoch + 1) }) if val_accuracy_rgb > best_accuracy: save_path_model = (config.models_dir + '/best_model_ms_state_dict.pth') torch.save(model.state_dict(), save_path_model) best_accuracy = val_accuracy_rgb wandb.run.summary["best_valid_accuracy"] = best_accuracy else: if (epoch + 1) % 10 == 0: save_path_model = (config.models_dir + '/best_model_ms_state_dict' + str(epoch + 1) + '.pth') # torch.save(model.state_dict(), save_path_model) wandb.run.summary["best_valid_accuracy"] = best_accuracy return
def main(): global best_prec1, evaluate, args wandb.init(project='RC2020-att_sweep', name = args.name, group = args.group) if args.arch == 0: if args.version == 0: model = resnet18(att = 'Vanilla', num_classes = 10) elif args.version == 1: model = resnet18(att = 'GCT', num_classes = 10) elif args.version == 2: model = resnet18(att = 'Strip Pool', num_classes = 10) elif args.version == 3: model = resnet18(att = 'ECA', num_classes = 10) else: model = resnet18(att = 'Triplet', num_classes = 10) elif args.arch == 1: if args.version == 0: model = resnet34(att = 'Vanilla', num_classes = 10) elif args.version == 1: model = resnet34(att = 'GCT', num_classes = 10) elif args.version == 2: model = resnet34(att = 'Strip Pool', num_classes = 10) elif args.version == 3: model = resnet34(att = 'ECA', num_classes = 10) else: model = resnet34(att = 'Triplet', num_classes = 10) else: if args.version == 0: model = resnet50(att = 'Vanilla', num_classes = 10) elif args.version == 1: model = resnet50(att = 'GCT', num_classes = 10) elif args.version == 2: model = resnet50(att = 'Strip Pool', num_classes = 10) elif args.version == 3: model = resnet50(att = 'ECA', num_classes = 10) else: model = resnet50(att = 'Triplet', num_classes = 10) wandb.watch(model) model = model.cuda() print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) wandb.config.update({'Parameters':sum([p.data.nelement() for p in model.parameters()]), 'Batch_Size':128}) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_loader = torch.utils.data.DataLoader( datasets.CIFAR10(root='./data', train=True, transform=transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ToTensor(), normalize, ]), download=True), batch_size=128, shuffle=True, num_workers=4, pin_memory=True) val_loader = torch.utils.data.DataLoader( datasets.CIFAR10(root='./data', train=False, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])), batch_size=128, shuffle=False, num_workers=4, pin_memory=True) # define loss function (criterion) and pptimizer criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() optimizer = torch.optim.SGD(model.parameters(), 0.1, momentum=0.9, weight_decay=5e-4) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150], last_epoch=0 - 1) max_epoch = 50 for epoch in range(0, max_epoch): print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr'])) wandb.log({'lr': optimizer.param_groups[0]['lr']}) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) lr_scheduler.step() # evaluate on validation set prec1 = validate(val_loader, model, criterion, epoch) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if epoch > 0 and epoch % 20 == 0: save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename=os.path.join('./', 'vanilla_checkpoint.th')) save_checkpoint({ 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename=os.path.join('./', 'vanilla_model.th')) wandb.run.finish()
def train_cycle(use_wandb=True): print("%s: Training the model" % (time.strftime("%Y/%m/%d-%H:%M:%S"))) # n_iters = 100000 # print_every = 5000 # plot_every = 1000 print_every = 50 plot_every = 500 embedding_size = 32 num_epochs = 30 margin = 0.05 train_size = None evaluate_size = 100 save_path = './unif_model.ckpt' # Keep track of losses for plotting current_print_loss = 0 current_plot_loss = 0 all_losses = [] start = time.time() code_snippets_file = './data/parallel_bodies_n1000' descriptions_file = './data/parallel_desc_n1000' dataset = CodeDescDataset(code_snippets_file, descriptions_file, train_size) num_iters = len(dataset) # model = UNIF(dataset.code_vocab_size, dataset.desc_vocab_size, embedding_size) model = UNIFNoAttention(dataset.code_vocab_size, dataset.desc_vocab_size, embedding_size) cosine_similarity_function = nn.CosineSimilarity() loss_function = nn.CosineEmbeddingLoss(margin=margin) learning_rate = 0.05 # If you set this too high, it might explode. If too low, it might not learn optimiser = torch.optim.SGD(model.parameters(), lr=learning_rate) if use_wandb: wandb.init(project='code-search', name='unif-cosine-pos', reinit=True) config = wandb.config config.learning_rate = learning_rate config.embedding_size = embedding_size config.evaluate_size = evaluate_size config.margin = margin config.num_epochs = num_epochs config.train_size = len(dataset) wandb.watch(model, log_freq=plot_every) metrics = evaluate_top_n(model, evaluate_size) if use_wandb: wandb.log(metrics) for epoch in range(num_epochs): print('Epoch: ', epoch) for iter in range(num_iters): # print(iter) tokenized_code, tokenized_positive_desc, tokenized_negative_desc =\ dataset[iter] code_embedding, desc_embedding, loss = train( model, loss_function, optimiser, tokenized_code, okenized_positive_desc) current_print_loss += loss current_plot_loss += loss # Print iter number, loss, name and guess if (iter + 1) % print_every == 0: print('%d %d%% (%s) %.4f' % (iter + 1, (iter + 1) / num_iters * 100, timeSince(start), current_print_loss / print_every)) cosine_similarity = cosine_similarity_function( code_embedding, desc_embedding).item() print('Cosine similarity:', cosine_similarity) # print('Cosine similarity:', cosine_similarity, code_embedding, desc_embedding) current_print_loss = 0 # Add current loss avg to list of losses if (iter + 1) % plot_every == 0: torch.save(model.state_dict(), save_path) metrics = evaluate_top_n(model, evaluate_size) metrics.update({'loss': current_plot_loss / plot_every}) all_losses.append(current_plot_loss / plot_every) current_plot_loss = 0 if use_wandb: wandb.log(metrics) return model, current_print_loss, all_losses
def train( self, train_dataloader, output_dir, show_running_loss=True, eval_dataloader=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args.tensorboard_dir) t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [] custom_parameter_names = set() for group in self.args.custom_parameter_groups: params = group.pop("params") custom_parameter_names.update(params) param_group = {**group} param_group["params"] = [ p for n, p in model.named_parameters() if n in params ] optimizer_grouped_parameters.append(param_group) for group in self.args.custom_layer_parameters: layer_number = group.pop("layer") layer = f"layer.{layer_number}." group_d = {**group} group_nd = {**group} group_nd["weight_decay"] = 0.0 params_d = [] params_nd = [] for n, p in model.named_parameters(): if n not in custom_parameter_names and layer in n: if any(nd in n for nd in no_decay): params_nd.append(p) else: params_d.append(p) custom_parameter_names.add(n) group_d["params"] = params_d group_nd["params"] = params_nd optimizer_grouped_parameters.append(group_d) optimizer_grouped_parameters.append(group_nd) if not self.args.train_custom_parameters_only: optimizer_grouped_parameters.extend([ { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and not any( nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and any( nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ]) warmup_steps = math.ceil(t_total * args.warmup_ratio) args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.silent) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 if args.evaluate_during_training: training_progress_scores = self._create_training_progress_scores( **kwargs) if args.wandb_project: wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs) wandb.watch(self.model) if args.fp16: from torch.cuda import amp scaler = amp.GradScaler() model.train() for _ in train_iterator: train_iterator.set_description( f"Epoch {epoch_number + 1} of {args.num_train_epochs}") batch_iterator = tqdm( train_dataloader, desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}", disable=args.silent, mininterval=0, ) for step, batch in enumerate(batch_iterator): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch if args.fp16: with amp.autocast(): (lm_loss), (mc_loss), *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels, ) # model outputs are always tuple in pytorch-transformers (see doc) loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef else: (lm_loss), (mc_loss), *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels, ) # model outputs are always tuple in pytorch-transformers (see doc) loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % current_loss, end="") if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: scaler.scale(loss).backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if args.fp16: scaler.step(optimizer) scaler.update() else: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.wandb_project: wandb.log({ "Training loss": current_loss, "lr": scheduler.get_lr()[0], "global_step": global_step, }) if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) self.save_model(output_dir_current, model=model) if args.evaluate_during_training and ( args.evaluate_during_training_steps > 0 and global_step % args.evaluate_during_training_steps == 0): # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model( eval_dataloader, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if args.save_eval_checkpoints: self.save_model(output_dir_current, model=model, results=results) training_progress_scores["global_step"].append( global_step) training_progress_scores["train_loss"].append( current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args.output_dir, "training_progress_scores.csv"), index=False, ) if args.wandb_project: wandb.log( self._get_last_metrics( training_progress_scores)) if not best_eval_metric: best_eval_metric = results[ args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[ args. early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[ args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if results[ args. early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[ args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return global_step, tr_loss / global_step epoch_number += 1 output_dir_current = os.path.join( output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args.save_model_every_epoch or args.evaluate_during_training: os.makedirs(output_dir_current, exist_ok=True) if args.save_model_every_epoch: self.save_model(output_dir_current, model=model) if args.evaluate_during_training: results, _, _ = self.eval_model( eval_dataloader, verbose=verbose and args.evaluate_during_training_verbose, silent=True, **kwargs, ) self.save_model(output_dir_current, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args.output_dir, "training_progress_scores.csv"), index=False) if args.wandb_project: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[ args. early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 else: if results[ args. early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 return global_step, tr_loss / global_step