def main(cfg: DictConfig) -> None: """ Run the code following a given configuration :param cfg: configuration file retrieved from hydra framework """ main_utils.init(cfg) logger = TrainLogger(exp_name_prefix=cfg['main']['experiment_name_prefix'], logs_dir=cfg['main']['paths']['logs']) logger.write(OmegaConf.to_yaml(cfg)) # Set seed for results reproduction main_utils.set_seed(cfg['main']['seed']) # Load dataset train_dataset = MyDataset(path=cfg['main']['paths']['train']) val_dataset = MyDataset(path=cfg['main']['paths']['validation']) train_loader = DataLoader(train_dataset, cfg['train']['batch_size'], shuffle=True, num_workers=cfg['main']['num_workers']) eval_loader = DataLoader(val_dataset, cfg['train']['batch_size'], shuffle=True, num_workers=cfg['main']['num_workers']) # Init model model = MyModel(num_hid=cfg['train']['num_hid'], dropout=cfg['train']['dropout']) # TODO: Add gpus_to_use if cfg['main']['parallel']: model = torch.nn.DataParallel(model) if torch.cuda.is_available(): model = model.cuda() logger.write(main_utils.get_model_string(model)) # Run model train_params = train_utils.get_train_params(cfg) # Report metrics and hyper parameters to tensorboard metrics = train(model, train_loader, eval_loader, train_params, logger) hyper_parameters = main_utils.get_flatten_dict(cfg['train']) logger.report_metrics_hyper_params(hyper_parameters, metrics)
def train(model: nn.Module, train_loader: DataLoader, eval_loader: DataLoader, train_params: TrainParams, logger: TrainLogger) -> Metrics: """ Training procedure. Change each part if needed (optimizer, loss, etc.) :param model: :param train_loader: :param eval_loader: :param train_params: :param logger: :return: """ metrics = train_utils.get_zeroed_metrics_dict() best_eval_score = 0 # Create optimizer optimizer = torch.optim.Adam(model.parameters(), lr=train_params.lr) # Create learning rate scheduler scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=train_params.lr_step_size, gamma=train_params.lr_gamma) for epoch in tqdm(range(train_params.num_epochs)): t = time.time() metrics = train_utils.get_zeroed_metrics_dict() for i, (x, y) in enumerate(train_loader): if torch.cuda.is_available(): x = x.cuda() y = y.cuda() y_hat = model(x) loss = nn.functional.binary_cross_entropy_with_logits(y_hat, y) # Optimization step optimizer.zero_grad() loss.backward() optimizer.step() # Calculate metrics metrics['total_norm'] += nn.utils.clip_grad_norm_(model.parameters(), train_params.grad_clip) metrics['count_norm'] += 1 # NOTE! This function compute scores correctly only for one hot encoding representation of the logits batch_score = train_utils.compute_score_with_logits(y_hat, y.data).sum() metrics['train_score'] += batch_score.item() metrics['train_loss'] += loss.item() * x.size(0) # Report model to tensorboard if epoch == 0 and i == 0: logger.report_graph(model, x) # Learning rate scheduler step scheduler.step() # Calculate metrics metrics['train_loss'] /= len(train_loader.dataset) metrics['train_score'] /= len(train_loader.dataset) metrics['train_score'] *= 100 norm = metrics['total_norm'] / metrics['count_norm'] model.train(False) metrics['eval_score'], metrics['eval_loss'] = evaluate(model, eval_loader) model.train(True) epoch_time = time.time() - t logger.write_epoch_statistics(epoch, epoch_time, metrics['train_loss'], norm, metrics['train_score'], metrics['eval_score']) scalars = {'Accuracy/Train': metrics['train_score'], 'Accuracy/Validation': metrics['eval_score'], 'Loss/Train': metrics['train_loss'], 'Loss/Validation': metrics['eval_loss']} logger.report_scalars(scalars, epoch) if metrics['eval_score'] > best_eval_score: best_eval_score = metrics['eval_score'] if train_params.save_model: logger.save_model(model, epoch, optimizer) return get_metrics(best_eval_score, metrics['eval_score'], metrics['train_loss'])
def run(self): self.model.train() self.head.train() running_loss = 0. step = 0 val_acc = 0. val_loss = 0. best_step = 0 best_acc = float('Inf') if self.config.max_or_min == 'max': best_acc *= -1 for epoch in range(self.config.epochs): train_logger = TrainLogger(self.config.batch_size, self.config.frequency_log) if epoch + 1 in self.config.reduce_lr and not self.config.lr_plateau: self.reduce_lr() for idx, data in enumerate(self.train_loader): imgs, labels = data imgs = imgs.to(self.config.device) labels = labels.to(self.config.device) self.optimizer.zero_grad() embeddings = self.model(imgs) if self.config.attribute == 'recognition': outputs = self.head(embeddings, labels) else: outputs = self.head(embeddings) if self.weights is not None: loss = self.config.loss(outputs, labels, weight=self.weights) else: loss = self.config.loss(outputs, labels) loss.backward() running_loss += loss.item() self.optimizer.step() if step % self.tensorboard_loss_every == 0: loss_board = running_loss / self.tensorboard_loss_every self.writer.add_scalar('train_loss', loss_board, step) running_loss = 0. if step % self.evaluate_every == 0 and step != 0: if self.config.val_source is not None: val_acc, val_loss = self.evaluate(step) self.model.train() self.head.train() best_acc, best_step = self.save_model(val_acc, best_acc, step, best_step) print(f'Best accuracy: {best_acc:.5f} at step {best_step}') else: save_state(self.model, self.head, self.optimizer, self.config, 0, step) train_logger(epoch, self.config.epochs, idx, len(self.train_loader), loss.item()) step += 1 if self.config.lr_plateau: self.scheduler.step(val_acc) if self.config.early_stop: self.early_stop(val_acc) if self.early_stop.stop: print("Early stopping model...") break val_acc, val_loss = self.evaluate(step) best_acc = self.save_model(val_acc, best_acc, step, best_step) print(f'Best accuracy: {best_acc} at step {best_step}')
def main(cfg: DictConfig, preprocess_data=True, create_images_h5_file=True): """ Run the code following a given configuration :param cfg: configuration file retrieved from hydra framework """ main_utils.init(cfg) logger = TrainLogger(exp_name_prefix=cfg['main']['experiment_name_prefix'], logs_dir=cfg['main']['paths']['logs']) logger.write(OmegaConf.to_yaml(cfg)) # Set seed for results reproduction main_utils.set_seed(cfg['main']['seed']) #------Only run 1 time in order to create the h5 file-----: # create h5 file with the images, separate files for train, val if create_images_h5_file: logger.write('--------creating vision files--------') start = time.time() vision_utils.create_vision_files(cfg) logger.write(f'time of creating images files: {time.time()-start}') if preprocess_data: logger.write('--------preprocess data--------') # Load dataset train_dataset = MyDataset(cfg, 'train', is_padding=True) w2idx, idx2w = train_dataset.w2idx, train_dataset.idx2w val_dataset = MyDataset(cfg, 'val', w2idx, idx2w, is_padding=True) # save a cPickle # with open(cfg['main']["paths"]['train_dataset'], 'wb') as f: # cPickle.dump(train_dataset, f) # with open(cfg['main']["paths"]['val_dataset'], 'wb') as f: # cPickle.dump(val_dataset, f) # save as torch pth train_dataset._save() val_dataset._save() else: logger.write("--------loading datasets--------") # load as cPickle # train_dataset = cPickle.load(open(cfg['main']["paths"]['train_dataset'], 'rb')) # val_dataset = cPickle.load(open(cfg['main']["paths"]['val_dataset'], 'rb')) # load as torch pth train_dataset = torch.load(cfg['main']["paths"]['train_dataset']) val_dataset = torch.load(cfg['main']["paths"]['val_dataset']) logger.write('--------create data loaders--------') train_loader = DataLoader(train_dataset, cfg['train']['batch_size'], shuffle=True, num_workers=cfg['main']['num_workers'], collate_fn=main_utils.collate_fn) val_loader = DataLoader(val_dataset, cfg['train']['batch_size'], shuffle=True, num_workers=cfg['main']['num_workers'], collate_fn=main_utils.collate_fn) # logger.write(f'len of train loader: {len(train_loader) * cfg["train"]["batch_size"]}, ' # f'len of val loader: {len(val_loader) * cfg["train"]["batch_size"]}') # 2127 val samples dont have answers, train num samples: 443760, val num samples: 214368 # Init model logger.write(f'--------init model---------') max_q_len = train_loader.dataset.max_q_length num_ans = train_loader.dataset.num_of_ans q_name, v_name, vqa_name = cfg['main']['model_names'].values() for model_name in [ 'no_pretrain', 'pretrain_4_layers', 'pretrain_8_layers' ]: model = main_utils.init_models(q_name, v_name, vqa_name, cfg, max_q_len, num_ans, model_name).model # Add gpus_to_use in cfg- not relevant, we have 1 GPU if cfg['main']['parallel']: model = torch.nn.DataParallel(model) if torch.cuda.is_available(): model = model.cuda() logger.write(main_utils.get_model_string(model)) # Run model logger.write(f'--------train model---------') train_params = train_utils.get_train_params(cfg) # Report metrics and hyper parameters to tensorboard metrics = train(model, train_loader, val_loader, train_params, logger, model_name) hyper_parameters = main_utils.get_flatten_dict(cfg['train']) logger.report_metrics_hyper_params(hyper_parameters, metrics)
def train(model: nn.Module, train_loader: DataLoader, eval_loader: DataLoader, train_params: TrainParams, logger: TrainLogger) -> Metrics: """ Training procedure. Change each part if needed (optimizer, loss, etc.) :param model: :param train_loader: :param eval_loader: :param train_params: :param logger: :return: """ metrics = train_utils.get_zeroed_metrics_dict() best_eval_score = 0 # Create optimizer optimizer = torch.optim.Adam(model.parameters(), lr=train_params.lr) # Create learning rate scheduler scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=train_params.lr_step_size, gamma=train_params.lr_gamma) for epoch in (range(train_params.num_epochs)): print(f"#######epoch {epoch+1}##########") t = time.time() metrics = train_utils.get_zeroed_metrics_dict() bce_loss = nn.BCEWithLogitsLoss(reduction='sum') for i, (image, question, question_len, label) in tqdm(enumerate(train_loader), disable=disable_tqdm, total=len(train_loader)): if torch.cuda.is_available(): image = image.cuda() question = question.cuda() question_len = question_len.cuda() label = label.cuda() y_hat = model(image, question,question_len) y_hat_probs = nn.functional.log_softmax(y_hat) # target_probs = nn.functional.softmax(label) loss = bce_loss(y_hat, label) # Optimization step optimizer.zero_grad() loss.backward() optimizer.step() # Calculate metrics metrics['total_norm'] += nn.utils.clip_grad_norm_(model.parameters(), train_params.grad_clip) metrics['count_norm'] += 1 # Calculate accuracy batch_score = train_utils.compute_soft_accuracy(y_hat_probs, label) metrics['train_score'] += batch_score.item() metrics['train_loss'] += loss.item() # Report model to tensorboard if epoch == 0 and i == 0: logger.report_graph(model, (image, question, question_len)) # Learning rate scheduler step scheduler.step() # Calculate metrics metrics['train_loss'] /= len(train_loader.dataset) metrics['train_score'] /= len(train_loader.dataset) metrics['train_score'] *= 100 norm = metrics['total_norm'] / metrics['count_norm'] model.train(False) metrics['eval_score'], metrics['eval_loss'] = evaluate(model, eval_loader) model.train(True) epoch_time = time.time() - t logger.write_epoch_statistics(epoch, epoch_time, metrics['train_loss'], norm, metrics['train_score'], metrics['eval_score'],metrics['eval_loss']) scalars = {'Accuracy/Train': metrics['train_score'], 'Accuracy/Validation': metrics['eval_score'], 'Loss/Train': metrics['train_loss'], 'Loss/Validation': metrics['eval_loss']} logger.report_scalars(scalars, epoch) if metrics['eval_score'] > best_eval_score: best_eval_score = metrics['eval_score'] if train_params.save_model: logger.save_model(model, epoch, optimizer) return get_metrics(best_eval_score, metrics['eval_score'], metrics['train_loss'])
def main(cfg: DictConfig) -> None: """ Run the code following a given configuration :param cfg: configuration file retrieved from hydra framework """ main_utils.init(cfg) logger = TrainLogger(exp_name_prefix=cfg['main']['experiment_name_prefix'], logs_dir=cfg['main']['paths']['logs']) logger.write(OmegaConf.to_yaml(cfg)) # Set seed for results reproduction main_utils.set_seed(cfg['main']['seed']) # Load dataset path_image_train = '/datashare/train2014/COCO_train2014_' path_question_train = '/datashare/v2_OpenEnded_mscoco_train2014_questions.json' train_dataset = VQADataset(path_answers=cfg['main']['paths']['train'], path_image=path_image_train, path_questions=path_question_train) path_image_val = '/datashare/val2014/COCO_val2014_' path_question_val = '/datashare/v2_OpenEnded_mscoco_val2014_questions.json' val_dataset = VQADataset(path_answers=cfg['main']['paths']['validation'], path_image=path_image_val, path_questions=path_question_val, word_dict=train_dataset.word_dict) train_loader = DataLoader(train_dataset, cfg['train']['batch_size'], shuffle=True, num_workers=cfg['main']['num_workers']) eval_loader = DataLoader(val_dataset, cfg['train']['batch_size'], shuffle=True, num_workers=cfg['main']['num_workers']) image_dim = train_dataset.pic_size output_dim = 2410 # possible answers model = VQAModel(batch_size=cfg['train']['batch_size'], word_vocab_size=train_dataset.vocab_size, lstm_hidden=cfg['train']['num_hid'], output_dim=output_dim, dropout=cfg['train']['dropout'], word_embedding_dim=cfg['train']['word_embedding_dim'], question_output_dim=cfg['train']['question_output_dim'], image_dim=image_dim, last_hidden_fc_dim=cfg['train']['last_hidden_fc_dim']) if cfg['main']['parallel']: model = torch.nn.DataParallel(model) if torch.cuda.is_available(): model = model.cuda() logger.write(main_utils.get_model_string(model)) # Run model train_params = train_utils.get_train_params(cfg) # Report metrics and hyper parameters to tensorboard metrics = train(model, train_loader, eval_loader, train_params, logger) hyper_parameters = main_utils.get_flatten_dict(cfg['train']) logger.report_metrics_hyper_params(hyper_parameters, metrics)