model_name = f'models/CUT{n_epoch}.pt' for epoch in range(start_epoch, n_epoch + 1): start_time = time.time() train_loss, train_loss_G, train_loss_D = train( G, H, D, optimizer_G, optimizer_H, optimizer_D, criterion_GAN, criterion_NCE, layers_nce, train_loader, device, epoch, args.log_interval) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) # print statistics and update training progress print(f'Epoch: {epoch:02} | Time: {epoch_mins}m {epoch_secs}s') im_fake_B = visualize(G) if args.tensorboard: writer.add_scalar('Loss/train', train_loss, epoch) writer.add_scalars('Loss/G_D', { 'G': train_loss_G, 'D': train_loss_D }, epoch) writer.add_image('Image/Fake Dog', im_fake_B, epoch) # log results to model dictionary model_dict['train_loss']['G'].append(train_loss) model_dict['train_loss']['D'].append(train_loss_G) model_dict['train_loss']['total'].append(train_loss_D) model_dict['metrics']['last']['loss'] = train_loss model_dict['metrics']['last']['epoch'] = epoch if epoch == 1 or train_loss < model_dict['metrics']['best']['loss']: model_dict['model_state_dict'] = G.state_dict() model_dict['optimizer_state_dict'] = optimizer_G.state_dict() model_dict['metrics']['best']['epoch'] = epoch model_dict['metrics']['best']['loss'] = train_loss if args.save: torch.save(model_dict, model_name)
def main(): writer = SummaryWriter() data_loader = configure_data() g = GeneratorNet().to(device) d = DiscriminatorNet().to(device) loss_func = nn.BCELoss().to(device) optimizer_g = opt.Adam(g.parameters(), lr=args.g_lr, betas=(args.b1, args.b2)) optimizer_d = opt.Adam(d.parameters(), lr=args.d_lr, betas=(args.b1, args.b2)) for e in range(args.epoch): total_d_loss, total_r_loss, total_f_loss, total_g_loss = torch.tensor(0.0).to(device), \ torch.tensor(0.0).to(device),\ torch.tensor(0.0).to(device),\ torch.tensor(0.0).to(device) for i, (imgs, _) in enumerate(data_loader): fake = torch.zeros(imgs.shape[0], 1).to(device) real = torch.ones(imgs.shape[0], 1).to(device) noise_a = torch.randn(imgs.shape[0], args.noise_dim).to(device) noise_b = torch.randn(imgs.shape[0], args.noise_dim).to(device) with torch.no_grad(): gen_pictures_a = g(noise_a) gen_pictures_b = g(noise_b) real_pictures = imgs.to(device) # 训练D gen_scores = d(gen_pictures_a) real_scores = d(real_pictures) optimizer_d.zero_grad() r_loss = loss_func(real_scores, real) f_loss = loss_func(gen_scores, fake) d_loss = r_loss + f_loss total_d_loss += d_loss total_r_loss += r_loss total_f_loss += f_loss d_loss.backward() optimizer_d.step() # 训练G optimizer_g.zero_grad() # g_loss = -loss_func(d(gen_pictures.detach()), fake) g_loss = loss_func(d(gen_pictures_b), real) total_g_loss += g_loss g_loss.backward() optimizer_g.step() print( f"[Epoch:{e+1}] [Batch:{i+1}] [D loss:{d_loss}] [G loss:{g_loss}]" ) batchs_done = e * len(data_loader) + i if batchs_done % args.save_interval == 0: save_image(gen_pictures_a[np.random.randint(0, args.batch_size // 2, size=25)], f"images/{batchs_done}.png", nrow=5, normalize=True) writer.add_scalars( 'loss', { 'd_loss_expectation': total_d_loss / len(data_loader), 'real_loss_expectation': total_r_loss / len(data_loader), 'fake_loss_expectation': total_f_loss / len(data_loader), 'g_loss_expectation': total_g_loss / len(data_loader) }, e) if (e + 1) % 50 == 0: torch.save(g.state_dict(), f"models/g{e+1}.pth") torch.save(d.state_dict(), f"models/d{e+1}.pth")
class ExperimentBuilder(object): def __init__(self, args, data, model, device): """ Initializes an experiment builder using a named tuple (args), a data provider (data), a meta learning system (model) and a device (e.g. gpu/cpu/n) :param args: A namedtuple containing all experiment hyperparameters :param data: A data provider of instance MetaLearningSystemDataLoader :param model: A meta learning system instance :param device: Device/s to use for the experiment """ self.args, self.device = args, device self.model = model ( self.saved_models_filepath, self.logs_filepath, self.samples_filepath, ) = build_experiment_folder(experiment_name=self.args.experiment_name) self.per_task_performance = defaultdict(lambda: 0) self.total_losses = dict() self.state = dict() self.state["best_val_loss"] = 10**6 self.state["best_val_accuracy"] = 0 self.state["best_val_iter"] = 0 self.state["current_iter"] = 0 self.start_epoch = 0 self.num_epoch_no_improvements = 0 self.patience = args.patience self.create_summary_csv = False self.writer = SummaryWriter("runs/{}".format( self.args.experiment_name)) if self.args.continue_from_epoch == "from_scratch": self.create_summary_csv = True elif self.args.continue_from_epoch == "latest": checkpoint = os.path.join(self.saved_models_filepath, "train_model_latest") print("attempting to find existing checkpoint", ) if os.path.exists(checkpoint): self.state = self.model.load_model( model_save_dir=self.saved_models_filepath, model_name="train_model", model_idx="latest", ) self.start_epoch = int(self.state["current_iter"] / self.args.total_iter_per_epoch) else: self.args.continue_from_epoch = "from_scratch" self.create_summary_csv = True elif int(self.args.continue_from_epoch) >= 0: self.state = self.model.load_model( model_save_dir=self.saved_models_filepath, model_name="train_model", model_idx=self.args.continue_from_epoch, ) self.start_epoch = int(self.state["current_iter"] / self.args.total_iter_per_epoch) self.data = data(args=args, current_iter=self.state["current_iter"]) self.idx_to_class_name = self.data.dataset.load_from_json( self.data.dataset.index_to_label_name_dict_file) print("train_seed {}, val_seed: {}, at start time".format( self.data.dataset.seed["train"], self.data.dataset.seed["val"])) self.total_epochs_before_pause = self.args.total_epochs_before_pause self.state["best_epoch"] = int(self.state["best_val_iter"] / self.args.total_iter_per_epoch) self.epoch = int(self.state["current_iter"] / self.args.total_iter_per_epoch) self.start_time = time.time() self.epochs_done_in_this_run = 0 print( self.state["current_iter"], int(self.args.total_iter_per_epoch * self.args.total_epochs), ) if self.epoch == 0: for param_name, param in self.model.named_parameters(): self.writer.add_histogram(param_name, param, 0) self.writer.flush() def build_summary_dict(self, total_losses, phase, summary_losses=None): """ Builds/Updates a summary dict directly from the metric dict of the current iteration. :param total_losses: Current dict with total losses (not aggregations) from experiment :param phase: Current training phase :param summary_losses: Current summarised (aggregated/summarised) losses stats means, stdv etc. :return: A new summary dict with the updated summary statistics information. """ if summary_losses is None: summary_losses = dict() for key in total_losses: summary_losses["{}_{}_mean".format(phase, key)] = np.mean( total_losses[key]) summary_losses["{}_{}_std".format(phase, key)] = np.std(total_losses[key]) return summary_losses def build_loss_summary_string(self, summary_losses): """ Builds a progress bar summary string given current summary losses dictionary :param summary_losses: Current summary statistics :return: A summary string ready to be shown to humans. """ output_update = "" for key, value in zip(list(summary_losses.keys()), list(summary_losses.values())): if "loss" in key or "accuracy" in key: value = float(value) output_update += "{}: {:.4f}, ".format(key, value) return output_update def merge_two_dicts(self, first_dict, second_dict): """Given two dicts, merge them into a new dict as a shallow copy.""" z = first_dict.copy() z.update(second_dict) return z def write_task_lang_log(self, log): """ Writes the log from a train iteration in tidy format to the task/lang log file :param log: list containing [task name, language, iteration, support loss, support accuracy, query loss, query accuracy] :return: """ for line in log: save_statistics(self.logs_filepath, line, filename="task_lang_log.csv", create=False) def train_iteration( self, train_sample, sample_idx, epoch_idx, total_losses, current_iter, pbar_train, ): """ Runs a training iteration, updates the progress bar and returns the total and current epoch train losses. :param train_sample: A sample from the data provider :param sample_idx: The index of the incoming sample, in relation to the current training run. :param epoch_idx: The epoch index. :param total_losses: The current total losses dictionary to be updated. :param current_iter: The current training iteration in relation to the whole experiment. :param pbar_train: The progress bar of the training. :return: Updates total_losses, train_losses, current_iter """ ( x_support_set, len_support_set, x_target_set, len_target_set, y_support_set, y_target_set, selected_classes, seed, ) = train_sample # Get teacher names and languages teacher_names, langs = zip(*[t.split("_") for t in selected_classes]) data_batch = ( x_support_set, len_support_set, x_target_set, len_target_set, y_support_set, y_target_set, selected_classes, ) losses, task_lang_log = self.model.run_train_iter( data_batch=data_batch, epoch=epoch_idx) for log, lang in zip(task_lang_log, langs): log.insert(1, lang) self.write_task_lang_log(task_lang_log) for key, value in zip(list(losses.keys()), list(losses.values())): if key not in total_losses: total_losses[key] = [float(value)] else: total_losses[key].append(float(value)) train_losses = self.build_summary_dict(total_losses=total_losses, phase="train") train_output_update = self.build_loss_summary_string(losses) pbar_train.update(1) pbar_train.set_description("training phase {} -> {}".format( self.epoch, train_output_update)) current_iter += 1 return train_losses, total_losses, current_iter def full_task_set_evaluation(self, epoch, set_name="val", **kwargs): if set_name == "test": print("Loading best model for evaluation..") self.model.load_model( model_save_dir=self.saved_models_filepath, model_name="train_model", model_idx="best", ) set_meta_loss_back = False if self.model.meta_loss.lower( ) == "kl" and self.args.val_using_cross_entropy: # Use cross entropy on gold labels as no teacher encoding is available self.model.meta_loss = "ce" set_meta_loss_back = True # list sets in dev set val_tasks = list(self.data.dataset.task_set_sizes[set_name].keys()) # generate seeds seeds = [42 + i for i in range(self.args.num_evaluation_seeds)] per_val_set_performance = {k: [] for k in val_tasks} # perform finetuning and evaluation result = {} losses = [] accuracies = [] saved_already = False for task_name in val_tasks: for seed in seeds: print("Evaluating {} with seed {}...".format(task_name, seed)) train_dataloader, dev_dataloader = self.data.get_finetune_dataloaders( task_name, 0, seed) _, best_loss, curr_loss, accuracy = self.model.finetune_epoch( None, self.model.classifier.config, train_dataloader, dev_dataloader, task_name=task_name, epoch=epoch, eval_every=1, model_save_dir=self.saved_models_filepath, best_loss=0, ) per_val_set_performance[task_name].append(accuracy) accuracies.append(accuracy) losses.append(curr_loss) # Store and compare performance per validation task avg_accuracy = np.mean(per_val_set_performance[task_name]) if avg_accuracy > self.per_task_performance[task_name]: print("New best performance for task", task_name) self.per_task_performance[task_name] = avg_accuracy self.state["best_epoch_{}".format(task_name)] = int( self.state["current_iter"] / self.args.total_iter_per_epoch) result["{}_accuracy_mean".format(set_name)] = np.mean(accuracies) result["{}_loss_std".format(set_name)] = np.std(accuracies) result["{}_loss_mean".format(set_name)] = np.mean(losses) result["{}_loss_std".format(set_name)] = np.std(losses) if set_meta_loss_back: self.model.meta_loss = "kl" return result def evaluation_iteration(self, val_sample, total_losses, pbar_val, phase): """ Runs a validation iteration, updates the progress bar and returns the total and current epoch val losses. :param val_sample: A sample from the data provider :param total_losses: The current total losses dictionary to be updated. :param pbar_val: The progress bar of the val stage. :return: The updated val_losses, total_losses """ ( x_support_set, len_support_set, x_target_set, len_target_set, y_support_set, y_target_set, selected_classes, seed, ) = val_sample # Convert selected_classes to their pretrained directories if self.args.sets_are_pre_split: teacher_names = [t.split("_")[0] for t in selected_classes] else: teacher_names = [ self.idx_to_class_name[selected_class].split("_")[0] for selected_class in selected_classes ] data_batch = ( x_support_set, len_support_set, x_target_set, len_target_set, y_support_set, y_target_set, teacher_names, ) losses = self.model.run_validation_iter(data_batch=data_batch) for key, value in losses.items(): if key not in total_losses: total_losses[key] = [float(value)] else: total_losses[key].append(float(value)) val_losses = self.build_summary_dict(total_losses=total_losses, phase=phase) val_output_update = self.build_loss_summary_string(losses) pbar_val.update(1) pbar_val.set_description("val_phase {} -> {}".format( self.epoch, val_output_update)) return val_losses, total_losses def test_evaluation_iteration(self, val_sample, pbar_test): """ Runs a validation iteration, updates the progress bar and returns the total and current epoch val losses. :param val_sample: A sample from the data provider :param total_losses: The current total losses dictionary to be updated. :param pbar_test: The progress bar of the val stage. :return: The updated val_losses, total_losses """ ( x_support_set, len_support_set, x_target_set, len_target_set, y_support_set, y_target_set, selected_classes, seed, ) = val_sample # Convert selected_classes to their pretrained directories if self.args.sets_are_pre_split: teacher_names = [t.split("_")[0] for t in selected_classes] else: teacher_names = [ self.idx_to_class_name[selected_class].split("_")[0] for selected_class in selected_classes ] data_batch = ( x_support_set, len_support_set, x_target_set, len_target_set, y_support_set, y_target_set, teacher_names, ) losses = self.model.run_validation_iter(data_batch=data_batch) test_output_update = self.build_loss_summary_string(losses) pbar_test.update(1) pbar_test.set_description("test_phase {} -> {}".format( self.epoch, test_output_update)) return losses def save_models(self, model, epoch, state, new_best): """ Saves two separate instances of the current model. One to be kept for history and reloading later and another one marked as "latest" to be used by the system for the next epoch training. Useful when the training/val process is interrupted or stopped. Leads to fault tolerant training and validation systems that can continue from where they left off before. :param model: Current meta learning model of any instance within the few_shot_learning_system.py :param epoch: Current epoch :param state: Current model and experiment state dict. :param new best: Only save double copy of model when it performs better than all previous models """ print("New best: ", new_best) if new_best: model.save_model( model_save_dir=os.path.join(self.saved_models_filepath, "train_model_best"), state=state, ) model.save_model( model_save_dir=os.path.join(self.saved_models_filepath, "train_model_latest"), state=state, ) print("saved models to", self.saved_models_filepath) def pack_and_save_metrics(self, start_time, create_summary_csv, train_losses, val_losses, state): """ Given current epochs start_time, train losses, val losses and whether to create a new stats csv file, pack stats and save into a statistics csv file. Return a new start time for the new epoch. :param start_time: The start time of the current epoch :param create_summary_csv: A boolean variable indicating whether to create a new statistics file or append results to existing one :param train_losses: A dictionary with the current train losses :param val_losses: A dictionary with the currrent val loss :return: The current time, to be used for the next epoch. """ epoch_summary_losses = self.merge_two_dicts(first_dict=train_losses, second_dict=val_losses) if "per_epoch_statistics" not in state: state["per_epoch_statistics"] = dict() for key, value in epoch_summary_losses.items(): if key not in state["per_epoch_statistics"]: state["per_epoch_statistics"][key] = [value] else: state["per_epoch_statistics"][key].append(value) epoch_summary_string = self.build_loss_summary_string( epoch_summary_losses) epoch_summary_losses["epoch"] = self.epoch epoch_summary_losses["epoch_run_time"] = time.time() - start_time if create_summary_csv: self.summary_statistics_filepath = save_statistics( self.logs_filepath, list(epoch_summary_losses.keys()), create=True) self.create_summary_csv = False start_time = time.time() print("epoch {} -> {}".format(epoch_summary_losses["epoch"], epoch_summary_string)) self.summary_statistics_filepath = save_statistics( self.logs_filepath, list(epoch_summary_losses.values())) return start_time, state def evaluate_test_set_using_the_best_models(self, top_n_models): per_epoch_statistics = self.state["per_epoch_statistics"] val_acc = np.copy(per_epoch_statistics["val_loss_mean"]) val_idx = np.array([i for i in range(len(val_acc))]) sorted_idx = np.argsort(val_acc, axis=0).astype(dtype=np.int32)[:top_n_models] sorted_val_acc = val_acc[sorted_idx] val_idx = val_idx[sorted_idx] print(sorted_idx) print(sorted_val_acc) top_n_idx = val_idx[:top_n_models] per_model_per_batch_loss = [[] for i in range(top_n_models)] # per_model_per_batch_targets = [[] for i in range(top_n_models)] test_losses = [dict() for i in range(top_n_models)] for idx, model_idx in enumerate(top_n_idx): self.state = self.model.load_model( model_save_dir=self.saved_models_filepath, model_name="train_model", model_idx=model_idx + 1, ) with tqdm.tqdm(total=int(self.args.num_evaluation_tasks / self.args.batch_size)) as pbar_test: for sample_idx, test_sample in enumerate( self.data.get_test_batches( total_batches=int(self.args.num_evaluation_tasks / self.args.batch_size), augment_images=False, )): # print(test_sample[4]) # per_model_per_batch_targets[idx].extend(np.array(test_sample[3])) per_model_per_batch_loss = self.test_evaluation_iteration( val_sample=test_sample, sample_idx=sample_idx, model_idx=idx, per_model_per_batch_preds=per_model_per_batch_loss, pbar_test=pbar_test, ) per_batch_loss = np.mean(per_model_per_batch_loss, axis=0) loss = np.mean(per_batch_loss) loss_std = np.std(per_batch_loss) test_losses = {"test_loss_mean": loss, "test_loss_std": loss_std} _ = save_statistics( self.logs_filepath, list(test_losses.keys()), create=True, filename="test_summary.csv", ) summary_statistics_filepath = save_statistics( self.logs_filepath, list(test_losses.values()), create=False, filename="test_summary.csv", ) print(test_losses) print("saved test performance at", summary_statistics_filepath) def prep_finetuning( self, task_name, is_baseline, percentage_train, seed, ): """ Takes the best performing model and fine-tunes it using all available data for a task :param task_name: :return: """ # Get dataloader with all task data train_dataloader, dev_dataloader = self.data.get_finetune_dataloaders( task_name, percentage_train, seed) ############################# # Load the model to finetune ############################# if is_baseline: teacher_name = (task_name.split("_")[0].replace("val/", "").replace( "train/", "")) model = AutoModelForSequenceClassification.from_pretrained( os.path.join(self.args.teacher_dir, teacher_name), output_hidden_states=False, ) return train_dataloader, dev_dataloader, model else: per_epoch_statistics = self.state["per_epoch_statistics"] val_acc = np.copy(per_epoch_statistics["val_loss_mean"]) # Load the best scoring model model_idx = np.argsort(val_acc, axis=0).astype(dtype=np.int32)[0] sorted_val_acc = val_acc[model_idx] print("Loading model {} with validation loss {}".format( model_idx, sorted_val_acc)) self.state = self.model.load_model( model_save_dir=self.saved_models_filepath, model_name="train_model", model_idx="best", # model_idx + 1, ) del self.state return train_dataloader, dev_dataloader, self.model.classifier def run_experiment(self): """ Runs a full training experiment with evaluations of the model on the val set at every epoch. Furthermore, will return the test set evaluation results on the best performing validation model. """ # pr = cProfile.Profile() # pr.enable() with tqdm.tqdm( initial=self.state["current_iter"], total=int(self.args.total_iter_per_epoch * self.args.total_epochs), ) as pbar_train: while (self.state["current_iter"] < (self.args.total_epochs * self.args.total_iter_per_epoch) ) and (self.args.evaluate_on_test_set_only == False): for train_sample_idx, train_sample in enumerate( self.data.get_train_batches( total_batches=int(self.args.total_iter_per_epoch * self.args.total_epochs) - self.state["current_iter"])): ( train_losses, total_losses, self.state["current_iter"], ) = self.train_iteration( train_sample=train_sample, total_losses=self.total_losses, epoch_idx=(self.state["current_iter"] / self.args.total_iter_per_epoch), pbar_train=pbar_train, current_iter=self.state["current_iter"], sample_idx=self.state["current_iter"], ) if self.state[ "current_iter"] % self.args.total_iter_per_epoch == 0: # pr.disable() # pr.print_stats() epoch = (self.state["current_iter"] // self.args.total_iter_per_epoch) total_losses = dict() val_losses = dict() new_best = False if (self.args.eval_using_full_task_set ): # evaluate on the whole available task set val_losses = self.full_task_set_evaluation( epoch=epoch) else: # evaluate in few-shot fashion/ on query set only with tqdm.tqdm(total=int( self.args.num_evaluation_tasks / self.args.batch_size)) as pbar_val: for _, val_sample in enumerate( self.data. get_val_batches(total_batches=int( self.args.num_evaluation_tasks / self.args.batch_size))): ( val_losses, total_losses, ) = self.evaluation_iteration( val_sample=val_sample, total_losses=total_losses, pbar_val=pbar_val, phase="val", ) # Write metrics to tensorboard # log metrics self.writer.add_scalars( "loss", { "train": train_losses["train_loss_mean"], "val": val_losses["val_loss_mean"], }, epoch, ) self.writer.add_scalars( "Accuracy", { "train": train_losses["train_accuracy_mean"], "val": val_losses["val_accuracy_mean"], }, epoch, ) # log weight distributions and gradients of slow weights for param_name, param in self.model.named_parameters(): self.writer.add_histogram(param_name, param, epoch) self.writer.flush() if (val_losses["val_accuracy_mean"] > self.state["best_val_accuracy"]): self.num_epoch_no_improvements = 0 new_best = True print( "Best validation accuracy", val_losses["val_accuracy_mean"], "with loss", val_losses["val_loss_mean"], ) self.state["best_val_accuracy"] = ( val_losses["val_accuracy_mean"], ) self.state["best_val_iter"] = self.state[ "current_iter"] self.state["best_epoch"] = int( self.state["best_val_iter"] / self.args.total_iter_per_epoch) else: self.num_epoch_no_improvements += 1 self.epoch += 1 self.state = self.merge_two_dicts( first_dict=self.merge_two_dicts( first_dict=self.state, second_dict=train_losses), second_dict=val_losses, ) self.save_models( model=self.model, epoch=self.epoch, state=self.state, new_best=new_best, ) self.start_time, self.state = self.pack_and_save_metrics( start_time=self.start_time, create_summary_csv=self.create_summary_csv, train_losses=train_losses, val_losses=val_losses, state=self.state, ) self.total_losses = dict() self.epochs_done_in_this_run += 1 save_to_json( filename=os.path.join(self.logs_filepath, "summary_statistics.json"), dict_to_store=self.state["per_epoch_statistics"], ) if (self.epochs_done_in_this_run >= self.total_epochs_before_pause): print("Pause time, evaluating on test set...") print( self.full_task_set_evaluation( set_name="test", epoch=self.epoch)) print("train_seed {}, val_seed: {}, at pause time". format( self.data.dataset.seed["train"], self.data.dataset.seed["val"], )) sys.exit() if self.num_epoch_no_improvements > self.patience: print( "{} epochs no improvement, early stopping applied." .format(self.num_epoch_no_improvements)) print( self.full_task_set_evaluation( set_name="test", epoch=self.epoch)) print("train_seed {}, val_seed: {}, at pause time". format( self.data.dataset.seed["train"], self.data.dataset.seed["val"], )) sys.exit() print( self.full_task_set_evaluation(epoch=self.epoch, set_name="test"))
test_avg_loss += avg1_loss.item() * batch_size test_max_loss += max1_loss.item() * batch_size test_concat_loss += concat_loss.item() * batch_size test_metric_loss += metric_loss.item() * batch_size test_acc = float(test_correct) / total test_loss = test_loss / total test_avg_loss = test_avg_loss / total test_max_loss = test_max_loss / total test_concat_loss = test_concat_loss / total test_metric_loss = 5.0 * test_metric_loss / total print( "epoch:{} - test loss: {:.3f} and test acc: {:.3f} total sample:{}" .format(epoch, test_loss, test_acc, total)) write.add_scalars("lOSS", { 'train': train_loss, "test": test_loss }, epoch) write.add_scalars("AVG_loss", { 'train': train_avg_loss, "test": test_avg_loss }, epoch) write.add_scalars("MAX_loss", { 'train': train_max_loss, "test": test_max_loss }, epoch) write.add_scalars("Cat_loss", { 'train': train_concat_loss, "test": test_concat_loss }, epoch) write.add_scalars("Metric_loss", { 'train': train_metric_loss,
def train(model, training_data, validation_data, optimizer, device, opt): ''' Start training ''' # Use tensorboard to plot curves, e.g. perplexity, accuracy, learning rate if opt.use_tb: from torch.utils.tensorboard import SummaryWriter tb_writer = SummaryWriter( log_dir=os.path.join(opt.output_dir, 'tensorboard')) log_train_file = os.path.join(opt.output_dir, 'train.log') log_valid_file = os.path.join(opt.output_dir, 'valid.log') print('[Info] Training performance will be written to file: {} and {}'. format(log_train_file, log_valid_file)) with open(log_train_file, 'w') as log_tf, open(log_valid_file, 'w') as log_vf: log_tf.write('epoch,loss,ppl,accuracy\n') log_vf.write('epoch,loss,ppl,accuracy\n') def print_performances(header, ppl, accu, start_time, lr, Num_parameters): print(' - {header:12} ppl: {ppl: 8.5f}, accuracy: {accu:3.3f} %, lr: {lr:8.5f}, '\ 'elapse: {elapse:3.3f} min, ParameterNumber: {Num_parameters: 8.2f}'.format( header=f"({header})", ppl=ppl, accu=100*accu, elapse=(time.time()-start_time)/60, lr=lr, Num_parameters=Num_parameters)) #valid_accus = [] valid_losses = [] for epoch_i in range(opt.epoch): print('[ Epoch', epoch_i, ']') start = time.time() train_loss, train_accu = train_epoch(model, training_data, optimizer, opt, device, smoothing=opt.label_smoothing) train_ppl = math.exp(min(train_loss, 100)) # Current learning rate lr = optimizer._optimizer.param_groups[0]['lr'] #Calculate Num op parameters of model Num_parameters = count_parameters(model) print_performances('Training', train_ppl, train_accu, start, lr, Num_parameters) start = time.time() valid_loss, valid_accu = eval_epoch(model, validation_data, device, opt) valid_ppl = math.exp(min(valid_loss, 100)) #Calculate Num op parameters of model Num_parameters = count_parameters(model) print_performances('Validation', valid_ppl, valid_accu, start, lr, Num_parameters) valid_losses += [valid_loss] checkpoint = { 'epoch': epoch_i, 'settings': opt, 'model': model.state_dict() } if opt.save_mode == 'all': model_name = 'model_accu_{accu:3.3f}.chkpt'.format(accu=100 * valid_accu) torch.save(checkpoint, model_name) elif opt.save_mode == 'best': model_name = 'model.chkpt' if valid_loss <= min(valid_losses): torch.save(checkpoint, os.path.join(opt.output_dir, model_name)) print(' - [Info] The checkpoint file has been updated.') with open(log_train_file, 'a') as log_tf, open(log_valid_file, 'a') as log_vf: log_tf.write( '{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format( epoch=epoch_i, loss=train_loss, ppl=train_ppl, accu=100 * train_accu)) log_vf.write( '{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format( epoch=epoch_i, loss=valid_loss, ppl=valid_ppl, accu=100 * valid_accu)) if opt.use_tb: tb_writer.add_scalars('ppl', { 'train': train_ppl, 'val': valid_ppl }, epoch_i) tb_writer.add_scalars('accuracy', { 'train': train_accu * 100, 'val': valid_accu * 100 }, epoch_i) tb_writer.add_scalar('learning_rate', lr, epoch_i)
class Dense_U_Net_lidar_Agent: def __init__(self, config=None, torchvision_init=True): ''' Handles everything - training, validation testing - checkpoint loading and saving - logging | tensorboard summaries Accordingly everything is specified here - model - loss - optimizer - lr scheduling Arguments: torchvision_init: boolean - True: load densenet state dict from torchvision - False: load checkpoint; if no checkpoint just normal init ''' self.logger = logging.getLogger('Agent') # model and config if lazy self.model = densenet121_u_lidar(pretrained=torchvision_init, config=config) # in case config is empty it is created in model self.config = self.model.config # dataloader self.data_loader = WaymoDataset_Loader(self.config) # pixel-wise cross-entropy loss self.loss = torch.nn.BCEWithLogitsLoss(reduction='none').cuda() # optimizer self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.optimizer.learning_rate, betas=(self.config.optimizer.beta1, self.config.optimizer.beta2), eps=self.config.optimizer.eps, weight_decay=self.config.optimizer.weight_decay, amsgrad=self.config.optimizer.amsgrad) # learning rate decay scheduler if self.config.optimizer.lr_scheduler.want: self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=self.config.optimizer.lr_scheduler.every_n_epochs, gamma=self.config.optimizer.lr_scheduler.gamma) # initialize counters; updated in load_checkpoint self.current_epoch = 0 self.current_train_iteration = 0 self.current_val_iteration = 0 self.best_val_iou = 0 # if cuda is available export model to gpu self.cuda = torch.cuda.is_available() if self.cuda: self.device = torch.device('cuda') torch.cuda.manual_seed_all(self.config.agent.seed) self.logger.info('Operation will be on *****GPU-CUDA***** ') else: self.device = torch.device('cpu') torch.manual_seed(self.config.agent.seed) self.logger.info('Operation will be on *****CPU***** ') self.model = self.model.to(self.device) self.loss = self.loss.to(self.device) if not torchvision_init: self.load_checkpoint() # Tensorboard Writers Path(self.config.dir.current_run.summary).mkdir(exist_ok=True, parents=True) self.train_summary_writer = SummaryWriter(log_dir=self.config.dir.current_run.summary, comment='Dense_U_Net') self.val_summary_writer = SummaryWriter(log_dir=self.config.dir.current_run.summary, comment='Dense_U_Net') def save_checkpoint(self, filename='checkpoint.pth.tar', is_best=False): ''' Saving the latest checkpoint of the training Arguments: filename: filename which will contain the state is_best: flag is it is the best model ''' #aggregate important data state = { self.config.agent.checkpoint.epoch: self.current_epoch, self.config.agent.checkpoint.train_iteration: self.current_train_iteration, self.config.agent.checkpoint.val_iteration: self.current_val_iteration, self.config.agent.checkpoint.best_val_iou: self.best_val_iou, self.config.agent.checkpoint.state_dict: self.model.state_dict(), self.config.agent.checkpoint.optimizer: self.optimizer.state_dict() } if is_best: filename = self.config.agent.best_checkpoint_name # create dir if not exists Path(self.config.dir.current_run.checkpoints).mkdir(exist_ok=True, parents=True) # Save the state torch.save(state, os.path.join(self.config.dir.current_run.checkpoints, filename)) def load_checkpoint(self, filename=None): ''' load checkpoint from file should contain following keys: 'epoch', 'iteration', 'best_val_iou', 'state_dict', 'optimizer' where state_dict is model statedict and optimizer is optimizer statesict Arguments: filename: only name with file type extension | path in config.dir.current_run.checkpoints ''' # use best if not specified if filename is None: filename = self.config.agent.best_checkpoint_name # load according to key filepath = os.path.join(self.config.dir.current_run.checkpoints, filename) try: self.logger.info('Loading checkpoint {}'.format(filename)) checkpoint = torch.load(filepath) self.current_epoch = checkpoint[self.config.agent.checkpoint.epoch] self.current_train_iteration = checkpoint[ self.config.agent.checkpoint.train_iteration] self.current_val_iteration = checkpoint[ self.config.agent.checkpoint.val_iteration] self.best_val_iou = checkpoint[ self.config.agent.checkpoint.best_val_iou] self.model.load_state_dict(checkpoint[ self.config.agent.checkpoint.state_dict]) self.optimizer.load_state_dict(checkpoint[ self.config.agent.checkpoint.optimizer]) self.logger.info('Checkpoint loaded successfully from {} at (epoch {}) at (iteration {})\n' .format(self.config.dir.current_run.checkpoints, checkpoint['epoch'], checkpoint['train_iteration'])) except OSError: warnings.warn('No checkpoint exists from {}. Skipping...'.format(filepath)) self.logger.info('No checkpoint exists from {}. Skipping...'.format(filepath)) self.logger.info('**First time to train**') def run(self): ''' starts training are testing: specify under config.loader.mode can handle keyboard interupt ''' print('starting ' + self.config.loader.mode + ' at ' + str(datetime.now())) try: if self.config.loader.mode == 'test': with torch.no_grad(): self.validate() else: self.train() except KeyboardInterrupt: self.logger.info('You have entered CTRL+C.. Wait to finalize') def train(self): ''' training one epoch at a time validating after each epoch saving checkpoint after each epoch check if val acc is best and store separately ''' # add selected loss and optimizer to config | not added in init as may be changed before training self.config.loss.func = str(self.loss) self.config.optimizer.func = str(self.optimizer) # make sure to remember the hyper params self.add_hparams_summary_writer() self.save_hparams_json() # Iterate epochs | train one epoch | validate | save checkpoint for epoch in range(self.current_epoch, self.config.agent.max_epoch): self.current_epoch = epoch self.train_one_epoch() with torch.no_grad(): avg_val_iou_per_class = self.validate() val_iou = sum(avg_val_iou_per_class)/len(avg_val_iou_per_class) is_best = val_iou > self.best_val_iou if is_best: self.best_val_iou = val_iou self.save_checkpoint(is_best=is_best) self.train_summary_writer.close() self.val_summary_writer.close() def train_one_epoch(self): ''' One epoch training function ''' # Initialize progress visualization and get batch tqdm_batch = tqdm(self.data_loader.train_loader, total=self.data_loader.train_iterations, desc='Epoch-{}-'.format(self.current_epoch)) # Set the model to be in training mode self.model.train() # metric counters current_batch = 0 number_of_batches = self.data_loader.train_loader.dataset.__len__() epoch_loss = torch.zeros((number_of_batches, self.config.model.num_classes)).to(self.device) epoch_iou = torch.zeros((number_of_batches, self.config.model.num_classes)) epoch_iou_nans = torch.zeros((number_of_batches, self.config.model.num_classes)) epoch_acc = torch.zeros((number_of_batches, self.config.model.num_classes)).to(self.device) for image, lidar, ht_map in tqdm_batch: # push to gpu if possible if self.cuda: image = image.cuda(non_blocking=self.config.loader.async_loading) lidar = lidar.cuda(non_blocking=self.config.loader.async_loading) ht_map = ht_map.cuda(non_blocking=self.config.loader.async_loading) # forward pass prediction = self.model(image, lidar) # pixel-wise loss current_loss = self.loss(prediction, ht_map) loss_per_class = torch.sum(current_loss.detach(), dim=(0,2,3)) epoch_loss[current_batch, :] = loss_per_class # whole image IoU per class; not taking nans into acc for the mean value; counting the nans separately iou_per_instance_per_class = utils.compute_IoU_whole_img_batch(prediction.detach(), ht_map.detach(), self.config.agent.iou_threshold) iou_per_class = torch.tensor(np.nanmean(iou_per_instance_per_class, axis=0)) iou_per_class[torch.isnan(iou_per_class)] = 0 epoch_iou[current_batch, :] = iou_per_class epoch_iou_nans[current_batch, :] = torch.sum(torch.isnan(iou_per_instance_per_class), axis=0) # compute class-wise accuracy of current batch acc_per_class = utils.compute_accuracy(ht_map.detach(), prediction.detach(), self.config.agent.iou_threshold) epoch_acc[current_batch, :] = acc_per_class # backprop self.optimizer.zero_grad() current_loss.backward(torch.ones_like(current_loss.detach(), device=self.device)) # , retain_graph=True? self.optimizer.step() # logging for visualization during training: separate plots for loss, acc, iou | each-classwise + overall loss_dict = { 'Vehicle': loss_per_class[0], 'Pedestrian': loss_per_class[1], 'Cyclist': loss_per_class[2], 'Overall': torch.mean(loss_per_class) } self.train_summary_writer.add_scalars('Training/Loss', loss_dict, self.current_train_iteration) acc_dict = { 'Vehicle': acc_per_class[0], 'Pedestrian': acc_per_class[1], 'Cyclist': acc_per_class[2], 'Overall': torch.mean(acc_per_class) } self.train_summary_writer.add_scalars('Training/Accuracy', acc_dict, self.current_train_iteration) iou_dict = { 'Vehicle': iou_per_class[0], 'Pedestrian': iou_per_class[1], 'Cyclist': iou_per_class[2], 'Overall': torch.mean(iou_per_class) } self.train_summary_writer.add_scalars('Training/IoU', iou_dict, self.current_train_iteration) # counters self.current_train_iteration += 1 current_batch += 1 tqdm_batch.close() # learning rate decay update; after validate; after each epoch if self.config.optimizer.lr_scheduler.want: self.lr_scheduler.step() # log avg_epoch_loss = torch.mean(epoch_loss, axis=0).tolist() avg_epoch_iou = torch.mean(epoch_iou, axis=0).tolist() cum_epoch_nans = torch.sum(epoch_iou_nans, axis=0).tolist() avg_epoch_acc = torch.mean(epoch_acc, axis=0).tolist() self.logger.info('Training at Epoch-' + str(self.current_epoch) + ' | ' + 'Average Loss: ' + str( avg_epoch_loss) + ' | ' + 'Average IoU: ' + str(avg_epoch_iou) + ' | ' + 'Number of NaNs: ' + str( cum_epoch_nans) + ' | ' + 'Average Accuracy: ' + str(avg_epoch_acc)) def validate(self): ''' One epoch validation return: average IoU per class ''' # Initialize progress visualization and get batch # !self.data_loader.valid_loader works for both valid and test tqdm_batch = tqdm(self.data_loader.valid_loader, total=self.data_loader.valid_iterations, desc='Valiation at -{}-'.format(self.current_epoch)) # set the model in training mode self.model.eval() # metric counters current_batch = 0 number_of_batches = self.data_loader.valid_loader.dataset.__len__() epoch_loss = torch.zeros((number_of_batches, self.config.model.num_classes)).to(self.device) epoch_iou = torch.zeros((number_of_batches, self.config.model.num_classes)) epoch_iou_nans = torch.zeros((number_of_batches, self.config.model.num_classes)) epoch_acc = torch.zeros((number_of_batches, self.config.model.num_classes)).to(self.device) for image, lidar, ht_map in tqdm_batch: # push to gpu if possible if self.cuda: image = image.cuda(non_blocking=self.config.loader.async_loading) lidar = lidar.cuda(non_blocking=self.config.loader.async_loading) ht_map = ht_map.cuda(non_blocking=self.config.loader.async_loading) # forward pass prediction = self.model(image, lidar) # pixel-wise loss current_loss = self.loss(prediction, ht_map) loss_per_class = torch.sum(current_loss.detach(), dim=(0,2,3)) epoch_loss[current_batch, :] = loss_per_class # whole image IoU per class; not taking nans into acc for the mean value; counting the nans separately iou_per_instance_per_class = utils.compute_IoU_whole_img_batch(prediction.detach(), ht_map.detach(), self.config.agent.iou_threshold) iou_per_class = torch.tensor(np.nanmean(iou_per_instance_per_class, axis=0)) iou_per_class[torch.isnan(iou_per_class)] = 0 epoch_iou[current_batch, :] = iou_per_class epoch_iou_nans[current_batch, :] = torch.sum(torch.isnan(iou_per_instance_per_class), axis=0) # compute class-wise accuracy of current batch acc_per_class = utils.compute_accuracy(ht_map.detach(), prediction.detach(), self.config.agent.iou_threshold) epoch_acc[current_batch, :] = acc_per_class # logging for visualization during training: separate plots for loss, acc, iou | each-classwise + overall loss_dict = { 'Vehicle': loss_per_class[0], 'Pedestrian': loss_per_class[1], 'Cyclist': loss_per_class[2], 'Overall': torch.mean(loss_per_class) } self.val_summary_writer.add_scalars('Validation/Loss', loss_dict, self.current_val_iteration) acc_dict = { 'Vehicle': acc_per_class[0], 'Pedestrian': acc_per_class[1], 'Cyclist': acc_per_class[2], 'Overall': torch.mean(acc_per_class) } self.val_summary_writer.add_scalars('Validation/Accuracy', acc_dict, self.current_val_iteration) iou_dict = { 'Vehicle': iou_per_class[0], 'Pedestrian': iou_per_class[1], 'Cyclist': iou_per_class[2], 'Overall': torch.mean(iou_per_class) } self.val_summary_writer.add_scalars('Validation/IoU', iou_dict, self.current_val_iteration) # counters self.current_val_iteration += 1 current_batch += 1 # log avg_epoch_loss = torch.mean(epoch_loss, axis=0).tolist() avg_epoch_iou = torch.mean(epoch_iou, axis=0).tolist() cum_epoch_nans = torch.sum(epoch_iou_nans, axis=0).tolist() avg_epoch_acc = torch.mean(epoch_acc, axis=0).tolist() self.logger.info('Validation at Epoch-' + str(self.current_epoch) + ' | ' + 'Average Loss: ' + str( avg_epoch_loss) + ' | ' + 'Average IoU: ' + str(avg_epoch_iou) + ' | ' + 'Number of NaNs: ' + str( cum_epoch_nans) + ' | ' + 'Average Accuracy: ' + str(avg_epoch_acc)) tqdm_batch.close() return avg_epoch_iou def add_hparams_summary_writer(self): ''' Add Hyperparamters to tensorboard summary writers using .add_hparams Can be accessed under the Hyperparameter tab in Tensorboard ''' hyper_params = { 'loss_func': self.config.loss.func, 'loss_alpha': torch.tensor(self.config.loss.alpha), 'loss_gamma': torch.tensor(self.config.loss.gamma), 'loss_skip_v_every_n_its': self.config.loss.skip_v_every_n_its, 'loss_skip_p_every_n_its': self.config.loss.skip_p_every_n_its, 'loss_skip_b_every_n_its': self.config.loss.skip_b_every_n_its, 'optimizer': self.config.optimizer.func, 'learning_rate': self.config.optimizer.learning_rate, 'beta1': self.config.optimizer.beta1, 'beta2': self.config.optimizer.beta2, 'eps': self.config.optimizer.eps, 'amsgrad': self.config.optimizer.amsgrad, 'weight_decay': self.config.optimizer.weight_decay, 'lr_scheduler': self.config.optimizer.lr_scheduler.want, 'lr_scheduler_every_n_epochs': self.config.optimizer.lr_scheduler.every_n_epochs, 'lr_scheduler_gamma': self.config.optimizer.lr_scheduler.gamma, } self.train_summary_writer.add_hparams(hyper_params, {}) self.val_summary_writer.add_hparams(hyper_params, {}) def save_hparams_json(self): ''' Uses config information to generate a hyperparameter dict and saves it as a json file into the current_run directory ''' hparams = { 'loss': self.config.loss.__dict__, 'optimizer': self.config.optimizer.__dict__ } utils.save_json_file(os.path.join(self.config.dir.current_run.summary, 'hyperparams.json'), hparams , indent=4) def finalize(self): ''' Close all Writers and print time ''' self.logger.info('Please wait while finalizing the operation.. Thank you') self.train_summary_writer.close() self.val_summary_writer.close() print('ending ' + self.config.loader.mode + ' at ' + str(datetime.now()))
# set new states to current states for determining next actions states = next_states # Update episode score for each agent agent_scores += reward if iteration_step % iteration_interval == 0: buildings_reward_dict = {} building_idx = 1 for building in reward: buildings_reward_dict["Building {}".format( building_idx)] = building building_idx += 1 # Building reward writer.add_scalars("Reward/Buildings", buildings_reward_dict, iteration_step) agent_scores_dict = {} agent_idx = 1 for agentS in agent_scores: agent_scores_dict["Agent {}".format(agent_idx)] = agentS agent_idx += 1 # Agent scores #writer.add_scalars("Scores/Agents", agent_scores_dict, iteration_step) # Plot losses for critic and actor if agent.critic_loss is not None: writer.add_scalar("Losses/Critic Loss", agent.critic_loss, iteration_step) if agent.actor_loss is not None: writer.add_scalar("Losses/Actor Loss", agent.actor_loss,
def train(data_root, epochs, log_dir): input_transforms = torchvision.transforms.Compose([ lambda x: x / 255.0, torchvision.transforms.ToTensor(), lambda x: x.type(torch.FloatTensor) ]) # Skip the first 2 elements in the lable which are the ID, age and subtract 1 # to make it one-hot encoded target_transforms = torchvision.transforms.Compose([lambda x: x[2:] - 1]) loaders = {} for mode in _modes(): dataset = Market1501Dataset(root=data_root, train=mode == 'train', input_transforms=input_transforms, target_transforms=target_transforms) loaders[mode] = torch.utils.data.DataLoader(dataset=dataset, batch_size=_batch(mode), drop_last=True) net = EDNet(input_shape=(3, 128, 64), num_classes=26, num_downsamples=3) print(net) net = net.to(DEVICE) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.SGD(net.parameters(), lr=1e-3, momentum=0.9) global_step = 0 writer = SummaryWriter( log_dir=os.path.join(log_dir, datetime.now().strftime('%Y-%m-%d-%H:%M:%S')), flush_secs=20, filename_suffix=datetime.now().strftime('%Y-%m-%d-%H:%M:%S')) for epoch in range(epochs): running_loss = {'train': 0.0, 'test': 0.0} for mode in _modes(): print("Running {} on {} samples".format(mode, len(loaders[mode]))) if mode == "test": net.eval() else: net.train() for i, data in enumerate(loaders[mode]): inputs, labels = data inputs = inputs.to(DEVICE) labels = labels.to(DEVICE) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, labels) running_loss[mode] += loss.item() if mode == "train": loss.backward() optimizer.step() if mode == "train" and i % 500 == 0: writer.add_scalar('loss/500th_iter_train_loss', loss.item(), global_step) print("Training loss, iter {}: {}".format( i, running_loss['train'] / (i + 1))) if mode == 'train': global_step += 1 writer.add_scalars('loss/epoch_loss', { '{}_loss'.format(mode): running_loss[mode] / len(loaders[mode]) }, global_step) print("Epoch {}: Train Loss {}, Validation Loss {}.".format( epoch, running_loss['train'] / len(loaders['train']), running_loss['test'] / len(loaders['test']))) writer.close()
class Trainer: def __init__(self, kwargs): kwargs["env_cls"] = Atari env = kwargs["env_cls"](kwargs["env_id"]) kwargs["state_shape"] = env.observation_space.shape kwargs["state_dtype"] = np.uint8 kwargs["n_actions"] = env.action_space.n kwargs["device"] = torch.device(kwargs["device_id"]) env.close() self.__dict__.update(kwargs) self.agent = DQNAgent(**kwargs) self.writer = SummaryWriter("./log/") self.cuda_eval = torch.cuda.Stream(self.device) mem_kwargs = dict( capacity=self.mem_capacity, history_len=self.history_len, state_shape=self.state_shape, state_dtype=self.state_dtype, batch_sz=self.batch_sz, alpha=self.mem_alpha, beta=LinearScheduler(self.mem_beta, 1., self.train_steps), priority_eps=self.mem_priority_eps, priority_upper=self.mem_priority_upper, prioritized_replay=self.prioritized_replay, device=self.device, ) mem_cls = PrioritizedReplayMemory if self.prioritized_replay else UniformReplayMemory self.mem = mem_cls(**mem_kwargs) self.mem_lock = Lock() self.sync = Queue(maxsize=1) self.sync.put(None) def play_thread(self): env = self.env_cls(self.env_id) terminal = True eps = LinearScheduler(self.eps_init, self.eps_final, self.eps_steps) behavior = list() with torch.cuda.stream(torch.cuda.Stream(self.device)): for global_step in range(-self.mem_init_sz, self.train_steps + 1): if terminal: state = env.reset() actions, mu, sigma = self.agent.policy( np.expand_dims(state, 0), training=True, eps=eps.get() if global_step > 0 else 1., return_streams=True, ) action = actions[0] if mu is not None and sigma is not None: mu = mu.cpu()[0] behavior.append(mu.argmax(0).item() != action) state, reward, terminal, lost_live = env.step(action) with self.mem_lock: self.mem.put(state[-2], action, np.sign(reward), terminal or lost_live) if global_step < 0: continue eps.step() if global_step % self.optimize_freq == 0: try: self.sync.get(block=True, timeout=10.) except Empty: continue if len(behavior) > 0: if self.adaptive_eps is not None and global_step % self.adaptive_freq == 0: real_eps = np.mean(behavior[-self.adaptive_freq:]) self.agent.c += 0.01 * np.sign(self.adaptive_eps - real_eps) self.agent.c = max(0.01, self.agent.c) if global_step % self.log_freq == 0: if self.adaptive_eps is not None: self.write(self.agent.c, "c", global_step) self.write(np.mean(behavior), "behavior", global_step) behavior = list() env.close() def train(self): Thread(target=self.play_thread, ).start() self.sync.put(None) start_t = datetime.now() for global_step in range(0, self.train_steps + 1): if global_step % self.print_freq == 0: step_time = (datetime.now() - start_t) / self.print_freq start_t = datetime.now() print( "every {} steps {}\t4M {}\t200M {}\tremain {}M,{}".format( self.optimize_freq, step_time * self.optimize_freq, step_time * 10**6, step_time * (50 * 10**6), (self.train_steps - global_step) * 4 // 10**6, step_time * (self.train_steps - global_step), )) if global_step % self.update_target_freq == 0: self.agent.update_target() if global_step % self.eval_freq == 0: self.agent.update_eval() eval_thread = Thread(target=self.eval, args=(global_step, )) eval_thread.start() if global_step % self.optimize_freq == 0: try: self.sync.put(None, block=True, timeout=10.) except Full: continue with self.mem_lock: batch = self.mem.sample() idx, td_err = self.agent.optimize(*batch) if self.prioritized_replay: with self.mem_lock: self.mem.update_priority(idx, np.abs(td_err.cpu().numpy())) self.sync.task_done() eval_thread.join() return def eval(self, global_step): eval_func = dict( frames=self.eval_by_frames, episodes=self.eval_by_episodes, )[self.eval_method] reward = eval_func() self.write(reward, "reward", global_step) self.writer.flush() return def eval_by_episodes(self): n_trials = self.eval_episodes envs = [Atari(self.env_id) for _ in range(n_trials)] states = np.stack([u.reset() for u in envs]) actions = np.empty(n_trials, dtype=np.int) reward = np.zeros(n_trials, dtype=np.float32) terminal = np.zeros(n_trials, dtype=np.bool) with torch.cuda.stream(self.cuda_eval): while not terminal.all(): not_t = ~terminal actions[not_t] = self.agent.policy( states=states[not_t], training=False, eps=self.eps_eval, return_streams=False, ) for i, nt in enumerate(not_t): if nt: states[i], r, terminal[i], _ = envs[i].step(actions[i]) reward[i] += r for e in envs: e.close() return np.mean(reward) def eval_by_frames(self): rewards = list() reward = 0. env = Atari(self.env_id) state = env.reset() with torch.cuda.stream(self.cuda_eval): for step in range(self.eval_frames // 4): action = self.agent.policy( np.expand_dims(state, 0), training=False, eps=self.eps_eval, return_streams=False, )[0] state, r, terminal, _ = env.step(action) reward += r if terminal: rewards.append(reward) reward = 0. state = env.reset() env.close() return np.mean(rewards) def write(self, value, category, step): frm_idx = step * 4 self.writer.add_scalars( main_tag="{}/{}".format(category, self.env_id), tag_scalar_dict={self.label: value}, global_step=frm_idx, ) if not os.path.exists(CSV_FOLDER): os.makedirs(CSV_FOLDER) path = os.path.join( CSV_FOLDER, "{}--{}--{}.csv".format(category, self.env_id, self.label), ) has_header = os.path.exists(path) with open(path, "a") as fp: if not has_header: fp.write("frame (millions), {}\n".format(category)) fp.write("{:.2f}, {:.3f}\n".format(frm_idx / 10**6, value)) return
class SelfPlay: """ Class which run in a dedicated thread to play games and save them to the replay-buffer. """ def __init__(self, initial_weights, game, config, test=False, idx=-1, render=False): self.config: MuZeroConfigBase = config self.game = game self.idx = idx self.episode = 0 self.render = render self.writer = SummaryWriter(self.config.results_path / f"self_play_{idx}") # Initialize the network self.model = models.MuZeroExtendedNetwork( self.config.observation_shape, len(self.config.action_space), self.config.encoding_size, self.config.hidden_size, ) self.model.set_weights(initial_weights) self.model.to(torch.device("cpu")) self.model.eval() self.continuous_self_play(test) def continuous_self_play(self, test_mode=False): while True: if self.config.v_self_play_count.value > 0: # Update the model if the trianer is running self.model.set_weights(self.config.q_weights.get()) # Take the best action (no exploration) in test mode temperature = ( 0 if test_mode else self.config.visit_softmax_temperature_fn( trained_steps=self.config.v_training_step.value ) ) game_history = self.play_game(temperature, False) # Save to the shared storage score = sum(game_history.rewards) self.writer.add_scalars( f"1.Total reward/{'test' if test_mode else 'train'}", {f"env_{self.idx}": score}, global_step=self.episode, ) self.episode += 1 if test_mode: self.config.v_total_reward.value = int(score) if not test_mode: self.config.q_save_game.put(game_history) if not test_mode and self.config.self_play_delay: time.sleep(self.config.self_play_delay) def play_game(self, temperature, render: bool = None): """ Play one game with actions based on the Monte Carlo tree search at each moves. """ if render is None: render = self.render game_history = GameHistory() observation = self.game.reset() game_history.observation_history.append(observation) done = False with torch.no_grad(): while not done and len(game_history.action_history) < self.config.max_moves: root = MCTS(self.config).run( self.model, observation, self.game.to_play(), True if temperature else False, self.game ) action = select_action(root, temperature, self.game) observation, reward, done = self.game.step(action) if render: self.game.render() game_history.observation_history.append(observation) game_history.rewards.append(reward) game_history.action_history.append(action) game_history.store_search_statistics(root, self.config.action_space) self.game.close() return game_history
def train_model(model, dataloaders, criterion, optimizer, num_epochs, scheduler, string_name, device): logger = SummaryWriter() best_model_wts = copy.deepcopy(model.state_dict()) best_rank1_v, best_rank5_v = 0.0, 0.0 feats_val, labels_val = torch.ones(len(dataloaders['val'].dataset), 1024), torch.ones( len(dataloaders['val'].dataset)) for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) for phase in ['train', 'val']: # configure model functionality - train/val if phase == 'train': model.train() running_loss, nr_batch_triplets = (0.0, 0) else: model.eval() for index, (inputs, labels) in enumerate(dataloaders[phase]): inputs, labels = inputs.to(device), labels.to(device) optimizer.zero_grad() embeddings = F.adaptive_avg_pool2d( F.relu(model.features(inputs), inplace=True), (1, 1)).view(inputs.size(0), -1) # for each batch update if phase == 'train': with torch.set_grad_enabled(phase == 'train'): batch_triplets = triplet_selector.get_triplets( embeddings, labels) nr_batch_triplets += batch_triplets.size(0) loss = criterion(embeddings[batch_triplets[:, 0]], embeddings[batch_triplets[:, 1]], embeddings[batch_triplets[:, 2]]) running_loss += batch_triplets.size(0) * loss.item() loss.backward() optimizer.step() else: with torch.no_grad(): feats_val[index * dataloaders[phase]. batch_size:dataloaders[phase].batch_size * (index + 1)] = embeddings labels_val[index * dataloaders[phase]. batch_size:dataloaders[phase].batch_size * (index + 1)] = labels # for each epoch if phase == 'train': epoch_loss = running_loss / nr_batch_triplets print('{} Triplet Loss: {:.4f} Informative triplets: {:.4f}'. format( phase, epoch_loss, round(nr_batch_triplets / len(dataloaders[phase])))) else: rank1_v, rank5_v = compute_rank(feats_val, labels_val, dataloaders[phase], device) print('{} Rank-1: {:.4f} Rank-5: {:.4f}'.format( phase, rank1_v, rank5_v)) scheduler.step() # deep copy the model if phase == 'val' and rank1_v > best_rank1_v: best_rank1_v, best_rank5_v = rank1_v, rank5_v best_model_wts = copy.deepcopy(model.state_dict()) torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict() }, string_name + '.pt') logger.add_scalars( string_name, { 'train_loss': epoch_loss, 'nr_triplets': nr_batch_triplets, 'rank1_val': rank1_v, 'rank5_val': rank5_v }, epoch + 1) logger.close() print( 'Training Finished. Best Validation Rank-1: {:4f} and Best val Rank-5: {:4f} ' .format(best_rank1_v, best_rank5_v)) # load best model weights model.load_state_dict(best_model_wts) return model
class BaseSolver(): ''' Prototype Solver for all kinds of tasks Arguments config - yaml-styled config paras - argparse outcome ''' def __init__(self, config, paras, mode): # General Settings self.config = config self.paras = paras self.mode = mode for k, v in default_hparas.items(): setattr(self, k, v) self.device = torch.device( 'cuda') if self.paras.gpu and torch.cuda.is_available( ) else torch.device('cpu') self.amp = paras.amp # Name experiment self.exp_name = paras.name if self.exp_name is None: # By default, exp is named after config file self.exp_name = paras.config.split('/')[-1].replace('.yaml', '') if mode == 'train': self.exp_name += '_sd{}'.format(paras.seed) # Plugin list self.emb_decoder = None if mode == 'train': # Filepath setup os.makedirs(paras.ckpdir, exist_ok=True) self.ckpdir = os.path.join(paras.ckpdir, self.exp_name) os.makedirs(self.ckpdir, exist_ok=True) # Logger settings self.logdir = os.path.join(paras.logdir, self.exp_name) self.log = SummaryWriter(self.logdir, flush_secs=self.TB_FLUSH_FREQ) self.timer = Timer() # Hyperparameters self.step = 0 self.valid_step = config['hparas']['valid_step'] self.max_step = config['hparas']['max_step'] self.verbose('Exp. name : {}'.format(self.exp_name)) self.verbose('Loading data... large corpus may took a while.') elif mode == 'test': # Output path os.makedirs(paras.outdir, exist_ok=True) self.ckpdir = os.path.join(paras.outdir, self.exp_name) # Load training config to get acoustic feat, text encoder and build model self.src_config = yaml.load(open(config['src']['config'], 'r'), Loader=yaml.FullLoader) self.paras.load = config['src']['ckpt'] self.verbose('Evaluating result of tr. config @ {}'.format( config['src']['config'])) def backward(self, loss): ''' Standard backward step with self.timer and debugger Arguments loss - the loss to perform loss.backward() ''' self.timer.set() loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.GRAD_CLIP) if math.isnan(grad_norm): self.verbose('Error : grad norm is NaN @ step ' + str(self.step)) else: self.optimizer.step() self.timer.cnt('bw') return grad_norm def load_ckpt(self): ''' Load ckpt if --load option is specified ''' if self.paras.load: # Load weights ckpt = torch.load( self.paras.load, map_location=self.device if self.mode == 'train' else 'cpu') self.model.load_state_dict(ckpt['model']) # if self.emb_decoder is not None: # self.emb_decoder.load_state_dict(ckpt['emb_decoder']) # if self.amp: # amp.load_state_dict(ckpt['amp']) # Load task-dependent items if self.mode == 'train': self.step = ckpt['global_step'] self.optimizer.load_opt_state_dict(ckpt['optimizer']) self.verbose('Load ckpt from {}, restarting at step {}'.format( self.paras.load, self.step)) else: for k, v in ckpt.items(): if type(v) is float: metric, score = k, v self.model.eval() if self.emb_decoder is not None: self.emb_decoder.eval() self.verbose( 'Evaluation target = {} (recorded {} = {:.2f} %)'.format( self.paras.load, metric, score)) def verbose(self, msg): ''' Verbose function for print information to stdout''' if self.paras.verbose: if type(msg) == list: for m in msg: print('[INFO]', m.ljust(100)) else: print('[INFO]', msg.ljust(100)) def progress(self, msg): ''' Verbose function for updating progress on stdout (do not include newline) ''' if self.paras.verbose: sys.stdout.write("\033[K") # Clear line print('[{}] {}'.format(human_format(self.step), msg), end='\r') def write_log(self, log_name, log_dict): ''' Write log to TensorBoard log_name - <str> Name of tensorboard variable log_value - <dict>/<array> Value of variable (e.g. dict of losses), passed if value = None ''' if type(log_dict) is dict: log_dict = { key: val for key, val in log_dict.items() if (val is not None and not math.isnan(val)) } if log_dict is None: pass elif len(log_dict) > 0: if 'align' in log_name or 'spec' in log_name: img, form = log_dict self.log.add_image(log_name, img, global_step=self.step, dataformats=form) elif 'text' in log_name or 'hyp' in log_name: self.log.add_text(log_name, log_dict, self.step) else: self.log.add_scalars(log_name, log_dict, self.step) def save_checkpoint(self, f_name, metric, score, show_msg=True): '''' Ckpt saver f_name - <str> the name phnof ckpt file (w/o prefix) to store, overwrite if existed score - <float> The value of metric used to evaluate model ''' ckpt_path = os.path.join(self.ckpdir, f_name) full_dict = { "model": self.model.state_dict(), "optimizer": self.optimizer.get_opt_state_dict(), "global_step": self.step, metric: score } # Additional modules to save # if self.amp: # full_dict['amp'] = self.amp_lib.state_dict() if self.emb_decoder is not None: full_dict['emb_decoder'] = self.emb_decoder.state_dict() torch.save(full_dict, ckpt_path) if show_msg: self.verbose( "Saved checkpoint (step = {}, {} = {:.2f}) and status @ {}". format(human_format(self.step), metric, score, ckpt_path)) def enable_apex(self): if self.amp: # Enable mixed precision computation (ToDo: Save/Load amp) from apex import amp self.amp_lib = amp self.verbose( "AMP enabled (check https://github.com/NVIDIA/apex for more details)." ) self.model, self.optimizer.opt = self.amp_lib.initialize( self.model, self.optimizer.opt, opt_level='O1') # ----------------------------------- Abtract Methods ------------------------------------------ # @abc.abstractmethod def load_data(self): ''' Called by main to load all data After this call, data related attributes should be setup (e.g. self.tr_set, self.dev_set) No return value ''' raise NotImplementedError @abc.abstractmethod def set_model(self): ''' Called by main to set models After this call, model related attributes should be setup (e.g. self.l2_loss) The followings MUST be setup - self.model (torch.nn.Module) - self.optimizer (src.Optimizer), init. w/ self.optimizer = src.Optimizer(self.model.parameters(),**self.config['hparas']) Loading pre-trained model should also be performed here No return value ''' raise NotImplementedError @abc.abstractmethod def exec(self): ''' Called by main to execute training/inference ''' raise NotImplementedError
class GpuPynvmlLogger(Thread): """ Logger for the GPU resources GPU and RAM utilization. `CpuInfo` is implemented on a separate thread as any attachment to an event would would effectively measure the GPU/CPU-utilization of the downtime, as all events are not fired during the `Engine().process()` where the GPU/CPU is in use. Triggering the logging independently will randomize the measurements during _up times_ (when `Engine().process()` is running) and _down times_. Args: logger_directory: directory for tensorboard logs logger_name: name of logger log_interval_seconds: logging interval in seconds. Decreasing the `log_interval_seconds < 0.1` may \ reasonably increase (> ~5-30%) the GPU-utilization by the measurement task unit (['KB', 'MB', 'GB']): logging unit defaults to `'GB'` """ def __init__(self, logger_directory, logger_name='GPULogger', log_interval_seconds=1, unit='GB'): super(GpuPynvmlLogger, self).__init__(name=logger_name, daemon=True) # CAUTION: Always avoid more than one `SummaryWriter` logging to the same directory # because this will lead to log file losses self.logger_directory = logger_directory self.log_interval_seconds = log_interval_seconds nvmlInit() self.gpu_count = nvmlDeviceGetCount() self.gpu_handles = {} for gpu_idx in range(self.gpu_count): hdl = nvmlDeviceGetHandleByIndex(gpu_idx) name = nvmlDeviceGetName(hdl).decode('ascii').replace(' ', '_') self.gpu_handles['GPU{}_{}'.format(gpu_idx, name)] = hdl self._tb_logger = SummaryWriter(logdir=self.logger_directory) self._memory_stats_to_log = ['total', 'used', 'free'] self._log_gpu = True self._unit = unit self._units = {'KB': 1024, 'MB': 1024**2, 'GB': 1024**3} self._start_time = None def run(self): """ Target function of the thread that logs GPU resources to tensoboard till it is closed. CAUTION: DO NOT CALL `self.run()` on its own but CALL `self.start()` inherited from `Thread`. Otherwise `self.run()` will simple be executed in the `MainThread` instead of passed as target function to the new thread. :return: """ self._start_time = time() while self._log_gpu: self._log_gpu_utilization() self._log_gpu_memory() sleep(self.log_interval_seconds) def _log_gpu_memory(self): # Get memory statistics for each GPU for gpu_name, gpu_hdl in self.gpu_handles.items(): # Get current memory stats memory_sizes = nvmlDeviceGetMemoryInfo(handle=gpu_hdl) memory_stats = {} # Select memory statistics to be logged and calculate units for mem_stat in self._memory_stats_to_log: memory_stats[mem_stat] = memory_sizes.__getattribute__( mem_stat) / self._units[self._unit] # log memory statistics to tensorboard self._tb_logger.add_scalars(main_tag='{}_memory_{}'.format( gpu_name, self._unit), tag_scalar_dict=memory_stats, global_step=time() - self._start_time) def _log_gpu_utilization(self): gpu_utilizations = {} # Get current GPU utilizations in percent for gpu_name, gpu_hdl in self.gpu_handles.items(): gpu_percentage = nvmlDeviceGetUtilizationRates(handle=gpu_hdl).gpu gpu_utilizations[gpu_name] = gpu_percentage # log CPU utilization to tensorboard self._tb_logger.add_scalars(main_tag='GPUs_utilization_percentage', tag_scalar_dict=gpu_utilizations, global_step=time() - self._start_time) def close(self): # Quit while-loop in `self.run()` self._log_gpu = False # Close tensorboard logger self._tb_logger.close() # Join thread self.join()
class train_config(ut_cfg.config): def __init__(self): super(train_config, self).__init__(pBs=32, pWn=4, p_force_cpu=False) self.path_save_mdroot = self.check_path_valid( os.path.join(ROOT, "outputs")) localtime = time.localtime(time.time()) self.path_save_mdid = "alexMNIST" + "%02d%02d" % (localtime.tm_mon, localtime.tm_mday) self.save_epoch_begin = 50 self.save_epoch_interval = 25 self.log_epoch_txt = open( os.path.join(self.path_save_mdroot, "conv_epoch_loss_log.txt"), 'a+') self.writer = SummaryWriter( log_dir=os.path.join(self.path_save_mdroot, "board")) self.height_in = 28 self.width_in = 28 self.class_num = 10 self.method_init = "xavier" #"preTrain" #"kaming"#"xavier" self.training_epoch_amount = 150 self.dtroot = os.path.join(ROOT, "datasets") self.opt_baseLr = 1e-3 self.opt_bata1 = 0.9 self.opt_weightdecay = 3e-6 def init_net(self, pNet): if self.method_init == "xavier": ut_init.init_xavier(pNet) elif self.method_init == "kaiming": ut_init.init_kaiming(pNet) elif self.method_init == "preTrain": assert self.preTrain_model_path is not None, "weight path ungiven" pNet.load_state_dict(torch.load(self.preTrain_model_path)) pNet.to(self.device).train() def create_dataset(self, istrain): if istrain: imgUbyte_absfilename = r"datasets\MNIST\train-images-idx3-ubyte.gz" labelUbyte_absfilename = r"datasets\MNIST\train-labels-idx1-ubyte.gz" else: imgUbyte_absfilename = r"datasets\MNIST\t10k-images-idx3-ubyte.gz" labelUbyte_absfilename = r"datasets\MNIST\t10k-labels-idx1-ubyte.gz" q_dataset = mnstld.minist_Loader(imgUbyte_absfilename, labelUbyte_absfilename) return q_dataset def name_save_model(self, save_mode, epochX=None): model_filename = self.path_save_mdid if save_mode == "processing": assert epochX is not None, "miss the epoch info" model_filename += "_%03d" % (epochX) + ".pth" elif save_mode == "ending": model_filename += "_%03d" % (self.training_epoch_amount) + ".pth" elif save_mode == "interrupt": model_filename += "_interrupt" + ".pth" assert os.path.splitext(model_filename)[-1] == ".pth" q_abs_path = os.path.join(self.path_save_mdroot, model_filename) return q_abs_path def log_in_file(self, *print_paras): for para_i in print_paras: print(para_i, end="") print(para_i, end="", file=self.log_epoch_txt) print("") print("", file=self.log_epoch_txt) def log_in_board(self, chartname, data_Dic, epoch): # for key_i, val_i in data_Dic: self.writer.add_scalars(chartname, data_Dic, epoch) def validate(self, pNet): # use the classifacation acc to validate the convNet performance valid_dataset = self.create_dataset(istrain=False) validloader = torch.utils.data.DataLoader(dataset=valid_dataset, batch_size=self.ld_batchsize, shuffle=True, num_workers=self.ld_workers) acc_Lst = [] # len(validloader) = 313; for iter_idx, (img_Tsor_bacth_i, label_Tsor_bacth_i) in enumerate(validloader): img_Tsor_bacth_i = img_Tsor_bacth_i.to(self.device) label_Tsor_bacth_i = label_Tsor_bacth_i.to(self.device) pred_Tsor_bacth_i = gm_net(img_Tsor_bacth_i) max_likeli_pred_bacth_i = torch.argmax(pred_Tsor_bacth_i, dim=-1) error_num = (max_likeli_pred_bacth_i - label_Tsor_bacth_i).nonzero().shape[0] cur_acc = 1 - error_num / label_Tsor_bacth_i.shape[0] acc_Lst.append(cur_acc) return sum(acc_Lst) / len(acc_Lst)
def train(appliance_name, model, mains, appliance, epochs, batch_size, pretrain=False, checkpoint_interval=None, train_patience=3): # Model configuration if USE_CUDA: model = model.cuda() if not pretrain: model.apply(initialize) summary(model, (1, mains.shape[1])) # Split the train and validation set train_mains, valid_mains, train_appliance, valid_appliance = train_test_split(mains, appliance, test_size=.2, random_state=random_seed) # Create optimizer, loss function, and dataloader optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) loss_fn = torch.nn.MSELoss(reduction='mean') train_dataset = TensorDataset(torch.from_numpy(train_mains).float().permute(0, 2, 1), torch.from_numpy(train_appliance).float()) train_loader = tud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True) valid_dataset = TensorDataset(torch.from_numpy(valid_mains).float().permute(0, 2, 1), torch.from_numpy(valid_appliance).float()) valid_loader = tud.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True) writer = SummaryWriter(comment='train_visual') patience, best_loss = 0, None for epoch in range(epochs): # Earlystopping if (patience == train_patience): print("val_loss did not improve after {} Epochs, thus Earlystopping is calling".format(train_patience)) break # train the model model.train() st = time.time() for i, (batch_mains, batch_appliance) in enumerate(train_loader): if USE_CUDA: batch_mains = batch_mains.cuda() batch_appliance = batch_appliance.cuda() batch_pred = model(batch_mains) loss = loss_fn(batch_appliance, batch_pred) model.zero_grad() loss.backward() optimizer.step() ed = time.time() # Evaluate the model model.eval() with torch.no_grad(): cnt, loss_sum = 0, 0 for i, (batch_mains, batch_appliance) in enumerate(valid_loader): if USE_CUDA: batch_mains = batch_mains.cuda() batch_appliance = batch_appliance.cuda() batch_pred = model(batch_mains) loss = loss_fn(batch_appliance, batch_pred) loss_sum += loss cnt += 1 final_loss = loss_sum / cnt final_loss = loss_sum / cnt # Save best only if best_loss is None or final_loss < best_loss: best_loss = final_loss patience = 0 net_state_dict = model.state_dict() path_state_dict = "./" + appliance_name + "_AttentionCNN_best_state_dict.pt" torch.save(net_state_dict, path_state_dict) else: patience = patience + 1 print("Epoch: {}, Valid_Loss: {}, Time consumption: {}s.".format(epoch, final_loss, ed - st)) # For the visualization of training process for name, param in model.named_parameters(): writer.add_histogram(name + '_grad', param.grad, epoch) writer.add_histogram(name + '_data', param, epoch) writer.add_scalars("MSELoss", {"Valid": final_loss}, epoch) # Save checkpoint if (checkpoint_interval != None) and ((epoch + 1) % checkpoint_interval == 0): checkpoint = {"model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "epoch": epoch} path_checkpoint = "./" + appliance_name + "_AttentionCNN_{}_epoch.pkl".format(epoch) torch.save(checkpoint, path_checkpoint)
class Train: def __init__(self, root): self.summaryWriter = SummaryWriter("./logs") # 加载训练数据 self.train_dataset = datasets.CIFAR10(root, True, transform=transforms.ToTensor(), download=True) self.train_dataloader = DataLoader( self.train_dataset, batch_size=100, shuffle=True, ) # 加载测试数据 self.test_dataset = datasets.CIFAR10(root, False, transform=transforms.ToTensor(), download=True) self.test_dataloder = DataLoader( self.test_dataset, 100, ) # 创建模型 self.net = NetV2() # self.net.load_state_dict(torch.load("./checkpoint/2.t")) # self.net.to(DEVICE) # 创建优化器 self.opt = optim.Adam(self.net.parameters()) self.loos_fn = nn.CrossEntropyLoss() # 训练代码 def __call__(self): for epoch in range(100000): self.net.train() sum_loss = 0. for i, (imgs, tags) in enumerate(self.train_dataloader): y = self.net(imgs) #正则化 # L2 = [] # for param in self.net.parameters(): # L2 += torch.sum(param ** 2) # loss = torch.mean((tags - y) ** 2) loss = self.loos_fn(y, tags) # loss = self.loos_fn(y,tags) + 0.01*L2 self.opt.zero_grad() loss.backward() self.opt.step() sum_loss += loss.cpu().detach().item() avg_loss = sum_loss / len(self.train_dataloader) self.net.eval() sum_score = 0. test_sum_loss = 0. for i, (imgs, tags) in enumerate(self.test_dataloder): # imgs,tags = imgs.to(DRVICE),tags.to(DEVICE) test_y = self.net(imgs) # test_loss = torch.mean((tags - test_y) ** 2) test_loss = self.loos_fn(test_y, tags) test_sum_loss += test_loss.cpu().detach().item() pred_tags = torch.argmax(test_y, dim=1) # label_tags = torch.argmax(tags, dim=1) sum_score += torch.sum(torch.eq( pred_tags, tags).float()).cpu().detach().item() # 加载测试图片 self.summaryWriter.add_images("imgs", imgs[:10], epoch) test_avg_loss = test_sum_loss / len(self.test_dataloder) score = sum_score / len(self.test_dataset) self.summaryWriter.add_scalars("loss", { "train_loss": avg_loss, "test_loss": test_avg_loss }, epoch) self.summaryWriter.add_scalar("score", score, epoch) layer1_weight = self.net.seq[1].weight layer2_weight = self.net.seq[5].weight layer3_weight = self.net.seq[9].weight self.summaryWriter.add_histogram("later1", layer1_weight, epoch) self.summaryWriter.add_histogram("later2", layer2_weight, epoch) self.summaryWriter.add_histogram("later3", layer3_weight, epoch) print(epoch, avg_loss, test_avg_loss, score) torch.save(self.net.state_dict(), f"./checkpoint/{epoch}.t")
class Run(): def __init__(self, modeln="MyModel", val_length=10, batch_size=2, classifications_file="classifications.pkl", learning_rate=3e-2): # SAMPLE FOR VALIDATION AND TEST SETS self.val_length = val_length # , self.test_length = 20, 0 self.batch_size = batch_size self.classifications_file = classifications_file self.lr = learning_rate sample = random.sample(range(0, len(master_list)), k=self.val_length) # + self.test_length # sample = sample_by_label(master_list, val_size=self.val_length, n_min=2) # self.test_list = [e for i, e in enumerate(master_list) if i in sample[self.val_length:]] self.val_list = [e for i, e in enumerate(master_list) if i in sample] self.train_list = [ e for i, e in enumerate(master_list) if i not in sample ] print("train length: %s \t val length: %s \t test length: " % (len(self.train_list), len(self.val_list))) # , len(self.te))) self.train_dataset = PET_CT_Dataset(self.train_list) self.val_dataset = PET_CT_Dataset(self.val_list) # self.test_dataset = PET_CT_Dataset(self.test_list) self.train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=4, shuffle=True) self.val_loader = DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=4, shuffle=False, drop_last=False) # self.test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=4, shuffle=False) self.writer = SummaryWriter() self.modeln = modeln self.model = self._init_model(model_name=self.modeln) self.model = self.model.to(device) self.loss_ce = nn.BCELoss() # self.loss_ce = nn.BCEWithLogitsLoss() # naj bi se BCE loss uporabljal z sigmoidom, ne pa z softmax!! # oba optimizera sta kr cool :) self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr, weight_decay=5e-3, momentum=0.9) # works better? # self.optimizer = torch.optim.Adam(self.model.parameters(), lr=3e-3) # weight_decay=5e-3, momentum=0.9) self.global_step = 0 self.val_top_loss = 1e5 self.train_top_loss = 1e5 def _init_model(self, model_name): if model_name == "MyModel": return MyModel(num_classes=5) if model_name == 'resnet10': return resnet10(num_classes=5, activation="softmax") else: return None # TODO: implement forward function def forward(self, *inputs): raise NotImplementedError def epoch_train(self): self.model = self.model.train() epoch_loss = 0 for ct, pet, merged, label, _ in self.train_loader: self.optimizer.zero_grad() inp = torch.Tensor(merged.float()) inp = inp.to(device) # no schema error!! label = label.to(device) otpt = self.model(inp) loss = self.loss_ce(otpt, label) loss.backward() self.optimizer.step() epoch_loss += loss.sum().detach().cpu() epoch_loss /= len(self.train_list) self.writer.add_scalar("train_loss", epoch_loss, global_step=self.global_step) return epoch_loss def epoch_val(self): self.model = self.model.eval() epoch_loss = 0 log_txt = "" for ct, pet, merged, label, _ in self.val_loader: inp = torch.Tensor(merged.float()) inp = inp.to(device) label = label.to(device) otpt = self.model(inp) loss = self.loss_ce(otpt, label) epoch_loss += loss.sum().detach().cpu() # otpt = F.sigmoid(otpt) log_txt += f'truth: \t{str(label.detach().cpu().numpy())} output: \t{str(otpt.detach().cpu().numpy())}\n' epoch_loss /= len(self.val_list) self.writer.add_text("val_", text_string=log_txt, global_step=self.global_step) self.writer.add_scalar("val_loss", epoch_loss, global_step=self.global_step) return epoch_loss def evaluate_classification(self): self.model = torch.load( os.path.join(self.writer.log_dir, "best_val.pth")) label_list = m_list_settings['encoding'][1] try: with open(self.classifications_file, "rb") as f: classifications = pickle.load(f) except: classifications = dict() classifications['val_loss'] = list( ) # se itak požene na koncu, ko je že zoptimiziran.. classifications['model_version'] = list() classifications['truth'] = dict() classifications['pred'] = dict() classifications['CT_dirs'] = list() classifications['PET_dirs'] = list() for l in label_list: classifications['truth'][l] = list() classifications['pred'][l] = list() self.model = self.model.eval() val_loss = 0 for ct, pet, merged, label, entry in self.val_loader: inp = torch.Tensor(merged.float()) inp = inp.to(device) label = label.to(device) otpt = self.model(inp) loss = self.loss_ce(otpt, label) val_loss += loss.sum().detach().cpu() otpt = otpt.detach().cpu().numpy() label = label.detach().cpu().numpy() for b in range(otpt.shape[0]): # for example in batch classifications['model_version'].append(self.writer.log_dir) classifications['CT_dirs'].append(entry['CT_dir']) classifications['PET_dirs'].append(entry['PET_dir']) for il, l in enumerate(label_list): classifications['truth'][l].append(label[b, il]) classifications['pred'][l].append(otpt[b, il]) classifications['val_loss'].append(val_loss) with open(self.classifications_file, "wb") as f: pickle.dump(classifications, f) def train(self, no_epochs=10): for i in range(no_epochs): t0 = time() self.global_step += 1 tr = self.epoch_train() val = self.epoch_val() self.writer.add_scalars(main_tag="losses", tag_scalar_dict={ 'train_loss': tr, "val_loss": val }, global_step=self.global_step) if val < self.val_top_loss: torch.save(self.model, os.path.join(self.writer.log_dir, "best_val.pth")) self.val_top_loss = val print("saved_top_model_val") if tr < self.train_top_loss: torch.save(self.model, os.path.join(self.writer.log_dir, "best_tr.pth")) self.train_top_loss = tr print("saved_top_model_tr") print(f"STEP: {i} TRAINLOSS: {tr} VALLOSS {val} dt {time() - t0}") self.writer.close()
def main(): epoch = 500 history = { 'train_loss': [], 'test_loss': [], 'train_acc': [], 'test_acc': [] } loader = load_cifar10() net = CNN() criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(params=net.parameters(), lr=0.001, momentum=0.9) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') net.to(device) print(device) writer = SummaryWriter(log_dir='./logs') for e in range(epoch): net.train() loss = None for i, (images, labels) in enumerate(loader['train']): images = images.to(device) labels = labels.to(device) optimizer.zero_grad() output = net(images) loss = criterion(output, labels) loss.backward() optimizer.step() if i % 10 == 0: print( f'Training log: {e+1:03} epoch ({(i+1)*128:05} / 50000 train. data). Loss: {loss.item()}' ) history['train_loss'].append(loss.item()) net.eval() correct = 0 with torch.no_grad(): for i, (images, labels) in enumerate(loader['train']): images = images.to(device) labels = labels.to(device) outputs = net(images) _, predicted = torch.max(outputs.data, 1) correct += (predicted == labels).sum().item() acc = float(correct / 50000) history['train_acc'].append(acc) print(f'Accuracy on train. data: {acc}') loss_test = None correct = 0 with torch.no_grad(): for i, (images, labels) in enumerate(loader['test']): images = images.to(device) labels = labels.to(device) outputs = net(images) _, predicted = torch.max(outputs.data, 1) correct += (predicted == labels).sum().item() loss_test = criterion(outputs, labels) acc_test = float(correct / 10000) history['test_acc'].append(acc_test) history['test_loss'].append(loss_test.item()) print(f'Accuracy on test data: {acc_test}') print(f'Loss on test: {loss_test.item()}') writer.add_scalars('Loss', { 'train': loss.item(), 'test': loss_test.item() }, e) writer.add_scalars('Accuracy', {'train': acc, 'test': acc_test}, e) print(history) writer.close()
class Trainer(object): def __init__(self, model, train_loader, val_loader, args, device, logging): self.train_loader = train_loader self.val_loader = val_loader self.args = args self.device = device self.logging = logging self.criterion = nn.CrossEntropyLoss() self.optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=0) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, mode='max', factor=0.5, patience=3, verbose=True, min_lr=1e-5) if args.action == 'train': self.writer = SummaryWriter(log_dir=args.tensorboard_dir) self.inputs = next(iter(train_loader))[0] self.writer.add_graph(model, self.inputs.to(device, dtype=torch.float32)) if args.DataParallel: self.model = torch.nn.DataParallel(model) else: self.model = model def train(self): epochs = self.args.epochs n_train = len(self.train_loader.dataset) step = 0 best_acc = 0. accs = AverageMeter() for epoch in range(epochs): self.model.train() epoch_loss = 0 # training with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img') as pbar: for batch in self.train_loader: images, labels = batch[0], batch[1] images = images.to(device=self.device, dtype=torch.float32) labels = labels.to(device=self.device, dtype=torch.long) preds = self.model(images) loss = self.criterion(preds, labels) epoch_loss += loss.item() self.optimizer.zero_grad() loss.backward() self.optimizer.step() accs.update((preds.argmax(1) == labels).sum().item() / images.size(0), images.size(0)) pbar.set_postfix(**{'loss': loss.item(), 'acc': accs.avg}) self.writer.add_scalar('acc/train', accs.avg, step) self.writer.add_scalar('Loss/train', loss.item(), step) pbar.update(images.shape[0]) step = step + 1 # eval if (epoch + 1) % self.args.val_epoch == 0: acc = self.test(mode='val') if acc > best_acc: best_acc = acc if self.args.save_path: if not os.path.exists(self.args.save_path): os.makedirs(self.args.save_path) torch.save(self.model.state_dict(), f'{self.args.save_path}/best_model.pth') self.logging.info( char_color(f'best model saved !', word=33)) self.logging.info(f'acc: {acc}') self.writer.add_scalars('Valid', {'acc': acc}, step) self.writer.add_scalar('learning_rate', self.optimizer.param_groups[0]['lr'], step) self.scheduler.step(acc) if (epoch + 1) % self.args.save_model_epoch == 0: if self.args.save_path: if not os.path.exists(self.args.save_path): os.makedirs(self.args.save_path) model_name = f'{self.args.task}_' torch.save( self.model.state_dict(), f'{self.args.save_path}/{model_name}{epoch + 1}.pth') self.logging.info( char_color(f'Checkpoint {epoch + 1} saved !')) self.writer.close() def test(self, mode='val', model_path=None, aug=False): self.model.train(False) self.model.eval() accs = AverageMeter() test_len = len(self.val_loader) step = 0 with torch.no_grad(): with tqdm(total=test_len, desc=f'{mode}', unit='batch') as pbar: for batch in self.val_loader: images, labels = batch[0], batch[1] images = images.to(device=self.device, dtype=torch.float32) labels = labels.to(device=self.device, dtype=torch.long) preds = self.model(images) accs.update((preds.argmax(1) == labels).sum().item() / images.size(0), images.size(0)) pbar.set_postfix(**{'acc': accs.avg}) pbar.update(images.shape[0]) step = step + 1 return accs.avg
class Train: def __init__(self, root): self.epoch = 100000 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.param_path = 'param/params.pt' # 加载训练集 self.train_dataset = MyDataset(root) self.train_dataloader = DataLoader(self.train_dataset, batch_size=100, shuffle=True, num_workers=8) # 加载验证集 self.test_dataset = MyDataset(root, is_train=False) self.test_dataloader = DataLoader(self.test_dataset, batch_size=10, shuffle=False, num_workers=0) # 定义网络 # self.net = RnnNet().to(self.device) self.net = Seq2seqNet().to(self.device) # 加载参数 if os.path.exists(self.param_path): self.net.load_state_dict(torch.load(self.param_path)) # 定义优化器和损失 self.optim = torch.optim.Adam(self.net.parameters()) self.loss_func = nn.MSELoss() def __call__(self): self.summ = SummaryWriter('./logs') for epoch in range(self.epoch): loss_train_sum = 0 for i, (img, label) in enumerate(tqdm(self.train_dataloader)): img = img.to(self.device) label = label.to(self.device) out = self.net(img) loss = self.loss_func(out, label) self.optim.zero_grad() loss.backward() self.optim.step() loss_train_sum += loss.detach().cpu().item() loss_train_avg = torch.true_divide(loss_train_sum, len(self.train_dataloader)) loss_test_sum = 0 acc = 0 for i, (img, tage) in enumerate(tqdm(self.test_dataloader)): # 把数据的标签放入GPU进行计算 input, test_tage = img.to(self.device), tage.to(self.device) test_output = self.net(input) perd = torch.argmax(test_output, 2).detach().cpu().numpy() label = torch.argmax(tage, 2).detach().cpu().numpy() loss = self.loss_func(test_output, test_tage) loss_test_sum += loss.cpu().item() acc += np.mean(np.all(perd == label, axis=1)) loss_test_avg = torch.true_divide(loss_test_sum, len(self.test_dataloader)) acc_avg = torch.true_divide(acc, len(self.test_dataloader)) # add_scalars用来保存多个值, add_scalar只能保存一个 self.summ.add_scalars("loss", { "train_avg_loss": loss_train_avg, "test_avg_loss": loss_test_avg }, epoch) self.summ.add_scalar("acc", acc_avg, epoch) # 保存网络参数 w, b,不会自动创建文件 需要先将文件夹创建出来,按轮次保存,保存的格式为 .apk 或则 .t 文件 为二进制文件 # 防止出现意外情况,保留参数 torch.save(self.net.state_dict(), self.param_path) print(epoch, "训练损失", loss_train_avg.item(), "测试损失", loss_test_avg.item(), "得分", acc_avg.item())
for epoch in range(20): total_train_loss, train_avg_auc, train_auc, train_data_pr, train_duration = one_epoch_train( model, train_loader, optimizer, criterion, device, scaler, iters_to_accumulate=8, clip_grads=False) total_val_loss, val_avg_auc, val_auc, val_data_pr, val_duration = eval_model( model, val_loader, device, criterion, scaler) scheduler.step() writer.add_scalars('avg/loss', { 'train': total_train_loss, 'val': total_val_loss }, epoch) writer.add_scalars('avg/auc', { 'train': train_avg_auc, 'val': val_avg_auc }, epoch) writer.flush() print( 'EPOCH %d:\tTRAIN [duration %.3f sec, loss: %.3f, avg auc: %.3f]\t\t' 'VAL [duration %.3f sec, loss: %.3f, avg auc: %.3f]\tCurrent time %s' % (epoch + 1, train_duration, total_train_loss, train_avg_auc, val_duration, total_val_loss, val_avg_auc, str(datetime.now(timezone('Europe/Moscow'))))) torch.save(
def train_gan(training_config): writer = SummaryWriter() device = torch.device("cpu") # Download MNIST dataset in the directory data mnist_data_loader = utils.get_mnist_data_loader( training_config['batch_size']) discriminator_net, generator_net = utils.get_gan(device, GANType.CLASSIC.name) discriminator_opt, generator_opt = utils.get_optimizers( discriminator_net, generator_net) adversarial_loss = nn.BCELoss() real_image_gt = torch.ones((training_config['batch_size'], 1), device=device) fake_image_gt = torch.zeros((training_config['batch_size'], 1), device=device) ref_batch_size = 16 ref_noise_batch = utils.get_gaussian_latent_batch(ref_batch_size, device) discriminator_loss_values = [] generator_loss_values = [] img_cnt = 0 ts = time.time() utils.print_training_info_to_console(training_config) for epoch in range(training_config['num_epochs']): for batch_idx, (real_images, _) in enumerate(mnist_data_loader): real_images = real_images.to(device) # Train discriminator discriminator_opt.zero_grad() real_discriminator_loss = adversarial_loss( discriminator_net(real_images), real_image_gt) fake_images = generator_net( utils.get_gaussian_latent_batch(training_config['batch_size'], device)) fake_images_predictions = discriminator_net(fake_images.detach()) fake_discriminator_loss = adversarial_loss(fake_images_predictions, fake_image_gt) discriminator_loss = real_discriminator_loss + fake_discriminator_loss discriminator_loss.backward() discriminator_opt.step() # Train generator generator_opt.zero_grad() generated_images_prediction = discriminator_net( generator_net( utils.get_gaussian_latent_batch( training_config['batch_size'], device))) generator_loss = adversarial_loss(generated_images_prediction, real_image_gt) generator_loss.backward() generator_opt.step() # Logging and checkpoint creation generator_loss_values.append(generator_loss.item()) discriminator_loss_values.append(discriminator_loss.item()) if training_config['enable_tensorboard']: writer.add_scalars( 'Losses/g-and-d', { 'g': generator_loss.item(), 'd': discriminator_loss.item() }, len(mnist_data_loader) * epoch + batch_idx + 1) if training_config[ 'debug_imagery_log_freq'] is not None and batch_idx % training_config[ 'debug_imagery_log_freq'] == 0: with torch.no_grad(): log_generated_images = generator_net(ref_noise_batch) log_generated_images_resized = nn.Upsample( scale_factor=2, mode='nearest')(log_generated_images) intermediate_imagery_grid = make_grid( log_generated_images_resized, nrow=int(np.sqrt(ref_batch_size)), normalize=True) writer.add_image( 'intermediate generated imagery', intermediate_imagery_grid, len(mnist_data_loader) * epoch + batch_idx + 1) if training_config[ 'console_log_freq'] is not None and batch_idx % training_config[ 'console_log_freq'] == 0: print( f'GAN training: time elapsed = {(time.time() - ts):.2f} [s] | epoch={epoch + 1} | batch= [{batch_idx + 1}/{len(mnist_data_loader)}]' ) # Save intermediate generator images if training_config[ 'debug_imagery_log_freq'] is not None and batch_idx % training_config[ 'debug_imagery_log_freq'] == 0: with torch.no_grad(): log_generated_images = generator_net(ref_noise_batch) log_generated_images_resized = nn.Upsample( scale_factor=2, mode='nearest')(log_generated_images) save_image(log_generated_images_resized, os.path.join(training_config['debug_path'], f'{str(img_cnt).zfill(6)}.jpg'), nrow=int(np.sqrt(ref_batch_size)), normalize=True) img_cnt += 1 # Save generator checkpoint if training_config['checkpoint_freq'] is not None and ( epoch + 1 ) % training_config['checkpoint_freq'] == 0 and batch_idx == 0: ckpt_model_name = f"Classic_ckpt_epoch_{epoch + 1}_batch_{batch_idx + 1}.pth" torch.save( utils.get_training_state(generator_net, GANType.CLASSIC.name), os.path.join(CHECKPOINTS_PATH, ckpt_model_name)) torch.save(utils.get_training_state(generator_net, GANType.CLASSIC.name), os.path.join(BINARIES_PATH, utils.get_available_binary_name()))
mb_loss = loss_fn(y_hat_mb, y_mb) mb_loss.backward() opt.step() with torch.no_grad(): mb_acc = acc(y_hat_mb, y_mb) tr_loss += mb_loss.item() tr_acc += mb_acc.item() if (epoch * len(tr_dl) + step) % model_config.summary_step == 0: val_loss = evaluate(model, val_dl, {'loss': loss_fn}, device)['loss'] writer.add_scalars('loss', { 'train': tr_loss / (step + 1), 'val': val_loss }, epoch * len(tr_dl) + step) model.train() else: tr_loss /= (step + 1) tr_acc /= (step + 1) tr_summary = {'loss': tr_loss, 'acc': tr_acc} val_summary = evaluate(model, val_dl, { 'loss': loss_fn, 'acc': acc }, device) scheduler.step(val_summary['loss']) tqdm.write('epoch : {}, tr_loss: {:.3f}, val_loss: ' '{:.3f}, tr_acc: {:.2%}, val_acc: {:.2%}'.format(
loss.backward() optimizer.step() tr_loss += loss.item() tr_loss /= (step + 1) model.eval() val_loss = 0 for step, batch in enumerate(val_dl): h, t, r = map(lambda elm: elm.to(device), batch) n_h, n_t = sampler.corrupt_batch(h, t, r) with torch.no_grad(): pos, neg = model(h, t, n_h, n_t, r) loss = criterion(pos, neg) val_loss += loss.item() val_loss /= (step + 1) writer.add_scalars('loss', {'train': tr_loss, 'val': val_loss}, epoch) if (epoch + 1) % args.summary_step == 0: tqdm.write( 'Epoch {} | train loss: {:.5f}, valid loss: {:.5f}'.format( epoch + 1, tr_loss, val_loss)) model.normalize_parameters() is_best = val_loss < best_val_loss if is_best: state = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } summary = { 'training loss': round(tr_loss, 4), 'validation loss': round(val_loss, 4)
def main(): env = gym.make(args.environment) agent_ = getattr(Agent, args.agent.replace(' ', '') + 'Agent') if args.test: args.load_models = True args.render = True print(args) if args.agent == 'DDPG': max_action = float(env.action_space.high[0]) agent = agent_(state_dim=env.observation_space.shape, action_dim=env.action_space.shape, hidden_dims=args.hidden_dims, max_action=max_action, gamma=args.gamma, tau=args.tau, critic_lr=args.critic_lr, critic_wd=args.critic_wd, actor_lr=args.actor_lr, actor_wd=args.actor_wd, batch_size=args.batch_size, final_init=args.final_init, maxsize=int(args.maxsize), sigma=args.sigma, theta=args.theta, dt=args.dt, checkpoint=args.checkpoint) elif args.agent == 'TD3': max_action = float(env.action_space.high[0]) agent = agent_(state_dim=env.observation_space.shape, action_dim=env.action_space.shape, hidden_dims=args.hidden_dims, max_action=max_action, gamma=args.gamma, tau=args.tau, critic_lr=args.critic_lr, critic_wd=args.critic_wd, actor_lr=args.actor_lr, actor_wd=args.actor_wd, batch_size=args.batch_size, final_init=args.final_init, maxsize=int(args.maxsize), sigma=args.sigma, theta=args.theta, dt=args.dt, checkpoint=args.checkpoint, actor_update_iter=args.actor_update_iter, action_sigma=args.action_sigma, action_clip=args.action_clip) elif args.agent == 'SAC': max_action = float(env.action_space.high[0]) agent = agent_( state_dim=env.observation_space.shape, action_dim=env.action_space.shape, hidden_dims=args.hidden_dims, max_action=max_action, gamma=args.gamma, tau=args.tau, alpha=args.alpha, lr=args.critic_lr, batch_size=args.batch_size, maxsize=int(args.maxsize), log_std_min=args.log_std_min, log_std_max=args.log_std_max, epsilon=args.epsilon, checkpoint=args.checkpoint, ) else: agent = agent_(state_dim=env.observation_space.shape, actionaction_dim_dim=env.action_space.n, hidden_dims=args.hidden_dims, gamma=args.gamma, lr=args.lr) Path(args.logdir).mkdir(parents=True, exist_ok=True) Path(args.checkpoint).mkdir(parents=True, exist_ok=True) writer = SummaryWriter(args.logdir) if args.load_models: agent.load_models(args.agent + '_' + args.environment) pbar = tqdm(range(args.n_episodes)) score_history = deque(maxlen=args.window_legnth) best_score = -np.inf for e in pbar: done, score, observation = False, 0, env.reset() # reset DDPG UO Noise and also keep track of actor/critic losses if args.agent in ['DDPG', 'TD3', 'SAC']: if args.agent == 'DDPG': agent.noise.reset() actor_losses, critic_losses = [], [] while not done: if args.render: env.render(mode='human') action = agent.choose_action(observation, args.test) next_observation, reward, done, _ = env.step(action) score += reward # update for td methods, recording for mc methods if args.test: continue elif args.agent == 'Actor Critic': agent.update(reward, next_observation, done) elif args.agent in ['DDPG', 'TD3', 'SAC']: agent.store_transition(observation, action, reward, next_observation, done) # if we have memory smaller than batch size, do not update if agent.memory.idx < args.batch_size or ( args.agent == 'TD3' and agent.ctr < args.warmup_steps): continue else: actor_loss, critic_loss = agent.update() actor_losses.append(actor_loss) critic_losses.append(critic_loss) pbar.set_postfix({ 'Reward': reward, 'Actor Loss': actor_loss, 'Critic Loss': critic_loss }) else: agent.store_reward(reward) observation = next_observation score_history.append(score) if args.test: continue # update for mc methods w/ full trajectory elif args.agent == 'Policy Gradient': agent.update() # logging & saving elif args.agent in ['DDPG', 'TD3', 'SAC']: writer.add_scalars('Scores', { 'Episodic': score, 'Windowed Average': np.mean(score_history) }, global_step=e) if actor_losses: loss_dict = { 'Actor': np.mean(actor_losses), 'Critic': np.mean(critic_losses) } writer.add_scalars('Losses', loss_dict, global_step=e) actor_losses, critic_losses = [], [] if np.mean(score_history) > best_score: best_score = np.mean(score_history) agent.save_models(args.agent + '_' + args.environment) tqdm.write( f'Episode: {e + 1}/{args.n_episodes}, Score: {score}, Average Score: {np.mean(score_history)}' )
def main(_A: argparse.Namespace): apex = False is_cpu = False if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") is_cpu = True else: # Get the current device as set for current distributed process. # Check `launch` function in `virtex.utils.distributed` module. device = torch.cuda.current_device() # Create a config object (this will be immutable) and perform common setup # such as logging and setting up serialization directory. _C = Config(_A.config, _A.config_override) common_setup(_C, _A) # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, OPTIMIZER # ------------------------------------------------------------------------- tokenizer = TokenizerFactory.from_config(_C) train_dataset = PretrainingDatasetFactory.from_config(_C, split="train", csv=_A.train_csv) val_dataset = PretrainingDatasetFactory.from_config(_C, split="val", csv=_A.val_csv) train_dataloader = DataLoader( train_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), #sampler= Sampler(train_dataset), sampler=DistributedSampler(train_dataset, shuffle=True), num_workers=_A.cpu_workers, pin_memory=True, drop_last=True, collate_fn=train_dataset.collate_fn, ) val_dataloader = DataLoader( val_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), # sampler = Sampler(val_dataset), sampler=DistributedSampler(val_dataset, shuffle=False), num_workers=_A.cpu_workers, pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn, ) model = PretrainingModelFactory.from_config(_C).to(device) optimizer = OptimizerFactory.from_config(_C, model.named_parameters()) scheduler = LRSchedulerFactory.from_config(_C, optimizer) # ------------------------------------------------------------------------- # BEFORE TRAINING STARTS # ------------------------------------------------------------------------- # Load checkpoint to resume training if specified. if _A.resume_from is not None: start_iteration = CheckpointManager(model=model, optimizer=optimizer, scheduler=scheduler).load( _A.resume_from) else: start_iteration = 0 # Keep track of time per iteration and ETA. timer = Timer( start_from=start_iteration + 1, total_iterations=_C.OPTIM.NUM_ITERATIONS, ) # Create an iterator from dataloader to sample batches perpetually. train_dataloader_iter = cycle(train_dataloader, device, start_iteration) if (not is_cpu): # Wrap model and optimizer using NVIDIA Apex for mixed precision training. # NOTE: Always do this before wrapping model with DistributedDataParallel. if apex: if _C.FP16_OPT > 0: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=f"O{_C.FP16_OPT}") # Wrap model in DDP if using more than one processes. if dist.get_world_size() > 1: dist.synchronize() model = nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) # Create checkpoint manager and tensorboard writer (only in master process). if dist.is_master_process(): checkpoint_manager = CheckpointManager( _A.serialization_dir, model=model, optimizer=optimizer, scheduler=scheduler, ) tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) tensorboard_writer.add_text("config", f"```\n{_C}\n```") # ------------------------------------------------------------------------- # TRAINING LOOP # ------------------------------------------------------------------------- for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1): timer.tic() optimizer.zero_grad() batch_loss = torch.tensor(0.0, device=device) batch = next(train_dataloader_iter) output_dict = model(batch) loss = output_dict["loss"] batch_loss += loss.item() # Perform dynamic scaling of loss to adjust for mixed precision. if apex and _C.FP16_OPT > 0: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Clip norm of gradients before optimizer step. torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer) if apex and _C.FP16_OPT > 0 else model.parameters(), _C.OPTIM.CLIP_GRAD_NORM, ) optimizer.step() scheduler.step(iteration) timer.toc() # --------------------------------------------------------------------- # TENSORBOARD LOGGING # --------------------------------------------------------------------- if iteration % _A.log_every == 0 and dist.is_master_process(): logger.info(f"{timer.stats} | Loss: {batch_loss:.3f} | " f"GPU mem: {dist.gpu_mem_usage()} MB") tensorboard_writer.add_scalars( "learning_rate", { "visual": optimizer.param_groups[0]["lr"], "common": optimizer.param_groups[-1]["lr"], }, iteration, ) tensorboard_writer.add_scalars("train", output_dict["loss_components"], iteration) # --------------------------------------------------------------------- # VALIDATION # --------------------------------------------------------------------- if iteration % _A.checkpoint_every == 0: if dist.is_master_process(): checkpoint_manager.step(iteration) torch.set_grad_enabled(False) model.eval() # Accumulate different val loss components according to the type of # pretraining model. val_loss_counter: Counter = Counter() for val_iteration, val_batch in enumerate(val_dataloader, start=1): for key in val_batch: val_batch[key] = val_batch[key].to(device) output_dict = model(val_batch) val_loss_counter.update(output_dict["loss_components"]) # Divide each loss component by number of val batches per GPU. val_loss_dict = { k: v / val_iteration for k, v in dict(val_loss_counter).items() } dist.average_across_processes(val_loss_dict) torch.set_grad_enabled(True) model.train() if iteration % _A.checkpoint_every == 0 and dist.is_master_process(): logger.info(f"Iter: {iteration} | Val loss: {val_loss_dict}") tensorboard_writer.add_scalars("val", val_loss_dict, iteration) # All processes will wait till master process is done logging. dist.synchronize()
class Trainer(object): def __init__(self, dataset, model, optimizer, batch_size=64, annealing_strategy='logistic', kl_anneal_rate=0.01, kl_anneal_time=100, kl_anneal_target=1., label_anneal_rate=0.01, label_anneal_time=100, label_anneal_target=1., add_bow_loss=False, force_cpu=False, run_dir=None, alpha=1.): self.force_cpu = force_cpu self.dataset = dataset self.model = model self.optimizer = optimizer self.i2w = dataset.i2w self.i2int = dataset.i2int self.w2i = dataset.w2i self.int2i = dataset.int2i self.batch_size = batch_size self.annealing_strategy = annealing_strategy self.kl_anneal_time = kl_anneal_time self.kl_anneal_rate = kl_anneal_rate self.kl_anneal_target = kl_anneal_target self.label_anneal_time = label_anneal_time self.label_anneal_rate = label_anneal_rate self.label_anneal_target = label_anneal_target self.add_bow_loss = add_bow_loss self.alpha = alpha self.epoch = 0 self.step = 0 self.latent_rep = {intent: [] for intent in self.i2int} self.run_logs = { 'train': { 'recon_loss': [], 'kl_losses': [[] for _ in range(self.model.z_size)], 'conditioning_accuracy': [], 'total_loss': [], 'classifications': { real_intent: {pred_intent: 0 for pred_intent in self.i2int} for real_intent in self.i2int }, 'transfer': { real_intent: torch.zeros(model.cat_size) for real_intent in self.i2int } }, 'dev': { 'recon_loss': [], 'kl_loss': [], 'conditioning_accuracy': [], 'total_loss': [], 'classifications': { real_intent: {pred_intent: 0 for pred_intent in self.i2int} for real_intent in self.i2int }, 'transfer': { real_intent: torch.zeros(model.cat_size) for real_intent in self.i2int } } } self.summary_writer = SummaryWriter(log_dir=run_dir) def run(self, n_epochs, dev_step_every_n_epochs=1): train_iter, val_iter = self.dataset.get_iterators( batch_size=self.batch_size) for idx in range(n_epochs): import gc gc.collect() torch.cuda.empty_cache() self.epoch += 1 is_last_epoch = self.epoch == n_epochs - 1 train_loss, train_recon_loss, train_kl_loss, train_acc = self.do_one_sweep( train_iter, is_last_epoch, "train") # logging LOGGER.info('Training loss after epoch %d: %f', self.epoch, train_loss.item()) LOGGER.info('Training reconstruction loss after epoch %d: %f', self.epoch, train_recon_loss.item()) LOGGER.info('Training KL loss after epoch %d: %f', self.epoch, train_kl_loss.item()) LOGGER.info('Training accuracy after epoch %d: %f', self.epoch, train_acc) self.summary_writer.add_scalar( 'train/total-loss', train_loss.cpu().detach().numpy().item(), self.epoch) self.run_logs['train']['total_loss'].append( train_loss.cpu().detach().numpy().item()) if (idx + 1) % dev_step_every_n_epochs == 0: dev_loss, dev_recon_loss, dev_kl_loss, dev_acc = self.do_one_sweep( val_iter, is_last_epoch, "dev") # logging LOGGER.info('Dev loss after epoch %d: %f', self.epoch, dev_loss) LOGGER.info('Dev recon loss after epoch %d: %f', self.epoch, dev_recon_loss) LOGGER.info('Dev KL loss after epoch %d: %f', self.epoch, dev_kl_loss) LOGGER.info('Dev acc after epoch %d: %f', self.epoch, dev_acc) # summaries self.summary_writer.add_scalar( 'dev/recon-loss', dev_recon_loss.cpu().detach().numpy().item(), self.epoch) self.run_logs['dev']['recon_loss'].append( dev_recon_loss.cpu().detach().numpy().item()) self.summary_writer.add_scalar( 'dev/kl-loss', dev_kl_loss.detach().cpu().detach().numpy(), self.epoch) self.run_logs['dev']['kl_loss'].append( dev_kl_loss.cpu().detach().numpy().item()) self.summary_writer.add_scalar( 'dev/total-loss', dev_loss.detach().cpu().detach().numpy(), self.epoch) self.run_logs['dev']['total_loss'].append( dev_loss.cpu().detach().numpy().item()) def do_one_sweep(self, iter, is_last_epoch, train_or_dev): if train_or_dev not in ['train', 'dev']: raise TypeError("train_or_dev should be either train or dev") if train_or_dev == "train": self.model.train() else: self.model.eval() sweep_loss = 0 sweep_recon_loss = 0 sweep_kl_loss = 0 sweep_accuracy = 0 n_batches = 0 for iteration, batch in enumerate(tqdm(iter)): # if len(batch) < self.batch_size and : # continue if train_or_dev == "train": self.step += 1 self.optimizer.zero_grad() # forward pass x, lengths = getattr(batch, self.dataset.input_type) input = x[:, :-1] # remove <eos> target = x[:, 1:] # remove <sos> lengths -= 1 # account for the removal input, target = to_device(input, self.force_cpu), to_device( target, self.force_cpu) y = None if self.model.conditional is not None: y = batch.intent.squeeze() y = to_device(y, self.force_cpu) sorted_lengths, sorted_idx = torch.sort(lengths, descending=True) y = y[sorted_idx] logp, mean, logv, logc, z, bow = self.model(input, lengths) if is_last_epoch: _, reversed_idx = torch.sort(sorted_idx) y = y[reversed_idx] logc = logc[reversed_idx] real_labels = [self.i2int[label] for label in y] pred_labels = [ self.i2int[label] if label < len(self.i2int) else 'None' for label in logc.max(1)[1] ] for real_label, pred_label in zip(real_labels, pred_labels): self.run_logs[train_or_dev]['classifications'][real_label][ pred_label] += 1 for real_label in real_labels: self.run_logs[train_or_dev]['transfer'][ real_label] += logc.sum(dim=0).cpu().detach() # save latent representation if train_or_dev == "train" and self.model.conditional: for i, intent in enumerate(y): self.latent_rep[self.i2int[intent]].append( z[i].cpu().detach().numpy()) # loss calculation loss, recon_loss, kl_loss, accuracy = self.compute_loss( logp, bow, target, lengths, mean, logv, logc, y, train_or_dev) sweep_loss += loss sweep_recon_loss += recon_loss sweep_kl_loss += kl_loss sweep_accuracy += accuracy n_batches += 1 if train_or_dev == "train": loss.backward() self.optimizer.step() if is_last_epoch: for intent1 in self.i2int: n_sentences = sum(self.run_logs[train_or_dev] ['classifications'][intent1].values()) self.run_logs[train_or_dev]['transfer'][intent1] /= n_sentences for intent2 in self.i2int: self.run_logs[train_or_dev]['classifications'][intent1][ intent2] /= n_sentences return sweep_loss / n_batches, sweep_recon_loss / n_batches, \ sweep_kl_loss / n_batches, sweep_accuracy / n_batches def compute_loss(self, logp, bow, target, length, mean, logv, logc, y, train_or_dev): batch_size, seqlen, vocab_size = logp.size() target = target.view(batch_size, -1) # reconstruction loss recon_loss = compute_recon_loss(self.dataset.pad_idx, vocab_size, length, logp, target) # kl loss kl_weight, kl_losses = compute_kl_loss(logv, mean, self.annealing_strategy, self.step, self.kl_anneal_rate, self.kl_anneal_time, self.kl_anneal_target) kl_loss = torch.sum(kl_losses) total_loss = (recon_loss + kl_weight * kl_loss) # bow loss if self.add_bow_loss: total_loss += compute_bow_loss(batch_size, bow, target) # labels loss if self.model.conditional == 'supervised': if 'None' in self.i2int: none_idx = self.int2i['None'] else: none_idx = -100 label_loss, label_weight = compute_label_loss( logc, y, self.annealing_strategy, self.step, self.label_anneal_time, self.label_anneal_rate, self.label_anneal_target, none_idx, self.alpha) total_loss += label_weight * label_loss elif self.model.conditional == 'unsupervised': entropy = torch.sum( torch.exp(logc) * torch.log(self.model.n_classes * torch.exp(logc))) total_loss += entropy # summaries if train_or_dev == "train": self.summary_writer.add_scalar( train_or_dev + '/recon-loss', recon_loss.detach().cpu().detach().numpy() / batch_size, self.step) self.run_logs[train_or_dev]['recon_loss'].append( recon_loss.cpu().detach().numpy() / batch_size) for i in range(self.model.z_size): self.summary_writer.add_scalars( train_or_dev + '/kl-losses', { str(i): kl_losses[i].cpu().detach().numpy().item() / batch_size }, self.step) self.run_logs[train_or_dev]['kl_losses'][i].append( kl_losses[i].cpu().detach().numpy().item() / batch_size) n_correct = 0 if self.model.conditional is not None: mask = y != self.int2i['None'] # ignore nones pred_labels = logc[mask].data.max(1)[1].long() true_labels = y[mask].data n_correct = pred_labels.eq(true_labels).cpu().sum().float().item() self.summary_writer.add_scalar(train_or_dev + '/conditioning-accuracy', n_correct / len(true_labels), self.step) self.run_logs[train_or_dev]['conditioning_accuracy'].append( n_correct / len(true_labels)) return total_loss / batch_size, recon_loss / batch_size, \ kl_loss / batch_size, n_correct / len(true_labels)
''' 安装好tensorboard后测试是否能够添加标量 ''' import numpy as np from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter(comment='test_tensorboard') for x in range(100): writer.add_scalar('y=2x', x * 2, x) writer.add_scalar('y=pow(2, x)', 2**x, x) writer.add_scalars('data/scalar_group', { "xsinx": x * np.sin(x), "xcosx": x * np.cos(x), "arctanx": np.arctan(x) }, x) writer.close()
cum_labels = torch.Tensor().to(device) for batch_n, batch in enumerate(masked_loader): batch = batch.to(device) out, _ = model(batch) labels = batch.y.to(device) weights = generate_weights(labels).to(device) te_loss = F.binary_cross_entropy( out, target=labels, weight=generate_weights(labels)) pred = out.detach().round().to(device) cum_labels = torch.cat((cum_labels, labels.clone().detach()), dim=0) cum_pred = torch.cat((cum_pred, pred.clone().detach()), dim=0) roc_auc_masked = roc_auc_score(cum_labels.cpu(), cum_pred.cpu()) writer.add_scalars('Loss', { 'train': tr_loss, 'test': te_loss }, epoch) writer.add_scalars('ROC AUC', { 'train': roc_auc, 'test': roc_auc_te, 'masked': roc_auc_masked }, epoch) writer.add_scalar('learning rate', learn_rate, epoch) print("---- Round {}: tr_loss={:.4f} te_roc_auc:{:.4f} lr:{:.6f}". format(epoch, loss, roc_auc_te, learn_rate)) # -------------- MODEL SAVING ------------------------ if roc_auc_te > max_roc_auc: max_roc_auc = roc_auc_te path = './{}/best_{}.pt'.format(modelpath, model_n)
class TensorBoardVisualize(): def __init__(self, experiment_name, logdir, dic, hyperparam={"hyper":1}): current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") experiment_name = experiment_name+"_"+current_time self.tensorboard_writer = SummaryWriter( log_dir=pth.join(logdir,experiment_name), filename_suffix=experiment_name) self.comet_exp = comet_ml.Experiment(project_name="masterthesis") self.comet_exp.log_parameters(hyperparam) self.comet_exp.log_asset("train.py") self.comet_exp.log_asset("visualize.py") self.comet_exp.log_asset("dataset.py") self.comet_exp.log_asset("model.py") self.word_dic = {v: k for k, v in dic['word_dic'].items()} self.answer_dic = {v: k for k, v in dic['answer_dic'].items()} self.word_vect = np.vectorize(lambda x: self.word_dic[x] if x > 0 else "") self.answer_vect = np.vectorize(lambda x: self.answer_dic[x]) self.hooks = {} self.epoch = 0 self.step = 0 def set_epoch_step(self,epoch,step): self.epoch = epoch self.step = step def register_hook(self,key,hook): self.hooks[key] = hook def append_histogram(self, x, y, chart): self.tensorboard_writer.add_histogram(chart, y, x) #self.tensorboard_writer.close() def append_line(self, x, y_dic, chart): self.tensorboard_writer.add_scalars(chart, y_dic, x) #self.tensorboard_writer.close() def comet_line(self,y_dic,prefix): self.comet_exp.log_metrics(y_dic,prefix=prefix,epoch=self.epoch,step=self.step) def comet_image(self,images,chart): for i,comet_image in enumerate(images): self.comet_exp.log_image( comet_image.squeeze(0), name=f"{chart}_{i}", image_format="png", image_channels="first", step=self.step) def add_images(self,x,images,chart): self.tensorboard_writer.add_images( chart, images, global_step=x, walltime=None, dataformats='NCHW') self.comet_image(images,chart) # def add_conv2(self,x,module,chart,hook_name,mask,n_act,suffix=""): # #weights and gradients # weights = module.weight.data.cpu().numpy() # gradients = module.weight.grad.cpu().numpy() # self.append_histogram(x, weights.reshape(-1), f"{chart}_weights") # self.append_histogram(x, gradients.reshape(-1), f"{chart}_gradients") # #need hook # act_hook = self.hooks[hook_name] # act = act_hook.get_features()[mask][:n_act].mean(1,keepdim=True).cpu() # self.add_images( # x, # act, # f"{chart}_activations{suffix}") def add_conv2(self,x,module,chart,hook_name,mask,n_act,suffix=""): #weights and gradients if isinstance(module,Conv2dBatchAct): module = module.conv2d_batch_act[0] weights = module.weight.data.cpu().numpy() gradients = module.weight.grad.cpu().numpy() self.comet_exp.log_histogram_3d(weights, name=f"{chart}_weights", step=self.step) self.comet_exp.log_histogram_3d(gradients, name=f"{chart}_gradients", step=self.step) self.append_histogram(x, gradients.reshape(-1), f"{chart}_gradients") self.append_histogram(x, weights.reshape(-1), f"{chart}_weights") #need hook act_hook = self.hooks[hook_name] act = act_hook.get_features()[mask][0].unsqueeze(1).cpu() act = act - act.min() act = act / (act.max() - act.min()) self.add_images( x, act, f"{chart}_act_first_image{suffix}") def add_figure_with_question(self,x,image,question,answer,output,index,chart,suffix=""): norm_img = mpl.colors.Normalize(vmin=-1,vmax=1) visu_question = self.word_vect(question) visu_answer = self.answer_vect(answer) visu_output = self.answer_vect(output) figures = [] for idx in range(image.shape[0]): fig = plt.figure() a = fig.add_subplot(111) plt.imshow( norm_img(np.transpose(image[idx],[1,2,0])), vmin=0.,vmax=1.) a.text(0, 0, textwrap.fill( f"{index[idx]}: " + " ".join(visu_question[idx]) + f"Answer/Output: {visu_answer[idx]}/{visu_output[idx]}", 60),wrap=True,ha='left',va='bottom') figures.append(fig) self.comet_exp.log_figure(figure_name=f"{chart}/sample{suffix}_{idx}", figure=fig, overwrite=False, step=self.step) self.tensorboard_writer.add_figure( f"{chart}/sample{suffix}", figures, x) def close(self): self.tensorboard_writer.close()