import os import time import logging import numpy as np import sys import copy import torch as th import time import tensorboard import tqdm from torch.utils.tensorboard import SummaryWriter # Writer will output to ./runs/ directory by default writer = SummaryWriter("vis") from torch.optim.lr_scheduler import ReduceLROnPlateau from torch.nn.utils import clip_grad_norm_ from utils import compute_sdr, MAX_INT16, center_trim from preprocess import Prep from conv_tasnet import TasNet from torch.nn import MSELoss n_spks = 3 def load_obj(obj, device): """
dist.init_process_group(backend='nccl', init_method='env://') # distributed backend opt.world_size = dist.get_world_size() assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count' opt.batch_size = opt.total_batch_size // opt.world_size print(opt) # Train if not opt.evolve: tb_writer = None if opt.local_rank in [-1, 0]: print( 'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/' ) tb_writer = SummaryWriter( log_dir=increment_dir('runs/exp', opt.name)) train(hyp, opt, device, tb_writer) # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = { 'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) 'momentum': (0.1, 0.6, 0.98), # SGD momentum/Adam beta1 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay 'giou': (1, 0.02, 0.2), # GIoU loss gain 'cls': (1, 0.2, 4.0), # cls loss gain 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels)
def main(args): np.set_printoptions(precision=3) save_dir = os.getcwd() log = os.path.join(save_dir, "log.txt") # Setup SummaryWriter summary_dir = os.path.join(save_dir, "summary") if not os.path.exists(summary_dir): os.mkdir(summary_dir) writer = SummaryWriter(summary_dir) if args.run.s3_bucket is not None: aws_utils.download_from_s3(log, args.run.s3_bucket, log) train_utils.copy_code_to_experiment_dir("/code/nas-theory/cnn", save_dir) aws_utils.upload_directory(os.path.join(save_dir, "scripts"), args.run.s3_bucket) train_utils.set_up_logging(log) if not torch.cuda.is_available(): logging.info("no gpu device available") sys.exit(1) torch.cuda.set_device(args.run.gpu) logging.info("gpu device = %d" % args.run.gpu) logging.info("args = %s", args.pretty()) rng_seed = train_utils.RNGSeed(args.run.seed) if args.search.method in ["edarts", "gdarts", "eedarts"]: if args.search.fix_alphas: from architect.architect_edarts_edge_only import ( ArchitectEDARTS as Architect, ) else: from architect.architect_edarts import ArchitectEDARTS as Architect elif args.search.method in ["darts", "fdarts"]: from architect.architect_darts import ArchitectDARTS as Architect elif args.search.method == "egdas": from architect.architect_egdas import ArchitectEGDAS as Architect else: raise NotImplementedError if args.search.search_space in ["darts", "darts_small"]: from search_spaces.darts.model_search import DARTSNetwork as Network elif "nas-bench-201" in args.search.search_space: from search_spaces.nasbench_201.model_search import ( NASBENCH201Network as Network, ) elif args.search.search_space == "pcdarts": from search_spaces.pc_darts.model_search import PCDARTSNetwork as Network else: raise NotImplementedError if args.train.smooth_cross_entropy: criterion = train_utils.cross_entropy_with_label_smoothing else: criterion = nn.CrossEntropyLoss() num_train, num_classes, train_queue, valid_queue = train_utils.create_data_queues( args) print("dataset: {}, num_classes: {}".format(args.run.dataset, num_classes)) model = Network( args.train.init_channels, num_classes, args.search.nodes, args.train.layers, criterion, **{ "auxiliary": args.train.auxiliary, "search_space_name": args.search.search_space, "exclude_zero": args.search.exclude_zero, "track_running_stats": args.search.track_running_stats, }) model = model.cuda() logging.info("param size = %fMB", train_utils.count_parameters_in_MB(model)) optimizer, scheduler = train_utils.setup_optimizer(model, args) # TODO: separate args by model, architect, etc # TODO: look into using hydra for config files architect = Architect(model, args, writer) # Try to load a previous checkpoint try: start_epochs, history = train_utils.load(save_dir, rng_seed, model, optimizer, architect, args.run.s3_bucket) scheduler.last_epoch = start_epochs - 1 ( num_train, num_classes, train_queue, valid_queue, ) = train_utils.create_data_queues(args) except Exception as e: logging.info(e) start_epochs = 0 best_valid = 0 for epoch in range(start_epochs, args.run.epochs): lr = scheduler.get_lr()[0] logging.info("epoch %d lr %e", epoch, lr) model.drop_path_prob = args.train.drop_path_prob * epoch / args.run.epochs # training train_acc, train_obj = train( args, train_queue, valid_queue, model, architect, criterion, optimizer, lr, ) architect.baseline = train_obj architect.update_history() architect.log_vars(epoch, writer) if "update_lr_state" in dir(scheduler): scheduler.update_lr_state(train_obj) logging.info("train_acc %f", train_acc) # History tracking for vs in [("alphas", architect.alphas), ("edges", architect.edges)]: for ct in vs[1]: v = vs[1][ct] logging.info("{}-{}".format(vs[0], ct)) logging.info(v) # Calling genotypes sets alphas to best arch for EGDAS and MEGDAS # so calling here before infer. genotype = architect.genotype() logging.info("genotype = %s", genotype) if not args.search.single_level: valid_acc, valid_obj = train_utils.infer( valid_queue, model, criterion, report_freq=args.run.report_freq, discrete=args.search.discrete, ) if valid_acc > best_valid: best_valid = valid_acc best_genotype = architect.genotype() logging.info("valid_acc %f", valid_acc) train_utils.save( save_dir, epoch + 1, rng_seed, model, optimizer, architect, save_history=True, s3_bucket=args.run.s3_bucket, ) scheduler.step() valid_acc, valid_obj = train_utils.infer( valid_queue, model, criterion, report_freq=args.run.report_freq, discrete=args.search.discrete, ) if valid_acc > best_valid: best_valid = valid_acc best_genotype = architect.genotype() logging.info("valid_acc %f", valid_acc) if args.run.s3_bucket is not None: filename = "cnn_genotypes.txt" aws_utils.download_from_s3(filename, args.run.s3_bucket, filename) with open(filename, "a+") as f: f.write("\n") f.write("{}{}{}{} = {}".format( args.search.search_space, args.search.method, args.run.dataset.replace("-", ""), args.run.seed, best_genotype, )) aws_utils.upload_to_s3(filename, args.run.s3_bucket, filename) aws_utils.upload_to_s3(log, args.run.s3_bucket, log)
def train(model, training_data, validation_data, optimizer, device, opt): ''' Start training ''' # Use tensorboard to plot curves, e.g. perplexity, accuracy, learning rate if opt.use_tb: from torch.utils.tensorboard import SummaryWriter tb_writer = SummaryWriter( log_dir=os.path.join(opt.output_dir, 'tensorboard')) log_train_file = os.path.join(opt.output_dir, 'train.log') log_valid_file = os.path.join(opt.output_dir, 'valid.log') print('[Info] Training performance will be written to file: {} and {}'. format(log_train_file, log_valid_file)) with open(log_train_file, 'w') as log_tf, open(log_valid_file, 'w') as log_vf: log_tf.write('epoch,loss,ppl,accuracy\n') log_vf.write('epoch,loss,ppl,accuracy\n') def print_performances(header, ppl, accu, start_time, lr): print(' - {header:12} ppl: {ppl: 8.5f}, accuracy: {accu:3.3f} %, lr: {lr:8.5f}, '\ 'elapse: {elapse:3.3f} min'.format( header=f"({header})", ppl=ppl, accu=100*accu, elapse=(time.time()-start_time)/60, lr=lr)) #valid_accus = [] valid_losses = [] for epoch_i in range(opt.epoch): print('[ Epoch', epoch_i, ']') start = time.time() train_loss, train_accu = train_epoch(model, training_data, optimizer, opt, device, smoothing=opt.label_smoothing) train_ppl = math.exp(min(train_loss, 100)) # Current learning rate lr = optimizer._optimizer.param_groups[0]['lr'] print_performances('Training', train_ppl, train_accu, start, lr) start = time.time() valid_loss, valid_accu = eval_epoch(model, validation_data, device, opt) valid_ppl = math.exp(min(valid_loss, 100)) print_performances('Validation', valid_ppl, valid_accu, start, lr) valid_losses += [valid_loss] checkpoint = { 'epoch': epoch_i, 'settings': opt, 'model': model.state_dict() } if opt.save_mode == 'all': model_name = 'model_accu_{accu:3.3f}.chkpt'.format(accu=100 * valid_accu) torch.save(checkpoint, model_name) elif opt.save_mode == 'best': model_name = 'model.chkpt' if valid_loss <= min(valid_losses): torch.save(checkpoint, os.path.join(opt.output_dir, model_name)) print(' - [Info] The checkpoint file has been updated.') with open(log_train_file, 'a') as log_tf, open(log_valid_file, 'a') as log_vf: log_tf.write( '{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format( epoch=epoch_i, loss=train_loss, ppl=train_ppl, accu=100 * train_accu)) log_vf.write( '{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format( epoch=epoch_i, loss=valid_loss, ppl=valid_ppl, accu=100 * valid_accu)) if opt.use_tb: tb_writer.add_scalars('ppl', { 'train': train_ppl, 'val': valid_ppl }, epoch_i) tb_writer.add_scalars('accuracy', { 'train': train_accu * 100, 'val': valid_accu * 100 }, epoch_i) tb_writer.add_scalar('learning_rate', lr, epoch_i)
class PIRL(Task): def __init__(self, backbone: BackboneBase, projector: HeadBase, memory: MemoryBank, optimizer: torch.optim.Optimizer, scheduler: torch.optim.lr_scheduler._LRScheduler, loss_function: PIRLLoss, loss_weight: float, num_negatives: int, metrics: dict, checkpoint_dir: str, write_summary: bool ): super(PIRL, self).__init__() assert isinstance(memory, MemoryBank) assert isinstance(loss_function, PIRLLoss) self.backbone = backbone self.projector = projector self.memory = memory self.optimizer = optimizer self.scheduler = scheduler self.loss_function = loss_function self.loss_weight = loss_weight self.num_negatives = num_negatives self.metrics = metrics if isinstance(metrics, dict) else None self.checkpoint_dir = checkpoint_dir os.makedirs(self.checkpoint_dir, exist_ok=True) self.writer = SummaryWriter(log_dir=self.checkpoint_dir) if write_summary else None def run(self, train_set, valid_set, epochs, batch_size, num_workers=0, device='cuda', **kwargs): # pylint: disable=unused-argument assert isinstance(train_set, torch.utils.data.Dataset) assert isinstance(valid_set, torch.utils.data.Dataset) assert isinstance(epochs, int) assert isinstance(batch_size, int) assert isinstance(num_workers, int) assert device.startswith('cuda') or device == 'cpu' logger = kwargs.get('logger', None) self.backbone = self.backbone.to(device) self.projector = self.projector.to(device) train_loader = get_dataloader(train_set, batch_size, num_workers=num_workers) valid_loader = get_dataloader(valid_set, batch_size, num_workers=num_workers) # Initialize training memory if not self.memory.initialized: self.memory.initialize(self.backbone, self.projector, train_loader) with tqdm.tqdm(**get_tqdm_config(total=epochs, leave=True, color='blue')) as pbar: best_valid_loss = float('inf') best_epoch = 0 for epoch in range(1, epochs + 1): # 0. Train & evaluate train_history = self.train(train_loader, device=device) valid_history = self.evaluate(valid_loader, device=device) # 1. Epoch history (loss) epoch_history = { 'loss': { 'train': train_history.get('loss'), 'valid': valid_history.get('loss') }, } # 2. Epoch history (other metrics if provided) if self.metrics is not None: assert isinstance(self.metrics, dict) for metric_name, _ in self.metrics.items(): epoch_history[metric_name] = { 'train': train_history.get(metric_name), 'valid': valid_history.get(metric_name), } # 3. Tensorboard if self.writer is not None: for metric_name, metric_dict in epoch_history.items(): self.writer.add_scalars( main_tag=metric_name, tag_scalar_dict=metric_dict, global_step=epoch ) if self.scheduler is not None: self.writer.add_scalar( tag='lr', scalar_value=self.scheduler.get_last_lr()[0], global_step=epoch ) # 4-1. Save model if it is the current best valid_loss = epoch_history['loss']['valid'] if valid_loss < best_valid_loss: best_valid_loss = valid_loss best_epoch = epoch self.save_checkpoint(self.best_ckpt, epoch=epoch, **epoch_history) self.memory.save(os.path.join(os.path.dirname(self.best_ckpt), 'best_memory.pt'), epoch=epoch) # 4-2. Save intermediate models if isinstance(kwargs.get('save_every'), int): if epoch % kwargs.get('save_every') == 0: new_ckpt = os.path.join(self.checkpoint_dir, f'epoch_{epoch:04d}.loss_{valid_loss:.4f}.pt') # No need to save memory self.save_checkpoint(new_ckpt, epoch=epoch, **epoch_history) # 5. Update learning rate scheduler if self.scheduler is not None: self.scheduler.step() # 6. Logging desc = make_epoch_description( history=epoch_history, current=epoch, total=epochs, best=best_epoch ) pbar.set_description_str(desc) pbar.update(1) if logger is not None: logger.info(desc) # 7. Save last model self.save_checkpoint(self.last_ckpt, epoch=epoch, **epoch_history) self.memory.save(os.path.join(os.path.dirname(self.last_ckpt), 'last_memory.pt'), epoch=epoch) # 8. Test model (optional) if 'test_set' in kwargs.keys(): test_loader = get_dataloader(kwargs.get('test_set'), batch_size, num_workers=num_workers) self.test(test_loader, device=device, logger=logger) def train(self, data_loader: torch.utils.data.DataLoader, device: str, **kwargs): # pylint: disable=unused-argument """Train function defined for a single epoch.""" preds = [] train_loss = 0. steps_per_epoch = len(data_loader) self._set_learning_phase(train=True) with tqdm.tqdm(**get_tqdm_config(steps_per_epoch, leave=False, color='green')) as pbar: for i, batch in enumerate(data_loader): j = batch['idx'] x = batch['x'].to(device) x_t = batch['x_t'].to(device) z = self.predict(x) z_t = self.predict(x_t) m = self.memory.get_representations(j).to(device) negatives = self.memory.get_negatives(self.num_negatives, exclude=j) # Calculate loss loss_z, _ = self.loss_function( anchors=m, positives=z, negatives=negatives, ) loss_z_t, logits = self.loss_function( anchors=m, positives=z_t, negatives=negatives, ) loss = (1 - self.loss_weight) * loss_z + self.loss_weight * loss_z_t # Backpropagation & update loss.backward() self.optimizer.step() self.optimizer.zero_grad() self.memory.update(j, values=z.detach()) train_loss += loss.detach().item() preds += [logits.detach().cpu()] desc = f" Batch: [{i+1:>4}/{steps_per_epoch:>4}]" desc += f" Loss: {train_loss/(i+1):.4f} " pbar.set_description_str(desc) pbar.update(1) out = {'loss': train_loss / steps_per_epoch} if self.metrics is not None: assert isinstance(self.metrics, dict) with torch.no_grad(): preds = torch.cat(preds, dim=0) # (N, 1+ num_negatives) trues = torch.zeros(preds.size(0), device=preds.device) # (N, ) for metric_name, metric_function in self.metrics.items(): out[metric_name] = metric_function(preds, trues).item() return out def evaluate(self, data_loader: torch.utils.data.DataLoader, device: str, **kwargs): # pylint: disable=unused-argument """Evaluate current model. A single pass through the given dataset.""" preds = [] valid_loss = 0. steps_per_epoch = len(data_loader) self._set_learning_phase(train=False) with torch.no_grad(): for _, batch in enumerate(data_loader): j = batch['idx'] x = batch['x'].to(device) x_t = batch['x_t'].to(device) z = self.predict(x) z_t = self.predict(x_t) negatives = self.memory.get_negatives(self.num_negatives, exclude=j) # Note that no memory representation (m) exists for the validation data. loss, logits = self.loss_function( anchors=z, positives=z_t, negatives=negatives, ) valid_loss += loss.item() preds += [logits.detach().cpu()] out = {'loss': valid_loss / steps_per_epoch} if self.metrics is not None: assert isinstance(self.metrics, dict) preds = torch.cat(preds, dim=0) # (N, 1+ num_negatives) trues = torch.zeros(preds.size(0), device=preds.device) # (N, ) for metric_name, metric_function in self.metrics.items(): out[metric_name] = metric_function(preds, trues).item() return out def predict(self, x: torch.Tensor): return self.projector(self.backbone(x)) def test(self, data_loader: torch.utils.data.DataLoader, device: str, logger = None): """Evaluate best model on test data.""" def test_on_ckpt(ckpt: str): """Load checkpoint history and add test metric values.""" self.load_model_from_checkpoint(ckpt) ckpt_history = self.load_history_from_checkpoint(ckpt) test_history = self.evaluate(data_loader, device) for metric_name, metric_val in test_history.items(): ckpt_history[metric_name]['test'] = metric_val return ckpt_history def make_description(history: dict, prefix: str = ''): desc = f" {prefix} ({history['epoch']:>4d}): " for metric_name, metric_dict in history.items(): if metric_name == 'epoch': continue for k, v in metric_dict.items(): desc += f" {k}_{metric_name}: {v:.4f} |" return desc # 1. Best model best_history = test_on_ckpt(self.best_ckpt) desc = make_description(best_history, prefix='Best model') print(desc) if logger is not None: logger.info(desc) with open(os.path.join(self.checkpoint_dir, 'best_history.json'), 'w') as fp: json.dump(best_history, fp, indent=2) # 2. Last model last_history = test_on_ckpt(self.last_ckpt) desc = make_description(last_history, prefix='Last model') print(desc) if logger is not None: logger.info(desc) with open(os.path.join(self.checkpoint_dir, 'last_history.json'), 'w') as fp: json.dump(best_history, fp, indent=2) def _set_learning_phase(self, train=False): if train: self.backbone.train() self.projector.train() else: self.backbone.eval() self.projector.eval() def save_checkpoint(self, path: str, **kwargs): ckpt = { 'backbone': self.backbone.state_dict(), 'projector': self.projector.state_dict(), 'optimizer': self.optimizer.state_dict(), 'scheduler': self.scheduler.state_dict() if \ self.scheduler is not None else None } if kwargs: ckpt.update(kwargs) torch.save(ckpt, path) def load_model_from_checkpoint(self, path: str): ckpt = torch.load(path) self.backbone.load_state_dict(ckpt['backbone']) self.projector.load_state_dict(ckpt['projector']) self.optimizer.load_state_dict(ckpt['optimizer']) if self.scheduler is not None: self.scheduler.load_state_dict(ckpt['scheduler']) def load_history_from_checkpoint(self, path: str): ckpt = torch.load(path) del ckpt['backbone'] del ckpt['projector'] del ckpt['optimizer'] del ckpt['scheduler'] return ckpt
def train_engine(__C): # define network net = get_network(__C) net = net.cuda() __C.batch_size = __C.batch_size // __C.gradient_accumulation_steps # define dataloader train_loader = get_train_loader(__C) test_loader = get_test_loader(__C) # define optimizer and loss function if __C.label_smoothing: loss_function = LabelSmoothingCrossEntropy(__C.smoothing) else: loss_function = nn.CrossEntropyLoss() # define optimizer and training parameters if __C.no_bias_decay: params = split_weights(net) else: params = net.parameters() optimizer = optim.SGD(params, lr=__C.lr, momentum=0.9, weight_decay=5e-4) # define optimizer scheduler # len(train_loader) 就是一个epoch的steps数量 warmup_steps = __C.warmup_steps total_steps = __C.num_steps # change epoch into steps for i in __C.milestones: i *= len(train_loader) if __C.decay_type == 'multi_step': train_scheduler = WarmupMultiStepSchedule(__C, optimizer, warmup_steps=warmup_steps, t_total=total_steps) elif __C.decay_type == 'cosine': train_scheduler = WarmupCosineSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) elif __C.decay_type == 'linear': train_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) # define tensorboard writer writer = SummaryWriter( log_dir=os.path.join(__C.tensorboard_log_dir, __C.model, __C.version)) # define model save dir checkpoint_path = os.path.join(__C.ckpts_dir, __C.model, __C.version) if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) checkpoint_path = os.path.join(checkpoint_path, '{net}-{global_step}-{type}.pth') # define log save dir log_path = os.path.join(__C.result_log_dir, __C.model) if not os.path.exists(log_path): os.makedirs(log_path) log_path = os.path.join(log_path, __C.version + '.txt') # write the hyper parameters to log logfile = open(log_path, 'a+') logfile.write(str(__C)) logfile.close() # Train! logger.info(" ***** Running training *****") logger.info(" Total optimization steps = %d", __C.num_steps) logger.info(" Instantaneous batch size per GPU = %d", __C.batch_size) logger.info(" Gradient Accumulation steps = %d", __C.gradient_accumulation_steps) net.zero_grad() losses = AverageMeter() global_step, best_acc = 0, 0 while True: net.train() epoch_iterator = tqdm(train_loader, desc="Training (X / X Steps) (loss=X.X)", bar_format="{l_bar}{r_bar}", dynamic_ncols=True) for step, (images, labels) in enumerate(train_loader): images = images.cuda() labels = labels.cuda() train_outputs = net(images) loss = loss_function(train_outputs, labels) if __C.gradient_accumulation_steps > 1: loss = loss / __C.gradient_accumulation_steps else: loss.backward() if (step + 1) % __C.gradient_accumulation_steps == 0: losses.update(loss.item() * __C.gradient_accumulation_steps) torch.nn.utils.clip_grad_norm_(net.parameters(), __C.max_grad_norm) train_scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 epoch_iterator.set_description( "Training (%d / %d Steps) (loss=%2.5f)" % (global_step, total_steps, losses.val)) writer.add_scalar("[Step] Train/loss", scalar_value=losses.val, global_step=global_step) writer.add_scalar("[Step] Train/lr", scalar_value=train_scheduler.get_lr()[0], global_step=global_step) if global_step % __C.eval_every == 0: accuracy = valid(__C, model=net, writer=writer, test_loader=test_loader, global_step=global_step, loss_function=loss_function) if best_acc < accuracy: torch.save( net.state_dict(), checkpoint_path.format(net=__C.model, global_step=global_step, type='best')) best_acc = accuracy net.train() if global_step % total_steps == 0: break losses.reset() if global_step % total_steps == 0: break writer.close() logger.info("Best Accuracy: \t%f" % best_acc) logger.info("End Training!")
def main(argv): writer = SummaryWriter() torch.manual_seed(FLAGS.random_seed) np.random.seed(FLAGS.random_seed) if hasattr(torch, "cuda_is_available"): if torch.cuda_is_available(): torch.cuda.manual_seed(FLAGS.random_seed) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True device = torch.device(FLAGS.device) kwargs = { "num_workers": 1, "pin_memory": True } if FLAGS.device is "cuda" else {} train_loader = torch.utils.data.DataLoader( torchvision.datasets.MNIST( root=".", train=True, download=True, transform=torchvision.transforms.Compose([ # torchvision.transforms.RandomCrop(size=[28,28], padding=4), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize((0.1307, ), (0.3081, )), ]), ), batch_size=FLAGS.batch_size, shuffle=True, **kwargs, ) test_loader = torch.utils.data.DataLoader( torchvision.datasets.MNIST( root=".", train=False, transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize((0.1307, ), (0.3081, )), ]), ), batch_size=FLAGS.batch_size, **kwargs, ) label = os.environ.get("SLURM_JOB_ID", str(uuid.uuid4())) if FLAGS.prefix: path = f"runs/mnist/{FLAGS.prefix}/{label}" else: path = f"runs/mnist/{label}" os.makedirs(path, exist_ok=True) os.chdir(path) FLAGS.append_flags_into_file(f"flags.txt") input_features = 28 * 28 output_features = 10 model = LIFConvNet( input_features, FLAGS.seq_length, model=FLAGS.model, device=device, refrac=FLAGS.refrac, only_first_spike=FLAGS.only_first_spike, ).to(device) if FLAGS.optimizer == "sgd": optimizer = torch.optim.SGD(model.parameters(), lr=FLAGS.learning_rate) elif FLAGS.optimizer == "adam": optimizer = torch.optim.Adam(model.parameters(), lr=FLAGS.learning_rate) if FLAGS.only_output: optimizer = torch.optim.Adam(model.out.parameters(), lr=FLAGS.learning_rate) training_losses = [] mean_losses = [] test_losses = [] accuracies = [] for epoch in range(FLAGS.epochs): training_loss, mean_loss = train(model, device, train_loader, optimizer, epoch, writer=writer) test_loss, accuracy = test(model, device, test_loader, epoch, writer=writer) training_losses += training_loss mean_losses.append(mean_loss) test_losses.append(test_loss) accuracies.append(accuracy) max_accuracy = np.max(np.array(accuracies)) if (epoch % FLAGS.model_save_interval == 0) and FLAGS.save_model: model_path = f"mnist-{epoch}.pt" save( model_path, model=model, optimizer=optimizer, epoch=epoch, is_best=accuracy > max_accuracy, ) np.save("training_losses.npy", np.array(training_losses)) np.save("mean_losses.npy", np.array(mean_losses)) np.save("test_losses.npy", np.array(test_losses)) np.save("accuracies.npy", np.array(accuracies)) model_path = f"mnist-final.pt" save( model_path, epoch=epoch, model=model, optimizer=optimizer, is_best=accuracy > max_accuracy, ) writer.close()
def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping, permute_masks=None, logger=None): batch_size = 30 losses, accs , losses_wo, accs_wo= [], [], [], [] perm = torch.randperm(dataset[0].num_nodes) for k in range(runs): model.to(device).reset_parameters() optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay) best_val_perf = test_perf = 0 if torch.cuda.is_available(): torch.cuda.synchronize() writer = SummaryWriter('runs/{}_{}'.format(k, tt)) data = dataset[0] data = data.to(device) num_nodes = data.num_nodes if os.path.isfile('{}_{}.pkl'.format(str(dataset)[:-2], k)): data = pickle.load(open('{}_{}.pkl'.format(str(dataset)[:-2], k), 'rb')) else: pivot= int(num_nodes*0.1) cold_mask_node = perm[list(range(k*pivot, (k+1)*pivot))] data.test_masked_nodes =cold_mask_node train_node = range(num_nodes) train_node = [e for e in train_node if e not in cold_mask_node] #or unknown] data = test_edges(data, cold_mask_node) val_mask_node = random.sample(train_node, int(pivot*0.5)) data.val_masked_nodes = torch.tensor(val_mask_node) data = val_edges(data, val_mask_node) train_node = [e for e in train_node if e not in val_mask_node] #or unknown] data.train_nodes = train_node data.train_masked_nodes = torch.tensor(random.sample(train_node,int(num_nodes*0.1))) data = train_edges(data, data.train_masked_nodes) with open('{}_{}.pkl'.format(str(dataset)[:-2], k), 'wb') as f: pickle.dump(data, f) print("{}-fold Result".format(k)) train_node=data.train_nodes loss_wo, acc_wo = run_(data, dataset, data.train_edge_index, train_node,writer) losses_wo.append(loss_wo) accs_wo.append(acc_wo) scheduler = StepLR(optimizer, step_size=2000, gamma=0.5) for epoch in range(2000): with torch.autograd.set_detect_anomaly(True): train_gan(dataset, data, writer) for epoch in range(5000): with torch.autograd.set_detect_anomaly(True): train_loss =train(model, optimizer,data,epoch) scheduler.step() if torch.cuda.is_available(): torch.cuda.synchronize() loss, acc = evaluate(model, data) losses.append(loss) accs.append(acc) print('Val Loss: {:.4f}, Test Accuracy: {:.3f}'.format(loss,acc)) losses, accs, losses_wo, accs_wo = tensor(losses), tensor(accs), tensor(losses_wo), tensor(accs_wo) print('w/o Mean Val Loss: {:.4f}, Mean Test Accuracy: {:.3f} ± {:.3f}'. format(losses_wo.mean().item(), accs_wo.mean().item(), accs_wo.std().item() )) print('Mean Val Loss: {:.4f}, Mean Test Accuracy: {:.3f} ± {:.3f}'. format(losses.mean().item(), accs.mean().item(), accs.std().item() ))
depth = self.depth data = ocnn.octree_property(octree, 'feature', depth) assert data.size(1) == self.channel_in pool_idx = [None] * (depth + 1) for i, d in enumerate(range(depth, 2, -1)): data = self.convs[i](data, octree) data, pool_idx[d] = self.pools[i](data, octree) for i, d in enumerate(range(2, depth)): data = self.deconvs[i](data, octree) data = self.unpools[i](data, pool_idx[d+1], octree) data = self.deconv(data, octree) data = self.header(data) return data if __name__ == '__main__': from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter('logs/segnet') octree = ocnn.octree_batch(ocnn.octree_samples(['octree_1', 'octree_2'])) model = SegNet(depth=5, channel_in=3, nout=4) print(model) octree = octree.cuda() model = model.cuda() writer.add_graph(model, octree) writer.flush()
def train(self, dataset): if self.torch_manual_seed: torch.random.manual_seed(self.torch_manual_seed) # create PyTorch datasets dataset_train = dataset.create_torch_dataset( part='train', reshape=((1, ) + dataset.space[0].shape, (1, ) + dataset.space[1].shape)) dataset_validation = dataset.create_torch_dataset( part='validation', reshape=((1, ) + dataset.space[0].shape, (1, ) + dataset.space[1].shape)) # reset model before training self.init_model() criterion = torch.nn.MSELoss() self.init_optimizer(dataset_train=dataset_train) # create PyTorch dataloaders data_loaders = { 'train': DataLoader(dataset_train, batch_size=self.batch_size, num_workers=self.num_data_loader_workers, shuffle=True, pin_memory=True), 'validation': DataLoader(dataset_validation, batch_size=self.batch_size, num_workers=self.num_data_loader_workers, shuffle=True, pin_memory=True) } dataset_sizes = { 'train': len(dataset_train), 'validation': len(dataset_validation) } self.init_scheduler(dataset_train=dataset_train) if self.scheduler is not None: schedule_every_batch = isinstance(self.scheduler, (CyclicLR, OneCycleLR)) best_model_wts = deepcopy(self.model.state_dict()) best_psnr = 0 if self.log_dir is not None: writer = SummaryWriter(log_dir=self.log_dir, max_queue=0) validation_samples = dataset.get_data_pairs( 'validation', self.log_num_validation_samples) self.model.to(self.device) self.model.train() for epoch in range(self.epochs): # Each epoch has a training and validation phase for phase in ['train', 'validation']: if phase == 'train': self.model.train() # Set model to training mode else: self.model.eval() # Set model to evaluate mode running_psnr = 0.0 running_loss = 0.0 running_size = 0 with tqdm(data_loaders[phase], desc='epoch {:d}'.format(epoch + 1), disable=not self.show_pbar) as pbar: for inputs, labels in pbar: if self.normalize_by_opnorm: inputs = (1. / self.opnorm) * inputs inputs = inputs.to(self.device) labels = labels.to(self.device) # zero the parameter gradients self.optimizer.zero_grad() # forward # track gradients only if in train phase with torch.set_grad_enabled(phase == 'train'): outputs = self.model(inputs) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() torch.nn.utils.clip_grad_norm_( self.model.parameters(), max_norm=1) self.optimizer.step() if (self.scheduler is not None and schedule_every_batch): self.scheduler.step() for i in range(outputs.shape[0]): labels_ = labels[i, 0].detach().cpu().numpy() outputs_ = outputs[i, 0].detach().cpu().numpy() running_psnr += PSNR(outputs_, labels_) # statistics running_loss += loss.item() * outputs.shape[0] running_size += outputs.shape[0] pbar.set_postfix({ 'phase': phase, 'loss': running_loss / running_size, 'psnr': running_psnr / running_size }) if self.log_dir is not None and phase == 'train': step = (epoch * ceil( dataset_sizes['train'] / self.batch_size) + ceil(running_size / self.batch_size)) writer.add_scalar( 'loss/{}'.format(phase), torch.tensor(running_loss / running_size), step) writer.add_scalar( 'psnr/{}'.format(phase), torch.tensor(running_psnr / running_size), step) if self.scheduler is not None and not schedule_every_batch: self.scheduler.step() epoch_loss = running_loss / dataset_sizes[phase] epoch_psnr = running_psnr / dataset_sizes[phase] if self.log_dir is not None and phase == 'validation': step = (epoch + 1) * ceil( dataset_sizes['train'] / self.batch_size) writer.add_scalar('loss/{}'.format(phase), epoch_loss, step) writer.add_scalar('psnr/{}'.format(phase), epoch_psnr, step) # deep copy the model (if it is the best one seen so far) if phase == 'validation' and epoch_psnr > best_psnr: best_psnr = epoch_psnr best_model_wts = deepcopy(self.model.state_dict()) if self.save_best_learned_params_path is not None: self.save_learned_params( self.save_best_learned_params_path) if (phase == 'validation' and self.log_dir is not None and self.log_num_validation_samples > 0): with torch.no_grad(): val_images = [] for (y, x) in validation_samples: y = torch.from_numpy(np.asarray(y))[None, None].to( self.device) x = torch.from_numpy(np.asarray(x))[None, None].to( self.device) reco = self.model(y) reco -= torch.min(reco) reco /= torch.max(reco) val_images += [reco, x] writer.add_images( 'validation_samples', torch.cat(val_images), (epoch + 1) * (ceil(dataset_sizes['train'] / self.batch_size)), dataformats='NCWH') print('Best val psnr: {:4f}'.format(best_psnr)) self.model.load_state_dict(best_model_wts)
def run(data_path, model_path, stump_type, gpu_name, batch_size, num_epochs, num_workers): """ Main method to train, evaluate and test the multiple-instance-learning approach to classify the Paxos dataset into refer- and nonreferable retinopathy. :param base_path: Absolute path to the dataset. The folder should have folders for training (train), evaluation (val) and corresponding label files :param model_path: Absolute path to the pretrained model :param gpu_name: ID of the gpu (e.g. cuda0) :param batch_size: Bath size :param num_epochs: Maximum number of training epochs :param num_workers: Number of threads used for data loading :return: f1-score for the evaluation (or test) set """ device = torch.device(gpu_name if torch.cuda.is_available() else "cpu") print(f'Using device {device}') hyperparameter = { 'data': os.path.basename(os.path.normpath(data_path)), 'learning_rate': 1e-4, 'weight_decay': 1e-3, 'num_epochs': num_epochs, 'batch_size': batch_size, 'optimizer': optim.Adam.__name__, 'freeze': 0.0, 'balance': 0.35, 'image_size': 450, 'crop_size': 399, 'pretraining': True, 'preprocessing': False, 'stump': stump_type, 'attention_neurons': 738, 'bag_size': 75, 'attention': 'normal', # normal / gated 'pooling': 'max' # avg / max / none } os.mkdir(RES_PATH) with open(os.path.join(RES_PATH, 'hp.txt'), 'w') as f: print(hyperparameter, file=f) aug_pipeline_train = get_training_pipeline(hyperparameter['image_size'], hyperparameter['crop_size']) aug_pipeline_val = get_validation_pipeline(hyperparameter['image_size'], hyperparameter['crop_size']) hyperparameter_str = str(hyperparameter).replace(', \'', ',\n \'')[1:-1] print(f'Hyperparameter info:\n {hyperparameter_str}') loaders = prepare_dataset(data_path, hyperparameter, aug_pipeline_train, aug_pipeline_val, num_workers) net = prepare_network(model_path, hyperparameter, device) optimizer_ft = optim.Adam([{'params': net.feature_extractor_part1.parameters(), 'lr': 1e-5}, {'params': net.feature_extractor_part2.parameters()}, #, 'lr': 1e-5}, {'params': net.attention.parameters()}, {'params': net.att_v.parameters()}, {'params': net.att_u.parameters()}, {'params': net.att_weights.parameters()}, {'params': net.classifier.parameters()}], lr=hyperparameter['learning_rate'], weight_decay=hyperparameter['weight_decay']) # optimizer_ft = optim.Adam(net.parameters(), lr=hyperparameter['learning_rate'], weight_decay=hyperparameter['weight_decay']) criterion = nn.CrossEntropyLoss() plateau_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer_ft, mode='min', factor=0.1, patience=15, verbose=True) desc = f'_paxos_mil_{str("_".join([k[0] + str(hp) for k, hp in hyperparameter.items()]))}' writer = SummaryWriter(comment=desc) best_model = train_model(net, criterion, optimizer_ft, plateau_scheduler, loaders, device, writer, hyperparameter, num_epochs=hyperparameter['num_epochs'], description=desc) _, f1 = validate(best_model, criterion, loaders[1], device, writer, hyperparameter, hyperparameter['num_epochs'], calc_roc=True) return f1
def main(): parser = argparse.ArgumentParser( description='Test', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('data', metavar='DATA', help='path to file') parser.add_argument('--tb-save-path', dest='tb_save_path', metavar='PATH', default='../checkpoints/', help='tensorboard checkpoints path') parser.add_argument('--lgd-weight', dest='lgd_weight', metavar='PATH', default=None, help='pretrained weight for LGD model') parser.add_argument('--sdf-weight', dest='sdf_weight', metavar='PATH', default=None, help='pretrained weight for SDF model') parser.add_argument('--batchsize', dest='batchsize', type=int, metavar='BATCHSIZE', default=1, help='batch size') parser.add_argument('--epoch', dest='epoch', type=int, metavar='EPOCH', default=200, help='epochs for adam and lgd') parser.add_argument('--width', dest='width', type=int, metavar='WIDTH', default=128, help='width for rendered image') parser.add_argument('--height', dest='height', type=int, metavar='HEIGHT', default=128, help='height for rendered image') parser.add_argument('--outfile', dest='outfile', metavar='OUTFILE', help='output file') args = parser.parse_args() width = args.width height = args.height epoch = args.epoch writer = SummaryWriter(args.tb_save_path) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # create models model = Siren(in_features=3, out_features=1, hidden_features=256, hidden_layers=5, outermost_linear=True).to(device) if args.sdf_weight != None: try: model.load_state_dict(torch.load(args.sdf_weight)) except: print("Couldn't load pretrained weight: " + args.sdf_weight) model.eval() for param in model.parameters(): param.requires_grad = False # load mm = torch.tensor([-0.1, -0.1, 0.1], device=device, dtype=torch.float) mx = torch.tensor([0.1, 0.1, 0.1], device=device, dtype=torch.float) wh = torch.tensor([width, height, 1], device=device, dtype=torch.int) rot = torch.tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]], device=device, dtype=torch.float) trans = torch.tensor([[0, 0, -0.8]], device=device, dtype=torch.float) p_distribution = GridDataset(mm, mx, wh) d = torch.zeros((width * height, 1), device=device, dtype=torch.float).requires_grad_(True) sampler = nn.Sequential(UniformSample(width * height), PointTransform(rot)) p = sampler(p_distribution) ds = ObjDataset(args.data) objsampler = ObjUniformSample(1000) x_preview = (objsampler(ds)['p']).to(device) d2_eval = lambda d: torch.pow(d, 2).mean() sdf_eval = lambda d: torch.pow(model(d * ray_n + p + trans)[0], 2).sum( dim=1).mean() d_eval = lambda d: (torch.tanh(d) - 1.).mean() * 0.5 d2_eval_list = lambda d: d2_eval(d[0]) sdf_eval_list = lambda d: sdf_eval(d[0]) d_eval_list = lambda d: d_eval(d[0]) writer.add_mesh("preview", torch.cat([(p + trans), x_preview]).unsqueeze(0), global_step=0) print("lgd") hidden = None lgd = LGD(1, 3, k=10).to(device) if args.lgd_weight != None: try: lgd.load_state_dict(torch.load(args.lgd_weight)) except: print("Couldn't load pretrained weight: " + args.lgd_weight) ray_n = torch.tensor([[0, 0, 1]], device=device, dtype=torch.float).repeat(width * height, 1) writer.add_mesh("raymarch_LGD", torch.cat([(d * ray_n + trans + p), x_preview]).unsqueeze(0), global_step=0) # test LGD lgd.eval() for i in range(epoch): # evaluate losses #loss = sdf_eval(x).mean() # update x [d], hidden = lgd.step(d, [d2_eval_list, sdf_eval_list, d_eval_list], hidden, width * height) d = detach_var(d) hidden = detach_var(hidden) if i % 5 == 0: writer.add_mesh("raymarch_LGD", torch.cat([(d * ray_n + trans + p), x_preview]).unsqueeze(0), global_step=i + 1) writer.close()
class DefaultTrainer: """ Implements general image classification with pruning """ def __init__(self, model: GeneralModel, loss: GeneralModel, optimizer: Optimizer, device, arguments: argparse.Namespace, train_loader: DataLoader, test_loader: DataLoader, metrics: Metrics, criterion: GeneralModel): self._test_loader = test_loader self._train_loader = train_loader self._loss_function = loss self._model = model self._arguments = arguments self._optimizer = optimizer self._device = device self._global_steps = 0 self.out = metrics.log_line DATA_MANAGER.set_date_stamp(addition=arguments.run_name) self._writer = SummaryWriter( os.path.join(DATA_MANAGER.directory, RESULTS_DIR, DATA_MANAGER.stamp, SUMMARY_DIR)) self._metrics: Metrics = metrics self._metrics.init_training(self._writer) self._acc_buffer = [] self._loss_buffer = [] self._elapsed_buffer = [] self._criterion = criterion self.ts = None batch = next(iter(self._test_loader)) self.saliency = Saliency(model, device, batch[0][:8]) self._metrics.write_arguments(arguments) self._flopcounter = FLOPCounter(model, batch[0][:8], self._arguments.batch_size, device=device) self._metrics.model_to_tensorboard(model, timestep=-1) def _batch_iteration(self, x: torch.Tensor, y: torch.Tensor, train: bool = True): """ one iteration of forward-backward """ # unpack x, y = x.to(self._device).float(), y.to(self._device) # update metrics self._metrics.update_batch(train) # record time if "cuda" in str(self._device): start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() # forward pass accuracy, loss, out = self._forward_pass(x, y, train=train) # backward pass if train: self._backward_pass(loss) # record time if "cuda" in str(self._device): end.record() torch.cuda.synchronize(self._device) time = start.elapsed_time(end) else: time = 0 # free memory for tens in [out, y, x, loss]: tens.detach() return accuracy, loss.item(), time def _forward_pass(self, x: torch.Tensor, y: torch.Tensor, train: bool = True): """ implementation of a forward pass """ if train: self._optimizer.zero_grad() if self._model.is_maskable: self._model.apply_weight_mask() out = self._model(x).squeeze() loss = self._loss_function(output=out, target=y, weight_generator=self._model.parameters(), model=self._model, criterion=self._criterion) accuracy = self._get_accuracy(out, y) return accuracy, loss, out def _backward_pass(self, loss): """ implementation of a backward pass """ loss.backward() self._model.insert_noise_for_gradient(self._arguments.grad_noise) if self._arguments.grad_clip > 0: torch.nn.utils.clip_grad_norm_(self._model.parameters(), self._arguments.grad_clip) self._optimizer.step() if self._model.is_maskable: self._model.apply_weight_mask() def _epoch_iteration(self): """ implementation of an epoch """ self.out("\n") self._acc_buffer, self._loss_buffer = self._metrics.update_epoch() for batch_num, batch in enumerate(self._train_loader): self.out(f"\rTraining... {batch_num}/{len(self._train_loader)}", end='') if self._model.is_tracking_weights: self._model.save_prev_weights() acc, loss, elapsed = self._batch_iteration(*batch, self._model.training) if self._model.is_tracking_weights: self._model.update_tracked_weights(self._metrics.batch_train) self._acc_buffer.append(acc) self._loss_buffer.append(loss) self._elapsed_buffer.append(elapsed) self._log(batch_num) self._check_exit_conditions_epoch_iteration() self.out("\n") def _log(self, batch_num: int): """ logs to terminal and tensorboard if the time is right""" if (batch_num % self._arguments.eval_freq) == 0: # validate on test and train set train_acc, train_loss = np.mean(self._acc_buffer), np.mean( self._loss_buffer) test_acc, test_loss, test_elapsed = self.validate() self._elapsed_buffer += test_elapsed # log metrics self._add_metrics(test_acc, test_loss, train_acc, train_loss) # reset for next log self._acc_buffer, self._loss_buffer, self._elapsed_buffer = [], [], [] # print to terminal self.out(self._metrics.printable_last) def validate(self): """ validates the model on test set """ self.out("\n") # init test mode self._model.eval() cum_acc, cum_loss, cum_elapsed = [], [], [] with torch.no_grad(): for batch_num, batch in enumerate(self._test_loader): acc, loss, elapsed = self._batch_iteration( *batch, self._model.training) cum_acc.append(acc) cum_loss.append(loss), cum_elapsed.append(elapsed) self.out( f"\rEvaluating... {batch_num}/{len(self._test_loader)}", end='') self.out("\n") # put back into train mode self._model.train() return float(np.mean(cum_acc)), float(np.mean(cum_loss)), cum_elapsed def _add_metrics(self, test_acc, test_loss, train_acc, train_loss): """ save metrics """ sparsity = self._model.pruned_percentage spasity_index = 2 * ((sparsity * test_acc) / (1e-8 + sparsity + test_acc)) flops_per_sample, total_seen = self._flopcounter.count_flops( self._metrics.batch_train) self._metrics.add(train_acc, key="acc/train") self._metrics.add(train_loss, key="loss/train") self._metrics.add(test_loss, key="loss/test") self._metrics.add(test_acc, key="acc/test") self._metrics.add(sparsity, key="sparse/weight") self._metrics.add(self._model.structural_sparsity, key="sparse/node") self._metrics.add(spasity_index, key="sparse/hm") self._metrics.add(np.log(self._model.compressed_size), key="sparse/log_disk_size") self._metrics.add(np.mean(self._elapsed_buffer), key="time/gpu_time") self._metrics.add(int(flops_per_sample), key="time/flops_per_sample") self._metrics.add(np.log10(total_seen), key="time/flops_log_cum") if torch.cuda.is_available(): self._metrics.add(torch.cuda.memory_allocated(0), key="cuda/ram_footprint") self._metrics.timeit() def train(self): """ main training function """ # setup data output directories: setup_directories() save_codebase_of_run(self._arguments) DATA_MANAGER.write_to_file( os.path.join(RESULTS_DIR, DATA_MANAGER.stamp, OUTPUT_DIR, "calling_command.txt"), str(" ".join(sys.argv))) # data gathering epoch = self._metrics._epoch self._model.train() try: self.out(f"{PRINTCOLOR_BOLD}Started training{PRINTCOLOR_END}") if self._arguments.skip_first_plot: self._metrics.handle_weight_plotting(0, trainer_ns=self) # if snip we prune before training if self._arguments.prune_criterion in SINGLE_SHOT: self._criterion.prune(self._arguments.pruning_limit, train_loader=self._train_loader, manager=DATA_MANAGER) if self._arguments.prune_criterion in STRUCTURED_SINGLE_SHOT: self._optimizer = find_right_model( OPTIMS, self._arguments.optimizer, params=self._model.parameters(), lr=self._arguments.learning_rate, weight_decay=self._arguments.l2_reg) self._metrics.model_to_tensorboard(self._model, timestep=epoch) # do training for epoch in range(epoch, self._arguments.epochs + epoch): self.out( f"\n\n{PRINTCOLOR_BOLD}EPOCH {epoch} {PRINTCOLOR_END} \n\n" ) # do epoch self._epoch_iteration() # plotting if (epoch % self._arguments.plot_weights_freq ) == 0 and self._arguments.plot_weights_freq > 0: self._metrics.handle_weight_plotting(epoch, trainer_ns=self) # do all related to pruning self._handle_pruning(epoch) # save what needs to be saved self._handle_backing_up(epoch) if self._arguments.skip_first_plot: self._metrics.handle_weight_plotting(epoch + 1, trainer_ns=self) # example last save save_models([self._model, self._metrics], "finished") except KeyboardInterrupt as e: self.out(f"Killed by user: {e} at {time.time()}") save_models([self._model, self._metrics], f"KILLED_at_epoch_{epoch}") sys.stdout.flush() DATA_MANAGER.write_to_file( os.path.join(RESULTS_DIR, DATA_MANAGER.stamp, OUTPUT_DIR, "log.txt"), self._metrics.log) self._writer.close() exit(69) except Exception as e: self._writer.close() report_error(e, self._model, epoch, self._metrics) # flush prints sys.stdout.flush() DATA_MANAGER.write_to_file( os.path.join(RESULTS_DIR, DATA_MANAGER.stamp, OUTPUT_DIR, "log.txt"), self._metrics.log) self._writer.close() def _handle_backing_up(self, epoch): if (epoch % self._arguments.save_freq) == 0 and epoch > 0: self.out("\nSAVING...\n") save_models([self._model, self._metrics], f"save_at_epoch_{epoch}") sys.stdout.flush() DATA_MANAGER.write_to_file( os.path.join(RESULTS_DIR, DATA_MANAGER.stamp, OUTPUT_DIR, "log.txt"), self._metrics.log) def _handle_pruning(self, epoch): if self._is_pruning_time(epoch): if self._is_not_finished_pruning(): self.out("\nPRUNING...\n") self._criterion.prune(percentage=self._arguments.pruning_rate, train_loader=self._train_loader, manager=DATA_MANAGER) if self._arguments.prune_criterion in DURING_TRAINING: self._optimizer = find_right_model( OPTIMS, self._arguments.optimizer, params=self._model.parameters(), lr=self._arguments.learning_rate, weight_decay=self._arguments.l2_reg) self._metrics.model_to_tensorboard(self._model, timestep=epoch) if self._model.is_rewindable: self.out("rewinding weights to checkpoint...\n") self._model.do_rewind() if self._model.is_growable: self.out("growing too...\n") self._criterion.grow(self._arguments.growing_rate) if self._is_checkpoint_time(epoch): self.out(f"\nCreating weights checkpoint at epoch {epoch}\n") self._model.save_rewind_weights() def _is_not_finished_pruning(self): return self._arguments.pruning_limit > self._model.pruned_percentage \ or \ ( self._arguments.prune_criterion in DURING_TRAINING and self._arguments.pruning_limit > self._model.structural_sparsity ) @staticmethod def _get_accuracy(output, y): predictions = output.argmax(dim=-1, keepdim=True).view_as(y) correct = y.eq(predictions).sum().item() return correct / output.shape[0] def _is_checkpoint_time(self, epoch: int): return epoch == self._arguments.rewind_to and self._model.is_rewindable def _is_pruning_time(self, epoch: int): if self._arguments.prune_criterion == "EmptyCrit": return False epoch -= self._arguments.prune_delay return (epoch % self._arguments.prune_freq) == 0 and \ epoch > 0 and \ self._model.is_maskable and \ self._arguments.prune_criterion not in SINGLE_SHOT def _check_exit_conditions_epoch_iteration(self, patience=1): time_passed = datetime.now() - DATA_MANAGER.actual_date # check if runtime is expired if (time_passed.total_seconds() > (self._arguments.max_training_minutes * 60)) \ and \ self._arguments.max_training_minutes > 0: raise KeyboardInterrupt( f"Process killed because {self._arguments.max_training_minutes} minutes passed " f"since {DATA_MANAGER.actual_date}. Time now is {datetime.now()}" ) if patience == 0: raise NotImplementedError( "feature to implement", KeyboardInterrupt("Process killed because patience is zero"))
def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = batch[2] if args.model_type in [ "bert", "xlnet" ] else None # XLM and RoBERTa don"t use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev") for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, "module" ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): global best_prec1, args args = parse() print("opt_level = {}".format(args.opt_level)) print("keep_batchnorm_fp32 = {}".format(args.keep_batchnorm_fp32), type(args.keep_batchnorm_fp32)) print("loss_scale = {}".format(args.loss_scale), type(args.loss_scale)) print("\nCUDNN VERSION: {}\n".format(torch.backends.cudnn.version())) cudnn.benchmark = True best_prec1 = 0 args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.gpu = 0 args.world_size = 1 if args.distributed: # this will be 0-3 if you have 4 GPUs on curr node args.gpu = args.local_rank torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') # this is the total # of GPUs across all nodes # if using 2 nodes with 4 GPUs each, world size is 8 args.world_size = torch.distributed.get_world_size() print("### global rank of curr node: {}".format( torch.distributed.get_rank())) assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." if args.channels_last: memory_format = torch.channels_last else: memory_format = torch.contiguous_format # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.sync_bn: print("using apex synced BN") model = apex.parallel.convert_syncbn_model(model) model = model.cuda() # initialize tb logging, you don't want to "double log" # so only allow GPU0 to launch tb if torch.distributed.get_rank() == 0: writer = SummaryWriter(comment="_{}_gpux{}_b{}_cpu{}_opt{}".format( args.arch, args.world_size, args.batch_size, args.workers, args.opt_level)) # Scale init learning rate based on global batch size args.lr = args.lr * float(args.batch_size * args.world_size) / 256. optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Initialize Amp. Amp accepts either values or strings for the optional override arguments, # for convenient interoperation with argparse. model, optimizer = amp.initialize( model, optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=args.keep_batchnorm_fp32, loss_scale=args.loss_scale) # For distributed training, wrap the model with apex.parallel.DistributedDataParallel. # This must be done AFTER the call to amp.initialize. If model = DDP(model) is called # before model, ... = amp.initialize(model, ...), the call to amp.initialize may alter # the types of model's parameters in a way that disrupts or destroys DDP's allreduce hooks. if args.distributed: # By default, apex.parallel.DistributedDataParallel overlaps communication with # computation in the backward pass. # model = DDP(model) # delay_allreduce delays all communication to the end of the backward pass. model = DDP(model, delay_allreduce=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() # Optionally resume from a checkpoint if args.resume: # Use a local scope to avoid dangling references def resume(): if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) resume() # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') if args.arch == "inception_v3": raise RuntimeError( "Currently, inception_v3 is not supported by this example.") else: crop_size = 224 val_size = 256 train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip(), # transforms.ToTensor(), Too slow # normalize, ])) val_dataset = datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(val_size), transforms.CenterCrop(crop_size), ])) # makes sure that each process gets a different slice of the training data # during distributed training train_sampler = None val_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) collate_fn = lambda b: fast_collate(b, memory_format) # notice we turn off shuffling and use distributed data sampler train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=collate_fn) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, collate_fn=collate_fn) if args.evaluate: validate(val_loader, model, criterion) return if torch.distributed.get_rank() == 0: start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_throughput, train_batch_time, train_losses, train_top1, train_top5, train_lr = train( train_loader, model, criterion, optimizer, epoch) # evaluate on validation set val_throughput, val_batch_time, val_losses, val_top1, val_top5 = validate( val_loader, model, criterion) # remember best prec@1 and save checkpoint # only allow GPU0 to print training states to prevent double logging if torch.distributed.get_rank() == 0: is_best = val_top1 > best_prec1 best_prec1 = max(val_top1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, writer.log_dir) # log train and val states to tensorboard writer.add_scalar('Throughput/train', train_throughput, epoch + 1) writer.add_scalar('Throughput/val', val_throughput, epoch + 1) writer.add_scalar('Time/train', train_batch_time, epoch + 1) writer.add_scalar('Time/val', val_batch_time, epoch + 1) writer.add_scalar('Loss/train', train_losses, epoch + 1) writer.add_scalar('Loss/val', val_losses, epoch + 1) writer.add_scalar('Top1/train', train_top1, epoch + 1) writer.add_scalar('Top1/val', val_top1, epoch + 1) writer.add_scalar('Top5/train', train_top5, epoch + 1) writer.add_scalar('Top5/val', val_top5, epoch + 1) writer.add_scalar('Lr', train_lr, epoch + 1) if torch.distributed.get_rank() == 0: writer.close() time_elapse = time.time() - start_time mins, secs = divmod(time_elapse, 60) hrs, mins = divmod(mins, 60) print( '### Training Time: {:.2f} hrs {:.2f} mins {:.2f} secs | {:.2f} secs' .format(hrs, mins, secs, time_elapse)) print('### All Arguments:') print(args) return
def train(args, train_dataset, model, tokenizer, criterion): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn, num_workers=args.num_workers, ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_f1, n_no_improve = 0, 0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) labels = batch[5] inputs = { "input_ids": batch[0], "input_modal": batch[2], "attention_mask": batch[1], "modal_start_tokens": batch[3], "modal_end_tokens": batch[4], } outputs = model(**inputs) logits = outputs[0] # model outputs are always tuple in transformers (see doc) loss = criterion(logits, labels) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, criterion) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{"step": global_step}})) if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training torch.save(model_to_save.state_dict(), os.path.join(output_dir, WEIGHTS_NAME)) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank == -1: results = evaluate(args, model, tokenizer, criterion) if results["micro_f1"] > best_f1: best_f1 = results["micro_f1"] n_no_improve = 0 else: n_no_improve += 1 if n_no_improve > args.patience: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
# Load image img_id = self.images[idx] image = Image.open(os.path.join(self.root_dir, img_id + ".jpg")) # Apply transform if self.transform is not None: image = self.transform(image) return {'image': image, 'label': self.image_labels[idx]} data_path = "/home/kevin/deep_learning/OpenImages/" eval_freq = 50 writer = SummaryWriter(log_dir=os.path.join(data_path, "models")) transform = transforms.Compose([ transforms.Scale((299, 299)), transforms.Grayscale(3), transforms.ToTensor() ]) root_dir = os.path.join(data_path, "pics") csv_path = os.path.join(data_path, "open_image_labels_formatted.csv") label_name_path = os.path.join(data_path, "label_names.csv") dataset = ImageDataset(label_file=csv_path, root_dir=root_dir, label_name_path=label_name_path, transform=transform)
dist.init_process_group(backend='nccl', init_method='env://') # distributed backend assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count' opt.batch_size = opt.total_batch_size // opt.world_size # Hyperparameters with open(opt.hyp) as f: hyp = yaml.safe_load(f) # load hyps # Train logger.info(opt) if not opt.evolve: tb_writer = None # init loggers if opt.global_rank in [-1, 0]: prefix = colorstr('tensorboard: ') logger.info(f"{prefix}Start with 'tensorboard --logdir {opt.project}', view at http://localhost:6006/") tb_writer = SummaryWriter(opt.save_dir) # Tensorboard train(hyp, opt, device, tb_writer) # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = {'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok) 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr 'box': (1, 0.02, 0.2), # box loss gain 'cls': (1, 0.2, 4.0), # cls loss gain 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight
#dataset = datasets.ImageFolder(root="celeb_dataset", transform=transforms) dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True) gen = Generator(NOISE_DIM, CHANNELS_IMG, FEATURES_GEN).to(device) disc = Discriminator(CHANNELS_IMG, FEATURES_DISC).to(device) initialize_weights(gen) initialize_weights(disc) # print(gen) # 输出模型 # print(disc) opt_gen = optim.Adam(gen.parameters(), lr=LEARNING_RATE, betas=(0.5, 0.999)) opt_disc = optim.Adam(disc.parameters(), lr=LEARNING_RATE, betas=(0.5, 0.999)) criterion = nn.BCELoss() fixed_noise = torch.randn(32, NOISE_DIM, 1, 1).to(device) writer_real = SummaryWriter(f"logs/real") writer_fake = SummaryWriter(f"logs/fake") step = 0 gen.train() disc.train() G_losses = [] # 为了画loss图 D_losses = [] img_list = [] for epoch in range(NUM_EPOCHS): # 不需要目标的标签,无监督 for batch_id, (real, _) in enumerate(dataloader): real = real.to(device)
class TensorboardWriter(): def __init__(self, args): name_model = args.model + "_" + args.dataset_name + "_" + datestr() self.writer = SummaryWriter(log_dir=args.tb_log_dir, comment=name_model) # self.step = 0 # self.mode = '' self.csv_train, self.csv_val = self.create_stats_files(args.save) self.dataset_name = args.dataset_name self.label_names = dict_class_names[args.dataset_name] self.data = { "train": dict((label, 0.0) for label in self.label_names), "val": dict((label, 0.0) for label in self.label_names) } self.data['train']['loss'] = 0.0 self.data['val']['loss'] = 0.0 self.data['train']['count'] = 1 self.data['val']['count'] = 1 self.data['train']['dsc'] = 0.0 self.data['val']['dsc'] = 0.0 # self.tb_writer_ftns = { # 'add_scalar', 'add_scalars', 'add_image', 'add_images', 'add_audio', # 'add_text', 'add_histogram', 'add_pr_curve', 'add_embedding' # } # # self.timer = datetime.now() def display_terminal(self, iter, epoch, mode='train', summary=False): """ :param iter: iteration or partial epoch :param epoch: epoch of training :param loss: any loss numpy :param mode: train or val ( for training and validation) :param summary: to print total statistics at the end of epoch """ if summary: info_print = "\n Epoch {} : {} summary Loss : {}".format( epoch, mode, self.data[mode]['loss'] / self.data[mode]['count']) for i in range(len(self.label_names)): info_print += " {} : {}".format( self.label_names[i], self.data[mode][self.label_names[i]] / self.data[mode]['count']) print(info_print) else: info_print = "partial epoch: {} Loss:{}".format( iter, self.data[mode]['loss'] / self.data[mode]['count']) for i in range(len(self.label_names)): info_print += " {} : {}".format( self.label_names[i], self.data[mode][self.label_names[i]] / self.data[mode]['count']) print(info_print) def create_stats_files(self, path): train_f = open(os.path.join(path, 'train.csv'), 'w') val_f = open(os.path.join(path, 'val.csv'), 'w') return train_f, val_f def reset(self, mode): self.data[mode]['dsc'] = 0.0 self.data[mode]['loss'] = 0.0 self.data[mode]['count'] = 1 for i in range(len(self.label_names)): self.data[mode][self.label_names[i]] = 0.0 def update_scores(self, iter, loss, channel_score, mode, writer_step): """ :param iter: iteration or partial epoch :param loss: any loss torch.tensor.item() :param channel_score: per channel score or dice coef :param mode: train or val ( for training and validation) :param writer_step: tensorboard writer step """ ## WARNING ASSUMING THAT CHANNELS IN SAME ORDER AS DICTIONARY ########### dice_coeff = np.mean(channel_score) * 100 num_channels = len(channel_score) self.data[mode]['dsc'] += dice_coeff self.data[mode]['loss'] += loss self.data[mode]['count'] = iter for i in range(num_channels): self.data[mode][self.label_names[i]] += channel_score[i] if self.writer != None: self.writer.add_scalar(mode + '/' + self.label_names[i], channel_score[i], global_step=writer_step) def _write_end_of_epoch(self, epoch): self.writer.add_scalars( 'DSC/', { 'train': self.data['train']['dsc'] / self.data['train']['count'], 'val': self.data['val']['dsc'] / self.data['val']['count'], }, epoch) self.writer.add_scalars( 'Loss/', { 'train': self.data['train']['loss'] / self.data['train']['count'], 'val': self.data['val']['loss'] / self.data['val']['count'], }, epoch) for i in range(len(self.label_names)): self.writer.add_scalars( self.label_names[i], { 'train': self.data['train'][self.label_names[i]] / self.data['train']['count'], 'val': self.data['val'][self.label_names[i]] / self.data['train']['count'], }, epoch)
def train(self, summary=False): # build model netG, netD = self.build_model() # just summary model if summary: self.model_summary(netG, netD) return # define optimizerss optimizerG, optimizerD = self.define_optimizers(netG, netD) scheduler_G = ExponentialLR(optimizerG, gamma=0.95) # create log directory self.set_logdir() # tensorboard writer = SummaryWriter() total_d_loss = 0. step = 1 for epoch in range(self.epochs): # train netD.train() netG.train() for i, data in enumerate(self.dataloader): fake, mat_real, mis_real = data[0], data[1], data[2] true_voxel, fake_embedding = fake[0].to( self.device, dtype=torch.float), fake[1].to(self.device, dtype=torch.float) mat_voxel, mat_embedding = mat_real[0].to( self.device, dtype=torch.float), mat_real[1].to(self.device, dtype=torch.float) mis_voxel, mis_embedding = mis_real[0].to( self.device, dtype=torch.float), mis_real[1].to(self.device, dtype=torch.float) # train Discriminator netD.zero_grad() # get G output noise = torch.from_numpy( np.random.uniform( low=-self.noise_unif_abs_max, high=self.noise_unif_abs_max, size=[mat_embedding.size(0), self.noise_size])).to(self.device, dtype=torch.float) fake_voxel = netG(noise, fake_embedding) _, fake_logits = netD(fake_voxel.detach(), fake_embedding) _, mat_real_logits = netD(mat_voxel, mat_embedding) _, mis_real_logits = netD(mis_voxel, mis_embedding) d_loss = losses.wasserstein_loss( fake_logits, 'dis_fake') + 2.0 * losses.wasserstein_loss( mat_real_logits, 'dis_real') + losses.wasserstein_loss( mis_real_logits, 'dis_fake') d_gp = losses.gradient_penalty(fake_voxel, mat_voxel, fake_embedding, mat_embedding, netD, self.device) d_loss += d_gp total_d_loss = total_d_loss + d_loss.item() d_loss.backward() optimizerD.step() # train G if step % self.dstep == 0: # print('g time', step) netG.zero_grad() _, fake_logits = netD(fake_voxel, fake_embedding) g_loss = losses.wasserstein_loss(fake_logits, 'gen') g_loss.backward() optimizerG.step() # tensorboard writer.add_scalar('d_loss/train', d_loss.item(), step // self.dstep) writer.add_scalar('d_gp/train', d_gp.item(), step // self.dstep) writer.add_scalar('g_loss/train', g_loss.item(), step // self.dstep) if step % (self.decay_step * self.dstep) == 0: print('g_lr is decay!') scheduler_G.step() if step % (self.print_step * self.dstep) == 0: print('global step:', step) print(f'train--->step:{step//self.dstep}') print(f'd_loss:{d_loss.item()}') print(f'd_gp:{d_gp.item()}') print(f'g_loss:{g_loss.item()}') # checkpoint if (step % self.cpkt_step * self.dstep) == 0: self.save_cpkt(epoch, step, optimizerG, optimizerD, netG, netD, d_loss, g_loss) # generated shape save if step % (self.save_voxel_step * self.dstep) == 0: # print(fake_voxel.size()) self.save_voxel(fake_voxel, true_voxel, step // self.dstep) step = step + 1
opt.global_rank = dist.get_rank() assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count' opt.batch_size = opt.total_batch_size // opt.world_size print(opt) with open(opt.hyp) as f: hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps # Train if not opt.evolve: tb_writer = None if opt.global_rank in [-1, 0]: print( 'Start Tensorboard with "tensorboard --logdir %s", view at http://localhost:6006/' % opt.logdir) tb_writer = SummaryWriter(log_dir=increment_dir( Path(opt.logdir) / 'exp', opt.name)) # runs/exp train(hyp, opt, device, tb_writer, wandb) # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = { 'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) 'momentum': (0.1, 0.6, 0.98), # SGD momentum/Adam beta1 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay 'giou': (1, 0.02, 0.2), # GIoU loss gain 'cls': (1, 0.2, 4.0), # cls loss gain 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels)
x = self.classifier(x) if self.training == True: return [x, out1, out2] else: return x if __name__ == '__main__': # Temporary define data and target batch_size = 5 x = torch.randn((batch_size, 3, 224, 224)) y = torch.randint(0, 1000, (batch_size, )) num_classes = 1000 # Add to graph in tensorboard writer = SummaryWriter(log_dir='logs/googlenet') m = MyGoogleNet() # print(m) # we have x,o1,o2 = m(x) # m(x)[0] means x; m(x)[1] means o1; m(x)[2] means o2 # o1 and o2 are output from auxclassifier print(m(x)[0].shape) m.eval() print(m.training) writer.add_graph(m, x) writer.close() # Notice here! When you going to train your network # Put these loss value into train step of your model m.train() loss = nn.CrossEntropyLoss()
def train(self, epochs=100, learningRate=0.005, dataset="Coco", useDatabase=True, printUpdateEvery=40, visualize=False, tensorboard=False): self._training = True self._initTraining(learningRate, dataset, useDatabase) # Deal with tensorboard if tensorboard or type(tensorboard) == str: from torch.utils.tensorboard import SummaryWriter if type(tensorboard) == str: writer = SummaryWriter("./data/tensorboard/" + tensorboard) else: writer = SummaryWriter("./data/tensorboard/") tensorboard = True def findBestROI(ROIs, label): bestMatch = 0 bestIndex = -1 for i, ROI in enumerate(ROIs): lbox = np.array(label["bbox"]) larea = lbox[2:] - lbox[:2] larea = larea[0] * larea[1] rbox = ROI.bounds rarea = rbox[2:] - rbox[:2] rarea = rarea[0] * rarea[1] SI = np.maximum(0, np.minimum(lbox[2], rbox[2]) - np.maximum(lbox[0], rbox[0])) * \ np.maximum(0, np.minimum(lbox[3], rbox[3]) - np.maximum(lbox[1], rbox[1])) SU = larea + rarea - SI overlap = SI / SU if bestMatch < overlap and SU != 0: bestMatch = overlap bestIndex = i return bestIndex Iterations = len(self.dataset) print("Starting training") for epoch in range(epochs): epochLoss = np.float64(0) for i in range(Iterations): ROIs, peopleTextures, labels = self._load(i) # Figure out what ROI belongs to what label groundtruth = np.zeros((len(ROIs), 14), dtype=np.float32) for label in labels: mostMatching = findBestROI(ROIs, label) if mostMatching != -1: groundtruth[mostMatching][label["category_id"]] = 1 # Most items in this dataset will be bypassed because no people were found or overlapping with gt if len(ROIs) == 0 or not np.any(groundtruth != 0): continue groundtruth = torch.from_numpy(groundtruth).to(device) # Apply noise to peopleTextures noise = np.random.randn(*peopleTextures.shape) * 5 b = peopleTextures.astype(np.int32) peopleTextures = peopleTextures.astype( np.int32) + noise.astype(np.int32) peopleTextures = np.clip(peopleTextures, 0, 255) peopleTextures = peopleTextures.astype(np.uint8) peopleTextures = torch.Tensor(peopleTextures).to(device) predictions = self.classifier.forward(peopleTextures) print(groundtruth) print(predictions) print("\n") lossSize = self.lossFunction(predictions, groundtruth) lossSize.backward() self.optimizer.step() self.optimizer.zero_grad() lossSize = lossSize.cpu().item() epochLoss += lossSize / Iterations if (i - 1) % printUpdateEvery == 0: print("Iteration {} / {}, epoch {} / {}".format( i, Iterations, epoch, epochs)) print("Loss size: {}\n".format(lossSize / printUpdateEvery)) if tensorboard: absI = i + epoch * Iterations writer.add_scalar("Loss size", lossSize, absI) # Show visualization if visualize: pass # TODO """ image = self.renderDebug(image) plt.ion() plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) plt.draw() plt.pause(4) """ print("Finished epoch {} / {}. Loss size:".format( epoch, epochs, epochLoss)) self.saveModel(self.modelPath) self._training = False
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) # Added here for reproductibility set_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # Save model checkpoint if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): # Load dataset print('Loading dataset ...\n') dataset_train = Dataset(train=True) dataset_val = Dataset(train=False) loader_train = DataLoader(dataset=dataset_train, num_workers=0, batch_size=opt.batchSize, shuffle=True) print("# of training samples: %d\n" % int(len(dataset_train))) # Build model # net = DnCNN(channels=1, num_of_layers=opt.num_of_layers) net = NoiseNetwork(4, 4, True) # net.apply(weights_init_kaiming) criterion = nn.MSELoss(size_average=False) # Move to GPU device_ids = [0] model = nn.DataParallel(net, device_ids=device_ids).cuda() criterion.cuda() # Optimizer optimizer = optim.AdamW(model.parameters(), lr=opt.lr) # training writer = SummaryWriter(opt.outf) step = 0 noiseL_B = [0, 55] # ingnored when opt.mode=='S' ep = [] ps = [] for epoch in range(opt.epochs): if epoch < opt.milestone: current_lr = opt.lr else: current_lr = opt.lr / 10. # set learning rate for param_group in optimizer.param_groups: param_group["lr"] = current_lr print('learning rate %f' % current_lr) # train for i, data in enumerate(loader_train, 0): # training step model.train() model.zero_grad() optimizer.zero_grad() img_train = data if opt.mode == 'S': noise = torch.FloatTensor(img_train.size()).normal_( mean=0, std=opt.noiseL / 255.) if opt.mode == 'B': noise = torch.zeros(img_train.size()) stdN = np.random.uniform(noiseL_B[0], noiseL_B[1], size=noise.size()[0]) for n in range(noise.size()[0]): sizeN = noise[0, :, :, :].size() noise[n, :, :, :] = torch.FloatTensor(sizeN).normal_( mean=0, std=stdN[n] / 255.) imgn_train = img_train + noise # ================dwt================ imgn_train = tensor_dwt(imgn_train) img_train = tensor_dwt(img_train) noise = tensor_dwt(noise) # ==================================== img_train, imgn_train = Variable(img_train.cuda()), Variable( imgn_train.cuda()) noise = Variable(noise.cuda()) # out_train = torch.clamp(model(imgn_train), 0., 1.) out_train = model(imgn_train) # out_train = torch.sigmoid(model(imgn_train)) loss = criterion(out_train, imgn_train) / (imgn_train.size()[0]**2) loss.backward() #torch.nn.utils.clip_grad_value_(model.parameters(), 0.5) torch.nn.utils.clip_grad_norm_(model.parameters(), 1, norm_type=2) optimizer.step() # results model.eval() out_train = model(imgn_train) out_train = tensor_idwt(out_train) out_train = torch.clamp(out_train, 0., 1.) img_train = tensor_idwt(img_train) # out_train = torch.sigmoid(model(imgn_train)) psnr_train = batch_PSNR(out_train, img_train, 1.) print( "[epoch %d][%d/%d] loss: %.4f PSNR_train: %.4f" % (epoch + 1, i + 1, len(loader_train), loss.item(), psnr_train)) # if you are using older version of PyTorch, you may need to change loss.item() to loss.data[0] if step % 10 == 0: # Log the scalar values writer.add_scalar('loss', loss.item(), step) writer.add_scalar('PSNR on training data', psnr_train, step) step += 1 ## the end of each epoch model.eval() # validate psnr_val = 0 with torch.no_grad(): for k in range(len(dataset_val)): img_val = torch.unsqueeze(dataset_val[k], 0) noise = torch.FloatTensor(img_val.size()).normal_( mean=0, std=opt.val_noiseL / 255.) imgn_val = img_val + noise #========================dwt======================= imgn_val = tensor_dwt(imgn_val) img_val = tensor_dwt(img_val) noise = tensor_dwt(noise) # ========================dwt======================= img_val, imgn_val = Variable(img_val.cuda()), Variable( imgn_val.cuda()) out_val = model(imgn_val) out_val = tensor_idwt(out_val) out_val = torch.clamp(out_val, 0., 1.) img_val = tensor_idwt(img_val) # out_val = torch.sigmoid(model(imgn_val)) psnr_val += batch_PSNR(out_val, img_val, 1.) psnr_val /= len(dataset_val) print("\n[epoch %d] PSNR_val: %.4f" % (epoch + 1, psnr_val)) ep.append(epoch + 1) ps.append(psnr_val) writer.add_scalar('PSNR on validation data', psnr_val, epoch) # log the images out_train = torch.clamp(model(imgn_train), 0., 1.) # out_train = torch.sigmoid(model(imgn_train)) Img = utils.make_grid(img_train.data, nrow=8, normalize=True, scale_each=True) Imgn = utils.make_grid(imgn_train.data, nrow=8, normalize=True, scale_each=True) Irecon = utils.make_grid(out_train.data, nrow=8, normalize=True, scale_each=True) writer.add_image('clean image', Img, epoch) writer.add_image('noisy image', Imgn, epoch) writer.add_image('reconstructed image', Irecon, epoch) # save model name = "net_epoch%d_PSNR%.4f.pth" % (epoch + 1, psnr_val) torch.save(model.state_dict(), os.path.join(opt.outf, name)) # save chart plt.plot(ep, ps) plt.gca().xaxis.set_major_locator(mticker.MultipleLocator(2)) plt.xlabel('epoch') plt.ylabel('PSNR') plt.title("PSNR values during training") plt.savefig('./psnr_val.jpg') plt.show()
(3 - len(opt.img_size))) # extend to 3 sizes (min, max, test) device = torch_utils.select_device(opt.device, apex=mixed_precision, batch_size=opt.batch_size) if device.type == 'cpu': mixed_precision = False # scale hyp['obj'] by img_size (evolved at 320) # hyp['obj'] *= opt.img_size[0] / 320. tb_writer = None if not opt.evolve: # Train normally print( 'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/' ) tb_writer = SummaryWriter(comment=opt.name) train(hyp) # train normally else: # Evolve hyperparameters (optional) opt.notest, opt.nosave = True, True # only test/save final epoch if opt.bucket: os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists for _ in range(1): # generations to evolve if os.path.exists( 'evolve.txt' ): # if evolve.txt exists: select best hyps and mutate # Select parent(s) parent = 'single' # parent selection method: 'single' or 'weighted' x = np.loadtxt('evolve.txt', ndmin=2)
def train( self, base_path: Union[Path, str], learning_rate: float = 0.1, mini_batch_size: int = 32, mini_batch_chunk_size: int = None, max_epochs: int = 100, scheduler=AnnealOnPlateau, anneal_factor: float = 0.5, patience: int = 3, initial_extra_patience=0, min_learning_rate: float = 0.0001, train_with_dev: bool = False, monitor_train: bool = False, monitor_test: bool = False, embeddings_storage_mode: str = "cpu", checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, anneal_with_prestarts: bool = False, batch_growth_annealing: bool = False, shuffle: bool = True, param_selection_mode: bool = False, num_workers: int = 6, sampler=None, use_amp: bool = False, amp_opt_level: str = "O1", eval_on_train_fraction=0.0, eval_on_train_shuffle=False, **kwargs, ) -> dict: """ Trains any class that implements the flair.nn.Model interface. :param base_path: Main path to which all output during training is logged and models are saved :param learning_rate: Initial learning rate :param mini_batch_size: Size of mini-batches during training :param mini_batch_chunk_size: If mini-batches are larger than this number, they get broken down into chunks of this size for processing purposes :param max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed. :param anneal_factor: The factor by which the learning rate is annealed :param patience: Patience is the number of epochs with no improvement the Trainer waits until annealing the learning rate :param min_learning_rate: If the learning rate falls below this threshold, training terminates :param train_with_dev: If True, training is performed using both train+dev data :param monitor_train: If True, training data is evaluated at end of each epoch :param monitor_test: If True, test data is evaluated at end of each epoch :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed), 'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU) :param checkpoint: If True, a full checkpoint is saved at end of each epoch :param save_final_model: If True, final model is saved :param anneal_with_restarts: If True, the last best model is restored when annealing the learning rate :param shuffle: If True, data is shuffled during training :param param_selection_mode: If True, testing is performed against dev data. Use this mode when doing parameter selection. :param num_workers: Number of workers in your data loader. :param sampler: You can pass a data sampler here for special sampling of data. :param eval_on_train_fraction: the fraction of train data to do the evaluation on, if 0. the evaluation is not performed on fraction of training data, if 'dev' the size is determined from dev set size :param eval_on_train_shuffle: if True the train data fraction is determined on the start of training and kept fixed during training, otherwise it's sampled at beginning of each epoch :param kwargs: Other arguments for the Optimizer :return: """ if self.use_tensorboard: try: from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter() except: log_line(log) log.warning( "ATTENTION! PyTorch >= 1.1.0 and pillow are required for TensorBoard support!" ) log_line(log) self.use_tensorboard = False pass if use_amp: if sys.version_info < (3, 0): raise RuntimeError( "Apex currently only supports Python 3. Aborting.") if amp is None: raise RuntimeError( "Failed to import apex. Please install apex from https://www.github.com/nvidia/apex " "to enable mixed-precision training.") if mini_batch_chunk_size is None: mini_batch_chunk_size = mini_batch_size if learning_rate < min_learning_rate: min_learning_rate = learning_rate / 10 initial_learning_rate = learning_rate # cast string to Path if type(base_path) is str: base_path = Path(base_path) log_handler = add_file_handler(log, base_path / "training.log") log_line(log) log.info(f'Model: "{self.model}"') log_line(log) log.info(f'Corpus: "{self.corpus}"') log_line(log) log.info("Parameters:") log.info(f' - learning_rate: "{learning_rate}"') log.info(f' - mini_batch_size: "{mini_batch_size}"') log.info(f' - patience: "{patience}"') log.info(f' - anneal_factor: "{anneal_factor}"') log.info(f' - max_epochs: "{max_epochs}"') log.info(f' - shuffle: "{shuffle}"') log.info(f' - train_with_dev: "{train_with_dev}"') log.info(f' - batch_growth_annealing: "{batch_growth_annealing}"') log_line(log) log.info(f'Model training base path: "{base_path}"') log_line(log) log.info(f"Device: {flair.device}") log_line(log) log.info(f"Embeddings storage mode: {embeddings_storage_mode}") if isinstance(self.model, SequenceTagger ) and self.model.weight_dict and self.model.use_crf: log_line(log) log.warning( f'WARNING: Specified class weights will not take effect when using CRF' ) # determine what splits (train, dev, test) to evaluate and log log_train = True if monitor_train else False log_test = (True if (not param_selection_mode and self.corpus.test and monitor_test) else False) log_dev = False log_train_part = (True if (eval_on_train_fraction == "dev" or eval_on_train_fraction > 0.0) else False) if log_train_part: train_part_size = (len( self.corpus.dev) if eval_on_train_fraction == "dev" else int( len(self.corpus.train) * eval_on_train_fraction)) assert train_part_size > 0 if not eval_on_train_shuffle: train_part_indices = list(range(train_part_size)) train_part = torch.utils.data.dataset.Subset( self.corpus.train, train_part_indices) # prepare loss logging file and set up header loss_txt = init_output_file(base_path, "loss.tsv") weight_extractor = WeightExtractor(base_path) optimizer: torch.optim.Optimizer = self.optimizer( self.model.parameters(), lr=learning_rate, **kwargs) if use_amp: self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=amp_opt_level) # minimize training loss if training with dev data, else maximize dev score anneal_mode = "min" if train_with_dev else "max" lr_scheduler = scheduler( optimizer, factor=anneal_factor, patience=patience, initial_extra_patience=initial_extra_patience, mode=anneal_mode, verbose=True, ) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data = ConcatDataset([self.corpus.train, self.corpus.dev]) # initialize sampler if provided if sampler is not None: # init with default values if only class is provided if inspect.isclass(sampler): sampler = sampler() # set dataset to sample from sampler.set_dataset(train_data) shuffle = False dev_score_history = [] dev_loss_history = [] train_loss_history = [] micro_batch_size = mini_batch_chunk_size # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for self.epoch in range(self.epoch + 1, max_epochs + 1): log_line(log) if anneal_with_prestarts: last_epoch_model_state_dict = copy.deepcopy( self.model.state_dict()) if eval_on_train_shuffle: train_part_indices = list(range(self.corpus.train)) random.shuffle(train_part_indices) train_part_indices = train_part_indices[:train_part_size] train_part = torch.utils.data.dataset.Subset( self.corpus.train, train_part_indices) # get new learning rate for group in optimizer.param_groups: learning_rate = group["lr"] if learning_rate != previous_learning_rate and batch_growth_annealing: mini_batch_size *= 2 # reload last best model if annealing with restarts is enabled if ((anneal_with_restarts or anneal_with_prestarts) and learning_rate != previous_learning_rate and (base_path / "best-model.pt").exists()): if anneal_with_restarts: log.info("resetting to best model") self.model.load_state_dict( self.model.load(base_path / "best-model.pt").state_dict()) if anneal_with_prestarts: log.info("resetting to pre-best model") self.model.load_state_dict( self.model.load(base_path / "pre-best-model.pt").state_dict()) previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < min_learning_rate: log_line(log) log.info("learning rate too small - quitting training!") log_line(log) break batch_loader = DataLoader( train_data, batch_size=mini_batch_size, shuffle=shuffle, num_workers=num_workers, sampler=sampler, ) self.model.train() train_loss: float = 0 seen_batches = 0 total_number_of_batches = len(batch_loader) modulo = max(1, int(total_number_of_batches / 10)) # process mini-batches batch_time = 0 for batch_no, batch in enumerate(batch_loader): start_time = time.time() # zero the gradients on the model and optimizer self.model.zero_grad() optimizer.zero_grad() # if necessary, make batch_steps batch_steps = [batch] if len(batch) > micro_batch_size: batch_steps = [ batch[x:x + micro_batch_size] for x in range(0, len(batch), micro_batch_size) ] # forward and backward for batch for batch_step in batch_steps: # forward pass loss = self.model.forward_loss(batch_step) # Backward if use_amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # do the optimizer step torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_batches += 1 train_loss += loss.item() # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(batch, embeddings_storage_mode) batch_time += time.time() - start_time if seen_batches % modulo == 0: log.info( f"epoch {self.epoch} - iter {seen_batches}/{total_number_of_batches} - loss " f"{train_loss / seen_batches:.8f} - samples/sec: {mini_batch_size * modulo / batch_time:.2f}" ) batch_time = 0 iteration = self.epoch * total_number_of_batches + batch_no if not param_selection_mode: weight_extractor.extract_weights( self.model.state_dict(), iteration) train_loss /= seen_batches self.model.eval() log_line(log) log.info( f"EPOCH {self.epoch} done: loss {train_loss:.4f} - lr {learning_rate:.7f}" ) if self.use_tensorboard: writer.add_scalar("train_loss", train_loss, self.epoch) # anneal against train loss if training with dev, otherwise anneal against dev score current_score = train_loss # evaluate on train / dev / test split depending on training settings result_line: str = "" if log_train: train_eval_result, train_loss = self.model.evaluate( self.corpus.train, mini_batch_size=mini_batch_chunk_size, num_workers=num_workers, embedding_storage_mode=embeddings_storage_mode, ) result_line += f"\t{train_eval_result.log_line}" # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.train, embeddings_storage_mode) if log_train_part: train_part_eval_result, train_part_loss = self.model.evaluate( train_part, mini_batch_size=mini_batch_chunk_size, num_workers=num_workers, embedding_storage_mode=embeddings_storage_mode, ) result_line += ( f"\t{train_part_loss}\t{train_part_eval_result.log_line}" ) log.info( f"TRAIN_SPLIT : loss {train_part_loss} - score {round(train_part_eval_result.main_score, 4)}" ) if log_dev: dev_eval_result, dev_loss = self.model.evaluate( self.corpus.dev, mini_batch_size=mini_batch_chunk_size, num_workers=num_workers, embedding_storage_mode=embeddings_storage_mode, ) result_line += f"\t{dev_loss}\t{dev_eval_result.log_line}" log.info( f"DEV : loss {dev_loss} - score {round(dev_eval_result.main_score, 4)}" ) # calculate scores using dev data if available # append dev score to score history dev_score_history.append(dev_eval_result.main_score) dev_loss_history.append(dev_loss.item()) current_score = dev_eval_result.main_score # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.dev, embeddings_storage_mode) if self.use_tensorboard: writer.add_scalar("dev_loss", dev_loss, self.epoch) writer.add_scalar("dev_score", dev_eval_result.main_score, self.epoch) if log_test: test_eval_result, test_loss = self.model.evaluate( self.corpus.test, mini_batch_size=mini_batch_chunk_size, num_workers=num_workers, out_path=base_path / "test.tsv", embedding_storage_mode=embeddings_storage_mode, ) result_line += f"\t{test_loss}\t{test_eval_result.log_line}" log.info( f"TEST : loss {test_loss} - score {round(test_eval_result.main_score, 4)}" ) # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.test, embeddings_storage_mode) if self.use_tensorboard: writer.add_scalar("test_loss", test_loss, self.epoch) writer.add_scalar("test_score", test_eval_result.main_score, self.epoch) # determine learning rate annealing through scheduler. Use auxiliary metric for AnnealOnPlateau #if not train_with_dev and isinstance(lr_scheduler, AnnealOnPlateau): if False: lr_scheduler.step(current_score, dev_loss) else: lr_scheduler.step(current_score) train_loss_history.append(train_loss) # determine bad epoch number try: bad_epochs = lr_scheduler.num_bad_epochs except: bad_epochs = 0 for group in optimizer.param_groups: new_learning_rate = group["lr"] if new_learning_rate != previous_learning_rate: bad_epochs = patience + 1 if previous_learning_rate == initial_learning_rate: bad_epochs += initial_extra_patience # log bad epochs log.info(f"BAD EPOCHS (no improvement): {bad_epochs}") # output log file with open(loss_txt, "a") as f: # make headers on first epoch if self.epoch == 1: f.write( f"EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS" ) if log_train: f.write("\tTRAIN_" + "\tTRAIN_".join( train_eval_result.log_header.split("\t"))) if log_train_part: f.write("\tTRAIN_PART_LOSS\tTRAIN_PART_" + "\tTRAIN_PART_".join( train_part_eval_result.log_header. split("\t"))) if log_dev: f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join( dev_eval_result.log_header.split("\t"))) if log_test: f.write("\tTEST_LOSS\tTEST_" + "\tTEST_".join( test_eval_result.log_header.split("\t"))) f.write( f"\n{self.epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t{train_loss}" ) f.write(result_line) # if checkpoint is enabled, save model at each epoch if checkpoint and not param_selection_mode: self.save_checkpoint(base_path / "checkpoint.pt") # if we use dev data, remember best model based on dev evaluation score if ((not train_with_dev or anneal_with_restarts or anneal_with_prestarts) and not param_selection_mode and current_score == lr_scheduler.best and bad_epochs == 0): print("saving best model") self.model.save(base_path / "best-model.pt") if anneal_with_prestarts: current_state_dict = self.model.state_dict() self.model.load_state_dict(last_epoch_model_state_dict) self.model.save(base_path / "pre-best-model.pt") self.model.load_state_dict(current_state_dict) # if we do not use dev data for model selection, save final model if save_final_model and not param_selection_mode: self.model.save(base_path / "final-model.pt") except KeyboardInterrupt: log_line(log) log.info("Exiting from training early.") if self.use_tensorboard: writer.close() if not param_selection_mode: log.info("Saving model ...") self.model.save(base_path / "final-model.pt") log.info("Done.") # test best model if test data is present #if self.corpus.test: if True: final_score = self.final_test(base_path, mini_batch_chunk_size, num_workers) else: final_score = 0 from flair.data import Sentence sentence: Sentence = Sentence( "George Washington went to Washington .") self.model.predict(sentence) print("Analysing %s" % sentence) print("\nThe following NER tags are found: \n") print(sentence) print(sentence.to_tagged_string()) log.info("Test data not provided setting final score to 0") log.removeHandler(log_handler) if self.use_tensorboard: writer.close() return { "test_score": final_score, "dev_score_history": dev_score_history, "train_loss_history": train_loss_history, "dev_loss_history": dev_loss_history, }
class Agent(): def __init__(self, state_size, action_size, action_dim, config): self.state_size = state_size self.action_size = action_size self.action_dim = action_dim self.seed = 0 self.device = 'cuda' self.batch_size = config["batch_size"] self.lr = 0.005 self.gamma = 0.99 self.q_shift_local = QNetwork(state_size, action_size, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.seed).to(self.device) self.Q_local = QNetwork(state_size, action_size, self.seed).to(self.device) self.Q_target = QNetwork(state_size, action_size, self.seed).to(self.device) self.R_local = RNetwork(state_size, action_size, self.seed).to(self.device) self.R_target = RNetwork(state_size, action_size, self.seed).to(self.device) self.policy = PolicyNetwork(state_size, action_size, self.seed).to(self.device) self.predicter = Classifier(state_size, action_dim, self.seed).to(self.device) #self.criterion = nn.CrossEntropyLoss() # optimizer self.optimizer_q_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.Q_local.parameters(), lr=self.lr) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr) pathname = "lr {} batch_size {} seed {}".format( self.lr, self.batch_size, self.seed) tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.writer = SummaryWriter(tensorboard_name) self.steps = 0 self.ratio = 1. / action_dim self.all_actions = [] for a in range(self.action_dim): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device)) def act(self, state): dis, action, log_probs, ent = self.policy.sample_action( torch.Tensor(state).unsqueeze(0)) return dis, action, log_probs, ent def learn(self, memory): states, next_states, actions = memory.expert_policy(self.batch_size) # actions = actions[0] # print("states ", states) self.state_action_frq(states, actions) self.get_action_prob(states, actions) self.compute_r_function(states, actions) return # compute difference between Q_shift and y_sh q_sh_value = self.q_shift_local(next_states, actions) y_sh = np.empty((self.batch_size, 1), dtype=np.float32) for idx, s in enumerate(next_states): q = [] for action in self.all_actions: q.append(Q_target(s.unsqueeze(0), action.unsqueeze(0))) q_max = max(q) np.copyto(y_sh[idx], q_max.detach().numpy()) y_sh = torch.Tensor(y_sh) y_sh *= self.gamma q_shift_loss = F.mse_loss(y_sh, q_shift_values) # Minimize the loss self.optimizer.zero_grad() q_shift_loss.backward() self.optimizer.step() #minimize MSE between pred Q and y = r'(s,a) + gama * max Q'(s',a) q_current = self.Q_local(states, actions) r_hat = self.R_target(states, actions) # use y_sh as target y_q = r_hat + y_sh q_loss = F.mse_loss(q_current, y_q) # Minimize the loss self.optimizer.zero_grad() q_loss.backward() self.optimizer.step() # get predicted reward r = self.R_local(states, actions) def state_action_frq(self, states, action): """ Train classifer to compute state action freq """ self.steps += 1 output = self.predicter(states) # create one hot encode y from actions y = action.type(torch.long) y = y.squeeze(1) loss = nn.CrossEntropyLoss()(output, y) self.optimizer_pre.zero_grad() loss.backward() self.optimizer_pre.step() self.writer.add_scalar('Predict_loss', loss, self.steps) def get_action_prob(self, states, actions, dim=False): """ """ if dim: output = self.predicter(states) action_prob = output.gather(1, actions.type(torch.long)) action_prob = torch.log(action_prob) return action_prob output = self.predicter(states) print("Output prob ", output) action_prob = output.gather(1, actions.type(torch.long)) print("action prob ", action_prob) action_prob = torch.log(action_prob) print("action prob ", action_prob) return action_prob def compute_r_function(self, states, actions): """ """ actions = actions.type(torch.float) y = self.R_local(states, actions) y_shift = self.q_shift_target(states, actions) y_r_part1 = self.get_action_prob(states, actions) - y_shift print("ratio ", self.ratio) # sum all other actions y_r_part2 = torch.empty((self.batch_size, 1), dtype=torch.float32) idx = 0 for a, s in zip(actions, states): y_h = 0 for b in self.all_actions: if torch.eq(a, b): continue print("diff ac ", b) r_hat = self.R_target(s.unsqueeze(0), b.unsqueeze(0)) n_b = self.get_action_prob(s.unsqueeze(0), b.unsqueeze(0), True) - self.q_shift_target( s.unsqueeze(0), b.unsqueeze(0)) y_h += (r_hat - n_b) y_h = self.ratio * y_h y_r_part2[idx] = y_h idx += 1 print("shape of r y ", y.shape) print("y r part 1 ", y_r_part1.shape) print("y r part 2 ", y_r_part2.shape)
class TabularQLearningAgent: """A Tabular. epsilon greedy Q-Learning Agent using Experience Replay """ def __init__(self, env, seed=None, lr=0.001, training_steps=10000, final_epsilon=0.05, exploration_steps=10000, gamma=0.99, verbose=True, **kwargs): # This implementation only works for flat actions assert env.flat_actions self.verbose = verbose if self.verbose: print("\nRunning Tabular Q-Learning with config:") pprint(locals()) # set seeds self.seed = seed if self.seed is not None: np.random.seed(self.seed) # envirnment setup self.env = env self.num_actions = self.env.action_space.n self.obs_dim = self.env.observation_space.shape # logger setup self.logger = SummaryWriter() # Training related attributes self.lr = lr self.exploration_steps = exploration_steps self.final_epsilon = final_epsilon self.epsilon_schedule = np.linspace(1.0, self.final_epsilon, self.exploration_steps) self.discount = gamma self.training_steps = training_steps self.steps_done = 0 # Q-Function self.qfunc = TabularQFunction(self.num_actions) def get_epsilon(self): if self.steps_done < self.exploration_steps: return self.epsilon_schedule[self.steps_done] return self.final_epsilon def get_egreedy_action(self, o, epsilon): if random.random() > epsilon: return self.qfunc.get_action(o) return random.randint(0, self.num_actions - 1) def optimize(self, s, a, next_s, r, done): # get q_val for state and action performed in that state q_vals_raw = self.qfunc.forward(s) q_val = q_vals_raw[a] # get target q val = max val of next state target_q_val = self.qfunc.forward(next_s).max() target = r + self.discount * (1 - done) * target_q_val # calculate error and update td_error = target - q_val td_delta = self.lr * td_error # optimize the model self.qfunc.update(s, a, td_delta) s_value = q_vals_raw.max() return td_error, s_value def train(self): if self.verbose: print("\nStarting training") num_episodes = 0 training_steps_remaining = self.training_steps while self.steps_done < self.training_steps: ep_results = self.run_train_episode(training_steps_remaining) ep_return, ep_steps, goal = ep_results num_episodes += 1 training_steps_remaining -= ep_steps self.logger.add_scalar("episode", num_episodes, self.steps_done) self.logger.add_scalar("epsilon", self.get_epsilon(), self.steps_done) self.logger.add_scalar("episode_return", ep_return, self.steps_done) self.logger.add_scalar("episode_steps", ep_steps, self.steps_done) self.logger.add_scalar("episode_goal_reached", int(goal), self.steps_done) if num_episodes % 10 == 0 and self.verbose: print(f"\nEpisode {num_episodes}:") print(f"\tsteps done = {self.steps_done} / " f"{self.training_steps}") print(f"\treturn = {ep_return}") print(f"\tgoal = {goal}") self.logger.close() if self.verbose: print("Training complete") print(f"\nEpisode {num_episodes}:") print(f"\tsteps done = {self.steps_done} / {self.training_steps}") print(f"\treturn = {ep_return}") print(f"\tgoal = {goal}") def run_train_episode(self, step_limit): s = self.env.reset() done = False steps = 0 episode_return = 0 while not done and steps < step_limit: a = self.get_egreedy_action(s, self.get_epsilon()) next_s, r, done, _ = self.env.step(a) self.steps_done += 1 td_error, s_value = self.optimize(s, a, next_s, r, done) self.logger.add_scalar("td_error", td_error, self.steps_done) self.logger.add_scalar("s_value", s_value, self.steps_done) s = next_s episode_return += r steps += 1 return episode_return, steps, self.env.goal_reached() def run_eval_episode(self, env=None, render=False, eval_epsilon=0.05, render_mode="readable"): if env is None: env = self.env s = env.reset() done = False steps = 0 episode_return = 0 line_break = "=" * 60 if render: print("\n" + line_break) print(f"Running EVALUATION using epsilon = {eval_epsilon:.4f}") print(line_break) env.render(render_mode) input("Initial state. Press enter to continue..") while not done: a = self.get_egreedy_action(s, eval_epsilon) next_s, r, done, _ = env.step(a) s = next_s episode_return += r steps += 1 if render: print("\n" + line_break) print(f"Step {steps}") print(line_break) print(f"Action Performed = {env.action_space.get_action(a)}") env.render(render_mode) print(f"Reward = {r}") print(f"Done = {done}") input("Press enter to continue..") if done: print("\n" + line_break) print("EPISODE FINISHED") print(line_break) print(f"Goal reached = {env.goal_reached()}") print(f"Total steps = {steps}") print(f"Total reward = {episode_return}") return episode_return, steps, env.goal_reached()